{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05374754776813308, "eval_steps": 500, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 5.374754776813308e-05, "grad_norm": 9.018385887145996, "learning_rate": 5e-09, "logits/chosen": -0.5629481673240662, "logits/rejected": -0.9253309965133667, "logps/chosen": -80.73175048828125, "logps/rejected": -120.37657165527344, "loss": 1.3863, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00010749509553626615, "grad_norm": 8.409211158752441, "learning_rate": 1e-08, "logits/chosen": -0.6839612722396851, "logits/rejected": -0.8808996677398682, "logps/chosen": -89.26901245117188, "logps/rejected": -102.36270904541016, "loss": 1.3863, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.00016124264330439922, "grad_norm": 10.9932279586792, "learning_rate": 1.5e-08, "logits/chosen": -0.39078935980796814, "logits/rejected": -1.0275822877883911, "logps/chosen": -91.3505859375, "logps/rejected": -113.90643310546875, "loss": 1.3638, "rewards/accuracies": 0.625, "rewards/chosen": 0.016756106168031693, "rewards/margins": 0.023180630058050156, "rewards/rejected": -0.006424522493034601, "step": 3 }, { "epoch": 0.0002149901910725323, "grad_norm": 9.944060325622559, "learning_rate": 2e-08, "logits/chosen": -0.663591206073761, "logits/rejected": -1.0442650318145752, "logps/chosen": -68.45164489746094, "logps/rejected": -113.84129333496094, "loss": 1.3984, "rewards/accuracies": 0.5, "rewards/chosen": 0.001784514868631959, "rewards/margins": -0.011848259717226028, "rewards/rejected": 0.013632774353027344, "step": 4 }, { "epoch": 0.0002687377388406654, "grad_norm": 8.76212215423584, "learning_rate": 2.5e-08, "logits/chosen": -0.5241557955741882, "logits/rejected": -0.9429107904434204, "logps/chosen": -79.99862670898438, "logps/rejected": -97.68046569824219, "loss": 1.366, "rewards/accuracies": 0.875, "rewards/chosen": 0.005249070934951305, "rewards/margins": 0.020516324788331985, "rewards/rejected": -0.015267252922058105, "step": 5 }, { "epoch": 0.00032248528660879845, "grad_norm": 9.241307258605957, "learning_rate": 3e-08, "logits/chosen": -0.554192841053009, "logits/rejected": -0.9173457026481628, "logps/chosen": -93.0028076171875, "logps/rejected": -127.36787414550781, "loss": 1.379, "rewards/accuracies": 0.625, "rewards/chosen": 0.0008242604089900851, "rewards/margins": 0.007851887494325638, "rewards/rejected": -0.007027626968920231, "step": 6 }, { "epoch": 0.00037623283437693153, "grad_norm": 10.087677955627441, "learning_rate": 3.5e-08, "logits/chosen": -0.5847232341766357, "logits/rejected": -0.7745344042778015, "logps/chosen": -84.73774719238281, "logps/rejected": -112.95906066894531, "loss": 1.3738, "rewards/accuracies": 0.5, "rewards/chosen": -0.0006923675537109375, "rewards/margins": 0.013265704736113548, "rewards/rejected": -0.013958072289824486, "step": 7 }, { "epoch": 0.0004299803821450646, "grad_norm": 7.152434349060059, "learning_rate": 4e-08, "logits/chosen": -0.6588990688323975, "logits/rejected": -0.7417616844177246, "logps/chosen": -82.9864273071289, "logps/rejected": -89.05368041992188, "loss": 1.3747, "rewards/accuracies": 0.75, "rewards/chosen": 0.0030961036682128906, "rewards/margins": 0.011989163234829903, "rewards/rejected": -0.008893061429262161, "step": 8 }, { "epoch": 0.0004837279299131977, "grad_norm": 9.751290321350098, "learning_rate": 4.5e-08, "logits/chosen": -0.49971503019332886, "logits/rejected": -0.7068791389465332, "logps/chosen": -83.61515808105469, "logps/rejected": -109.05160522460938, "loss": 1.409, "rewards/accuracies": 0.375, "rewards/chosen": 0.0018943313043564558, "rewards/margins": -0.02229447290301323, "rewards/rejected": 0.024188805371522903, "step": 9 }, { "epoch": 0.0005374754776813308, "grad_norm": 10.492074966430664, "learning_rate": 5e-08, "logits/chosen": -0.5029646158218384, "logits/rejected": -0.8592510223388672, "logps/chosen": -96.91883850097656, "logps/rejected": -131.0804443359375, "loss": 1.3865, "rewards/accuracies": 0.625, "rewards/chosen": 0.009151006117463112, "rewards/margins": 7.531698793172836e-05, "rewards/rejected": 0.009075689129531384, "step": 10 }, { "epoch": 0.0005912230254494639, "grad_norm": 8.155610084533691, "learning_rate": 5.4999999999999996e-08, "logits/chosen": -0.5331442356109619, "logits/rejected": -0.8224359750747681, "logps/chosen": -88.8160400390625, "logps/rejected": -112.42459106445312, "loss": 1.3885, "rewards/accuracies": 0.5, "rewards/chosen": 0.008426809683442116, "rewards/margins": -0.001995563507080078, "rewards/rejected": 0.010422373190522194, "step": 11 }, { "epoch": 0.0006449705732175969, "grad_norm": 7.364086151123047, "learning_rate": 6e-08, "logits/chosen": -0.5735334157943726, "logits/rejected": -1.0129125118255615, "logps/chosen": -77.84087371826172, "logps/rejected": -89.80377960205078, "loss": 1.3722, "rewards/accuracies": 0.5, "rewards/chosen": -0.0021553754340857267, "rewards/margins": 0.014481187798082829, "rewards/rejected": -0.01663656160235405, "step": 12 }, { "epoch": 0.00069871812098573, "grad_norm": 8.48727798461914, "learning_rate": 6.5e-08, "logits/chosen": -0.7501606941223145, "logits/rejected": -0.9889898896217346, "logps/chosen": -84.73579406738281, "logps/rejected": -113.06980895996094, "loss": 1.3603, "rewards/accuracies": 0.75, "rewards/chosen": 0.015059733763337135, "rewards/margins": 0.02650745026767254, "rewards/rejected": -0.011447716504335403, "step": 13 }, { "epoch": 0.0007524656687538631, "grad_norm": 7.543310165405273, "learning_rate": 7e-08, "logits/chosen": -0.5986064672470093, "logits/rejected": -1.0369253158569336, "logps/chosen": -76.786376953125, "logps/rejected": -95.89421844482422, "loss": 1.3896, "rewards/accuracies": 0.375, "rewards/chosen": 0.01045694388449192, "rewards/margins": -0.003253889037296176, "rewards/rejected": 0.013710832223296165, "step": 14 }, { "epoch": 0.0008062132165219961, "grad_norm": 8.725497245788574, "learning_rate": 7.5e-08, "logits/chosen": -0.40047967433929443, "logits/rejected": -0.42519611120224, "logps/chosen": -91.95352172851562, "logps/rejected": -109.30005645751953, "loss": 1.3812, "rewards/accuracies": 0.5, "rewards/chosen": 0.011234855279326439, "rewards/margins": 0.005371189676225185, "rewards/rejected": 0.005863667465746403, "step": 15 }, { "epoch": 0.0008599607642901292, "grad_norm": 8.531486511230469, "learning_rate": 8e-08, "logits/chosen": -0.4561946988105774, "logits/rejected": -0.715956449508667, "logps/chosen": -75.9218521118164, "logps/rejected": -120.33627319335938, "loss": 1.4012, "rewards/accuracies": 0.375, "rewards/chosen": 0.003715181490406394, "rewards/margins": -0.014596652239561081, "rewards/rejected": 0.01831183396279812, "step": 16 }, { "epoch": 0.0009137083120582623, "grad_norm": 8.766817092895508, "learning_rate": 8.500000000000001e-08, "logits/chosen": -0.5665198564529419, "logits/rejected": -0.5304955244064331, "logps/chosen": -100.96295166015625, "logps/rejected": -117.26142120361328, "loss": 1.3759, "rewards/accuracies": 0.625, "rewards/chosen": 0.00030202907510101795, "rewards/margins": 0.010729026980698109, "rewards/rejected": -0.01042699720710516, "step": 17 }, { "epoch": 0.0009674558598263954, "grad_norm": 9.407784461975098, "learning_rate": 9e-08, "logits/chosen": -0.7044994831085205, "logits/rejected": -0.8162285685539246, "logps/chosen": -75.76687622070312, "logps/rejected": -113.43864440917969, "loss": 1.3908, "rewards/accuracies": 0.625, "rewards/chosen": -0.0002802375238388777, "rewards/margins": -0.004327821545302868, "rewards/rejected": 0.004047584254294634, "step": 18 }, { "epoch": 0.0010212034075945285, "grad_norm": 7.906011581420898, "learning_rate": 9.499999999999999e-08, "logits/chosen": -0.6232011318206787, "logits/rejected": -0.901610791683197, "logps/chosen": -80.9669189453125, "logps/rejected": -119.91120910644531, "loss": 1.3763, "rewards/accuracies": 0.875, "rewards/chosen": 0.008800078183412552, "rewards/margins": 0.010257960297167301, "rewards/rejected": -0.0014578821137547493, "step": 19 }, { "epoch": 0.0010749509553626617, "grad_norm": 8.806061744689941, "learning_rate": 1e-07, "logits/chosen": -0.4426119029521942, "logits/rejected": -1.1311769485473633, "logps/chosen": -79.48087310791016, "logps/rejected": -113.45155334472656, "loss": 1.3945, "rewards/accuracies": 0.25, "rewards/chosen": -0.0027163506019860506, "rewards/margins": -0.00813446007668972, "rewards/rejected": 0.005418109707534313, "step": 20 }, { "epoch": 0.0011286985031307946, "grad_norm": 8.660501480102539, "learning_rate": 1.0499999999999999e-07, "logits/chosen": -0.5322842001914978, "logits/rejected": -0.8683191537857056, "logps/chosen": -81.68772888183594, "logps/rejected": -115.72924041748047, "loss": 1.3929, "rewards/accuracies": 0.25, "rewards/chosen": -0.012896967120468616, "rewards/margins": -0.006387948989868164, "rewards/rejected": -0.006509017664939165, "step": 21 }, { "epoch": 0.0011824460508989278, "grad_norm": 7.001013278961182, "learning_rate": 1.0999999999999999e-07, "logits/chosen": -0.5830490589141846, "logits/rejected": -0.8426412343978882, "logps/chosen": -59.00617218017578, "logps/rejected": -77.81758880615234, "loss": 1.3919, "rewards/accuracies": 0.5, "rewards/chosen": 0.0033678063191473484, "rewards/margins": -0.005130481906235218, "rewards/rejected": 0.00849828775972128, "step": 22 }, { "epoch": 0.0012361935986670608, "grad_norm": 8.08579158782959, "learning_rate": 1.15e-07, "logits/chosen": -0.5881620645523071, "logits/rejected": -0.8506753444671631, "logps/chosen": -85.85636901855469, "logps/rejected": -94.69215393066406, "loss": 1.383, "rewards/accuracies": 0.25, "rewards/chosen": 0.0047182561829686165, "rewards/margins": 0.0037292949855327606, "rewards/rejected": 0.0009889607317745686, "step": 23 }, { "epoch": 0.0012899411464351938, "grad_norm": 8.288750648498535, "learning_rate": 1.2e-07, "logits/chosen": -0.34049859642982483, "logits/rejected": -0.704309344291687, "logps/chosen": -93.78474426269531, "logps/rejected": -129.32717895507812, "loss": 1.3858, "rewards/accuracies": 0.375, "rewards/chosen": 0.0014446261338889599, "rewards/margins": 0.0009137153392657638, "rewards/rejected": 0.0005309106782078743, "step": 24 }, { "epoch": 0.001343688694203327, "grad_norm": 6.940954685211182, "learning_rate": 1.25e-07, "logits/chosen": -0.7932472229003906, "logits/rejected": -0.9097551107406616, "logps/chosen": -65.62593078613281, "logps/rejected": -92.70841979980469, "loss": 1.3893, "rewards/accuracies": 0.25, "rewards/chosen": 0.012067246250808239, "rewards/margins": -0.002649521455168724, "rewards/rejected": 0.014716767705976963, "step": 25 }, { "epoch": 0.00139743624197146, "grad_norm": 9.167326927185059, "learning_rate": 1.3e-07, "logits/chosen": -0.5547146201133728, "logits/rejected": -1.0815478563308716, "logps/chosen": -100.64911651611328, "logps/rejected": -103.84555053710938, "loss": 1.3763, "rewards/accuracies": 0.625, "rewards/chosen": 0.00014717550948262215, "rewards/margins": 0.010152269154787064, "rewards/rejected": -0.010005094110965729, "step": 26 }, { "epoch": 0.0014511837897395932, "grad_norm": 8.25241470336914, "learning_rate": 1.35e-07, "logits/chosen": -0.5232434868812561, "logits/rejected": -0.6390266418457031, "logps/chosen": -91.79365539550781, "logps/rejected": -116.89897155761719, "loss": 1.3924, "rewards/accuracies": 0.375, "rewards/chosen": -0.0075927735306322575, "rewards/margins": -0.005957889836281538, "rewards/rejected": -0.0016348841600120068, "step": 27 }, { "epoch": 0.0015049313375077261, "grad_norm": 9.182291984558105, "learning_rate": 1.4e-07, "logits/chosen": -0.39191919565200806, "logits/rejected": -0.7724546194076538, "logps/chosen": -96.36701202392578, "logps/rejected": -110.5954360961914, "loss": 1.3972, "rewards/accuracies": 0.5, "rewards/chosen": -0.008513402193784714, "rewards/margins": -0.010582972317934036, "rewards/rejected": 0.002069568494334817, "step": 28 }, { "epoch": 0.0015586788852758593, "grad_norm": 9.077556610107422, "learning_rate": 1.45e-07, "logits/chosen": -0.5273479223251343, "logits/rejected": -0.6808032989501953, "logps/chosen": -98.66131591796875, "logps/rejected": -151.17465209960938, "loss": 1.3878, "rewards/accuracies": 0.5, "rewards/chosen": 0.0006317612715065479, "rewards/margins": -0.0013585565611720085, "rewards/rejected": 0.0019903180655092, "step": 29 }, { "epoch": 0.0016124264330439923, "grad_norm": 7.897831916809082, "learning_rate": 1.5e-07, "logits/chosen": -0.7506681680679321, "logits/rejected": -1.0439293384552002, "logps/chosen": -70.08816528320312, "logps/rejected": -110.37745666503906, "loss": 1.3898, "rewards/accuracies": 0.375, "rewards/chosen": 0.0013779643923044205, "rewards/margins": -0.003206300549209118, "rewards/rejected": 0.004584264941513538, "step": 30 }, { "epoch": 0.0016661739808121255, "grad_norm": 8.56389045715332, "learning_rate": 1.55e-07, "logits/chosen": -0.5107775330543518, "logits/rejected": -0.7187220454216003, "logps/chosen": -106.04706573486328, "logps/rejected": -110.80841064453125, "loss": 1.3777, "rewards/accuracies": 0.625, "rewards/chosen": 0.006948089227080345, "rewards/margins": 0.008790111169219017, "rewards/rejected": -0.0018420221749693155, "step": 31 }, { "epoch": 0.0017199215285802585, "grad_norm": 7.16372013092041, "learning_rate": 1.6e-07, "logits/chosen": -0.6406233906745911, "logits/rejected": -0.7985771894454956, "logps/chosen": -79.37033081054688, "logps/rejected": -100.94114685058594, "loss": 1.4107, "rewards/accuracies": 0.25, "rewards/chosen": -0.006321430206298828, "rewards/margins": -0.024100303649902344, "rewards/rejected": 0.017778873443603516, "step": 32 }, { "epoch": 0.0017736690763483917, "grad_norm": 7.901755332946777, "learning_rate": 1.65e-07, "logits/chosen": -0.777077317237854, "logits/rejected": -0.8622071743011475, "logps/chosen": -67.00970458984375, "logps/rejected": -92.73141479492188, "loss": 1.3792, "rewards/accuracies": 0.5, "rewards/chosen": 0.0007941480726003647, "rewards/margins": 0.007220578379929066, "rewards/rejected": -0.006426429376006126, "step": 33 }, { "epoch": 0.0018274166241165246, "grad_norm": 7.719196796417236, "learning_rate": 1.7000000000000001e-07, "logits/chosen": -0.2993572950363159, "logits/rejected": -0.6201244592666626, "logps/chosen": -85.47561645507812, "logps/rejected": -100.66415405273438, "loss": 1.3969, "rewards/accuracies": 0.375, "rewards/chosen": -0.003897094866260886, "rewards/margins": -0.01031260471791029, "rewards/rejected": 0.0064155105501413345, "step": 34 }, { "epoch": 0.0018811641718846578, "grad_norm": 11.600595474243164, "learning_rate": 1.75e-07, "logits/chosen": -0.6471947431564331, "logits/rejected": -0.9437248706817627, "logps/chosen": -112.87066650390625, "logps/rejected": -130.8255615234375, "loss": 1.3927, "rewards/accuracies": 0.375, "rewards/chosen": -0.015840481966733932, "rewards/margins": -0.005790282040834427, "rewards/rejected": -0.010050201788544655, "step": 35 }, { "epoch": 0.0019349117196527908, "grad_norm": 8.569315910339355, "learning_rate": 1.8e-07, "logits/chosen": -0.5607515573501587, "logits/rejected": -0.9741392731666565, "logps/chosen": -89.04241180419922, "logps/rejected": -141.5797119140625, "loss": 1.3852, "rewards/accuracies": 0.5, "rewards/chosen": 0.009252786636352539, "rewards/margins": 0.0013018138706684113, "rewards/rejected": 0.007950973697006702, "step": 36 }, { "epoch": 0.0019886592674209238, "grad_norm": 7.82846736907959, "learning_rate": 1.85e-07, "logits/chosen": -0.3905686140060425, "logits/rejected": -1.1150908470153809, "logps/chosen": -89.02799987792969, "logps/rejected": -101.53260803222656, "loss": 1.3894, "rewards/accuracies": 0.375, "rewards/chosen": -0.007035780698060989, "rewards/margins": -0.002919721882790327, "rewards/rejected": -0.004116058815270662, "step": 37 }, { "epoch": 0.002042406815189057, "grad_norm": 8.96768569946289, "learning_rate": 1.8999999999999998e-07, "logits/chosen": -0.4186624586582184, "logits/rejected": -0.7087715864181519, "logps/chosen": -78.67716217041016, "logps/rejected": -105.57879638671875, "loss": 1.3895, "rewards/accuracies": 0.25, "rewards/chosen": 0.0027568337973207235, "rewards/margins": -0.003093290375545621, "rewards/rejected": 0.0058501241728663445, "step": 38 }, { "epoch": 0.00209615436295719, "grad_norm": 7.81805419921875, "learning_rate": 1.9499999999999999e-07, "logits/chosen": -0.7153301239013672, "logits/rejected": -0.6971107721328735, "logps/chosen": -72.32463836669922, "logps/rejected": -105.63674926757812, "loss": 1.3729, "rewards/accuracies": 0.625, "rewards/chosen": 0.009188270196318626, "rewards/margins": 0.01365575846284628, "rewards/rejected": -0.004467487800866365, "step": 39 }, { "epoch": 0.0021499019107253233, "grad_norm": 8.586557388305664, "learning_rate": 2e-07, "logits/chosen": -0.5092922449111938, "logits/rejected": -0.8808071613311768, "logps/chosen": -81.26252746582031, "logps/rejected": -105.72013092041016, "loss": 1.3932, "rewards/accuracies": 0.25, "rewards/chosen": -0.01609921269118786, "rewards/margins": -0.006821251008659601, "rewards/rejected": -0.009277964010834694, "step": 40 }, { "epoch": 0.002203649458493456, "grad_norm": 7.373818397521973, "learning_rate": 2.0499999999999997e-07, "logits/chosen": -0.49268585443496704, "logits/rejected": -0.7908859252929688, "logps/chosen": -86.733642578125, "logps/rejected": -95.32139587402344, "loss": 1.369, "rewards/accuracies": 0.75, "rewards/chosen": 0.009233569726347923, "rewards/margins": 0.017481137067079544, "rewards/rejected": -0.00824756734073162, "step": 41 }, { "epoch": 0.0022573970062615893, "grad_norm": 8.270150184631348, "learning_rate": 2.0999999999999997e-07, "logits/chosen": -0.5285747051239014, "logits/rejected": -0.722440242767334, "logps/chosen": -83.8260726928711, "logps/rejected": -125.6030502319336, "loss": 1.385, "rewards/accuracies": 0.5, "rewards/chosen": 0.007476449478417635, "rewards/margins": 0.0014731173869222403, "rewards/rejected": 0.006003332324326038, "step": 42 }, { "epoch": 0.0023111445540297225, "grad_norm": 10.629551887512207, "learning_rate": 2.1499999999999998e-07, "logits/chosen": -0.49013882875442505, "logits/rejected": -0.6852960586547852, "logps/chosen": -105.13156127929688, "logps/rejected": -123.69252014160156, "loss": 1.3746, "rewards/accuracies": 0.5, "rewards/chosen": 0.009237194433808327, "rewards/margins": 0.012375927530229092, "rewards/rejected": -0.0031387328635901213, "step": 43 }, { "epoch": 0.0023648921017978557, "grad_norm": 7.858402252197266, "learning_rate": 2.1999999999999998e-07, "logits/chosen": -0.5702916383743286, "logits/rejected": -0.8965415954589844, "logps/chosen": -68.97383117675781, "logps/rejected": -101.99735260009766, "loss": 1.3733, "rewards/accuracies": 0.625, "rewards/chosen": 0.005028081592172384, "rewards/margins": 0.013224005699157715, "rewards/rejected": -0.008195924572646618, "step": 44 }, { "epoch": 0.0024186396495659884, "grad_norm": 8.891136169433594, "learning_rate": 2.25e-07, "logits/chosen": -0.6039612293243408, "logits/rejected": -1.0555522441864014, "logps/chosen": -93.02165222167969, "logps/rejected": -129.453857421875, "loss": 1.3752, "rewards/accuracies": 0.625, "rewards/chosen": 0.009529518894851208, "rewards/margins": 0.011351561173796654, "rewards/rejected": -0.001822042977437377, "step": 45 }, { "epoch": 0.0024723871973341216, "grad_norm": 9.362869262695312, "learning_rate": 2.3e-07, "logits/chosen": -0.7231209874153137, "logits/rejected": -0.6981338858604431, "logps/chosen": -94.05708312988281, "logps/rejected": -139.45484924316406, "loss": 1.3876, "rewards/accuracies": 0.625, "rewards/chosen": 0.02146873250603676, "rewards/margins": -0.0009884836617857218, "rewards/rejected": 0.022457217797636986, "step": 46 }, { "epoch": 0.002526134745102255, "grad_norm": 8.48691177368164, "learning_rate": 2.3499999999999997e-07, "logits/chosen": -0.7336191534996033, "logits/rejected": -0.9868717193603516, "logps/chosen": -75.6761474609375, "logps/rejected": -105.52317810058594, "loss": 1.3536, "rewards/accuracies": 0.875, "rewards/chosen": 0.00992603413760662, "rewards/margins": 0.03316822275519371, "rewards/rejected": -0.02324218861758709, "step": 47 }, { "epoch": 0.0025798822928703876, "grad_norm": 7.733593940734863, "learning_rate": 2.4e-07, "logits/chosen": -0.5540213584899902, "logits/rejected": -0.8223700523376465, "logps/chosen": -71.37220001220703, "logps/rejected": -81.74217224121094, "loss": 1.4, "rewards/accuracies": 0.5, "rewards/chosen": -0.0020942208357155323, "rewards/margins": -0.013409614562988281, "rewards/rejected": 0.011315394192934036, "step": 48 }, { "epoch": 0.0026336298406385208, "grad_norm": 9.125642776489258, "learning_rate": 2.45e-07, "logits/chosen": -0.49975132942199707, "logits/rejected": -0.7237377166748047, "logps/chosen": -79.24974822998047, "logps/rejected": -149.38690185546875, "loss": 1.3972, "rewards/accuracies": 0.375, "rewards/chosen": -0.013892841525375843, "rewards/margins": -0.010572625324130058, "rewards/rejected": -0.0033202162012457848, "step": 49 }, { "epoch": 0.002687377388406654, "grad_norm": 7.797073841094971, "learning_rate": 2.5e-07, "logits/chosen": -0.7408368587493896, "logits/rejected": -0.6117101907730103, "logps/chosen": -96.25028228759766, "logps/rejected": -127.87377166748047, "loss": 1.3682, "rewards/accuracies": 0.75, "rewards/chosen": 0.006863451562821865, "rewards/margins": 0.018326900899410248, "rewards/rejected": -0.011463451199233532, "step": 50 }, { "epoch": 0.002741124936174787, "grad_norm": 8.364821434020996, "learning_rate": 2.55e-07, "logits/chosen": -0.5893720388412476, "logits/rejected": -0.9179937243461609, "logps/chosen": -94.17994689941406, "logps/rejected": -111.77970123291016, "loss": 1.4037, "rewards/accuracies": 0.375, "rewards/chosen": -0.005445909686386585, "rewards/margins": -0.016887545585632324, "rewards/rejected": 0.011441635899245739, "step": 51 }, { "epoch": 0.00279487248394292, "grad_norm": 8.976974487304688, "learning_rate": 2.6e-07, "logits/chosen": -0.5119687914848328, "logits/rejected": -0.8165309429168701, "logps/chosen": -82.34027099609375, "logps/rejected": -134.41204833984375, "loss": 1.3866, "rewards/accuracies": 0.375, "rewards/chosen": 0.0027532577514648438, "rewards/margins": -0.00021648360416293144, "rewards/rejected": 0.002969742054119706, "step": 52 }, { "epoch": 0.002848620031711053, "grad_norm": 9.535009384155273, "learning_rate": 2.65e-07, "logits/chosen": -0.48858749866485596, "logits/rejected": -0.7641533613204956, "logps/chosen": -108.45321655273438, "logps/rejected": -134.2806854248047, "loss": 1.388, "rewards/accuracies": 0.5, "rewards/chosen": -0.0011934274807572365, "rewards/margins": -0.0014282232150435448, "rewards/rejected": 0.00023479480296373367, "step": 53 }, { "epoch": 0.0029023675794791863, "grad_norm": 7.457547187805176, "learning_rate": 2.7e-07, "logits/chosen": -0.6441543698310852, "logits/rejected": -0.5924786329269409, "logps/chosen": -88.5523910522461, "logps/rejected": -101.81995391845703, "loss": 1.3794, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008887768490239978, "rewards/margins": 0.007016850169748068, "rewards/rejected": -0.0079056266695261, "step": 54 }, { "epoch": 0.0029561151272473195, "grad_norm": 9.02698040008545, "learning_rate": 2.75e-07, "logits/chosen": -0.468053936958313, "logits/rejected": -0.9804271459579468, "logps/chosen": -94.00070190429688, "logps/rejected": -149.0098419189453, "loss": 1.4167, "rewards/accuracies": 0.25, "rewards/chosen": -0.015568828210234642, "rewards/margins": -0.029503628611564636, "rewards/rejected": 0.013934802263975143, "step": 55 }, { "epoch": 0.0030098626750154523, "grad_norm": 10.612890243530273, "learning_rate": 2.8e-07, "logits/chosen": -0.6267477869987488, "logits/rejected": -0.8941481113433838, "logps/chosen": -84.3619155883789, "logps/rejected": -106.56355285644531, "loss": 1.3721, "rewards/accuracies": 0.75, "rewards/chosen": 0.010368634015321732, "rewards/margins": 0.014438199810683727, "rewards/rejected": -0.004069566261023283, "step": 56 }, { "epoch": 0.0030636102227835854, "grad_norm": 9.122997283935547, "learning_rate": 2.8499999999999997e-07, "logits/chosen": -0.4833652377128601, "logits/rejected": -0.5830209255218506, "logps/chosen": -96.44157409667969, "logps/rejected": -137.86610412597656, "loss": 1.3913, "rewards/accuracies": 0.625, "rewards/chosen": -0.00467534177005291, "rewards/margins": -0.004872847348451614, "rewards/rejected": 0.0001975062768906355, "step": 57 }, { "epoch": 0.0031173577705517186, "grad_norm": 7.18145227432251, "learning_rate": 2.9e-07, "logits/chosen": -0.5169291496276855, "logits/rejected": -0.7388797402381897, "logps/chosen": -77.57707214355469, "logps/rejected": -91.94204711914062, "loss": 1.3668, "rewards/accuracies": 0.75, "rewards/chosen": 0.019289113581180573, "rewards/margins": 0.019966794177889824, "rewards/rejected": -0.0006776812952011824, "step": 58 }, { "epoch": 0.003171105318319852, "grad_norm": 8.925081253051758, "learning_rate": 2.95e-07, "logits/chosen": -0.6823402643203735, "logits/rejected": -0.8340035676956177, "logps/chosen": -93.4341812133789, "logps/rejected": -125.25653076171875, "loss": 1.402, "rewards/accuracies": 0.375, "rewards/chosen": -0.018242502585053444, "rewards/margins": -0.015286922454833984, "rewards/rejected": -0.0029555796645581722, "step": 59 }, { "epoch": 0.0032248528660879846, "grad_norm": 8.46120834350586, "learning_rate": 3e-07, "logits/chosen": -0.5364607572555542, "logits/rejected": -0.74739670753479, "logps/chosen": -73.58871459960938, "logps/rejected": -86.86251831054688, "loss": 1.4047, "rewards/accuracies": 0.25, "rewards/chosen": -0.010658741928637028, "rewards/margins": -0.018239475786685944, "rewards/rejected": 0.007580732926726341, "step": 60 }, { "epoch": 0.0032786004138561178, "grad_norm": 11.018837928771973, "learning_rate": 3.05e-07, "logits/chosen": -0.529459536075592, "logits/rejected": -1.0881057977676392, "logps/chosen": -92.7784652709961, "logps/rejected": -140.6796875, "loss": 1.3855, "rewards/accuracies": 0.375, "rewards/chosen": -0.007517814636230469, "rewards/margins": 0.0014034267514944077, "rewards/rejected": -0.008921242319047451, "step": 61 }, { "epoch": 0.003332347961624251, "grad_norm": 7.908779621124268, "learning_rate": 3.1e-07, "logits/chosen": -0.3675628900527954, "logits/rejected": -0.7041562795639038, "logps/chosen": -72.2803955078125, "logps/rejected": -98.57533264160156, "loss": 1.3855, "rewards/accuracies": 0.625, "rewards/chosen": -0.0011669636005535722, "rewards/margins": 0.0009401319548487663, "rewards/rejected": -0.0021070956718176603, "step": 62 }, { "epoch": 0.003386095509392384, "grad_norm": 10.110648155212402, "learning_rate": 3.15e-07, "logits/chosen": -0.7167191505432129, "logits/rejected": -0.7103843688964844, "logps/chosen": -72.10398864746094, "logps/rejected": -112.51895904541016, "loss": 1.3809, "rewards/accuracies": 0.5, "rewards/chosen": 0.004012250807136297, "rewards/margins": 0.005618619732558727, "rewards/rejected": -0.0016063684597611427, "step": 63 }, { "epoch": 0.003439843057160517, "grad_norm": 9.034467697143555, "learning_rate": 3.2e-07, "logits/chosen": -0.7524758577346802, "logits/rejected": -0.7390173673629761, "logps/chosen": -71.0537109375, "logps/rejected": -110.75092315673828, "loss": 1.3879, "rewards/accuracies": 0.625, "rewards/chosen": -0.003200674429535866, "rewards/margins": -0.0012365346774458885, "rewards/rejected": -0.0019641402177512646, "step": 64 }, { "epoch": 0.00349359060492865, "grad_norm": 6.976632118225098, "learning_rate": 3.25e-07, "logits/chosen": -0.8358469009399414, "logits/rejected": -1.016180157661438, "logps/chosen": -66.08577728271484, "logps/rejected": -89.9134750366211, "loss": 1.3842, "rewards/accuracies": 0.375, "rewards/chosen": 0.010739064775407314, "rewards/margins": 0.0022439719177782536, "rewards/rejected": 0.008495092391967773, "step": 65 }, { "epoch": 0.0035473381526967833, "grad_norm": 8.591444969177246, "learning_rate": 3.3e-07, "logits/chosen": -0.6935573220252991, "logits/rejected": -0.6134279370307922, "logps/chosen": -88.94845581054688, "logps/rejected": -104.41876220703125, "loss": 1.3835, "rewards/accuracies": 0.5, "rewards/chosen": 0.0005981442518532276, "rewards/margins": 0.0029290677048265934, "rewards/rejected": -0.002330922521650791, "step": 66 }, { "epoch": 0.0036010857004649165, "grad_norm": 7.711324214935303, "learning_rate": 3.35e-07, "logits/chosen": -0.7065850496292114, "logits/rejected": -0.5953470468521118, "logps/chosen": -89.66769409179688, "logps/rejected": -95.72993469238281, "loss": 1.3829, "rewards/accuracies": 0.375, "rewards/chosen": 0.0043515912257134914, "rewards/margins": 0.0034556628670543432, "rewards/rejected": 0.0008959295228123665, "step": 67 }, { "epoch": 0.0036548332482330493, "grad_norm": 9.339879035949707, "learning_rate": 3.4000000000000003e-07, "logits/chosen": -0.5466330051422119, "logits/rejected": -0.9444475173950195, "logps/chosen": -88.27287292480469, "logps/rejected": -127.44645690917969, "loss": 1.387, "rewards/accuracies": 0.625, "rewards/chosen": -0.0009190088603645563, "rewards/margins": -0.0006193636218085885, "rewards/rejected": -0.0002996448893100023, "step": 68 }, { "epoch": 0.0037085807960011825, "grad_norm": 9.99643611907959, "learning_rate": 3.45e-07, "logits/chosen": -0.633792519569397, "logits/rejected": -0.6285285949707031, "logps/chosen": -102.36030578613281, "logps/rejected": -129.0988311767578, "loss": 1.3821, "rewards/accuracies": 0.375, "rewards/chosen": -0.0004914281889796257, "rewards/margins": 0.0044272420927882195, "rewards/rejected": -0.0049186707474291325, "step": 69 }, { "epoch": 0.0037623283437693156, "grad_norm": 9.309488296508789, "learning_rate": 3.5e-07, "logits/chosen": -0.3174594044685364, "logits/rejected": -0.789804995059967, "logps/chosen": -84.69456481933594, "logps/rejected": -107.91668701171875, "loss": 1.3789, "rewards/accuracies": 0.5, "rewards/chosen": 0.012507820501923561, "rewards/margins": 0.007670545484870672, "rewards/rejected": 0.0048372745513916016, "step": 70 }, { "epoch": 0.0038160758915374484, "grad_norm": 8.66744613647461, "learning_rate": 3.55e-07, "logits/chosen": -0.8024520874023438, "logits/rejected": -0.7307050228118896, "logps/chosen": -76.72541809082031, "logps/rejected": -106.57899475097656, "loss": 1.3894, "rewards/accuracies": 0.375, "rewards/chosen": 0.012390756979584694, "rewards/margins": -0.0021251197904348373, "rewards/rejected": 0.014515876770019531, "step": 71 }, { "epoch": 0.0038698234393055816, "grad_norm": 9.889826774597168, "learning_rate": 3.6e-07, "logits/chosen": -0.5179650783538818, "logits/rejected": -0.8690033555030823, "logps/chosen": -81.47711181640625, "logps/rejected": -150.914306640625, "loss": 1.3664, "rewards/accuracies": 0.75, "rewards/chosen": 0.004861020483076572, "rewards/margins": 0.020183132961392403, "rewards/rejected": -0.015322113409638405, "step": 72 }, { "epoch": 0.003923570987073715, "grad_norm": 8.243026733398438, "learning_rate": 3.65e-07, "logits/chosen": -0.3261711597442627, "logits/rejected": -0.6148155331611633, "logps/chosen": -84.84300231933594, "logps/rejected": -118.35273742675781, "loss": 1.3865, "rewards/accuracies": 0.625, "rewards/chosen": 0.009526778012514114, "rewards/margins": 0.00015711761079728603, "rewards/rejected": 0.00936965923756361, "step": 73 }, { "epoch": 0.0039773185348418475, "grad_norm": 9.10533332824707, "learning_rate": 3.7e-07, "logits/chosen": -0.5265448689460754, "logits/rejected": -0.4441685676574707, "logps/chosen": -101.1104736328125, "logps/rejected": -118.29222106933594, "loss": 1.3782, "rewards/accuracies": 0.5, "rewards/chosen": 0.01149735413491726, "rewards/margins": 0.008614778518676758, "rewards/rejected": 0.002882576547563076, "step": 74 }, { "epoch": 0.004031066082609981, "grad_norm": 8.713000297546387, "learning_rate": 3.75e-07, "logits/chosen": -0.8108004331588745, "logits/rejected": -0.740679144859314, "logps/chosen": -93.86286926269531, "logps/rejected": -128.41632080078125, "loss": 1.3941, "rewards/accuracies": 0.375, "rewards/chosen": -0.0016858575399965048, "rewards/margins": -0.007669306360185146, "rewards/rejected": 0.005983447656035423, "step": 75 }, { "epoch": 0.004084813630378114, "grad_norm": 9.181607246398926, "learning_rate": 3.7999999999999996e-07, "logits/chosen": -0.47283488512039185, "logits/rejected": -0.935647189617157, "logps/chosen": -95.63761901855469, "logps/rejected": -115.92280578613281, "loss": 1.3933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005431172903627157, "rewards/margins": -0.006902456283569336, "rewards/rejected": 0.006359338294714689, "step": 76 }, { "epoch": 0.004138561178146247, "grad_norm": 6.962100982666016, "learning_rate": 3.8499999999999997e-07, "logits/chosen": -0.6948473453521729, "logits/rejected": -1.3128026723861694, "logps/chosen": -71.61614227294922, "logps/rejected": -90.48023223876953, "loss": 1.3758, "rewards/accuracies": 0.625, "rewards/chosen": 0.00525279063731432, "rewards/margins": 0.010612440295517445, "rewards/rejected": -0.005359649192541838, "step": 77 }, { "epoch": 0.00419230872591438, "grad_norm": 8.91589069366455, "learning_rate": 3.8999999999999997e-07, "logits/chosen": -0.3106628954410553, "logits/rejected": -0.6929190158843994, "logps/chosen": -80.65679168701172, "logps/rejected": -115.49663543701172, "loss": 1.3935, "rewards/accuracies": 0.375, "rewards/chosen": -0.009992456063628197, "rewards/margins": -0.006991195492446423, "rewards/rejected": -0.003001260804012418, "step": 78 }, { "epoch": 0.004246056273682513, "grad_norm": 9.223258018493652, "learning_rate": 3.95e-07, "logits/chosen": -0.5829489827156067, "logits/rejected": -0.9253657460212708, "logps/chosen": -97.073974609375, "logps/rejected": -119.96102142333984, "loss": 1.3855, "rewards/accuracies": 0.5, "rewards/chosen": 0.0019248956814408302, "rewards/margins": 0.0013014795258641243, "rewards/rejected": 0.0006234167376533151, "step": 79 }, { "epoch": 0.004299803821450647, "grad_norm": 7.335805892944336, "learning_rate": 4e-07, "logits/chosen": -0.6180788278579712, "logits/rejected": -0.8246430158615112, "logps/chosen": -65.96183776855469, "logps/rejected": -90.71397399902344, "loss": 1.3685, "rewards/accuracies": 0.5, "rewards/chosen": 0.011597157455980778, "rewards/margins": 0.018112804740667343, "rewards/rejected": -0.006515646353363991, "step": 80 }, { "epoch": 0.0043535513692187795, "grad_norm": 9.195491790771484, "learning_rate": 4.05e-07, "logits/chosen": -0.815979540348053, "logits/rejected": -0.7271102070808411, "logps/chosen": -91.46857452392578, "logps/rejected": -138.6331329345703, "loss": 1.3745, "rewards/accuracies": 0.5, "rewards/chosen": 0.007442760281264782, "rewards/margins": 0.012495232746005058, "rewards/rejected": -0.005052470602095127, "step": 81 }, { "epoch": 0.004407298916986912, "grad_norm": 8.091022491455078, "learning_rate": 4.0999999999999994e-07, "logits/chosen": -0.49595868587493896, "logits/rejected": -0.903720498085022, "logps/chosen": -87.2410659790039, "logps/rejected": -123.95703125, "loss": 1.3733, "rewards/accuracies": 0.75, "rewards/chosen": -0.0067462921142578125, "rewards/margins": 0.013173293322324753, "rewards/rejected": -0.019919587299227715, "step": 82 }, { "epoch": 0.004461046464755046, "grad_norm": 8.640301704406738, "learning_rate": 4.1499999999999994e-07, "logits/chosen": -0.7808905839920044, "logits/rejected": -0.9601308107376099, "logps/chosen": -92.71270751953125, "logps/rejected": -103.0289306640625, "loss": 1.392, "rewards/accuracies": 0.375, "rewards/chosen": -0.00455665634945035, "rewards/margins": -0.005107973702251911, "rewards/rejected": 0.0005513187497854233, "step": 83 }, { "epoch": 0.004514794012523179, "grad_norm": 10.411420822143555, "learning_rate": 4.1999999999999995e-07, "logits/chosen": -0.5638555288314819, "logits/rejected": -0.7834076285362244, "logps/chosen": -104.28919982910156, "logps/rejected": -127.33184814453125, "loss": 1.3779, "rewards/accuracies": 0.625, "rewards/chosen": -0.003203297033905983, "rewards/margins": 0.008511114865541458, "rewards/rejected": -0.011714410968124866, "step": 84 }, { "epoch": 0.004568541560291311, "grad_norm": 8.805337905883789, "learning_rate": 4.2499999999999995e-07, "logits/chosen": -0.5656499862670898, "logits/rejected": -0.790486216545105, "logps/chosen": -89.9202651977539, "logps/rejected": -131.35479736328125, "loss": 1.3817, "rewards/accuracies": 0.625, "rewards/chosen": -0.00927596166729927, "rewards/margins": 0.004767702892422676, "rewards/rejected": -0.014043664559721947, "step": 85 }, { "epoch": 0.004622289108059445, "grad_norm": 8.494221687316895, "learning_rate": 4.2999999999999996e-07, "logits/chosen": -0.7255963087081909, "logits/rejected": -1.2127760648727417, "logps/chosen": -81.29212951660156, "logps/rejected": -100.43565368652344, "loss": 1.398, "rewards/accuracies": 0.625, "rewards/chosen": 0.0021460053976625204, "rewards/margins": -0.010950803756713867, "rewards/rejected": 0.013096809387207031, "step": 86 }, { "epoch": 0.004676036655827578, "grad_norm": 9.525997161865234, "learning_rate": 4.3499999999999996e-07, "logits/chosen": -0.67186439037323, "logits/rejected": -0.680492103099823, "logps/chosen": -83.36714172363281, "logps/rejected": -120.49726104736328, "loss": 1.3834, "rewards/accuracies": 0.5, "rewards/chosen": 0.008803272619843483, "rewards/margins": 0.003020667703822255, "rewards/rejected": 0.005782604217529297, "step": 87 }, { "epoch": 0.004729784203595711, "grad_norm": 10.27984619140625, "learning_rate": 4.3999999999999997e-07, "logits/chosen": -0.6513060331344604, "logits/rejected": -0.6883199214935303, "logps/chosen": -75.36994934082031, "logps/rejected": -114.74723815917969, "loss": 1.3685, "rewards/accuracies": 0.625, "rewards/chosen": 0.0018049958162009716, "rewards/margins": 0.018433403223752975, "rewards/rejected": -0.016628408804535866, "step": 88 }, { "epoch": 0.004783531751363844, "grad_norm": 8.953598976135254, "learning_rate": 4.45e-07, "logits/chosen": -0.7916650772094727, "logits/rejected": -1.2761249542236328, "logps/chosen": -98.39886474609375, "logps/rejected": -123.22477722167969, "loss": 1.3888, "rewards/accuracies": 0.25, "rewards/chosen": -0.013754701241850853, "rewards/margins": -0.0023045053239911795, "rewards/rejected": -0.01145019568502903, "step": 89 }, { "epoch": 0.004837279299131977, "grad_norm": 9.589315414428711, "learning_rate": 4.5e-07, "logits/chosen": -0.6036202311515808, "logits/rejected": -0.9316726922988892, "logps/chosen": -93.84979248046875, "logps/rejected": -125.3914794921875, "loss": 1.3902, "rewards/accuracies": 0.5, "rewards/chosen": -0.016410110518336296, "rewards/margins": -0.0034947870299220085, "rewards/rejected": -0.012915325351059437, "step": 90 }, { "epoch": 0.0048910268469001105, "grad_norm": 8.582666397094727, "learning_rate": 4.55e-07, "logits/chosen": -0.9905259609222412, "logits/rejected": -1.2266937494277954, "logps/chosen": -79.10990905761719, "logps/rejected": -105.07563781738281, "loss": 1.3924, "rewards/accuracies": 0.375, "rewards/chosen": 0.002523183822631836, "rewards/margins": -0.005945872515439987, "rewards/rejected": 0.008469056338071823, "step": 91 }, { "epoch": 0.004944774394668243, "grad_norm": 8.097585678100586, "learning_rate": 4.6e-07, "logits/chosen": -0.7338860034942627, "logits/rejected": -0.5818134546279907, "logps/chosen": -87.86322021484375, "logps/rejected": -100.40827941894531, "loss": 1.3985, "rewards/accuracies": 0.375, "rewards/chosen": -0.002400016877800226, "rewards/margins": -0.011916255578398705, "rewards/rejected": 0.009516239166259766, "step": 92 }, { "epoch": 0.004998521942436376, "grad_norm": 8.177687644958496, "learning_rate": 4.65e-07, "logits/chosen": -0.5757491588592529, "logits/rejected": -0.8946194648742676, "logps/chosen": -70.17253112792969, "logps/rejected": -105.48762512207031, "loss": 1.3723, "rewards/accuracies": 0.75, "rewards/chosen": 0.020789574831724167, "rewards/margins": 0.014466093853116035, "rewards/rejected": 0.006323480512946844, "step": 93 }, { "epoch": 0.00505226949020451, "grad_norm": 8.56423282623291, "learning_rate": 4.6999999999999995e-07, "logits/chosen": -0.5390354990959167, "logits/rejected": -0.40581464767456055, "logps/chosen": -82.2030029296875, "logps/rejected": -110.38099670410156, "loss": 1.3495, "rewards/accuracies": 0.75, "rewards/chosen": 0.020420502871274948, "rewards/margins": 0.03794069588184357, "rewards/rejected": -0.01752018928527832, "step": 94 }, { "epoch": 0.005106017037972642, "grad_norm": 8.461507797241211, "learning_rate": 4.7499999999999995e-07, "logits/chosen": -0.6540727615356445, "logits/rejected": -0.9722909927368164, "logps/chosen": -103.54087829589844, "logps/rejected": -112.89604187011719, "loss": 1.3667, "rewards/accuracies": 0.875, "rewards/chosen": 0.0024640089832246304, "rewards/margins": 0.01972184330224991, "rewards/rejected": -0.01725783385336399, "step": 95 }, { "epoch": 0.005159764585740775, "grad_norm": 8.544709205627441, "learning_rate": 4.8e-07, "logits/chosen": -1.1151082515716553, "logits/rejected": -0.8405168652534485, "logps/chosen": -93.03946685791016, "logps/rejected": -121.34730529785156, "loss": 1.3508, "rewards/accuracies": 0.625, "rewards/chosen": 0.015212059952318668, "rewards/margins": 0.03674693405628204, "rewards/rejected": -0.02153487130999565, "step": 96 }, { "epoch": 0.005213512133508909, "grad_norm": 11.145546913146973, "learning_rate": 4.85e-07, "logits/chosen": -0.8847471475601196, "logits/rejected": -1.0487537384033203, "logps/chosen": -101.20974731445312, "logps/rejected": -154.65463256835938, "loss": 1.3733, "rewards/accuracies": 0.625, "rewards/chosen": 0.01565098948776722, "rewards/margins": 0.013369608670473099, "rewards/rejected": 0.002281379420310259, "step": 97 }, { "epoch": 0.0052672596812770416, "grad_norm": 8.07213306427002, "learning_rate": 4.9e-07, "logits/chosen": -0.6318795084953308, "logits/rejected": -0.9646536111831665, "logps/chosen": -95.36701965332031, "logps/rejected": -107.36691284179688, "loss": 1.3728, "rewards/accuracies": 0.75, "rewards/chosen": 0.012050819583237171, "rewards/margins": 0.013714075088500977, "rewards/rejected": -0.001663255738094449, "step": 98 }, { "epoch": 0.005321007229045175, "grad_norm": 9.691338539123535, "learning_rate": 4.95e-07, "logits/chosen": -0.4308117628097534, "logits/rejected": -0.869768500328064, "logps/chosen": -90.40399169921875, "logps/rejected": -161.551025390625, "loss": 1.3575, "rewards/accuracies": 0.75, "rewards/chosen": 0.019125938415527344, "rewards/margins": 0.029542922973632812, "rewards/rejected": -0.010416984558105469, "step": 99 }, { "epoch": 0.005374754776813308, "grad_norm": 9.5785493850708, "learning_rate": 5e-07, "logits/chosen": -0.32433757185935974, "logits/rejected": -0.9152166843414307, "logps/chosen": -87.89921569824219, "logps/rejected": -110.95304870605469, "loss": 1.3612, "rewards/accuracies": 0.875, "rewards/chosen": 0.004671669099479914, "rewards/margins": 0.02547921985387802, "rewards/rejected": -0.020807553082704544, "step": 100 }, { "epoch": 0.005428502324581441, "grad_norm": 9.884817123413086, "learning_rate": 5.049999999999999e-07, "logits/chosen": -0.594855785369873, "logits/rejected": -0.9691734313964844, "logps/chosen": -95.22991180419922, "logps/rejected": -114.88735961914062, "loss": 1.3408, "rewards/accuracies": 0.75, "rewards/chosen": 0.034766387194395065, "rewards/margins": 0.04664893448352814, "rewards/rejected": -0.011882543563842773, "step": 101 }, { "epoch": 0.005482249872349574, "grad_norm": 7.850556373596191, "learning_rate": 5.1e-07, "logits/chosen": -0.7462785840034485, "logits/rejected": -0.9806910753250122, "logps/chosen": -81.7419662475586, "logps/rejected": -91.54460906982422, "loss": 1.3932, "rewards/accuracies": 0.5, "rewards/chosen": -0.0034491063561290503, "rewards/margins": -0.0068105231039226055, "rewards/rejected": 0.003361416282132268, "step": 102 }, { "epoch": 0.005535997420117707, "grad_norm": 8.684331893920898, "learning_rate": 5.149999999999999e-07, "logits/chosen": -0.7205624580383301, "logits/rejected": -1.0655252933502197, "logps/chosen": -87.99021911621094, "logps/rejected": -109.62840270996094, "loss": 1.359, "rewards/accuracies": 0.75, "rewards/chosen": 0.006158209405839443, "rewards/margins": 0.027988292276859283, "rewards/rejected": -0.021830083802342415, "step": 103 }, { "epoch": 0.00558974496788584, "grad_norm": 8.371237754821777, "learning_rate": 5.2e-07, "logits/chosen": -0.5356409549713135, "logits/rejected": -0.8478841185569763, "logps/chosen": -71.43356323242188, "logps/rejected": -124.50373077392578, "loss": 1.3623, "rewards/accuracies": 0.625, "rewards/chosen": 0.0047377352602779865, "rewards/margins": 0.02444911003112793, "rewards/rejected": -0.01971137523651123, "step": 104 }, { "epoch": 0.0056434925156539735, "grad_norm": 7.612960338592529, "learning_rate": 5.25e-07, "logits/chosen": -0.5328823328018188, "logits/rejected": -0.9160935878753662, "logps/chosen": -71.94480895996094, "logps/rejected": -107.04407501220703, "loss": 1.3587, "rewards/accuracies": 0.75, "rewards/chosen": 0.021870041266083717, "rewards/margins": 0.028014566749334335, "rewards/rejected": -0.006144524551928043, "step": 105 }, { "epoch": 0.005697240063422106, "grad_norm": 10.74140739440918, "learning_rate": 5.3e-07, "logits/chosen": -0.5819361805915833, "logits/rejected": -0.7985912561416626, "logps/chosen": -75.74542999267578, "logps/rejected": -120.71178436279297, "loss": 1.4054, "rewards/accuracies": 0.25, "rewards/chosen": -0.01317748986184597, "rewards/margins": -0.01863894611597061, "rewards/rejected": 0.005461455322802067, "step": 106 }, { "epoch": 0.00575098761119024, "grad_norm": 9.74856948852539, "learning_rate": 5.35e-07, "logits/chosen": -0.5082576870918274, "logits/rejected": -0.6974226236343384, "logps/chosen": -82.81983184814453, "logps/rejected": -128.2376251220703, "loss": 1.3222, "rewards/accuracies": 1.0, "rewards/chosen": 0.03298177942633629, "rewards/margins": 0.0655616745352745, "rewards/rejected": -0.032579898834228516, "step": 107 }, { "epoch": 0.005804735158958373, "grad_norm": 9.146522521972656, "learning_rate": 5.4e-07, "logits/chosen": -0.3765265941619873, "logits/rejected": -0.7558135986328125, "logps/chosen": -103.66470336914062, "logps/rejected": -125.59317779541016, "loss": 1.3431, "rewards/accuracies": 0.875, "rewards/chosen": 0.015331363305449486, "rewards/margins": 0.04393496364355087, "rewards/rejected": -0.028603600338101387, "step": 108 }, { "epoch": 0.005858482706726505, "grad_norm": 8.527243614196777, "learning_rate": 5.45e-07, "logits/chosen": -0.5010079145431519, "logits/rejected": -0.8275881409645081, "logps/chosen": -86.48405456542969, "logps/rejected": -119.30473327636719, "loss": 1.3383, "rewards/accuracies": 0.875, "rewards/chosen": 0.020288610830903053, "rewards/margins": 0.04892773926258087, "rewards/rejected": -0.02863912656903267, "step": 109 }, { "epoch": 0.005912230254494639, "grad_norm": 9.14734935760498, "learning_rate": 5.5e-07, "logits/chosen": -0.4768997132778168, "logits/rejected": -0.898991584777832, "logps/chosen": -95.84410095214844, "logps/rejected": -127.80978393554688, "loss": 1.3449, "rewards/accuracies": 0.875, "rewards/chosen": 0.005844164174050093, "rewards/margins": 0.042428016662597656, "rewards/rejected": -0.036583855748176575, "step": 110 }, { "epoch": 0.005965977802262772, "grad_norm": 9.342902183532715, "learning_rate": 5.55e-07, "logits/chosen": -0.5109285116195679, "logits/rejected": -1.2183165550231934, "logps/chosen": -104.57946014404297, "logps/rejected": -116.25564575195312, "loss": 1.3351, "rewards/accuracies": 0.875, "rewards/chosen": 0.017390966415405273, "rewards/margins": 0.0521213561296463, "rewards/rejected": -0.03473038226366043, "step": 111 }, { "epoch": 0.0060197253500309045, "grad_norm": 9.204907417297363, "learning_rate": 5.6e-07, "logits/chosen": -0.6733009815216064, "logits/rejected": -0.8024317026138306, "logps/chosen": -80.10874938964844, "logps/rejected": -122.90025329589844, "loss": 1.3282, "rewards/accuracies": 0.75, "rewards/chosen": 0.026174068450927734, "rewards/margins": 0.05993619188666344, "rewards/rejected": -0.0337621234357357, "step": 112 }, { "epoch": 0.006073472897799038, "grad_norm": 9.171602249145508, "learning_rate": 5.649999999999999e-07, "logits/chosen": -0.7525988221168518, "logits/rejected": -1.047884225845337, "logps/chosen": -76.10604858398438, "logps/rejected": -121.81997680664062, "loss": 1.3556, "rewards/accuracies": 0.75, "rewards/chosen": 0.01772494427859783, "rewards/margins": 0.031504061073064804, "rewards/rejected": -0.013779114931821823, "step": 113 }, { "epoch": 0.006127220445567171, "grad_norm": 21.33026123046875, "learning_rate": 5.699999999999999e-07, "logits/chosen": -0.6616357564926147, "logits/rejected": -0.8994951248168945, "logps/chosen": -89.33460998535156, "logps/rejected": -112.39480590820312, "loss": 1.3747, "rewards/accuracies": 0.5, "rewards/chosen": 0.025089550763368607, "rewards/margins": 0.012133168987929821, "rewards/rejected": 0.012956379912793636, "step": 114 }, { "epoch": 0.006180967993335304, "grad_norm": 7.7999467849731445, "learning_rate": 5.749999999999999e-07, "logits/chosen": -0.7610051035881042, "logits/rejected": -1.188895583152771, "logps/chosen": -72.83747863769531, "logps/rejected": -109.6019287109375, "loss": 1.358, "rewards/accuracies": 0.875, "rewards/chosen": 0.020053481683135033, "rewards/margins": 0.028916122391819954, "rewards/rejected": -0.008862639777362347, "step": 115 }, { "epoch": 0.006234715541103437, "grad_norm": 7.734726905822754, "learning_rate": 5.8e-07, "logits/chosen": -0.6701956987380981, "logits/rejected": -0.7872592210769653, "logps/chosen": -88.29141235351562, "logps/rejected": -101.36444091796875, "loss": 1.36, "rewards/accuracies": 0.75, "rewards/chosen": 0.006640911102294922, "rewards/margins": 0.026843883097171783, "rewards/rejected": -0.020202970132231712, "step": 116 }, { "epoch": 0.00628846308887157, "grad_norm": 8.046977996826172, "learning_rate": 5.849999999999999e-07, "logits/chosen": -0.755643367767334, "logits/rejected": -0.7978894710540771, "logps/chosen": -89.4335708618164, "logps/rejected": -101.10948944091797, "loss": 1.3541, "rewards/accuracies": 0.5, "rewards/chosen": 0.036292076110839844, "rewards/margins": 0.033278897404670715, "rewards/rejected": 0.0030131821986287832, "step": 117 }, { "epoch": 0.006342210636639704, "grad_norm": 8.519963264465332, "learning_rate": 5.9e-07, "logits/chosen": -0.44737139344215393, "logits/rejected": -0.6966142654418945, "logps/chosen": -80.47132873535156, "logps/rejected": -106.39862823486328, "loss": 1.3645, "rewards/accuracies": 0.625, "rewards/chosen": 0.011769914999604225, "rewards/margins": 0.022649336606264114, "rewards/rejected": -0.010879422537982464, "step": 118 }, { "epoch": 0.006395958184407836, "grad_norm": 7.753148555755615, "learning_rate": 5.949999999999999e-07, "logits/chosen": -0.5068751573562622, "logits/rejected": -0.6187183856964111, "logps/chosen": -79.90748596191406, "logps/rejected": -98.16801452636719, "loss": 1.3477, "rewards/accuracies": 0.75, "rewards/chosen": 0.031086064875125885, "rewards/margins": 0.040039923042058945, "rewards/rejected": -0.00895385816693306, "step": 119 }, { "epoch": 0.006449705732175969, "grad_norm": 8.345486640930176, "learning_rate": 6e-07, "logits/chosen": -0.6839581727981567, "logits/rejected": -0.6954530477523804, "logps/chosen": -96.79466247558594, "logps/rejected": -122.98995208740234, "loss": 1.3128, "rewards/accuracies": 0.875, "rewards/chosen": 0.040169715881347656, "rewards/margins": 0.07549968361854553, "rewards/rejected": -0.03532996401190758, "step": 120 }, { "epoch": 0.006503453279944103, "grad_norm": 10.284939765930176, "learning_rate": 6.049999999999999e-07, "logits/chosen": -0.5686594843864441, "logits/rejected": -0.8320870399475098, "logps/chosen": -100.07514953613281, "logps/rejected": -136.882080078125, "loss": 1.3441, "rewards/accuracies": 0.75, "rewards/chosen": 0.024566650390625, "rewards/margins": 0.04393329843878746, "rewards/rejected": -0.019366644322872162, "step": 121 }, { "epoch": 0.0065572008277122356, "grad_norm": 8.252532958984375, "learning_rate": 6.1e-07, "logits/chosen": -0.6287285089492798, "logits/rejected": -1.0863521099090576, "logps/chosen": -84.47332763671875, "logps/rejected": -99.42669677734375, "loss": 1.3532, "rewards/accuracies": 0.75, "rewards/chosen": 0.04068718105554581, "rewards/margins": 0.033846043050289154, "rewards/rejected": 0.006841135676950216, "step": 122 }, { "epoch": 0.006610948375480368, "grad_norm": 7.722995281219482, "learning_rate": 6.149999999999999e-07, "logits/chosen": -0.5881807208061218, "logits/rejected": -0.8277117013931274, "logps/chosen": -75.47027587890625, "logps/rejected": -109.70950317382812, "loss": 1.3327, "rewards/accuracies": 1.0, "rewards/chosen": 0.036786746233701706, "rewards/margins": 0.0547664612531662, "rewards/rejected": -0.01797971874475479, "step": 123 }, { "epoch": 0.006664695923248502, "grad_norm": 8.314743995666504, "learning_rate": 6.2e-07, "logits/chosen": -0.4414648711681366, "logits/rejected": -0.8960204720497131, "logps/chosen": -105.69581604003906, "logps/rejected": -111.28223419189453, "loss": 1.3449, "rewards/accuracies": 0.875, "rewards/chosen": 0.0160704143345356, "rewards/margins": 0.04225125163793564, "rewards/rejected": -0.02618083916604519, "step": 124 }, { "epoch": 0.006718443471016635, "grad_norm": 9.08999252319336, "learning_rate": 6.249999999999999e-07, "logits/chosen": -0.46431779861450195, "logits/rejected": -0.8100011348724365, "logps/chosen": -89.24364471435547, "logps/rejected": -113.09158325195312, "loss": 1.3166, "rewards/accuracies": 1.0, "rewards/chosen": 0.05551939457654953, "rewards/margins": 0.07135190814733505, "rewards/rejected": -0.01583251915872097, "step": 125 }, { "epoch": 0.006772191018784768, "grad_norm": 8.56413745880127, "learning_rate": 6.3e-07, "logits/chosen": -0.736238956451416, "logits/rejected": -0.91480553150177, "logps/chosen": -91.28163146972656, "logps/rejected": -128.42153930664062, "loss": 1.3612, "rewards/accuracies": 0.75, "rewards/chosen": 0.014629578217864037, "rewards/margins": 0.025527216494083405, "rewards/rejected": -0.010897635482251644, "step": 126 }, { "epoch": 0.006825938566552901, "grad_norm": 9.683178901672363, "learning_rate": 6.35e-07, "logits/chosen": -0.5534720420837402, "logits/rejected": -0.5627380609512329, "logps/chosen": -102.00486755371094, "logps/rejected": -132.73085021972656, "loss": 1.3379, "rewards/accuracies": 0.875, "rewards/chosen": 0.0065771108493208885, "rewards/margins": 0.050148867070674896, "rewards/rejected": -0.04357175529003143, "step": 127 }, { "epoch": 0.006879686114321034, "grad_norm": 7.932616710662842, "learning_rate": 6.4e-07, "logits/chosen": -0.612842321395874, "logits/rejected": -0.6996155977249146, "logps/chosen": -84.4884262084961, "logps/rejected": -108.09532165527344, "loss": 1.3425, "rewards/accuracies": 0.875, "rewards/chosen": 0.03657136112451553, "rewards/margins": 0.04477734491229057, "rewards/rejected": -0.008205986581742764, "step": 128 }, { "epoch": 0.0069334336620891675, "grad_norm": 8.367311477661133, "learning_rate": 6.45e-07, "logits/chosen": -0.5767112374305725, "logits/rejected": -0.8156481385231018, "logps/chosen": -88.93170166015625, "logps/rejected": -111.56533813476562, "loss": 1.3466, "rewards/accuracies": 0.75, "rewards/chosen": 0.003009557258337736, "rewards/margins": 0.04060201719403267, "rewards/rejected": -0.037592459470033646, "step": 129 }, { "epoch": 0.0069871812098573, "grad_norm": 10.67454719543457, "learning_rate": 6.5e-07, "logits/chosen": -0.5178333520889282, "logits/rejected": -0.7822754383087158, "logps/chosen": -95.70928192138672, "logps/rejected": -119.69634246826172, "loss": 1.3325, "rewards/accuracies": 0.875, "rewards/chosen": 0.03490953519940376, "rewards/margins": 0.05554027855396271, "rewards/rejected": -0.020630741491913795, "step": 130 }, { "epoch": 0.007040928757625433, "grad_norm": 9.503974914550781, "learning_rate": 6.55e-07, "logits/chosen": -0.37673330307006836, "logits/rejected": -0.7126582860946655, "logps/chosen": -82.8017349243164, "logps/rejected": -115.95632934570312, "loss": 1.3269, "rewards/accuracies": 0.75, "rewards/chosen": 0.034896187484264374, "rewards/margins": 0.061892323195934296, "rewards/rejected": -0.026996135711669922, "step": 131 }, { "epoch": 0.007094676305393567, "grad_norm": 8.934268951416016, "learning_rate": 6.6e-07, "logits/chosen": -0.4502097964286804, "logits/rejected": -0.8681063652038574, "logps/chosen": -79.98454284667969, "logps/rejected": -114.90025329589844, "loss": 1.322, "rewards/accuracies": 0.875, "rewards/chosen": 0.03669185936450958, "rewards/margins": 0.06607665866613388, "rewards/rejected": -0.029384803026914597, "step": 132 }, { "epoch": 0.007148423853161699, "grad_norm": 13.197391510009766, "learning_rate": 6.65e-07, "logits/chosen": -0.6386598348617554, "logits/rejected": -0.925796627998352, "logps/chosen": -95.45718383789062, "logps/rejected": -138.4733428955078, "loss": 1.3025, "rewards/accuracies": 1.0, "rewards/chosen": 0.03926239162683487, "rewards/margins": 0.08666572719812393, "rewards/rejected": -0.04740333557128906, "step": 133 }, { "epoch": 0.007202171400929833, "grad_norm": 6.120726585388184, "learning_rate": 6.7e-07, "logits/chosen": -0.9512741565704346, "logits/rejected": -1.089170217514038, "logps/chosen": -60.85023498535156, "logps/rejected": -70.33056640625, "loss": 1.3868, "rewards/accuracies": 0.5, "rewards/chosen": 0.012539004907011986, "rewards/margins": -0.00025792093947529793, "rewards/rejected": 0.012796926312148571, "step": 134 }, { "epoch": 0.007255918948697966, "grad_norm": 8.543952941894531, "learning_rate": 6.75e-07, "logits/chosen": -0.30307191610336304, "logits/rejected": -0.7091283798217773, "logps/chosen": -85.25006103515625, "logps/rejected": -118.5132064819336, "loss": 1.3113, "rewards/accuracies": 1.0, "rewards/chosen": 0.051970481872558594, "rewards/margins": 0.07699575275182724, "rewards/rejected": -0.025025272741913795, "step": 135 }, { "epoch": 0.0073096664964660985, "grad_norm": 8.328312873840332, "learning_rate": 6.800000000000001e-07, "logits/chosen": -0.4594718813896179, "logits/rejected": -1.032567024230957, "logps/chosen": -96.80720520019531, "logps/rejected": -124.26054382324219, "loss": 1.3398, "rewards/accuracies": 0.625, "rewards/chosen": 0.01257395837455988, "rewards/margins": 0.04889388382434845, "rewards/rejected": -0.036319926381111145, "step": 136 }, { "epoch": 0.007363414044234232, "grad_norm": 8.548255920410156, "learning_rate": 6.85e-07, "logits/chosen": -0.44986873865127563, "logits/rejected": -0.8132184743881226, "logps/chosen": -80.43583679199219, "logps/rejected": -115.8111572265625, "loss": 1.3085, "rewards/accuracies": 1.0, "rewards/chosen": 0.02393818087875843, "rewards/margins": 0.08009310066699982, "rewards/rejected": -0.056154921650886536, "step": 137 }, { "epoch": 0.007417161592002365, "grad_norm": 8.635651588439941, "learning_rate": 6.9e-07, "logits/chosen": -0.44903600215911865, "logits/rejected": -0.6189286708831787, "logps/chosen": -83.2327880859375, "logps/rejected": -108.99787902832031, "loss": 1.3055, "rewards/accuracies": 1.0, "rewards/chosen": 0.051844216883182526, "rewards/margins": 0.08314299583435059, "rewards/rejected": -0.03129878267645836, "step": 138 }, { "epoch": 0.007470909139770498, "grad_norm": 8.348700523376465, "learning_rate": 6.949999999999999e-07, "logits/chosen": -0.5882418751716614, "logits/rejected": -0.6412063837051392, "logps/chosen": -96.15507507324219, "logps/rejected": -137.0690155029297, "loss": 1.357, "rewards/accuracies": 0.625, "rewards/chosen": 0.03367133438587189, "rewards/margins": 0.03130665048956871, "rewards/rejected": 0.0023646820336580276, "step": 139 }, { "epoch": 0.007524656687538631, "grad_norm": 7.767421245574951, "learning_rate": 7e-07, "logits/chosen": -0.6120747923851013, "logits/rejected": -0.7664321660995483, "logps/chosen": -74.74644470214844, "logps/rejected": -101.01390838623047, "loss": 1.3626, "rewards/accuracies": 0.625, "rewards/chosen": 0.017233848571777344, "rewards/margins": 0.025109339505434036, "rewards/rejected": -0.007875489071011543, "step": 140 }, { "epoch": 0.007578404235306764, "grad_norm": 8.301179885864258, "learning_rate": 7.049999999999999e-07, "logits/chosen": -0.4770601987838745, "logits/rejected": -0.8895702362060547, "logps/chosen": -91.02867126464844, "logps/rejected": -110.958984375, "loss": 1.3112, "rewards/accuracies": 0.75, "rewards/chosen": 0.05669122189283371, "rewards/margins": 0.0791742354631424, "rewards/rejected": -0.022483013570308685, "step": 141 }, { "epoch": 0.007632151783074897, "grad_norm": 9.363536834716797, "learning_rate": 7.1e-07, "logits/chosen": -0.48902779817581177, "logits/rejected": -0.9160875082015991, "logps/chosen": -89.20378875732422, "logps/rejected": -128.65347290039062, "loss": 1.3272, "rewards/accuracies": 0.75, "rewards/chosen": 0.04372618347406387, "rewards/margins": 0.06114180013537407, "rewards/rejected": -0.017415620386600494, "step": 142 }, { "epoch": 0.00768589933084303, "grad_norm": 8.716055870056152, "learning_rate": 7.149999999999999e-07, "logits/chosen": -0.347247451543808, "logits/rejected": -0.7006733417510986, "logps/chosen": -86.57768249511719, "logps/rejected": -122.77500915527344, "loss": 1.3097, "rewards/accuracies": 0.625, "rewards/chosen": 0.04675121232867241, "rewards/margins": 0.08012666553258896, "rewards/rejected": -0.03337545692920685, "step": 143 }, { "epoch": 0.007739646878611163, "grad_norm": 8.482487678527832, "learning_rate": 7.2e-07, "logits/chosen": -0.673027515411377, "logits/rejected": -0.6785482168197632, "logps/chosen": -79.43406677246094, "logps/rejected": -108.27889251708984, "loss": 1.3283, "rewards/accuracies": 1.0, "rewards/chosen": 0.05551452934741974, "rewards/margins": 0.059876732528209686, "rewards/rejected": -0.004362202249467373, "step": 144 }, { "epoch": 0.007793394426379297, "grad_norm": 8.253223419189453, "learning_rate": 7.249999999999999e-07, "logits/chosen": -0.5571776032447815, "logits/rejected": -0.8740231990814209, "logps/chosen": -75.2855453491211, "logps/rejected": -92.43083953857422, "loss": 1.3403, "rewards/accuracies": 0.75, "rewards/chosen": 0.012529850006103516, "rewards/margins": 0.04750847816467285, "rewards/rejected": -0.034978628158569336, "step": 145 }, { "epoch": 0.00784714197414743, "grad_norm": 7.636621475219727, "learning_rate": 7.3e-07, "logits/chosen": -0.645206093788147, "logits/rejected": -0.7781713008880615, "logps/chosen": -86.50373077392578, "logps/rejected": -98.16445922851562, "loss": 1.3357, "rewards/accuracies": 1.0, "rewards/chosen": 0.015281391330063343, "rewards/margins": 0.05157160758972168, "rewards/rejected": -0.03629021346569061, "step": 146 }, { "epoch": 0.007900889521915562, "grad_norm": 10.003756523132324, "learning_rate": 7.35e-07, "logits/chosen": -0.7389339804649353, "logits/rejected": -0.7869266271591187, "logps/chosen": -89.31993865966797, "logps/rejected": -128.67466735839844, "loss": 1.3043, "rewards/accuracies": 0.625, "rewards/chosen": 0.05218982696533203, "rewards/margins": 0.08645191788673401, "rewards/rejected": -0.03426208719611168, "step": 147 }, { "epoch": 0.007954637069683695, "grad_norm": 9.176275253295898, "learning_rate": 7.4e-07, "logits/chosen": -0.6036407351493835, "logits/rejected": -0.6493508815765381, "logps/chosen": -87.88677978515625, "logps/rejected": -105.86882019042969, "loss": 1.383, "rewards/accuracies": 0.5, "rewards/chosen": 0.028727341443300247, "rewards/margins": 0.003834581933915615, "rewards/rejected": 0.024892758578062057, "step": 148 }, { "epoch": 0.00800838461745183, "grad_norm": 8.858968734741211, "learning_rate": 7.45e-07, "logits/chosen": -0.4755464196205139, "logits/rejected": -0.6751483678817749, "logps/chosen": -91.70085906982422, "logps/rejected": -131.9779052734375, "loss": 1.3306, "rewards/accuracies": 0.75, "rewards/chosen": 0.025980046018958092, "rewards/margins": 0.05764317512512207, "rewards/rejected": -0.03166313096880913, "step": 149 }, { "epoch": 0.008062132165219962, "grad_norm": 8.020471572875977, "learning_rate": 7.5e-07, "logits/chosen": -0.6035898923873901, "logits/rejected": -0.8535783290863037, "logps/chosen": -86.87428283691406, "logps/rejected": -105.51277923583984, "loss": 1.3315, "rewards/accuracies": 0.875, "rewards/chosen": 0.015166472643613815, "rewards/margins": 0.05631585419178009, "rewards/rejected": -0.04114937782287598, "step": 150 }, { "epoch": 0.008115879712988095, "grad_norm": 7.718076705932617, "learning_rate": 7.55e-07, "logits/chosen": -0.7106082439422607, "logits/rejected": -0.8762080669403076, "logps/chosen": -59.839927673339844, "logps/rejected": -103.9046630859375, "loss": 1.327, "rewards/accuracies": 0.875, "rewards/chosen": 0.015261316671967506, "rewards/margins": 0.06102485954761505, "rewards/rejected": -0.0457635372877121, "step": 151 }, { "epoch": 0.008169627260756228, "grad_norm": 9.431389808654785, "learning_rate": 7.599999999999999e-07, "logits/chosen": -0.46898919343948364, "logits/rejected": -0.7692798972129822, "logps/chosen": -86.0394287109375, "logps/rejected": -112.06622314453125, "loss": 1.3021, "rewards/accuracies": 0.875, "rewards/chosen": 0.0742824524641037, "rewards/margins": 0.08877305686473846, "rewards/rejected": -0.014490603469312191, "step": 152 }, { "epoch": 0.00822337480852436, "grad_norm": 10.360591888427734, "learning_rate": 7.65e-07, "logits/chosen": -0.4002518653869629, "logits/rejected": -0.6833705902099609, "logps/chosen": -92.29054260253906, "logps/rejected": -122.76181030273438, "loss": 1.2331, "rewards/accuracies": 1.0, "rewards/chosen": 0.10551580786705017, "rewards/margins": 0.16189125180244446, "rewards/rejected": -0.05637545883655548, "step": 153 }, { "epoch": 0.008277122356292493, "grad_norm": 8.860506057739258, "learning_rate": 7.699999999999999e-07, "logits/chosen": -0.5457747578620911, "logits/rejected": -1.0791329145431519, "logps/chosen": -75.7874526977539, "logps/rejected": -112.1403579711914, "loss": 1.3353, "rewards/accuracies": 0.75, "rewards/chosen": 0.04450831562280655, "rewards/margins": 0.05607190355658531, "rewards/rejected": -0.011563587933778763, "step": 154 }, { "epoch": 0.008330869904060628, "grad_norm": 8.489481925964355, "learning_rate": 7.75e-07, "logits/chosen": -0.5127053260803223, "logits/rejected": -0.8684940338134766, "logps/chosen": -96.62840270996094, "logps/rejected": -92.65662384033203, "loss": 1.3104, "rewards/accuracies": 0.875, "rewards/chosen": 0.057338617742061615, "rewards/margins": 0.07819996029138565, "rewards/rejected": -0.020861340686678886, "step": 155 }, { "epoch": 0.00838461745182876, "grad_norm": 9.592909812927246, "learning_rate": 7.799999999999999e-07, "logits/chosen": -0.39823704957962036, "logits/rejected": -0.8709710240364075, "logps/chosen": -87.85802459716797, "logps/rejected": -140.6796875, "loss": 1.3699, "rewards/accuracies": 0.625, "rewards/chosen": 0.018831826746463776, "rewards/margins": 0.01864795759320259, "rewards/rejected": 0.00018386892043054104, "step": 156 }, { "epoch": 0.008438364999596893, "grad_norm": 8.327983856201172, "learning_rate": 7.85e-07, "logits/chosen": -0.5753642320632935, "logits/rejected": -0.6110097169876099, "logps/chosen": -72.95355987548828, "logps/rejected": -114.62969207763672, "loss": 1.3577, "rewards/accuracies": 0.5, "rewards/chosen": 0.06254434585571289, "rewards/margins": 0.030099868774414062, "rewards/rejected": 0.03244447708129883, "step": 157 }, { "epoch": 0.008492112547365026, "grad_norm": 9.055319786071777, "learning_rate": 7.9e-07, "logits/chosen": -0.37607789039611816, "logits/rejected": -0.7452758550643921, "logps/chosen": -78.7669906616211, "logps/rejected": -146.11627197265625, "loss": 1.2797, "rewards/accuracies": 0.75, "rewards/chosen": 0.08929309248924255, "rewards/margins": 0.11435070633888245, "rewards/rejected": -0.025057602673768997, "step": 158 }, { "epoch": 0.008545860095133159, "grad_norm": 8.505377769470215, "learning_rate": 7.95e-07, "logits/chosen": -0.6849159598350525, "logits/rejected": -1.346714735031128, "logps/chosen": -82.99136352539062, "logps/rejected": -111.74497985839844, "loss": 1.3122, "rewards/accuracies": 0.875, "rewards/chosen": 0.03172416612505913, "rewards/margins": 0.07659444957971573, "rewards/rejected": -0.0448702834546566, "step": 159 }, { "epoch": 0.008599607642901293, "grad_norm": 8.17264461517334, "learning_rate": 8e-07, "logits/chosen": -0.527755618095398, "logits/rejected": -0.6062994003295898, "logps/chosen": -80.26480102539062, "logps/rejected": -99.29087829589844, "loss": 1.3211, "rewards/accuracies": 0.875, "rewards/chosen": 0.04537510871887207, "rewards/margins": 0.06676492840051651, "rewards/rejected": -0.02138981968164444, "step": 160 }, { "epoch": 0.008653355190669426, "grad_norm": 9.176811218261719, "learning_rate": 8.05e-07, "logits/chosen": -0.36080020666122437, "logits/rejected": -0.8093994855880737, "logps/chosen": -86.1155014038086, "logps/rejected": -112.10587310791016, "loss": 1.2684, "rewards/accuracies": 0.875, "rewards/chosen": 0.050379373133182526, "rewards/margins": 0.1239389032125473, "rewards/rejected": -0.07355952262878418, "step": 161 }, { "epoch": 0.008707102738437559, "grad_norm": 9.302167892456055, "learning_rate": 8.1e-07, "logits/chosen": -0.8071354031562805, "logits/rejected": -0.6821689009666443, "logps/chosen": -83.2026596069336, "logps/rejected": -127.60516357421875, "loss": 1.2635, "rewards/accuracies": 0.875, "rewards/chosen": 0.057270050048828125, "rewards/margins": 0.13212990760803223, "rewards/rejected": -0.0748598575592041, "step": 162 }, { "epoch": 0.008760850286205692, "grad_norm": 8.486352920532227, "learning_rate": 8.149999999999999e-07, "logits/chosen": -0.6618848443031311, "logits/rejected": -0.7918269634246826, "logps/chosen": -86.20860290527344, "logps/rejected": -101.74458312988281, "loss": 1.2866, "rewards/accuracies": 0.875, "rewards/chosen": 0.06387773156166077, "rewards/margins": 0.10453473031520844, "rewards/rejected": -0.04065699502825737, "step": 163 }, { "epoch": 0.008814597833973824, "grad_norm": 8.054268836975098, "learning_rate": 8.199999999999999e-07, "logits/chosen": -0.8835512399673462, "logits/rejected": -1.0847688913345337, "logps/chosen": -82.28048706054688, "logps/rejected": -86.61480712890625, "loss": 1.2999, "rewards/accuracies": 0.875, "rewards/chosen": 0.048715829849243164, "rewards/margins": 0.09011626243591309, "rewards/rejected": -0.04140043258666992, "step": 164 }, { "epoch": 0.008868345381741957, "grad_norm": 9.200186729431152, "learning_rate": 8.249999999999999e-07, "logits/chosen": -0.63123619556427, "logits/rejected": -0.8014737367630005, "logps/chosen": -96.98667907714844, "logps/rejected": -126.99364471435547, "loss": 1.2724, "rewards/accuracies": 0.75, "rewards/chosen": 0.05878172069787979, "rewards/margins": 0.1242380142211914, "rewards/rejected": -0.06545629352331161, "step": 165 }, { "epoch": 0.008922092929510092, "grad_norm": 10.122492790222168, "learning_rate": 8.299999999999999e-07, "logits/chosen": -0.3994729816913605, "logits/rejected": -0.6104108095169067, "logps/chosen": -94.57304382324219, "logps/rejected": -150.10963439941406, "loss": 1.2236, "rewards/accuracies": 1.0, "rewards/chosen": 0.0750236064195633, "rewards/margins": 0.17520728707313538, "rewards/rejected": -0.10018367320299149, "step": 166 }, { "epoch": 0.008975840477278224, "grad_norm": 9.124246597290039, "learning_rate": 8.349999999999999e-07, "logits/chosen": -0.5109861493110657, "logits/rejected": -0.8957618474960327, "logps/chosen": -96.60360717773438, "logps/rejected": -143.75567626953125, "loss": 1.2637, "rewards/accuracies": 1.0, "rewards/chosen": 0.08805136382579803, "rewards/margins": 0.12889742851257324, "rewards/rejected": -0.04084606468677521, "step": 167 }, { "epoch": 0.009029588025046357, "grad_norm": 8.598995208740234, "learning_rate": 8.399999999999999e-07, "logits/chosen": -0.604156494140625, "logits/rejected": -1.0018030405044556, "logps/chosen": -89.50662231445312, "logps/rejected": -141.1640625, "loss": 1.267, "rewards/accuracies": 0.875, "rewards/chosen": 0.09476061165332794, "rewards/margins": 0.12606163322925568, "rewards/rejected": -0.031301021575927734, "step": 168 }, { "epoch": 0.00908333557281449, "grad_norm": 8.155011177062988, "learning_rate": 8.45e-07, "logits/chosen": -0.44168877601623535, "logits/rejected": -0.432107150554657, "logps/chosen": -81.66665649414062, "logps/rejected": -99.3573226928711, "loss": 1.2592, "rewards/accuracies": 0.875, "rewards/chosen": 0.07716693729162216, "rewards/margins": 0.13341554999351501, "rewards/rejected": -0.05624861642718315, "step": 169 }, { "epoch": 0.009137083120582623, "grad_norm": 8.040282249450684, "learning_rate": 8.499999999999999e-07, "logits/chosen": -0.6053702235221863, "logits/rejected": -0.9967564344406128, "logps/chosen": -67.93074035644531, "logps/rejected": -82.04415130615234, "loss": 1.3406, "rewards/accuracies": 0.625, "rewards/chosen": 0.07301287353038788, "rewards/margins": 0.048865459859371185, "rewards/rejected": 0.024147415533661842, "step": 170 }, { "epoch": 0.009190830668350757, "grad_norm": 7.169889450073242, "learning_rate": 8.55e-07, "logits/chosen": -0.5939246416091919, "logits/rejected": -1.0662728548049927, "logps/chosen": -66.86637115478516, "logps/rejected": -95.12235260009766, "loss": 1.3129, "rewards/accuracies": 0.75, "rewards/chosen": 0.03163104131817818, "rewards/margins": 0.07644934952259064, "rewards/rejected": -0.04481830447912216, "step": 171 }, { "epoch": 0.00924457821611889, "grad_norm": 8.840331077575684, "learning_rate": 8.599999999999999e-07, "logits/chosen": -0.5900052785873413, "logits/rejected": -0.7810946702957153, "logps/chosen": -101.5522232055664, "logps/rejected": -136.8179931640625, "loss": 1.3156, "rewards/accuracies": 0.875, "rewards/chosen": 0.015662528574466705, "rewards/margins": 0.07332368195056915, "rewards/rejected": -0.05766115337610245, "step": 172 }, { "epoch": 0.009298325763887023, "grad_norm": 7.705813407897949, "learning_rate": 8.65e-07, "logits/chosen": -0.6044250726699829, "logits/rejected": -0.8337146043777466, "logps/chosen": -88.77130126953125, "logps/rejected": -115.64469146728516, "loss": 1.2657, "rewards/accuracies": 1.0, "rewards/chosen": 0.0708250030875206, "rewards/margins": 0.12716206908226013, "rewards/rejected": -0.05633707344532013, "step": 173 }, { "epoch": 0.009352073311655155, "grad_norm": 7.6406049728393555, "learning_rate": 8.699999999999999e-07, "logits/chosen": -0.49487096071243286, "logits/rejected": -0.8147666454315186, "logps/chosen": -74.4522476196289, "logps/rejected": -101.7776107788086, "loss": 1.3044, "rewards/accuracies": 0.875, "rewards/chosen": 0.07781295478343964, "rewards/margins": 0.08579063415527344, "rewards/rejected": -0.007977676577866077, "step": 174 }, { "epoch": 0.009405820859423288, "grad_norm": 7.91442346572876, "learning_rate": 8.75e-07, "logits/chosen": -0.7507854700088501, "logits/rejected": -0.7276033163070679, "logps/chosen": -78.06298828125, "logps/rejected": -114.89868927001953, "loss": 1.2901, "rewards/accuracies": 1.0, "rewards/chosen": 0.049116089940071106, "rewards/margins": 0.09981104731559753, "rewards/rejected": -0.050694968551397324, "step": 175 }, { "epoch": 0.009459568407191423, "grad_norm": 8.849769592285156, "learning_rate": 8.799999999999999e-07, "logits/chosen": -0.5043718218803406, "logits/rejected": -0.7160993814468384, "logps/chosen": -90.03047180175781, "logps/rejected": -109.82713317871094, "loss": 1.3098, "rewards/accuracies": 0.875, "rewards/chosen": 0.05672178417444229, "rewards/margins": 0.08076897263526917, "rewards/rejected": -0.024047184735536575, "step": 176 }, { "epoch": 0.009513315954959555, "grad_norm": 8.023724555969238, "learning_rate": 8.85e-07, "logits/chosen": -0.6142200231552124, "logits/rejected": -1.2040032148361206, "logps/chosen": -63.30528259277344, "logps/rejected": -85.78144836425781, "loss": 1.2823, "rewards/accuracies": 1.0, "rewards/chosen": 0.06448278576135635, "rewards/margins": 0.10814742743968964, "rewards/rejected": -0.04366464912891388, "step": 177 }, { "epoch": 0.009567063502727688, "grad_norm": 8.422699928283691, "learning_rate": 8.9e-07, "logits/chosen": -0.7486603260040283, "logits/rejected": -1.0276074409484863, "logps/chosen": -70.70411682128906, "logps/rejected": -111.33332061767578, "loss": 1.288, "rewards/accuracies": 0.625, "rewards/chosen": 0.04261412471532822, "rewards/margins": 0.10824036598205566, "rewards/rejected": -0.06562624126672745, "step": 178 }, { "epoch": 0.009620811050495821, "grad_norm": 8.398097038269043, "learning_rate": 8.95e-07, "logits/chosen": -0.539382815361023, "logits/rejected": -0.6525092124938965, "logps/chosen": -74.09980010986328, "logps/rejected": -114.26619720458984, "loss": 1.2532, "rewards/accuracies": 0.875, "rewards/chosen": 0.07439222931861877, "rewards/margins": 0.14116787910461426, "rewards/rejected": -0.06677565723657608, "step": 179 }, { "epoch": 0.009674558598263954, "grad_norm": 8.578017234802246, "learning_rate": 9e-07, "logits/chosen": -0.6013672351837158, "logits/rejected": -0.6956964731216431, "logps/chosen": -79.50172424316406, "logps/rejected": -106.16706848144531, "loss": 1.204, "rewards/accuracies": 1.0, "rewards/chosen": 0.08045469224452972, "rewards/margins": 0.19394618272781372, "rewards/rejected": -0.113491490483284, "step": 180 }, { "epoch": 0.009728306146032087, "grad_norm": 7.8155951499938965, "learning_rate": 9.05e-07, "logits/chosen": -0.4921185374259949, "logits/rejected": -0.5469996929168701, "logps/chosen": -93.06249237060547, "logps/rejected": -98.62901306152344, "loss": 1.2811, "rewards/accuracies": 0.875, "rewards/chosen": 0.06854715943336487, "rewards/margins": 0.11277022957801819, "rewards/rejected": -0.04422307014465332, "step": 181 }, { "epoch": 0.009782053693800221, "grad_norm": 8.917503356933594, "learning_rate": 9.1e-07, "logits/chosen": -0.5223396420478821, "logits/rejected": -0.7636221647262573, "logps/chosen": -84.32308197021484, "logps/rejected": -111.67572784423828, "loss": 1.2357, "rewards/accuracies": 0.875, "rewards/chosen": 0.10809466987848282, "rewards/margins": 0.16041654348373413, "rewards/rejected": -0.052321866154670715, "step": 182 }, { "epoch": 0.009835801241568354, "grad_norm": 8.830840110778809, "learning_rate": 9.15e-07, "logits/chosen": -0.5822125673294067, "logits/rejected": -0.7964699268341064, "logps/chosen": -94.43169403076172, "logps/rejected": -128.41546630859375, "loss": 1.2422, "rewards/accuracies": 0.875, "rewards/chosen": 0.0731562152504921, "rewards/margins": 0.15465021133422852, "rewards/rejected": -0.08149400353431702, "step": 183 }, { "epoch": 0.009889548789336487, "grad_norm": 8.283319473266602, "learning_rate": 9.2e-07, "logits/chosen": -0.5485873222351074, "logits/rejected": -0.4649333953857422, "logps/chosen": -86.34204864501953, "logps/rejected": -122.15489196777344, "loss": 1.1982, "rewards/accuracies": 0.875, "rewards/chosen": 0.09246230125427246, "rewards/margins": 0.20780116319656372, "rewards/rejected": -0.11533886194229126, "step": 184 }, { "epoch": 0.00994329633710462, "grad_norm": 7.356119155883789, "learning_rate": 9.25e-07, "logits/chosen": -0.48729968070983887, "logits/rejected": -0.8840200304985046, "logps/chosen": -64.23764038085938, "logps/rejected": -86.2247314453125, "loss": 1.2654, "rewards/accuracies": 0.875, "rewards/chosen": 0.0745830088853836, "rewards/margins": 0.12811151146888733, "rewards/rejected": -0.053528498858213425, "step": 185 }, { "epoch": 0.009997043884872752, "grad_norm": 8.707176208496094, "learning_rate": 9.3e-07, "logits/chosen": -0.6096377372741699, "logits/rejected": -0.7591854333877563, "logps/chosen": -91.56690979003906, "logps/rejected": -112.47627258300781, "loss": 1.279, "rewards/accuracies": 0.75, "rewards/chosen": 0.0850636437535286, "rewards/margins": 0.11659006774425507, "rewards/rejected": -0.03152642399072647, "step": 186 }, { "epoch": 0.010050791432640887, "grad_norm": 8.459226608276367, "learning_rate": 9.35e-07, "logits/chosen": -0.7061234712600708, "logits/rejected": -0.8238375186920166, "logps/chosen": -96.10240173339844, "logps/rejected": -113.69047546386719, "loss": 1.2641, "rewards/accuracies": 0.875, "rewards/chosen": 0.03810086473822594, "rewards/margins": 0.12898865342140198, "rewards/rejected": -0.09088778495788574, "step": 187 }, { "epoch": 0.01010453898040902, "grad_norm": 8.724274635314941, "learning_rate": 9.399999999999999e-07, "logits/chosen": -0.529472291469574, "logits/rejected": -0.7259479761123657, "logps/chosen": -107.95203399658203, "logps/rejected": -118.2959213256836, "loss": 1.2877, "rewards/accuracies": 0.625, "rewards/chosen": 0.03994712606072426, "rewards/margins": 0.11241521686315536, "rewards/rejected": -0.07246808707714081, "step": 188 }, { "epoch": 0.010158286528177152, "grad_norm": 8.41250228881836, "learning_rate": 9.45e-07, "logits/chosen": -0.43037286400794983, "logits/rejected": -1.0149872303009033, "logps/chosen": -84.99284362792969, "logps/rejected": -115.60348510742188, "loss": 1.2505, "rewards/accuracies": 0.875, "rewards/chosen": 0.08281326293945312, "rewards/margins": 0.15064755082130432, "rewards/rejected": -0.0678342878818512, "step": 189 }, { "epoch": 0.010212034075945285, "grad_norm": 9.678362846374512, "learning_rate": 9.499999999999999e-07, "logits/chosen": -0.5572127103805542, "logits/rejected": -0.8318883180618286, "logps/chosen": -90.814697265625, "logps/rejected": -139.9611053466797, "loss": 1.2629, "rewards/accuracies": 0.75, "rewards/chosen": 0.10129576176404953, "rewards/margins": 0.13078927993774414, "rewards/rejected": -0.029493525624275208, "step": 190 }, { "epoch": 0.010265781623713418, "grad_norm": 8.513623237609863, "learning_rate": 9.55e-07, "logits/chosen": -0.6218719482421875, "logits/rejected": -0.7108089923858643, "logps/chosen": -66.75973510742188, "logps/rejected": -116.55987548828125, "loss": 1.3335, "rewards/accuracies": 0.5, "rewards/chosen": 0.0546051487326622, "rewards/margins": 0.06247849017381668, "rewards/rejected": -0.007873346097767353, "step": 191 }, { "epoch": 0.01031952917148155, "grad_norm": 9.881647109985352, "learning_rate": 9.6e-07, "logits/chosen": -0.674138069152832, "logits/rejected": -0.932457685470581, "logps/chosen": -92.65321350097656, "logps/rejected": -140.28085327148438, "loss": 1.198, "rewards/accuracies": 0.875, "rewards/chosen": 0.12886744737625122, "rewards/margins": 0.20477329194545746, "rewards/rejected": -0.07590584456920624, "step": 192 }, { "epoch": 0.010373276719249685, "grad_norm": 7.939401149749756, "learning_rate": 9.649999999999999e-07, "logits/chosen": -0.4290343225002289, "logits/rejected": -0.83338862657547, "logps/chosen": -76.35269165039062, "logps/rejected": -97.52641296386719, "loss": 1.2273, "rewards/accuracies": 1.0, "rewards/chosen": 0.09781437367200851, "rewards/margins": 0.1681661754846573, "rewards/rejected": -0.07035179436206818, "step": 193 }, { "epoch": 0.010427024267017818, "grad_norm": 7.355748653411865, "learning_rate": 9.7e-07, "logits/chosen": -0.6702528595924377, "logits/rejected": -0.8530929088592529, "logps/chosen": -67.63008880615234, "logps/rejected": -103.4015884399414, "loss": 1.2156, "rewards/accuracies": 1.0, "rewards/chosen": 0.13403668999671936, "rewards/margins": 0.18164294958114624, "rewards/rejected": -0.047606274485588074, "step": 194 }, { "epoch": 0.01048077181478595, "grad_norm": 8.380016326904297, "learning_rate": 9.75e-07, "logits/chosen": -0.5382170677185059, "logits/rejected": -0.9154505729675293, "logps/chosen": -85.98081970214844, "logps/rejected": -114.91854858398438, "loss": 1.2527, "rewards/accuracies": 1.0, "rewards/chosen": 0.08690200001001358, "rewards/margins": 0.14225682616233826, "rewards/rejected": -0.05535483360290527, "step": 195 }, { "epoch": 0.010534519362554083, "grad_norm": 8.293807029724121, "learning_rate": 9.8e-07, "logits/chosen": -0.3949047327041626, "logits/rejected": -0.7062586545944214, "logps/chosen": -94.71266174316406, "logps/rejected": -132.51934814453125, "loss": 1.2261, "rewards/accuracies": 0.875, "rewards/chosen": 0.0977863296866417, "rewards/margins": 0.17238861322402954, "rewards/rejected": -0.07460227608680725, "step": 196 }, { "epoch": 0.010588266910322216, "grad_norm": 7.918712615966797, "learning_rate": 9.849999999999999e-07, "logits/chosen": -0.6305364966392517, "logits/rejected": -0.7408871650695801, "logps/chosen": -72.30216217041016, "logps/rejected": -103.49208068847656, "loss": 1.2417, "rewards/accuracies": 0.625, "rewards/chosen": 0.07783713936805725, "rewards/margins": 0.16417288780212402, "rewards/rejected": -0.08633575588464737, "step": 197 }, { "epoch": 0.01064201445809035, "grad_norm": 9.348301887512207, "learning_rate": 9.9e-07, "logits/chosen": -0.6360346078872681, "logits/rejected": -0.6967631578445435, "logps/chosen": -85.20661926269531, "logps/rejected": -117.7959976196289, "loss": 1.2744, "rewards/accuracies": 0.75, "rewards/chosen": 0.02152261883020401, "rewards/margins": 0.1231936514377594, "rewards/rejected": -0.10167103260755539, "step": 198 }, { "epoch": 0.010695762005858483, "grad_norm": 8.181173324584961, "learning_rate": 9.95e-07, "logits/chosen": -0.49966248869895935, "logits/rejected": -0.7837533354759216, "logps/chosen": -96.90880584716797, "logps/rejected": -107.97472381591797, "loss": 1.2194, "rewards/accuracies": 0.875, "rewards/chosen": 0.14349670708179474, "rewards/margins": 0.17999544739723206, "rewards/rejected": -0.03649874031543732, "step": 199 }, { "epoch": 0.010749509553626616, "grad_norm": 12.409173011779785, "learning_rate": 1e-06, "logits/chosen": -0.5410706996917725, "logits/rejected": -0.7554242610931396, "logps/chosen": -94.81787109375, "logps/rejected": -114.16231536865234, "loss": 1.155, "rewards/accuracies": 1.0, "rewards/chosen": 0.12644006311893463, "rewards/margins": 0.255648672580719, "rewards/rejected": -0.12920860946178436, "step": 200 }, { "epoch": 0.010803257101394749, "grad_norm": 9.018311500549316, "learning_rate": 9.999961446907352e-07, "logits/chosen": -0.38394927978515625, "logits/rejected": -0.8213692903518677, "logps/chosen": -96.14962768554688, "logps/rejected": -125.2713623046875, "loss": 1.215, "rewards/accuracies": 1.0, "rewards/chosen": 0.13162073493003845, "rewards/margins": 0.18227419257164001, "rewards/rejected": -0.05065345764160156, "step": 201 }, { "epoch": 0.010857004649162881, "grad_norm": 9.991246223449707, "learning_rate": 9.999845788223948e-07, "logits/chosen": -0.6805392503738403, "logits/rejected": -0.36724549531936646, "logps/chosen": -96.92276763916016, "logps/rejected": -110.201416015625, "loss": 1.1747, "rewards/accuracies": 0.875, "rewards/chosen": 0.133368581533432, "rewards/margins": 0.23651599884033203, "rewards/rejected": -0.10314741730690002, "step": 202 }, { "epoch": 0.010910752196931016, "grad_norm": 8.22262191772461, "learning_rate": 9.999653025733385e-07, "logits/chosen": -0.6609019637107849, "logits/rejected": -0.6106843948364258, "logps/chosen": -97.962646484375, "logps/rejected": -123.79737854003906, "loss": 1.1606, "rewards/accuracies": 0.875, "rewards/chosen": 0.12442468106746674, "rewards/margins": 0.2542736530303955, "rewards/rejected": -0.12984895706176758, "step": 203 }, { "epoch": 0.010964499744699149, "grad_norm": 8.588205337524414, "learning_rate": 9.999383162408303e-07, "logits/chosen": -0.40745532512664795, "logits/rejected": -0.5700491070747375, "logps/chosen": -86.30723571777344, "logps/rejected": -121.02368927001953, "loss": 1.2478, "rewards/accuracies": 0.875, "rewards/chosen": 0.10299758613109589, "rewards/margins": 0.14931106567382812, "rewards/rejected": -0.04631347581744194, "step": 204 }, { "epoch": 0.011018247292467281, "grad_norm": 9.101134300231934, "learning_rate": 9.999036202410323e-07, "logits/chosen": -0.6683564186096191, "logits/rejected": -0.826968789100647, "logps/chosen": -80.37091064453125, "logps/rejected": -96.48709869384766, "loss": 1.2045, "rewards/accuracies": 1.0, "rewards/chosen": 0.12043257057666779, "rewards/margins": 0.19962263107299805, "rewards/rejected": -0.07919006049633026, "step": 205 }, { "epoch": 0.011071994840235414, "grad_norm": 8.764567375183105, "learning_rate": 9.998612151090002e-07, "logits/chosen": -0.4916820824146271, "logits/rejected": -0.6521626114845276, "logps/chosen": -90.51093292236328, "logps/rejected": -133.3006134033203, "loss": 1.1356, "rewards/accuracies": 0.75, "rewards/chosen": 0.15498992800712585, "rewards/margins": 0.2916005253791809, "rewards/rejected": -0.13661059737205505, "step": 206 }, { "epoch": 0.011125742388003547, "grad_norm": 8.882989883422852, "learning_rate": 9.998111014986734e-07, "logits/chosen": -0.34713417291641235, "logits/rejected": -0.6887156367301941, "logps/chosen": -116.66211700439453, "logps/rejected": -141.1327667236328, "loss": 1.2164, "rewards/accuracies": 1.0, "rewards/chosen": 0.12109574675559998, "rewards/margins": 0.18433637917041779, "rewards/rejected": -0.06324061751365662, "step": 207 }, { "epoch": 0.01117948993577168, "grad_norm": 8.962393760681152, "learning_rate": 9.997532801828658e-07, "logits/chosen": -0.45506107807159424, "logits/rejected": -0.7329839468002319, "logps/chosen": -102.06887817382812, "logps/rejected": -111.65015411376953, "loss": 1.2346, "rewards/accuracies": 0.875, "rewards/chosen": 0.09608335793018341, "rewards/margins": 0.162027508020401, "rewards/rejected": -0.06594415009021759, "step": 208 }, { "epoch": 0.011233237483539814, "grad_norm": 7.876529693603516, "learning_rate": 9.996877520532534e-07, "logits/chosen": -0.4077056050300598, "logits/rejected": -0.6110527515411377, "logps/chosen": -86.82136535644531, "logps/rejected": -108.43330383300781, "loss": 1.1838, "rewards/accuracies": 1.0, "rewards/chosen": 0.17611068487167358, "rewards/margins": 0.21694111824035645, "rewards/rejected": -0.040830425918102264, "step": 209 }, { "epoch": 0.011286985031307947, "grad_norm": 6.917963981628418, "learning_rate": 9.996145181203615e-07, "logits/chosen": -0.6416649222373962, "logits/rejected": -0.870232105255127, "logps/chosen": -74.03877258300781, "logps/rejected": -100.16549682617188, "loss": 1.2437, "rewards/accuracies": 0.75, "rewards/chosen": 0.18118181824684143, "rewards/margins": 0.15956702828407288, "rewards/rejected": 0.021614791825413704, "step": 210 }, { "epoch": 0.01134073257907608, "grad_norm": 7.945036888122559, "learning_rate": 9.995335795135475e-07, "logits/chosen": -0.35453060269355774, "logits/rejected": -0.6333085894584656, "logps/chosen": -89.64887237548828, "logps/rejected": -103.50331115722656, "loss": 1.198, "rewards/accuracies": 0.75, "rewards/chosen": 0.1395391970872879, "rewards/margins": 0.2095623016357422, "rewards/rejected": -0.07002310454845428, "step": 211 }, { "epoch": 0.011394480126844212, "grad_norm": 7.781386375427246, "learning_rate": 9.99444937480985e-07, "logits/chosen": -0.8427431583404541, "logits/rejected": -1.0646154880523682, "logps/chosen": -71.52605438232422, "logps/rejected": -101.33668518066406, "loss": 1.1841, "rewards/accuracies": 0.875, "rewards/chosen": 0.1986706256866455, "rewards/margins": 0.2231091856956482, "rewards/rejected": -0.02443857118487358, "step": 212 }, { "epoch": 0.011448227674612345, "grad_norm": 8.750081062316895, "learning_rate": 9.993485933896437e-07, "logits/chosen": -0.6073715090751648, "logits/rejected": -0.86269211769104, "logps/chosen": -97.16561126708984, "logps/rejected": -121.15423583984375, "loss": 1.2527, "rewards/accuracies": 0.75, "rewards/chosen": 0.06649146229028702, "rewards/margins": 0.145115464925766, "rewards/rejected": -0.07862401008605957, "step": 213 }, { "epoch": 0.01150197522238048, "grad_norm": 8.668082237243652, "learning_rate": 9.99244548725269e-07, "logits/chosen": -0.6057989597320557, "logits/rejected": -0.7912963628768921, "logps/chosen": -86.06175231933594, "logps/rejected": -133.17581176757812, "loss": 1.1756, "rewards/accuracies": 0.75, "rewards/chosen": 0.0874997079372406, "rewards/margins": 0.24749141931533813, "rewards/rejected": -0.15999169647693634, "step": 214 }, { "epoch": 0.011555722770148612, "grad_norm": 9.294448852539062, "learning_rate": 9.99132805092358e-07, "logits/chosen": -0.487201452255249, "logits/rejected": -0.7378743290901184, "logps/chosen": -109.46067810058594, "logps/rejected": -152.04519653320312, "loss": 1.0844, "rewards/accuracies": 0.875, "rewards/chosen": 0.1331624537706375, "rewards/margins": 0.3561188280582428, "rewards/rejected": -0.22295637428760529, "step": 215 }, { "epoch": 0.011609470317916745, "grad_norm": 14.424534797668457, "learning_rate": 9.990133642141357e-07, "logits/chosen": -0.5480656623840332, "logits/rejected": -0.9329161643981934, "logps/chosen": -81.80020904541016, "logps/rejected": -126.82513427734375, "loss": 1.1742, "rewards/accuracies": 0.75, "rewards/chosen": 0.14331316947937012, "rewards/margins": 0.2515562176704407, "rewards/rejected": -0.10824304819107056, "step": 216 }, { "epoch": 0.011663217865684878, "grad_norm": 7.97386360168457, "learning_rate": 9.988862279325284e-07, "logits/chosen": -0.46164822578430176, "logits/rejected": -1.1121896505355835, "logps/chosen": -82.22102355957031, "logps/rejected": -95.99891662597656, "loss": 1.1936, "rewards/accuracies": 0.875, "rewards/chosen": 0.0752662718296051, "rewards/margins": 0.21177086234092712, "rewards/rejected": -0.13650460541248322, "step": 217 }, { "epoch": 0.01171696541345301, "grad_norm": 7.369724273681641, "learning_rate": 9.98751398208135e-07, "logits/chosen": -0.5404901504516602, "logits/rejected": -0.790448784828186, "logps/chosen": -72.71309661865234, "logps/rejected": -108.60038757324219, "loss": 1.182, "rewards/accuracies": 0.875, "rewards/chosen": 0.2458677738904953, "rewards/margins": 0.23527422547340393, "rewards/rejected": 0.010593554005026817, "step": 218 }, { "epoch": 0.011770712961221143, "grad_norm": 7.535937786102295, "learning_rate": 9.986088771201963e-07, "logits/chosen": -0.5325116515159607, "logits/rejected": -0.7334097623825073, "logps/chosen": -66.14689636230469, "logps/rejected": -84.96763610839844, "loss": 1.3108, "rewards/accuracies": 0.875, "rewards/chosen": 0.10276611149311066, "rewards/margins": 0.08222854137420654, "rewards/rejected": 0.020537566393613815, "step": 219 }, { "epoch": 0.011824460508989278, "grad_norm": 8.013921737670898, "learning_rate": 9.98458666866564e-07, "logits/chosen": -0.42551806569099426, "logits/rejected": -1.0276601314544678, "logps/chosen": -80.58146667480469, "logps/rejected": -114.80330657958984, "loss": 1.071, "rewards/accuracies": 1.0, "rewards/chosen": 0.18189138174057007, "rewards/margins": 0.36933794617652893, "rewards/rejected": -0.18744659423828125, "step": 220 }, { "epoch": 0.01187820805675741, "grad_norm": 7.9433088302612305, "learning_rate": 9.983007697636658e-07, "logits/chosen": -0.5696620941162109, "logits/rejected": -0.822013258934021, "logps/chosen": -74.19483184814453, "logps/rejected": -113.86673736572266, "loss": 1.2464, "rewards/accuracies": 0.625, "rewards/chosen": 0.08465752750635147, "rewards/margins": 0.15495048463344574, "rewards/rejected": -0.07029294967651367, "step": 221 }, { "epoch": 0.011931955604525544, "grad_norm": 7.609518527984619, "learning_rate": 9.981351882464707e-07, "logits/chosen": -0.6711792945861816, "logits/rejected": -0.7068201303482056, "logps/chosen": -72.17758178710938, "logps/rejected": -85.7686767578125, "loss": 1.271, "rewards/accuracies": 1.0, "rewards/chosen": 0.11152859032154083, "rewards/margins": 0.12282519787549973, "rewards/rejected": -0.011296607553958893, "step": 222 }, { "epoch": 0.011985703152293676, "grad_norm": 8.004049301147461, "learning_rate": 9.979619248684501e-07, "logits/chosen": -0.724856436252594, "logits/rejected": -0.7913885712623596, "logps/chosen": -94.92695617675781, "logps/rejected": -129.05795288085938, "loss": 1.0848, "rewards/accuracies": 0.75, "rewards/chosen": 0.19596806168556213, "rewards/margins": 0.36456066370010376, "rewards/rejected": -0.16859260201454163, "step": 223 }, { "epoch": 0.012039450700061809, "grad_norm": 9.519268035888672, "learning_rate": 9.9778098230154e-07, "logits/chosen": -0.4081488847732544, "logits/rejected": -0.9913891553878784, "logps/chosen": -66.12991333007812, "logps/rejected": -126.65169525146484, "loss": 1.232, "rewards/accuracies": 0.75, "rewards/chosen": 0.19431248307228088, "rewards/margins": 0.19488951563835144, "rewards/rejected": -0.0005770251154899597, "step": 224 }, { "epoch": 0.012093198247829944, "grad_norm": 8.631113052368164, "learning_rate": 9.975923633360984e-07, "logits/chosen": -0.37461525201797485, "logits/rejected": -0.8124234080314636, "logps/chosen": -91.42263793945312, "logps/rejected": -120.60844421386719, "loss": 1.0942, "rewards/accuracies": 0.875, "rewards/chosen": 0.16796961426734924, "rewards/margins": 0.33035793900489807, "rewards/rejected": -0.16238832473754883, "step": 225 }, { "epoch": 0.012146945795598076, "grad_norm": 8.081117630004883, "learning_rate": 9.973960708808631e-07, "logits/chosen": -0.4398130774497986, "logits/rejected": -0.6970310211181641, "logps/chosen": -82.72227478027344, "logps/rejected": -120.73685455322266, "loss": 1.07, "rewards/accuracies": 0.875, "rewards/chosen": 0.13794194161891937, "rewards/margins": 0.38850194215774536, "rewards/rejected": -0.2505600154399872, "step": 226 }, { "epoch": 0.012200693343366209, "grad_norm": 7.0437750816345215, "learning_rate": 9.971921079629069e-07, "logits/chosen": -0.5556354522705078, "logits/rejected": -0.864974856376648, "logps/chosen": -73.24920654296875, "logps/rejected": -90.26593017578125, "loss": 1.1827, "rewards/accuracies": 0.875, "rewards/chosen": 0.1013791561126709, "rewards/margins": 0.22670727968215942, "rewards/rejected": -0.12532812356948853, "step": 227 }, { "epoch": 0.012254440891134342, "grad_norm": 8.556796073913574, "learning_rate": 9.969804777275898e-07, "logits/chosen": -0.35300546884536743, "logits/rejected": -0.6973484754562378, "logps/chosen": -95.61795806884766, "logps/rejected": -130.8153533935547, "loss": 1.0892, "rewards/accuracies": 0.875, "rewards/chosen": 0.12840043008327484, "rewards/margins": 0.3624500334262848, "rewards/rejected": -0.23404961824417114, "step": 228 }, { "epoch": 0.012308188438902475, "grad_norm": 7.119718551635742, "learning_rate": 9.967611834385122e-07, "logits/chosen": -0.3875174820423126, "logits/rejected": -0.5986741781234741, "logps/chosen": -67.04378509521484, "logps/rejected": -96.16484832763672, "loss": 1.1726, "rewards/accuracies": 0.875, "rewards/chosen": 0.1416163444519043, "rewards/margins": 0.23682399094104767, "rewards/rejected": -0.09520764648914337, "step": 229 }, { "epoch": 0.012361935986670607, "grad_norm": 7.879711151123047, "learning_rate": 9.965342284774631e-07, "logits/chosen": -0.48159751296043396, "logits/rejected": -0.6574340462684631, "logps/chosen": -80.20025634765625, "logps/rejected": -125.88029479980469, "loss": 1.1334, "rewards/accuracies": 1.0, "rewards/chosen": 0.11776607483625412, "rewards/margins": 0.2744428515434265, "rewards/rejected": -0.1566767692565918, "step": 230 }, { "epoch": 0.012415683534438742, "grad_norm": 9.587478637695312, "learning_rate": 9.962996163443688e-07, "logits/chosen": -0.37069642543792725, "logits/rejected": -0.8089166879653931, "logps/chosen": -98.1978988647461, "logps/rejected": -132.13845825195312, "loss": 1.1718, "rewards/accuracies": 0.875, "rewards/chosen": 0.18037399649620056, "rewards/margins": 0.26343971490859985, "rewards/rejected": -0.0830657035112381, "step": 231 }, { "epoch": 0.012469431082206875, "grad_norm": 9.129337310791016, "learning_rate": 9.960573506572389e-07, "logits/chosen": -0.5636274814605713, "logits/rejected": -1.1714890003204346, "logps/chosen": -93.2618408203125, "logps/rejected": -97.5640869140625, "loss": 1.1944, "rewards/accuracies": 0.75, "rewards/chosen": 0.12527954578399658, "rewards/margins": 0.21808025240898132, "rewards/rejected": -0.09280071407556534, "step": 232 }, { "epoch": 0.012523178629975007, "grad_norm": 8.361261367797852, "learning_rate": 9.958074351521096e-07, "logits/chosen": -0.47556164860725403, "logits/rejected": -0.7843296527862549, "logps/chosen": -75.67303466796875, "logps/rejected": -99.92214965820312, "loss": 1.1867, "rewards/accuracies": 0.875, "rewards/chosen": 0.20813600718975067, "rewards/margins": 0.21530234813690186, "rewards/rejected": -0.007166337221860886, "step": 233 }, { "epoch": 0.01257692617774314, "grad_norm": 8.374642372131348, "learning_rate": 9.955498736829874e-07, "logits/chosen": -0.6883563995361328, "logits/rejected": -1.486838459968567, "logps/chosen": -76.17497253417969, "logps/rejected": -96.84264373779297, "loss": 1.1193, "rewards/accuracies": 0.875, "rewards/chosen": 0.22805273532867432, "rewards/margins": 0.32633835077285767, "rewards/rejected": -0.09828563034534454, "step": 234 }, { "epoch": 0.012630673725511273, "grad_norm": 9.872260093688965, "learning_rate": 9.952846702217885e-07, "logits/chosen": -0.6219691038131714, "logits/rejected": -0.8194465637207031, "logps/chosen": -74.91395568847656, "logps/rejected": -106.57698822021484, "loss": 1.1828, "rewards/accuracies": 0.75, "rewards/chosen": 0.2802756428718567, "rewards/margins": 0.22285690903663635, "rewards/rejected": 0.05741872638463974, "step": 235 }, { "epoch": 0.012684421273279407, "grad_norm": 9.46263599395752, "learning_rate": 9.950118288582787e-07, "logits/chosen": -0.5394343137741089, "logits/rejected": -0.8429723381996155, "logps/chosen": -78.40309143066406, "logps/rejected": -119.71903991699219, "loss": 1.2401, "rewards/accuracies": 0.875, "rewards/chosen": 0.16660980880260468, "rewards/margins": 0.15729470551013947, "rewards/rejected": 0.009315107017755508, "step": 236 }, { "epoch": 0.01273816882104754, "grad_norm": 7.394505500793457, "learning_rate": 9.947313538000092e-07, "logits/chosen": -0.5725371837615967, "logits/rejected": -0.6441730260848999, "logps/chosen": -69.10836029052734, "logps/rejected": -93.74408721923828, "loss": 1.1444, "rewards/accuracies": 1.0, "rewards/chosen": 0.11338989436626434, "rewards/margins": 0.2632651925086975, "rewards/rejected": -0.14987531304359436, "step": 237 }, { "epoch": 0.012791916368815673, "grad_norm": 7.916663646697998, "learning_rate": 9.944432493722524e-07, "logits/chosen": -0.38268229365348816, "logits/rejected": -0.5627045035362244, "logps/chosen": -68.636474609375, "logps/rejected": -103.48701477050781, "loss": 1.1002, "rewards/accuracies": 0.875, "rewards/chosen": 0.22320647537708282, "rewards/margins": 0.32132166624069214, "rewards/rejected": -0.09811516106128693, "step": 238 }, { "epoch": 0.012845663916583806, "grad_norm": 8.479836463928223, "learning_rate": 9.941475200179346e-07, "logits/chosen": -0.5663607716560364, "logits/rejected": -0.7409512996673584, "logps/chosen": -89.14552307128906, "logps/rejected": -122.3517837524414, "loss": 1.1899, "rewards/accuracies": 0.75, "rewards/chosen": 0.13951130211353302, "rewards/margins": 0.23840561509132385, "rewards/rejected": -0.09889431297779083, "step": 239 }, { "epoch": 0.012899411464351938, "grad_norm": 8.626564979553223, "learning_rate": 9.938441702975689e-07, "logits/chosen": -0.47271132469177246, "logits/rejected": -0.7084850668907166, "logps/chosen": -98.41950225830078, "logps/rejected": -148.84288024902344, "loss": 1.1146, "rewards/accuracies": 0.75, "rewards/chosen": 0.10083074867725372, "rewards/margins": 0.3182566463947296, "rewards/rejected": -0.21742592751979828, "step": 240 }, { "epoch": 0.012953159012120073, "grad_norm": 8.52356243133545, "learning_rate": 9.935332048891826e-07, "logits/chosen": -0.5194116234779358, "logits/rejected": -0.6223669052124023, "logps/chosen": -90.4648666381836, "logps/rejected": -140.77615356445312, "loss": 1.1332, "rewards/accuracies": 1.0, "rewards/chosen": 0.15672340989112854, "rewards/margins": 0.27920961380004883, "rewards/rejected": -0.12248621881008148, "step": 241 }, { "epoch": 0.013006906559888206, "grad_norm": 9.13849925994873, "learning_rate": 9.932146285882476e-07, "logits/chosen": -0.43376827239990234, "logits/rejected": -0.9578706622123718, "logps/chosen": -96.71780395507812, "logps/rejected": -124.5494613647461, "loss": 1.1259, "rewards/accuracies": 0.75, "rewards/chosen": 0.20085996389389038, "rewards/margins": 0.29483500123023987, "rewards/rejected": -0.0939750224351883, "step": 242 }, { "epoch": 0.013060654107656338, "grad_norm": 8.863753318786621, "learning_rate": 9.928884463076043e-07, "logits/chosen": -0.37400951981544495, "logits/rejected": -0.6572523713111877, "logps/chosen": -86.31878662109375, "logps/rejected": -139.85658264160156, "loss": 0.9777, "rewards/accuracies": 1.0, "rewards/chosen": 0.28231894969940186, "rewards/margins": 0.5099464058876038, "rewards/rejected": -0.2276274710893631, "step": 243 }, { "epoch": 0.013114401655424471, "grad_norm": 8.157885551452637, "learning_rate": 9.925546630773868e-07, "logits/chosen": -0.5754011273384094, "logits/rejected": -0.7451348900794983, "logps/chosen": -68.57931518554688, "logps/rejected": -114.21456909179688, "loss": 1.0157, "rewards/accuracies": 1.0, "rewards/chosen": 0.31994327902793884, "rewards/margins": 0.436386376619339, "rewards/rejected": -0.11644310504198074, "step": 244 }, { "epoch": 0.013168149203192604, "grad_norm": 8.603972434997559, "learning_rate": 9.922132840449458e-07, "logits/chosen": -0.4437783360481262, "logits/rejected": -0.7457876801490784, "logps/chosen": -104.11923217773438, "logps/rejected": -125.06718444824219, "loss": 1.0947, "rewards/accuracies": 0.875, "rewards/chosen": 0.25450336933135986, "rewards/margins": 0.3329158425331116, "rewards/rejected": -0.0784124881029129, "step": 245 }, { "epoch": 0.013221896750960737, "grad_norm": 7.682473182678223, "learning_rate": 9.91864314474768e-07, "logits/chosen": -0.7631401419639587, "logits/rejected": -0.7120974659919739, "logps/chosen": -57.73262405395508, "logps/rejected": -72.64689636230469, "loss": 1.2067, "rewards/accuracies": 0.75, "rewards/chosen": 0.10729515552520752, "rewards/margins": 0.19412139058113098, "rewards/rejected": -0.08682622015476227, "step": 246 }, { "epoch": 0.013275644298728871, "grad_norm": 9.09015941619873, "learning_rate": 9.915077597483958e-07, "logits/chosen": -0.691004753112793, "logits/rejected": -0.8626288175582886, "logps/chosen": -90.16621398925781, "logps/rejected": -135.4027862548828, "loss": 1.2029, "rewards/accuracies": 0.75, "rewards/chosen": 0.18838340044021606, "rewards/margins": 0.21971102058887482, "rewards/rejected": -0.03132762759923935, "step": 247 }, { "epoch": 0.013329391846497004, "grad_norm": 6.7129621505737305, "learning_rate": 9.911436253643443e-07, "logits/chosen": -0.6010236740112305, "logits/rejected": -0.9694432020187378, "logps/chosen": -51.10399627685547, "logps/rejected": -74.51303100585938, "loss": 1.3418, "rewards/accuracies": 0.75, "rewards/chosen": 0.2066667526960373, "rewards/margins": 0.05391811951994896, "rewards/rejected": 0.15274862945079803, "step": 248 }, { "epoch": 0.013383139394265137, "grad_norm": 7.616147994995117, "learning_rate": 9.907719169380162e-07, "logits/chosen": -0.5555158853530884, "logits/rejected": -1.0062698125839233, "logps/chosen": -85.89315795898438, "logps/rejected": -127.45658874511719, "loss": 1.1302, "rewards/accuracies": 0.75, "rewards/chosen": 0.13865943253040314, "rewards/margins": 0.3135596811771393, "rewards/rejected": -0.17490024864673615, "step": 249 }, { "epoch": 0.01343688694203327, "grad_norm": 7.474664211273193, "learning_rate": 9.90392640201615e-07, "logits/chosen": -0.3878192603588104, "logits/rejected": -0.804552435874939, "logps/chosen": -80.22721862792969, "logps/rejected": -111.23281860351562, "loss": 1.08, "rewards/accuracies": 0.875, "rewards/chosen": 0.10335011780261993, "rewards/margins": 0.39268797636032104, "rewards/rejected": -0.2893378436565399, "step": 250 }, { "epoch": 0.013490634489801402, "grad_norm": 8.29160213470459, "learning_rate": 9.900058010040577e-07, "logits/chosen": -0.43254974484443665, "logits/rejected": -0.7992380261421204, "logps/chosen": -85.9269027709961, "logps/rejected": -113.69029235839844, "loss": 1.257, "rewards/accuracies": 0.75, "rewards/chosen": 0.14866304397583008, "rewards/margins": 0.16407746076583862, "rewards/rejected": -0.015414435416460037, "step": 251 }, { "epoch": 0.013544382037569537, "grad_norm": 7.953009605407715, "learning_rate": 9.89611405310883e-07, "logits/chosen": -0.43862664699554443, "logits/rejected": -0.7818647623062134, "logps/chosen": -88.93862915039062, "logps/rejected": -120.42898559570312, "loss": 0.9718, "rewards/accuracies": 1.0, "rewards/chosen": 0.20921139419078827, "rewards/margins": 0.5064070820808411, "rewards/rejected": -0.2971956729888916, "step": 252 }, { "epoch": 0.01359812958533767, "grad_norm": 10.189489364624023, "learning_rate": 9.8920945920416e-07, "logits/chosen": -0.5023062825202942, "logits/rejected": -0.982239842414856, "logps/chosen": -74.873779296875, "logps/rejected": -108.82559204101562, "loss": 1.0636, "rewards/accuracies": 0.875, "rewards/chosen": 0.2425529956817627, "rewards/margins": 0.3734268844127655, "rewards/rejected": -0.1308738738298416, "step": 253 }, { "epoch": 0.013651877133105802, "grad_norm": 11.489689826965332, "learning_rate": 9.887999688823954e-07, "logits/chosen": -0.30702269077301025, "logits/rejected": -0.6260428428649902, "logps/chosen": -92.19865417480469, "logps/rejected": -124.37353515625, "loss": 1.1542, "rewards/accuracies": 0.875, "rewards/chosen": 0.3126932382583618, "rewards/margins": 0.26121002435684204, "rewards/rejected": 0.05148324370384216, "step": 254 }, { "epoch": 0.013705624680873935, "grad_norm": 7.918270587921143, "learning_rate": 9.883829406604361e-07, "logits/chosen": -0.5766696929931641, "logits/rejected": -1.1000299453735352, "logps/chosen": -87.10360717773438, "logps/rejected": -154.98800659179688, "loss": 0.9432, "rewards/accuracies": 1.0, "rewards/chosen": 0.18990127742290497, "rewards/margins": 0.5358986854553223, "rewards/rejected": -0.3459973931312561, "step": 255 }, { "epoch": 0.013759372228642068, "grad_norm": 8.557303428649902, "learning_rate": 9.879583809693736e-07, "logits/chosen": -0.22856351733207703, "logits/rejected": -0.864416241645813, "logps/chosen": -103.28632354736328, "logps/rejected": -154.03309631347656, "loss": 0.8591, "rewards/accuracies": 0.875, "rewards/chosen": 0.2975628972053528, "rewards/margins": 0.6666656732559204, "rewards/rejected": -0.3691027760505676, "step": 256 }, { "epoch": 0.0138131197764102, "grad_norm": 8.320441246032715, "learning_rate": 9.875262963564435e-07, "logits/chosen": -0.5290526151657104, "logits/rejected": -0.7232424020767212, "logps/chosen": -82.62203979492188, "logps/rejected": -149.920654296875, "loss": 1.046, "rewards/accuracies": 1.0, "rewards/chosen": 0.16208958625793457, "rewards/margins": 0.39549511671066284, "rewards/rejected": -0.23340550065040588, "step": 257 }, { "epoch": 0.013866867324178335, "grad_norm": 7.353522777557373, "learning_rate": 9.870866934849246e-07, "logits/chosen": -0.6042935252189636, "logits/rejected": -0.9732577800750732, "logps/chosen": -67.07771301269531, "logps/rejected": -84.36785125732422, "loss": 1.1253, "rewards/accuracies": 0.875, "rewards/chosen": 0.16771726310253143, "rewards/margins": 0.3118589520454407, "rewards/rejected": -0.14414167404174805, "step": 258 }, { "epoch": 0.013920614871946468, "grad_norm": 9.230646133422852, "learning_rate": 9.866395791340374e-07, "logits/chosen": -0.5574221014976501, "logits/rejected": -0.6890866756439209, "logps/chosen": -87.2138442993164, "logps/rejected": -133.9090576171875, "loss": 1.1809, "rewards/accuracies": 0.75, "rewards/chosen": 0.09173579514026642, "rewards/margins": 0.2554128170013428, "rewards/rejected": -0.16367703676223755, "step": 259 }, { "epoch": 0.0139743624197146, "grad_norm": 7.673941612243652, "learning_rate": 9.861849601988383e-07, "logits/chosen": -0.4745372533798218, "logits/rejected": -0.5913327932357788, "logps/chosen": -82.71144104003906, "logps/rejected": -130.95370483398438, "loss": 0.9595, "rewards/accuracies": 0.875, "rewards/chosen": 0.32120776176452637, "rewards/margins": 0.5713176131248474, "rewards/rejected": -0.2501099109649658, "step": 260 }, { "epoch": 0.014028109967482733, "grad_norm": 5.969098091125488, "learning_rate": 9.857228436901134e-07, "logits/chosen": -0.5343005657196045, "logits/rejected": -0.44417738914489746, "logps/chosen": -58.944358825683594, "logps/rejected": -89.23226165771484, "loss": 1.1487, "rewards/accuracies": 0.875, "rewards/chosen": 0.13331182301044464, "rewards/margins": 0.26338741183280945, "rewards/rejected": -0.1300755888223648, "step": 261 }, { "epoch": 0.014081857515250866, "grad_norm": 8.642870903015137, "learning_rate": 9.852532367342712e-07, "logits/chosen": -0.5430492758750916, "logits/rejected": -0.919087290763855, "logps/chosen": -93.74098205566406, "logps/rejected": -122.88327026367188, "loss": 1.0657, "rewards/accuracies": 0.875, "rewards/chosen": 0.1623714417219162, "rewards/margins": 0.36680108308792114, "rewards/rejected": -0.20442962646484375, "step": 262 }, { "epoch": 0.014135605063019, "grad_norm": 8.33414077758789, "learning_rate": 9.847761465732316e-07, "logits/chosen": -0.4385722875595093, "logits/rejected": -0.6532965898513794, "logps/chosen": -79.09358215332031, "logps/rejected": -116.81510162353516, "loss": 1.0642, "rewards/accuracies": 0.875, "rewards/chosen": 0.22798743844032288, "rewards/margins": 0.4548647999763489, "rewards/rejected": -0.2268773764371872, "step": 263 }, { "epoch": 0.014189352610787133, "grad_norm": 7.47625732421875, "learning_rate": 9.842915805643156e-07, "logits/chosen": -0.48590952157974243, "logits/rejected": -0.7134944200515747, "logps/chosen": -83.9005126953125, "logps/rejected": -124.9341812133789, "loss": 0.9709, "rewards/accuracies": 1.0, "rewards/chosen": 0.2338734120130539, "rewards/margins": 0.4882698655128479, "rewards/rejected": -0.2543964385986328, "step": 264 }, { "epoch": 0.014243100158555266, "grad_norm": 8.187015533447266, "learning_rate": 9.837995461801299e-07, "logits/chosen": -0.23714786767959595, "logits/rejected": -0.674926221370697, "logps/chosen": -60.35591125488281, "logps/rejected": -77.57908630371094, "loss": 1.1285, "rewards/accuracies": 1.0, "rewards/chosen": 0.24997663497924805, "rewards/margins": 0.2878130078315735, "rewards/rejected": -0.037836357951164246, "step": 265 }, { "epoch": 0.014296847706323399, "grad_norm": 8.187094688415527, "learning_rate": 9.833000510084537e-07, "logits/chosen": -0.46092841029167175, "logits/rejected": -0.7724924087524414, "logps/chosen": -83.680419921875, "logps/rejected": -112.01083374023438, "loss": 1.1699, "rewards/accuracies": 0.875, "rewards/chosen": 0.14704155921936035, "rewards/margins": 0.2566852867603302, "rewards/rejected": -0.10964371263980865, "step": 266 }, { "epoch": 0.014350595254091532, "grad_norm": 8.889602661132812, "learning_rate": 9.827931027521203e-07, "logits/chosen": -0.5110130310058594, "logits/rejected": -0.5687258243560791, "logps/chosen": -108.56094360351562, "logps/rejected": -124.67503356933594, "loss": 1.0632, "rewards/accuracies": 0.875, "rewards/chosen": 0.18289077281951904, "rewards/margins": 0.39289993047714233, "rewards/rejected": -0.2100091576576233, "step": 267 }, { "epoch": 0.014404342801859666, "grad_norm": 9.36739730834961, "learning_rate": 9.82278709228899e-07, "logits/chosen": -0.4345582127571106, "logits/rejected": -0.5341508388519287, "logps/chosen": -86.82331085205078, "logps/rejected": -104.12142181396484, "loss": 1.1031, "rewards/accuracies": 1.0, "rewards/chosen": 0.2522311210632324, "rewards/margins": 0.3170914053916931, "rewards/rejected": -0.0648602545261383, "step": 268 }, { "epoch": 0.014458090349627799, "grad_norm": 7.332289695739746, "learning_rate": 9.817568783713743e-07, "logits/chosen": -0.768648087978363, "logits/rejected": -0.9154085516929626, "logps/chosen": -85.2851333618164, "logps/rejected": -93.83207702636719, "loss": 1.1129, "rewards/accuracies": 1.0, "rewards/chosen": 0.20519228279590607, "rewards/margins": 0.31278467178344727, "rewards/rejected": -0.1075923889875412, "step": 269 }, { "epoch": 0.014511837897395932, "grad_norm": 9.34799861907959, "learning_rate": 9.812276182268236e-07, "logits/chosen": -0.5562636256217957, "logits/rejected": -0.6497949361801147, "logps/chosen": -84.54762268066406, "logps/rejected": -142.047607421875, "loss": 0.9431, "rewards/accuracies": 1.0, "rewards/chosen": 0.12430868297815323, "rewards/margins": 0.5213713049888611, "rewards/rejected": -0.39706259965896606, "step": 270 }, { "epoch": 0.014565585445164064, "grad_norm": 7.788771629333496, "learning_rate": 9.80690936957093e-07, "logits/chosen": -0.5215786099433899, "logits/rejected": -0.6321144104003906, "logps/chosen": -102.1207275390625, "logps/rejected": -114.18605041503906, "loss": 1.147, "rewards/accuracies": 0.875, "rewards/chosen": 0.15436579287052155, "rewards/margins": 0.280081570148468, "rewards/rejected": -0.12571577727794647, "step": 271 }, { "epoch": 0.014619332992932197, "grad_norm": 6.995871067047119, "learning_rate": 9.801468428384716e-07, "logits/chosen": -0.4826238751411438, "logits/rejected": -1.045019507408142, "logps/chosen": -51.4791259765625, "logps/rejected": -64.26348114013672, "loss": 1.2279, "rewards/accuracies": 0.75, "rewards/chosen": 0.16225357353687286, "rewards/margins": 0.21734732389450073, "rewards/rejected": -0.05509376898407936, "step": 272 }, { "epoch": 0.01467308054070033, "grad_norm": 8.320442199707031, "learning_rate": 9.795953442615637e-07, "logits/chosen": -0.2795771360397339, "logits/rejected": -0.9269124269485474, "logps/chosen": -83.69010925292969, "logps/rejected": -132.68978881835938, "loss": 0.9347, "rewards/accuracies": 0.875, "rewards/chosen": 0.28460603952407837, "rewards/margins": 0.5781921744346619, "rewards/rejected": -0.2935860753059387, "step": 273 }, { "epoch": 0.014726828088468464, "grad_norm": 7.224923133850098, "learning_rate": 9.790364497311595e-07, "logits/chosen": -0.516793966293335, "logits/rejected": -0.5459295511245728, "logps/chosen": -71.00935363769531, "logps/rejected": -104.81021118164062, "loss": 0.9845, "rewards/accuracies": 1.0, "rewards/chosen": 0.29926228523254395, "rewards/margins": 0.5231636762619019, "rewards/rejected": -0.2239014208316803, "step": 274 }, { "epoch": 0.014780575636236597, "grad_norm": 8.644327163696289, "learning_rate": 9.784701678661044e-07, "logits/chosen": -0.29655879735946655, "logits/rejected": -0.696053147315979, "logps/chosen": -73.27018737792969, "logps/rejected": -96.52377319335938, "loss": 1.1951, "rewards/accuracies": 0.875, "rewards/chosen": 0.20520314574241638, "rewards/margins": 0.20678368210792542, "rewards/rejected": -0.001580517739057541, "step": 275 }, { "epoch": 0.01483432318400473, "grad_norm": 9.182957649230957, "learning_rate": 9.77896507399165e-07, "logits/chosen": -0.5673010349273682, "logits/rejected": -0.6784734725952148, "logps/chosen": -88.32586669921875, "logps/rejected": -138.0816192626953, "loss": 1.1084, "rewards/accuracies": 0.75, "rewards/chosen": 0.19880113005638123, "rewards/margins": 0.33900701999664307, "rewards/rejected": -0.14020586013793945, "step": 276 }, { "epoch": 0.014888070731772863, "grad_norm": 7.926046848297119, "learning_rate": 9.773154771768955e-07, "logits/chosen": -0.44852402806282043, "logits/rejected": -0.8911378979682922, "logps/chosen": -79.64956665039062, "logps/rejected": -105.65498352050781, "loss": 1.1516, "rewards/accuracies": 0.875, "rewards/chosen": 0.29726237058639526, "rewards/margins": 0.28044331073760986, "rewards/rejected": 0.016819097101688385, "step": 277 }, { "epoch": 0.014941818279540995, "grad_norm": 8.613622665405273, "learning_rate": 9.767270861595004e-07, "logits/chosen": -0.6433182954788208, "logits/rejected": -0.8262368440628052, "logps/chosen": -75.862060546875, "logps/rejected": -110.22479248046875, "loss": 1.113, "rewards/accuracies": 0.75, "rewards/chosen": 0.17685633897781372, "rewards/margins": 0.3377441465854645, "rewards/rejected": -0.16088780760765076, "step": 278 }, { "epoch": 0.01499556582730913, "grad_norm": 7.498295307159424, "learning_rate": 9.761313434206977e-07, "logits/chosen": -0.5290209054946899, "logits/rejected": -0.6595941781997681, "logps/chosen": -93.25931549072266, "logps/rejected": -114.23931884765625, "loss": 0.9943, "rewards/accuracies": 0.875, "rewards/chosen": 0.14882898330688477, "rewards/margins": 0.49634385108947754, "rewards/rejected": -0.3475148677825928, "step": 279 }, { "epoch": 0.015049313375077263, "grad_norm": 7.240914821624756, "learning_rate": 9.755282581475767e-07, "logits/chosen": -0.4989565908908844, "logits/rejected": -0.6776658296585083, "logps/chosen": -82.03950500488281, "logps/rejected": -100.86872863769531, "loss": 1.0541, "rewards/accuracies": 0.875, "rewards/chosen": 0.28837645053863525, "rewards/margins": 0.3993419408798218, "rewards/rejected": -0.11096549034118652, "step": 280 }, { "epoch": 0.015103060922845395, "grad_norm": 6.950934410095215, "learning_rate": 9.749178396404588e-07, "logits/chosen": -0.3202768564224243, "logits/rejected": -0.8115454912185669, "logps/chosen": -61.45012283325195, "logps/rejected": -85.72331237792969, "loss": 1.1402, "rewards/accuracies": 0.875, "rewards/chosen": 0.24857988953590393, "rewards/margins": 0.2943744361400604, "rewards/rejected": -0.04579455032944679, "step": 281 }, { "epoch": 0.015156808470613528, "grad_norm": 6.702651023864746, "learning_rate": 9.743000973127523e-07, "logits/chosen": -0.6097660064697266, "logits/rejected": -0.8681362867355347, "logps/chosen": -59.426475524902344, "logps/rejected": -101.32482147216797, "loss": 1.0391, "rewards/accuracies": 0.875, "rewards/chosen": 0.24413172900676727, "rewards/margins": 0.4027537405490875, "rewards/rejected": -0.15862202644348145, "step": 282 }, { "epoch": 0.01521055601838166, "grad_norm": 7.214409828186035, "learning_rate": 9.73675040690808e-07, "logits/chosen": -0.4117346405982971, "logits/rejected": -0.7110905647277832, "logps/chosen": -96.80953216552734, "logps/rejected": -143.1290283203125, "loss": 0.8017, "rewards/accuracies": 1.0, "rewards/chosen": 0.3847959339618683, "rewards/margins": 0.7773008942604065, "rewards/rejected": -0.39250487089157104, "step": 283 }, { "epoch": 0.015264303566149794, "grad_norm": 9.447858810424805, "learning_rate": 9.730426794137726e-07, "logits/chosen": -0.46513450145721436, "logits/rejected": -0.7564966678619385, "logps/chosen": -94.57246398925781, "logps/rejected": -120.65567016601562, "loss": 0.999, "rewards/accuracies": 1.0, "rewards/chosen": 0.25270184874534607, "rewards/margins": 0.4643063247203827, "rewards/rejected": -0.21160446107387543, "step": 284 }, { "epoch": 0.015318051113917928, "grad_norm": 8.516096115112305, "learning_rate": 9.72403023233439e-07, "logits/chosen": -0.16967588663101196, "logits/rejected": -0.6391971111297607, "logps/chosen": -82.84049987792969, "logps/rejected": -115.44432067871094, "loss": 0.8835, "rewards/accuracies": 1.0, "rewards/chosen": 0.24860167503356934, "rewards/margins": 0.6482660174369812, "rewards/rejected": -0.3996642827987671, "step": 285 }, { "epoch": 0.01537179866168606, "grad_norm": 10.064424514770508, "learning_rate": 9.717560820140968e-07, "logits/chosen": -0.3854745030403137, "logits/rejected": -0.6414456367492676, "logps/chosen": -75.41014099121094, "logps/rejected": -109.79766845703125, "loss": 1.1009, "rewards/accuracies": 0.75, "rewards/chosen": 0.2192663997411728, "rewards/margins": 0.3585866689682007, "rewards/rejected": -0.1393202841281891, "step": 286 }, { "epoch": 0.015425546209454194, "grad_norm": 6.992526054382324, "learning_rate": 9.711018657323798e-07, "logits/chosen": -0.6344618201255798, "logits/rejected": -0.9946074485778809, "logps/chosen": -68.52722930908203, "logps/rejected": -115.47061920166016, "loss": 1.0299, "rewards/accuracies": 0.875, "rewards/chosen": 0.3005034327507019, "rewards/margins": 0.4479685425758362, "rewards/rejected": -0.1474650800228119, "step": 287 }, { "epoch": 0.015479293757222326, "grad_norm": 9.140570640563965, "learning_rate": 9.704403844771127e-07, "logits/chosen": -0.411197304725647, "logits/rejected": -0.7791616916656494, "logps/chosen": -99.13587188720703, "logps/rejected": -147.218017578125, "loss": 1.0681, "rewards/accuracies": 0.75, "rewards/chosen": 0.1548137664794922, "rewards/margins": 0.39904481172561646, "rewards/rejected": -0.24423104524612427, "step": 288 }, { "epoch": 0.01553304130499046, "grad_norm": 7.424576282501221, "learning_rate": 9.697716484491545e-07, "logits/chosen": -0.5678108930587769, "logits/rejected": -0.7492731213569641, "logps/chosen": -90.81136322021484, "logps/rejected": -138.5076446533203, "loss": 0.7894, "rewards/accuracies": 1.0, "rewards/chosen": 0.2965516448020935, "rewards/margins": 0.7807052135467529, "rewards/rejected": -0.4841535985469818, "step": 289 }, { "epoch": 0.015586788852758594, "grad_norm": 7.465813636779785, "learning_rate": 9.69095667961242e-07, "logits/chosen": -0.40255221724510193, "logits/rejected": -0.658112645149231, "logps/chosen": -86.02987670898438, "logps/rejected": -108.27945709228516, "loss": 0.9936, "rewards/accuracies": 1.0, "rewards/chosen": 0.30489006638526917, "rewards/margins": 0.4739183485507965, "rewards/rejected": -0.16902829706668854, "step": 290 }, { "epoch": 0.015640536400526726, "grad_norm": 8.759163856506348, "learning_rate": 9.684124534378306e-07, "logits/chosen": -0.4031268060207367, "logits/rejected": -0.5718150734901428, "logps/chosen": -72.60466003417969, "logps/rejected": -90.86416625976562, "loss": 1.0946, "rewards/accuracies": 0.75, "rewards/chosen": 0.12894652783870697, "rewards/margins": 0.3612363934516907, "rewards/rejected": -0.2322898656129837, "step": 291 }, { "epoch": 0.01569428394829486, "grad_norm": 7.793913841247559, "learning_rate": 9.677220154149337e-07, "logits/chosen": -0.5084730386734009, "logits/rejected": -0.9328963756561279, "logps/chosen": -57.33200454711914, "logps/rejected": -103.25164794921875, "loss": 1.088, "rewards/accuracies": 1.0, "rewards/chosen": 0.28789860010147095, "rewards/margins": 0.3288136124610901, "rewards/rejected": -0.04091501235961914, "step": 292 }, { "epoch": 0.015748031496062992, "grad_norm": 7.355979919433594, "learning_rate": 9.670243645399592e-07, "logits/chosen": -0.23863175511360168, "logits/rejected": -0.6393299102783203, "logps/chosen": -73.53858184814453, "logps/rejected": -106.6146240234375, "loss": 1.1431, "rewards/accuracies": 0.75, "rewards/chosen": 0.17891855537891388, "rewards/margins": 0.29221975803375244, "rewards/rejected": -0.11330118775367737, "step": 293 }, { "epoch": 0.015801779043831125, "grad_norm": 9.70301342010498, "learning_rate": 9.66319511571547e-07, "logits/chosen": -0.5887746810913086, "logits/rejected": -0.8464258909225464, "logps/chosen": -72.31002807617188, "logps/rejected": -113.12400817871094, "loss": 1.0684, "rewards/accuracies": 1.0, "rewards/chosen": 0.18143503367900848, "rewards/margins": 0.38432347774505615, "rewards/rejected": -0.20288844406604767, "step": 294 }, { "epoch": 0.015855526591599257, "grad_norm": 8.012235641479492, "learning_rate": 9.656074673794017e-07, "logits/chosen": -0.48777371644973755, "logits/rejected": -0.7305865287780762, "logps/chosen": -76.85704040527344, "logps/rejected": -106.29739379882812, "loss": 1.1032, "rewards/accuracies": 0.75, "rewards/chosen": 0.23623672127723694, "rewards/margins": 0.348111093044281, "rewards/rejected": -0.11187433451414108, "step": 295 }, { "epoch": 0.01590927413936739, "grad_norm": 8.312561988830566, "learning_rate": 9.648882429441256e-07, "logits/chosen": -0.6219150424003601, "logits/rejected": -0.7722678184509277, "logps/chosen": -81.02972412109375, "logps/rejected": -108.74473571777344, "loss": 1.0751, "rewards/accuracies": 0.875, "rewards/chosen": 0.2990270256996155, "rewards/margins": 0.3803249001502991, "rewards/rejected": -0.0812978744506836, "step": 296 }, { "epoch": 0.015963021687135523, "grad_norm": 7.4272685050964355, "learning_rate": 9.641618493570494e-07, "logits/chosen": -0.610638439655304, "logits/rejected": -0.6773062944412231, "logps/chosen": -78.98297119140625, "logps/rejected": -98.52257537841797, "loss": 1.0557, "rewards/accuracies": 1.0, "rewards/chosen": 0.2879084348678589, "rewards/margins": 0.3793227970600128, "rewards/rejected": -0.09141434729099274, "step": 297 }, { "epoch": 0.01601676923490366, "grad_norm": 9.1967191696167, "learning_rate": 9.634282978200603e-07, "logits/chosen": -0.47958338260650635, "logits/rejected": -0.7445369958877563, "logps/chosen": -88.43070983886719, "logps/rejected": -117.46142578125, "loss": 1.0955, "rewards/accuracies": 0.75, "rewards/chosen": 0.21477727591991425, "rewards/margins": 0.353880912065506, "rewards/rejected": -0.13910359144210815, "step": 298 }, { "epoch": 0.016070516782671792, "grad_norm": 8.55931568145752, "learning_rate": 9.62687599645431e-07, "logits/chosen": -0.5550699830055237, "logits/rejected": -0.8242961168289185, "logps/chosen": -95.43873596191406, "logps/rejected": -119.18885803222656, "loss": 1.0174, "rewards/accuracies": 0.875, "rewards/chosen": 0.2723308205604553, "rewards/margins": 0.4446273446083069, "rewards/rejected": -0.17229653894901276, "step": 299 }, { "epoch": 0.016124264330439925, "grad_norm": 9.203601837158203, "learning_rate": 9.619397662556433e-07, "logits/chosen": -0.6323695182800293, "logits/rejected": -0.7377463579177856, "logps/chosen": -89.126220703125, "logps/rejected": -110.41032409667969, "loss": 0.8626, "rewards/accuracies": 1.0, "rewards/chosen": 0.28707414865493774, "rewards/margins": 0.6520431637763977, "rewards/rejected": -0.36496901512145996, "step": 300 }, { "epoch": 0.016178011878208057, "grad_norm": 6.706488132476807, "learning_rate": 9.611848091832133e-07, "logits/chosen": -0.5003226399421692, "logits/rejected": -0.47677677869796753, "logps/chosen": -73.65802764892578, "logps/rejected": -127.02882385253906, "loss": 0.8246, "rewards/accuracies": 1.0, "rewards/chosen": 0.4354723393917084, "rewards/margins": 0.6970818042755127, "rewards/rejected": -0.26160943508148193, "step": 301 }, { "epoch": 0.01623175942597619, "grad_norm": 8.04930305480957, "learning_rate": 9.604227400705132e-07, "logits/chosen": -0.46409064531326294, "logits/rejected": -0.8065653443336487, "logps/chosen": -81.9608383178711, "logps/rejected": -112.73027801513672, "loss": 1.0681, "rewards/accuracies": 0.875, "rewards/chosen": 0.400997132062912, "rewards/margins": 0.3888436555862427, "rewards/rejected": 0.01215343363583088, "step": 302 }, { "epoch": 0.016285506973744323, "grad_norm": 8.2164888381958, "learning_rate": 9.59653570669591e-07, "logits/chosen": -0.33537808060646057, "logits/rejected": -0.8108541965484619, "logps/chosen": -71.90696716308594, "logps/rejected": -102.60675811767578, "loss": 1.2689, "rewards/accuracies": 0.75, "rewards/chosen": 0.29837486147880554, "rewards/margins": 0.1307733654975891, "rewards/rejected": 0.16760149598121643, "step": 303 }, { "epoch": 0.016339254521512456, "grad_norm": 7.2594895362854, "learning_rate": 9.588773128419905e-07, "logits/chosen": -0.4910351634025574, "logits/rejected": -0.5714324116706848, "logps/chosen": -84.32911682128906, "logps/rejected": -108.97666931152344, "loss": 0.9245, "rewards/accuracies": 1.0, "rewards/chosen": 0.2599973678588867, "rewards/margins": 0.5889714360237122, "rewards/rejected": -0.32897406816482544, "step": 304 }, { "epoch": 0.01639300206928059, "grad_norm": 8.445978164672852, "learning_rate": 9.58093978558568e-07, "logits/chosen": -0.44651299715042114, "logits/rejected": -0.9653267860412598, "logps/chosen": -86.14595031738281, "logps/rejected": -141.41494750976562, "loss": 0.8677, "rewards/accuracies": 0.875, "rewards/chosen": 0.31153959035873413, "rewards/margins": 0.6685594916343689, "rewards/rejected": -0.35701990127563477, "step": 305 }, { "epoch": 0.01644674961704872, "grad_norm": 8.330660820007324, "learning_rate": 9.573035798993068e-07, "logits/chosen": -0.3970104455947876, "logits/rejected": -0.6319835186004639, "logps/chosen": -88.41896057128906, "logps/rejected": -110.83444213867188, "loss": 1.1999, "rewards/accuracies": 0.625, "rewards/chosen": 0.1607862412929535, "rewards/margins": 0.22908997535705566, "rewards/rejected": -0.06830373406410217, "step": 306 }, { "epoch": 0.016500497164816854, "grad_norm": 7.503106117248535, "learning_rate": 9.56506129053132e-07, "logits/chosen": -0.4842306673526764, "logits/rejected": -0.7397516965866089, "logps/chosen": -67.37020874023438, "logps/rejected": -109.20219421386719, "loss": 1.0329, "rewards/accuracies": 0.75, "rewards/chosen": 0.24541062116622925, "rewards/margins": 0.4451240301132202, "rewards/rejected": -0.19971340894699097, "step": 307 }, { "epoch": 0.016554244712584987, "grad_norm": 7.155852317810059, "learning_rate": 9.557016383177225e-07, "logits/chosen": -0.40336179733276367, "logits/rejected": -0.6598275899887085, "logps/chosen": -82.82121276855469, "logps/rejected": -103.10366821289062, "loss": 1.0492, "rewards/accuracies": 0.875, "rewards/chosen": 0.34067612886428833, "rewards/margins": 0.4065336287021637, "rewards/rejected": -0.06585750728845596, "step": 308 }, { "epoch": 0.016607992260353123, "grad_norm": 7.3789143562316895, "learning_rate": 9.548901200993204e-07, "logits/chosen": -0.4259952902793884, "logits/rejected": -0.8674143552780151, "logps/chosen": -84.60432434082031, "logps/rejected": -112.87922668457031, "loss": 1.0037, "rewards/accuracies": 0.875, "rewards/chosen": 0.3106076121330261, "rewards/margins": 0.4831881821155548, "rewards/rejected": -0.1725805401802063, "step": 309 }, { "epoch": 0.016661739808121256, "grad_norm": 7.783058166503906, "learning_rate": 9.540715869125407e-07, "logits/chosen": -0.3333371877670288, "logits/rejected": -0.635305643081665, "logps/chosen": -82.22999572753906, "logps/rejected": -113.31059265136719, "loss": 1.0562, "rewards/accuracies": 0.875, "rewards/chosen": 0.2867959141731262, "rewards/margins": 0.3969234824180603, "rewards/rejected": -0.11012755334377289, "step": 310 }, { "epoch": 0.01671548735588939, "grad_norm": 8.2703275680542, "learning_rate": 9.532460513801773e-07, "logits/chosen": -0.5109522342681885, "logits/rejected": -0.8237969875335693, "logps/chosen": -61.520729064941406, "logps/rejected": -108.67868041992188, "loss": 1.0741, "rewards/accuracies": 0.875, "rewards/chosen": 0.42062604427337646, "rewards/margins": 0.38874348998069763, "rewards/rejected": 0.031882528215646744, "step": 311 }, { "epoch": 0.01676923490365752, "grad_norm": 8.926196098327637, "learning_rate": 9.524135262330098e-07, "logits/chosen": -0.49644413590431213, "logits/rejected": -0.8699883818626404, "logps/chosen": -90.44168090820312, "logps/rejected": -110.64535522460938, "loss": 1.159, "rewards/accuracies": 0.75, "rewards/chosen": 0.24056756496429443, "rewards/margins": 0.28453004360198975, "rewards/rejected": -0.04396247863769531, "step": 312 }, { "epoch": 0.016822982451425654, "grad_norm": 8.574334144592285, "learning_rate": 9.515740243096055e-07, "logits/chosen": -0.3673998713493347, "logits/rejected": -0.6509386301040649, "logps/chosen": -77.79373931884766, "logps/rejected": -97.71434020996094, "loss": 1.04, "rewards/accuracies": 1.0, "rewards/chosen": 0.2698175609111786, "rewards/margins": 0.4351244866847992, "rewards/rejected": -0.165306955575943, "step": 313 }, { "epoch": 0.016876729999193787, "grad_norm": 9.868024826049805, "learning_rate": 9.507275585561227e-07, "logits/chosen": -0.5035576820373535, "logits/rejected": -0.5416494607925415, "logps/chosen": -96.24528503417969, "logps/rejected": -142.3846893310547, "loss": 1.0724, "rewards/accuracies": 0.75, "rewards/chosen": 0.20162230730056763, "rewards/margins": 0.4640471637248993, "rewards/rejected": -0.26242485642433167, "step": 314 }, { "epoch": 0.01693047754696192, "grad_norm": 7.351305961608887, "learning_rate": 9.498741420261108e-07, "logits/chosen": -0.5489093065261841, "logits/rejected": -0.7355036735534668, "logps/chosen": -74.57876586914062, "logps/rejected": -94.16769409179688, "loss": 1.0776, "rewards/accuracies": 0.75, "rewards/chosen": 0.2299034297466278, "rewards/margins": 0.37732845544815063, "rewards/rejected": -0.14742501080036163, "step": 315 }, { "epoch": 0.016984225094730052, "grad_norm": 10.219794273376465, "learning_rate": 9.490137878803078e-07, "logits/chosen": -0.46236085891723633, "logits/rejected": -0.7211093902587891, "logps/chosen": -82.19825744628906, "logps/rejected": -93.35449981689453, "loss": 1.2408, "rewards/accuracies": 0.75, "rewards/chosen": 0.1813945174217224, "rewards/margins": 0.1912529468536377, "rewards/rejected": -0.009858418256044388, "step": 316 }, { "epoch": 0.017037972642498185, "grad_norm": 6.636542797088623, "learning_rate": 9.481465093864393e-07, "logits/chosen": -0.4133875370025635, "logits/rejected": -0.8873512744903564, "logps/chosen": -70.28067779541016, "logps/rejected": -96.2901611328125, "loss": 0.9233, "rewards/accuracies": 0.875, "rewards/chosen": 0.34994277358055115, "rewards/margins": 0.6102002263069153, "rewards/rejected": -0.26025745272636414, "step": 317 }, { "epoch": 0.017091720190266318, "grad_norm": 8.731184005737305, "learning_rate": 9.472723199190125e-07, "logits/chosen": -0.5680752396583557, "logits/rejected": -0.6813143491744995, "logps/chosen": -90.6051025390625, "logps/rejected": -113.22433471679688, "loss": 0.9737, "rewards/accuracies": 0.875, "rewards/chosen": 0.2579983174800873, "rewards/margins": 0.502092719078064, "rewards/rejected": -0.2440943717956543, "step": 318 }, { "epoch": 0.01714546773803445, "grad_norm": 8.641715049743652, "learning_rate": 9.463912329591104e-07, "logits/chosen": -0.4551582634449005, "logits/rejected": -0.5271797180175781, "logps/chosen": -110.28755950927734, "logps/rejected": -124.72492218017578, "loss": 0.8914, "rewards/accuracies": 1.0, "rewards/chosen": 0.3462936282157898, "rewards/margins": 0.6532011032104492, "rewards/rejected": -0.3069074749946594, "step": 319 }, { "epoch": 0.017199215285802587, "grad_norm": 8.12834358215332, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.23924991488456726, "logits/rejected": -0.7052961587905884, "logps/chosen": -86.89447021484375, "logps/rejected": -106.80326843261719, "loss": 1.1401, "rewards/accuracies": 0.75, "rewards/chosen": 0.25094008445739746, "rewards/margins": 0.3170701861381531, "rewards/rejected": -0.06613011658191681, "step": 320 }, { "epoch": 0.01725296283357072, "grad_norm": 7.740055084228516, "learning_rate": 9.446084210178422e-07, "logits/chosen": -0.6330962181091309, "logits/rejected": -0.7946895360946655, "logps/chosen": -69.3128662109375, "logps/rejected": -115.51507568359375, "loss": 0.974, "rewards/accuracies": 1.0, "rewards/chosen": 0.22671371698379517, "rewards/margins": 0.48475176095962524, "rewards/rejected": -0.2580380439758301, "step": 321 }, { "epoch": 0.017306710381338852, "grad_norm": 8.120292663574219, "learning_rate": 9.437067235296417e-07, "logits/chosen": -0.47494494915008545, "logits/rejected": -0.7982616424560547, "logps/chosen": -67.48805236816406, "logps/rejected": -105.09391784667969, "loss": 1.1378, "rewards/accuracies": 0.75, "rewards/chosen": 0.22622764110565186, "rewards/margins": 0.30137771368026733, "rewards/rejected": -0.07515005767345428, "step": 322 }, { "epoch": 0.017360457929106985, "grad_norm": 7.675968647003174, "learning_rate": 9.427981835348728e-07, "logits/chosen": -0.6213728189468384, "logits/rejected": -0.7559995651245117, "logps/chosen": -91.69728088378906, "logps/rejected": -136.9296875, "loss": 0.9418, "rewards/accuracies": 0.875, "rewards/chosen": 0.41454753279685974, "rewards/margins": 0.644859254360199, "rewards/rejected": -0.23031169176101685, "step": 323 }, { "epoch": 0.017414205476875118, "grad_norm": 9.217921257019043, "learning_rate": 9.418828150443468e-07, "logits/chosen": -0.4274148643016815, "logits/rejected": -0.49069780111312866, "logps/chosen": -72.67945861816406, "logps/rejected": -102.41048431396484, "loss": 1.2364, "rewards/accuracies": 0.75, "rewards/chosen": 0.18960294127464294, "rewards/margins": 0.17513293027877808, "rewards/rejected": 0.014470003545284271, "step": 324 }, { "epoch": 0.01746795302464325, "grad_norm": 8.929125785827637, "learning_rate": 9.409606321741774e-07, "logits/chosen": -0.34309011697769165, "logits/rejected": -0.7771865129470825, "logps/chosen": -71.84932708740234, "logps/rejected": -103.17626953125, "loss": 0.9622, "rewards/accuracies": 1.0, "rewards/chosen": 0.44899290800094604, "rewards/margins": 0.49878984689712524, "rewards/rejected": -0.04979696124792099, "step": 325 }, { "epoch": 0.017521700572411383, "grad_norm": 9.137594223022461, "learning_rate": 9.40031649145566e-07, "logits/chosen": -0.45482856035232544, "logits/rejected": -0.7130982875823975, "logps/chosen": -97.09280395507812, "logps/rejected": -114.96577453613281, "loss": 0.9506, "rewards/accuracies": 0.875, "rewards/chosen": 0.27811431884765625, "rewards/margins": 0.5792925953865051, "rewards/rejected": -0.3011782765388489, "step": 326 }, { "epoch": 0.017575448120179516, "grad_norm": 6.072786808013916, "learning_rate": 9.390958802845795e-07, "logits/chosen": -0.25175967812538147, "logits/rejected": -0.9929987192153931, "logps/chosen": -88.61722564697266, "logps/rejected": -178.9363250732422, "loss": 0.6134, "rewards/accuracies": 1.0, "rewards/chosen": 0.40909138321876526, "rewards/margins": 1.1727215051651, "rewards/rejected": -0.7636301517486572, "step": 327 }, { "epoch": 0.01762919566794765, "grad_norm": 7.18405294418335, "learning_rate": 9.381533400219317e-07, "logits/chosen": -0.40931206941604614, "logits/rejected": -0.8035541772842407, "logps/chosen": -62.351600646972656, "logps/rejected": -111.9510498046875, "loss": 0.8971, "rewards/accuracies": 0.875, "rewards/chosen": 0.3583703339099884, "rewards/margins": 0.6043733358383179, "rewards/rejected": -0.24600300192832947, "step": 328 }, { "epoch": 0.01768294321571578, "grad_norm": 7.456970691680908, "learning_rate": 9.372040428927594e-07, "logits/chosen": -0.6722944974899292, "logits/rejected": -0.6279394030570984, "logps/chosen": -82.15357208251953, "logps/rejected": -117.021240234375, "loss": 0.9028, "rewards/accuracies": 0.875, "rewards/chosen": 0.18554429709911346, "rewards/margins": 0.6106836199760437, "rewards/rejected": -0.42513933777809143, "step": 329 }, { "epoch": 0.017736690763483914, "grad_norm": 9.382244110107422, "learning_rate": 9.362480035363985e-07, "logits/chosen": -0.5607323050498962, "logits/rejected": -0.696474015712738, "logps/chosen": -102.08384704589844, "logps/rejected": -125.13653564453125, "loss": 1.0511, "rewards/accuracies": 0.75, "rewards/chosen": 0.31362810730934143, "rewards/margins": 0.4244765639305115, "rewards/rejected": -0.11084847152233124, "step": 330 }, { "epoch": 0.01779043831125205, "grad_norm": 8.200080871582031, "learning_rate": 9.352852366961588e-07, "logits/chosen": -0.47812920808792114, "logits/rejected": -0.8245142698287964, "logps/chosen": -99.10797119140625, "logps/rejected": -126.0719223022461, "loss": 0.9982, "rewards/accuracies": 0.875, "rewards/chosen": 0.21901445090770721, "rewards/margins": 0.5600014328956604, "rewards/rejected": -0.3409869074821472, "step": 331 }, { "epoch": 0.017844185859020183, "grad_norm": 7.284940719604492, "learning_rate": 9.343157572190956e-07, "logits/chosen": -0.33464840054512024, "logits/rejected": -0.5610167384147644, "logps/chosen": -65.7305679321289, "logps/rejected": -81.48878479003906, "loss": 1.1071, "rewards/accuracies": 0.75, "rewards/chosen": 0.2005133330821991, "rewards/margins": 0.34054267406463623, "rewards/rejected": -0.14002934098243713, "step": 332 }, { "epoch": 0.017897933406788316, "grad_norm": 7.7859697341918945, "learning_rate": 9.333395800557819e-07, "logits/chosen": -0.6617114543914795, "logits/rejected": -1.0107908248901367, "logps/chosen": -71.58953094482422, "logps/rejected": -115.14591979980469, "loss": 1.0504, "rewards/accuracies": 0.875, "rewards/chosen": 0.25825947523117065, "rewards/margins": 0.4628601372241974, "rewards/rejected": -0.20460066199302673, "step": 333 }, { "epoch": 0.01795168095455645, "grad_norm": 7.946098327636719, "learning_rate": 9.323567202600775e-07, "logits/chosen": -0.5933762788772583, "logits/rejected": -0.858302116394043, "logps/chosen": -83.83665466308594, "logps/rejected": -110.91783142089844, "loss": 0.9553, "rewards/accuracies": 0.875, "rewards/chosen": 0.2598263919353485, "rewards/margins": 0.5297263264656067, "rewards/rejected": -0.26989996433258057, "step": 334 }, { "epoch": 0.01800542850232458, "grad_norm": 7.789535045623779, "learning_rate": 9.313671929888959e-07, "logits/chosen": -0.3802677392959595, "logits/rejected": -0.6834218502044678, "logps/chosen": -68.62245178222656, "logps/rejected": -91.51272583007812, "loss": 1.0436, "rewards/accuracies": 0.875, "rewards/chosen": 0.28926119208335876, "rewards/margins": 0.398712694644928, "rewards/rejected": -0.1094515323638916, "step": 335 }, { "epoch": 0.018059176050092714, "grad_norm": 8.293915748596191, "learning_rate": 9.303710135019717e-07, "logits/chosen": -0.48674172163009644, "logits/rejected": -0.7055850028991699, "logps/chosen": -103.59611511230469, "logps/rejected": -133.96832275390625, "loss": 1.0318, "rewards/accuracies": 0.75, "rewards/chosen": 0.2677727937698364, "rewards/margins": 0.4485499858856201, "rewards/rejected": -0.1807771623134613, "step": 336 }, { "epoch": 0.018112923597860847, "grad_norm": 7.22746467590332, "learning_rate": 9.293681971616252e-07, "logits/chosen": -0.5064332485198975, "logits/rejected": -0.5895745158195496, "logps/chosen": -86.99788665771484, "logps/rejected": -101.78475952148438, "loss": 1.0433, "rewards/accuracies": 0.75, "rewards/chosen": 0.32099419832229614, "rewards/margins": 0.4389004409313202, "rewards/rejected": -0.11790624260902405, "step": 337 }, { "epoch": 0.01816667114562898, "grad_norm": 6.921882152557373, "learning_rate": 9.283587594325249e-07, "logits/chosen": -0.47325241565704346, "logits/rejected": -1.083040475845337, "logps/chosen": -64.67233276367188, "logps/rejected": -100.1177978515625, "loss": 1.0272, "rewards/accuracies": 1.0, "rewards/chosen": 0.4748002886772156, "rewards/margins": 0.4122890830039978, "rewards/rejected": 0.0625111535191536, "step": 338 }, { "epoch": 0.018220418693397113, "grad_norm": 7.4565253257751465, "learning_rate": 9.273427158814489e-07, "logits/chosen": -0.4230821132659912, "logits/rejected": -0.6810831427574158, "logps/chosen": -91.70931243896484, "logps/rejected": -123.36559295654297, "loss": 0.8063, "rewards/accuracies": 1.0, "rewards/chosen": 0.31289011240005493, "rewards/margins": 0.7667005062103271, "rewards/rejected": -0.45381033420562744, "step": 339 }, { "epoch": 0.018274166241165245, "grad_norm": 6.2641754150390625, "learning_rate": 9.26320082177046e-07, "logits/chosen": -0.5639437437057495, "logits/rejected": -0.8480712175369263, "logps/chosen": -69.2354736328125, "logps/rejected": -100.78401184082031, "loss": 0.7737, "rewards/accuracies": 1.0, "rewards/chosen": 0.3755197525024414, "rewards/margins": 0.78319251537323, "rewards/rejected": -0.4076727628707886, "step": 340 }, { "epoch": 0.018327913788933378, "grad_norm": 7.690711975097656, "learning_rate": 9.252908740895931e-07, "logits/chosen": -0.549254834651947, "logits/rejected": -0.7352517247200012, "logps/chosen": -80.27323913574219, "logps/rejected": -93.53897857666016, "loss": 0.9846, "rewards/accuracies": 0.875, "rewards/chosen": 0.30005186796188354, "rewards/margins": 0.5079077482223511, "rewards/rejected": -0.20785585045814514, "step": 341 }, { "epoch": 0.018381661336701514, "grad_norm": 7.434743404388428, "learning_rate": 9.242551074907518e-07, "logits/chosen": -0.6576711535453796, "logits/rejected": -0.7092283368110657, "logps/chosen": -88.44308471679688, "logps/rejected": -120.91687774658203, "loss": 0.8656, "rewards/accuracies": 1.0, "rewards/chosen": 0.20998618006706238, "rewards/margins": 0.6326916217803955, "rewards/rejected": -0.42270541191101074, "step": 342 }, { "epoch": 0.018435408884469647, "grad_norm": 9.212821960449219, "learning_rate": 9.232127983533245e-07, "logits/chosen": -0.49279314279556274, "logits/rejected": -0.5707727670669556, "logps/chosen": -97.72673034667969, "logps/rejected": -106.02079010009766, "loss": 1.0803, "rewards/accuracies": 0.875, "rewards/chosen": 0.21697282791137695, "rewards/margins": 0.3692976236343384, "rewards/rejected": -0.15232476592063904, "step": 343 }, { "epoch": 0.01848915643223778, "grad_norm": 9.73297119140625, "learning_rate": 9.221639627510075e-07, "logits/chosen": -0.4621116518974304, "logits/rejected": -0.900585949420929, "logps/chosen": -129.44708251953125, "logps/rejected": -136.5024871826172, "loss": 1.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.012989431619644165, "rewards/margins": 0.4303017258644104, "rewards/rejected": -0.44329118728637695, "step": 344 }, { "epoch": 0.018542903980005913, "grad_norm": 6.764889240264893, "learning_rate": 9.211086168581432e-07, "logits/chosen": -0.34475788474082947, "logits/rejected": -0.625433087348938, "logps/chosen": -62.81744384765625, "logps/rejected": -97.51995086669922, "loss": 0.8245, "rewards/accuracies": 1.0, "rewards/chosen": 0.3769282400608063, "rewards/margins": 0.6920626759529114, "rewards/rejected": -0.3151344060897827, "step": 345 }, { "epoch": 0.018596651527774045, "grad_norm": 7.9778571128845215, "learning_rate": 9.200467769494708e-07, "logits/chosen": -0.37343400716781616, "logits/rejected": -0.6763725876808167, "logps/chosen": -92.33715057373047, "logps/rejected": -163.88687133789062, "loss": 0.7635, "rewards/accuracies": 1.0, "rewards/chosen": 0.33541736006736755, "rewards/margins": 0.8390800952911377, "rewards/rejected": -0.5036627650260925, "step": 346 }, { "epoch": 0.018650399075542178, "grad_norm": 6.802786350250244, "learning_rate": 9.189784593998755e-07, "logits/chosen": -0.409076988697052, "logits/rejected": -0.5759463906288147, "logps/chosen": -83.07672119140625, "logps/rejected": -118.94459533691406, "loss": 0.744, "rewards/accuracies": 1.0, "rewards/chosen": 0.09941766411066055, "rewards/margins": 0.839026689529419, "rewards/rejected": -0.7396090030670166, "step": 347 }, { "epoch": 0.01870414662331031, "grad_norm": 8.793573379516602, "learning_rate": 9.179036806841351e-07, "logits/chosen": -0.4031161069869995, "logits/rejected": -0.6356488466262817, "logps/chosen": -89.63672637939453, "logps/rejected": -112.58209228515625, "loss": 0.9548, "rewards/accuracies": 1.0, "rewards/chosen": 0.19551272690296173, "rewards/margins": 0.5091462731361389, "rewards/rejected": -0.313633531332016, "step": 348 }, { "epoch": 0.018757894171078444, "grad_norm": 8.722846031188965, "learning_rate": 9.168224573766672e-07, "logits/chosen": -0.6030442118644714, "logits/rejected": -0.8677682876586914, "logps/chosen": -74.79644775390625, "logps/rejected": -126.76045989990234, "loss": 0.979, "rewards/accuracies": 1.0, "rewards/chosen": 0.40735456347465515, "rewards/margins": 0.5401187539100647, "rewards/rejected": -0.13276419043540955, "step": 349 }, { "epoch": 0.018811641718846576, "grad_norm": 10.029742240905762, "learning_rate": 9.157348061512726e-07, "logits/chosen": -0.2794766426086426, "logits/rejected": -0.504019021987915, "logps/chosen": -92.84403991699219, "logps/rejected": -113.22787475585938, "loss": 1.1816, "rewards/accuracies": 0.625, "rewards/chosen": 0.2582140862941742, "rewards/margins": 0.2663654386997223, "rewards/rejected": -0.008151352405548096, "step": 350 }, { "epoch": 0.01886538926661471, "grad_norm": 5.169428825378418, "learning_rate": 9.146407437808787e-07, "logits/chosen": -0.308366596698761, "logits/rejected": -0.7040262222290039, "logps/chosen": -68.47492980957031, "logps/rejected": -118.28482055664062, "loss": 0.7274, "rewards/accuracies": 1.0, "rewards/chosen": 0.48221278190612793, "rewards/margins": 0.9575176239013672, "rewards/rejected": -0.4753047823905945, "step": 351 }, { "epoch": 0.018919136814382845, "grad_norm": 8.315790176391602, "learning_rate": 9.135402871372808e-07, "logits/chosen": -0.5178185105323792, "logits/rejected": -0.7006076574325562, "logps/chosen": -78.71610260009766, "logps/rejected": -116.53684997558594, "loss": 1.0107, "rewards/accuracies": 0.75, "rewards/chosen": 0.11287462711334229, "rewards/margins": 0.4822719693183899, "rewards/rejected": -0.36939728260040283, "step": 352 }, { "epoch": 0.018972884362150978, "grad_norm": 9.885313987731934, "learning_rate": 9.124334531908817e-07, "logits/chosen": -0.5213099122047424, "logits/rejected": -0.8198671340942383, "logps/chosen": -73.0559310913086, "logps/rejected": -93.35203552246094, "loss": 1.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.52909255027771, "rewards/margins": 0.43378207087516785, "rewards/rejected": 0.09531045705080032, "step": 353 }, { "epoch": 0.01902663190991911, "grad_norm": 6.961985111236572, "learning_rate": 9.113202590104299e-07, "logits/chosen": -0.48150402307510376, "logits/rejected": -0.927175760269165, "logps/chosen": -62.59891128540039, "logps/rejected": -94.87863159179688, "loss": 0.9065, "rewards/accuracies": 0.875, "rewards/chosen": 0.5268270969390869, "rewards/margins": 0.6159303188323975, "rewards/rejected": -0.0891033187508583, "step": 354 }, { "epoch": 0.019080379457687244, "grad_norm": 7.9682936668396, "learning_rate": 9.102007217627567e-07, "logits/chosen": -0.3602108359336853, "logits/rejected": -0.6214918494224548, "logps/chosen": -92.37991333007812, "logps/rejected": -121.85865020751953, "loss": 0.8391, "rewards/accuracies": 1.0, "rewards/chosen": 0.4225645065307617, "rewards/margins": 0.6889932155609131, "rewards/rejected": -0.2664286494255066, "step": 355 }, { "epoch": 0.019134127005455377, "grad_norm": 10.373743057250977, "learning_rate": 9.090748587125117e-07, "logits/chosen": -0.5158312320709229, "logits/rejected": -0.903644859790802, "logps/chosen": -117.84254455566406, "logps/rejected": -119.69650268554688, "loss": 1.0159, "rewards/accuracies": 0.875, "rewards/chosen": 0.314586341381073, "rewards/margins": 0.4619480073451996, "rewards/rejected": -0.1473616659641266, "step": 356 }, { "epoch": 0.01918787455322351, "grad_norm": 9.36589527130127, "learning_rate": 9.079426872218956e-07, "logits/chosen": -0.28104740381240845, "logits/rejected": -0.9476622343063354, "logps/chosen": -73.08204650878906, "logps/rejected": -101.39688110351562, "loss": 0.9865, "rewards/accuracies": 0.875, "rewards/chosen": 0.3287655711174011, "rewards/margins": 0.48933619260787964, "rewards/rejected": -0.16057062149047852, "step": 357 }, { "epoch": 0.019241622100991642, "grad_norm": 7.228388786315918, "learning_rate": 9.068042247503935e-07, "logits/chosen": -0.29313015937805176, "logits/rejected": -0.8026216626167297, "logps/chosen": -77.93582916259766, "logps/rejected": -125.86537170410156, "loss": 0.9152, "rewards/accuracies": 1.0, "rewards/chosen": 0.11715321242809296, "rewards/margins": 0.5755292773246765, "rewards/rejected": -0.45837607979774475, "step": 358 }, { "epoch": 0.019295369648759775, "grad_norm": 7.534264087677002, "learning_rate": 9.056594888545049e-07, "logits/chosen": -0.5003196001052856, "logits/rejected": -0.5688073635101318, "logps/chosen": -87.33366394042969, "logps/rejected": -158.01368713378906, "loss": 0.7733, "rewards/accuracies": 0.875, "rewards/chosen": 0.3040294349193573, "rewards/margins": 0.8915838599205017, "rewards/rejected": -0.5875544548034668, "step": 359 }, { "epoch": 0.019349117196527908, "grad_norm": 7.72684383392334, "learning_rate": 9.045084971874737e-07, "logits/chosen": -0.42792773246765137, "logits/rejected": -0.6755184531211853, "logps/chosen": -89.75845336914062, "logps/rejected": -152.08319091796875, "loss": 0.7996, "rewards/accuracies": 1.0, "rewards/chosen": 0.39130136370658875, "rewards/margins": 0.8142063617706299, "rewards/rejected": -0.42290496826171875, "step": 360 }, { "epoch": 0.01940286474429604, "grad_norm": 6.714195728302002, "learning_rate": 9.033512674990149e-07, "logits/chosen": -0.4252118170261383, "logits/rejected": -0.5028501749038696, "logps/chosen": -76.52574920654297, "logps/rejected": -87.88744354248047, "loss": 0.8134, "rewards/accuracies": 1.0, "rewards/chosen": 0.5984838008880615, "rewards/margins": 0.7088361978530884, "rewards/rejected": -0.11035237461328506, "step": 361 }, { "epoch": 0.019456612292064173, "grad_norm": 9.345548629760742, "learning_rate": 9.021878176350422e-07, "logits/chosen": -0.5577481985092163, "logits/rejected": -0.8298448324203491, "logps/chosen": -85.6584243774414, "logps/rejected": -122.9629898071289, "loss": 0.965, "rewards/accuracies": 1.0, "rewards/chosen": 0.31385958194732666, "rewards/margins": 0.49451518058776855, "rewards/rejected": -0.18065562844276428, "step": 362 }, { "epoch": 0.01951035983983231, "grad_norm": 8.462217330932617, "learning_rate": 9.010181655373917e-07, "logits/chosen": -0.6656879782676697, "logits/rejected": -0.7718024253845215, "logps/chosen": -84.70065307617188, "logps/rejected": -118.10316467285156, "loss": 0.8528, "rewards/accuracies": 1.0, "rewards/chosen": 0.11916843056678772, "rewards/margins": 0.6957011222839355, "rewards/rejected": -0.5765327215194702, "step": 363 }, { "epoch": 0.019564107387600442, "grad_norm": 7.462190628051758, "learning_rate": 8.998423292435453e-07, "logits/chosen": -0.49574899673461914, "logits/rejected": -0.7866228818893433, "logps/chosen": -74.5083999633789, "logps/rejected": -125.56399536132812, "loss": 0.9659, "rewards/accuracies": 0.75, "rewards/chosen": 0.40475600957870483, "rewards/margins": 0.5902150273323059, "rewards/rejected": -0.18545904755592346, "step": 364 }, { "epoch": 0.019617854935368575, "grad_norm": 6.652899742126465, "learning_rate": 8.986603268863535e-07, "logits/chosen": -0.5968937277793884, "logits/rejected": -0.7734584808349609, "logps/chosen": -74.63990020751953, "logps/rejected": -121.51779174804688, "loss": 0.7613, "rewards/accuracies": 1.0, "rewards/chosen": 0.5559960603713989, "rewards/margins": 0.899902880191803, "rewards/rejected": -0.3439067602157593, "step": 365 }, { "epoch": 0.019671602483136708, "grad_norm": 7.760425090789795, "learning_rate": 8.97472176693755e-07, "logits/chosen": -0.4557651877403259, "logits/rejected": -0.6948214769363403, "logps/chosen": -69.77723693847656, "logps/rejected": -101.38095092773438, "loss": 0.9736, "rewards/accuracies": 1.0, "rewards/chosen": 0.17243242263793945, "rewards/margins": 0.5057952404022217, "rewards/rejected": -0.333362877368927, "step": 366 }, { "epoch": 0.01972535003090484, "grad_norm": 7.182799339294434, "learning_rate": 8.962778969884955e-07, "logits/chosen": -0.4457637369632721, "logits/rejected": -0.514811098575592, "logps/chosen": -102.07685852050781, "logps/rejected": -147.8944091796875, "loss": 0.6978, "rewards/accuracies": 1.0, "rewards/chosen": 0.3295539915561676, "rewards/margins": 1.0587542057037354, "rewards/rejected": -0.7292001247406006, "step": 367 }, { "epoch": 0.019779097578672973, "grad_norm": 6.737385272979736, "learning_rate": 8.950775061878452e-07, "logits/chosen": -0.3600085973739624, "logits/rejected": -0.6264634132385254, "logps/chosen": -68.79764556884766, "logps/rejected": -114.21524047851562, "loss": 0.8988, "rewards/accuracies": 0.875, "rewards/chosen": 0.20890530943870544, "rewards/margins": 0.6407402753829956, "rewards/rejected": -0.4318349361419678, "step": 368 }, { "epoch": 0.019832845126441106, "grad_norm": 7.686861038208008, "learning_rate": 8.938710228033154e-07, "logits/chosen": -0.4901299476623535, "logits/rejected": -0.6336987614631653, "logps/chosen": -68.33500671386719, "logps/rejected": -78.46503448486328, "loss": 1.0481, "rewards/accuracies": 0.875, "rewards/chosen": 0.29523828625679016, "rewards/margins": 0.3884444534778595, "rewards/rejected": -0.09320616722106934, "step": 369 }, { "epoch": 0.01988659267420924, "grad_norm": 10.485249519348145, "learning_rate": 8.926584654403724e-07, "logits/chosen": -0.6570696830749512, "logits/rejected": -0.5259045958518982, "logps/chosen": -75.26362609863281, "logps/rejected": -110.57732391357422, "loss": 1.1919, "rewards/accuracies": 0.625, "rewards/chosen": 0.3428437411785126, "rewards/margins": 0.29282957315444946, "rewards/rejected": 0.05001416802406311, "step": 370 }, { "epoch": 0.01994034022197737, "grad_norm": 8.570562362670898, "learning_rate": 8.914398527981508e-07, "logits/chosen": -0.4347901940345764, "logits/rejected": -0.5957015156745911, "logps/chosen": -98.20342254638672, "logps/rejected": -120.43230438232422, "loss": 0.9165, "rewards/accuracies": 0.875, "rewards/chosen": 0.27293142676353455, "rewards/margins": 0.7086238265037537, "rewards/rejected": -0.4356924295425415, "step": 371 }, { "epoch": 0.019994087769745504, "grad_norm": 7.017946243286133, "learning_rate": 8.902152036691648e-07, "logits/chosen": -0.4152313768863678, "logits/rejected": -0.8007388114929199, "logps/chosen": -64.62852478027344, "logps/rejected": -90.38722229003906, "loss": 0.9696, "rewards/accuracies": 1.0, "rewards/chosen": 0.28432926535606384, "rewards/margins": 0.5223590731620789, "rewards/rejected": -0.23802980780601501, "step": 372 }, { "epoch": 0.020047835317513637, "grad_norm": 7.493007659912109, "learning_rate": 8.889845369390192e-07, "logits/chosen": -0.43159249424934387, "logits/rejected": -0.9276126623153687, "logps/chosen": -88.8349838256836, "logps/rejected": -119.80219268798828, "loss": 0.7735, "rewards/accuracies": 1.0, "rewards/chosen": 0.20574785768985748, "rewards/margins": 0.7665290832519531, "rewards/rejected": -0.5607811808586121, "step": 373 }, { "epoch": 0.020101582865281773, "grad_norm": 8.743884086608887, "learning_rate": 8.877478715861172e-07, "logits/chosen": -0.4221246838569641, "logits/rejected": -0.6599010825157166, "logps/chosen": -65.5552978515625, "logps/rejected": -103.28410339355469, "loss": 1.0346, "rewards/accuracies": 0.75, "rewards/chosen": 0.23170161247253418, "rewards/margins": 0.48611900210380554, "rewards/rejected": -0.25441738963127136, "step": 374 }, { "epoch": 0.020155330413049906, "grad_norm": 7.923236846923828, "learning_rate": 8.865052266813685e-07, "logits/chosen": -0.646773099899292, "logits/rejected": -0.7029209136962891, "logps/chosen": -80.42398834228516, "logps/rejected": -99.16622924804688, "loss": 0.9431, "rewards/accuracies": 1.0, "rewards/chosen": 0.28594493865966797, "rewards/margins": 0.5480276942253113, "rewards/rejected": -0.2620827257633209, "step": 375 }, { "epoch": 0.02020907796081804, "grad_norm": 11.012399673461914, "learning_rate": 8.852566213878946e-07, "logits/chosen": -0.5520120859146118, "logits/rejected": -0.723477840423584, "logps/chosen": -85.48550415039062, "logps/rejected": -116.47734069824219, "loss": 1.3854, "rewards/accuracies": 0.5, "rewards/chosen": -0.005046457052230835, "rewards/margins": 0.04289980232715607, "rewards/rejected": -0.0479462593793869, "step": 376 }, { "epoch": 0.02026282550858617, "grad_norm": 8.88310718536377, "learning_rate": 8.840020749607339e-07, "logits/chosen": -0.5578601360321045, "logits/rejected": -0.5346056222915649, "logps/chosen": -91.29450988769531, "logps/rejected": -108.26667022705078, "loss": 1.0248, "rewards/accuracies": 1.0, "rewards/chosen": 0.14164933562278748, "rewards/margins": 0.4241856336593628, "rewards/rejected": -0.28253626823425293, "step": 377 }, { "epoch": 0.020316573056354304, "grad_norm": 9.422719955444336, "learning_rate": 8.827416067465441e-07, "logits/chosen": -0.3142057955265045, "logits/rejected": -0.7104319334030151, "logps/chosen": -103.71488189697266, "logps/rejected": -127.78880310058594, "loss": 1.0385, "rewards/accuracies": 0.875, "rewards/chosen": 0.00987568311393261, "rewards/margins": 0.4320681691169739, "rewards/rejected": -0.4221924841403961, "step": 378 }, { "epoch": 0.020370320604122437, "grad_norm": 7.4446702003479, "learning_rate": 8.814752361833043e-07, "logits/chosen": -0.49699872732162476, "logits/rejected": -0.8372992277145386, "logps/chosen": -67.62881469726562, "logps/rejected": -77.88633728027344, "loss": 1.1111, "rewards/accuracies": 0.75, "rewards/chosen": 0.37484660744667053, "rewards/margins": 0.3662661612033844, "rewards/rejected": 0.008580446243286133, "step": 379 }, { "epoch": 0.02042406815189057, "grad_norm": 9.441752433776855, "learning_rate": 8.802029828000155e-07, "logits/chosen": -0.577965497970581, "logits/rejected": -0.8191964626312256, "logps/chosen": -108.28520202636719, "logps/rejected": -140.89817810058594, "loss": 0.8119, "rewards/accuracies": 0.875, "rewards/chosen": 0.13382163643836975, "rewards/margins": 0.8248190879821777, "rewards/rejected": -0.6909973621368408, "step": 380 }, { "epoch": 0.020477815699658702, "grad_norm": 9.104571342468262, "learning_rate": 8.789248662163983e-07, "logits/chosen": -0.37231436371803284, "logits/rejected": -0.7481340765953064, "logps/chosen": -108.39694213867188, "logps/rejected": -119.09007263183594, "loss": 0.9942, "rewards/accuracies": 0.875, "rewards/chosen": 0.08131428062915802, "rewards/margins": 0.535480260848999, "rewards/rejected": -0.4541659355163574, "step": 381 }, { "epoch": 0.020531563247426835, "grad_norm": 9.293783187866211, "learning_rate": 8.776409061425918e-07, "logits/chosen": -0.5111558437347412, "logits/rejected": -0.6643510460853577, "logps/chosen": -84.5043716430664, "logps/rejected": -135.84146118164062, "loss": 0.8417, "rewards/accuracies": 0.875, "rewards/chosen": 0.30806851387023926, "rewards/margins": 0.8033510446548462, "rewards/rejected": -0.49528253078460693, "step": 382 }, { "epoch": 0.020585310795194968, "grad_norm": 7.287946701049805, "learning_rate": 8.763511223788484e-07, "logits/chosen": -0.6668428182601929, "logits/rejected": -0.8134665489196777, "logps/chosen": -63.47019958496094, "logps/rejected": -111.78068542480469, "loss": 0.9481, "rewards/accuracies": 1.0, "rewards/chosen": 0.3802218437194824, "rewards/margins": 0.5273222923278809, "rewards/rejected": -0.14710040390491486, "step": 383 }, { "epoch": 0.0206390583429631, "grad_norm": 7.1462016105651855, "learning_rate": 8.750555348152298e-07, "logits/chosen": -0.4628410339355469, "logits/rejected": -0.5174292325973511, "logps/chosen": -69.53150939941406, "logps/rejected": -97.24769592285156, "loss": 0.9459, "rewards/accuracies": 1.0, "rewards/chosen": 0.443825900554657, "rewards/margins": 0.5207382440567017, "rewards/rejected": -0.07691235840320587, "step": 384 }, { "epoch": 0.020692805890731237, "grad_norm": 7.443724155426025, "learning_rate": 8.737541634312983e-07, "logits/chosen": -0.6505249738693237, "logits/rejected": -0.9157782793045044, "logps/chosen": -57.33557891845703, "logps/rejected": -89.24098205566406, "loss": 1.0038, "rewards/accuracies": 0.75, "rewards/chosen": 0.2683905363082886, "rewards/margins": 0.5652196407318115, "rewards/rejected": -0.29682910442352295, "step": 385 }, { "epoch": 0.02074655343849937, "grad_norm": 7.210507869720459, "learning_rate": 8.724470282958109e-07, "logits/chosen": -0.4549306035041809, "logits/rejected": -0.7724043726921082, "logps/chosen": -79.18142700195312, "logps/rejected": -139.1554718017578, "loss": 0.7154, "rewards/accuracies": 1.0, "rewards/chosen": 0.3741530179977417, "rewards/margins": 0.9290341138839722, "rewards/rejected": -0.5548810362815857, "step": 386 }, { "epoch": 0.020800300986267502, "grad_norm": 9.94113540649414, "learning_rate": 8.711341495664084e-07, "logits/chosen": -0.40272626280784607, "logits/rejected": -0.6836596727371216, "logps/chosen": -100.57734680175781, "logps/rejected": -114.07740783691406, "loss": 0.8408, "rewards/accuracies": 0.75, "rewards/chosen": 0.19298434257507324, "rewards/margins": 0.8004929423332214, "rewards/rejected": -0.6075085401535034, "step": 387 }, { "epoch": 0.020854048534035635, "grad_norm": 7.178126811981201, "learning_rate": 8.698155474893049e-07, "logits/chosen": -0.49386048316955566, "logits/rejected": -0.6808856725692749, "logps/chosen": -77.24993896484375, "logps/rejected": -123.29280090332031, "loss": 0.7995, "rewards/accuracies": 1.0, "rewards/chosen": 0.3755072057247162, "rewards/margins": 0.7467286586761475, "rewards/rejected": -0.3712214529514313, "step": 388 }, { "epoch": 0.020907796081803768, "grad_norm": 8.829160690307617, "learning_rate": 8.684912423989754e-07, "logits/chosen": -0.314528226852417, "logits/rejected": -0.6963111758232117, "logps/chosen": -96.59176635742188, "logps/rejected": -124.65097045898438, "loss": 0.8959, "rewards/accuracies": 0.875, "rewards/chosen": 0.041516613215208054, "rewards/margins": 0.6903085112571716, "rewards/rejected": -0.6487919092178345, "step": 389 }, { "epoch": 0.0209615436295719, "grad_norm": 6.522557735443115, "learning_rate": 8.671612547178427e-07, "logits/chosen": -0.37712377309799194, "logits/rejected": -0.4971196949481964, "logps/chosen": -71.90280151367188, "logps/rejected": -111.73298645019531, "loss": 0.7702, "rewards/accuracies": 0.875, "rewards/chosen": 0.3025549054145813, "rewards/margins": 0.8281311392784119, "rewards/rejected": -0.5255762338638306, "step": 390 }, { "epoch": 0.021015291177340033, "grad_norm": 7.137973785400391, "learning_rate": 8.658256049559624e-07, "logits/chosen": -0.4118209779262543, "logits/rejected": -0.7066264152526855, "logps/chosen": -89.8065185546875, "logps/rejected": -141.0252685546875, "loss": 0.7282, "rewards/accuracies": 1.0, "rewards/chosen": 0.19320768117904663, "rewards/margins": 0.8609300255775452, "rewards/rejected": -0.6677223443984985, "step": 391 }, { "epoch": 0.021069038725108166, "grad_norm": 9.751758575439453, "learning_rate": 8.644843137107057e-07, "logits/chosen": -0.3937058746814728, "logits/rejected": -0.5206411480903625, "logps/chosen": -65.04463958740234, "logps/rejected": -97.33838653564453, "loss": 1.1092, "rewards/accuracies": 0.75, "rewards/chosen": 0.09613165259361267, "rewards/margins": 0.403576523065567, "rewards/rejected": -0.30744487047195435, "step": 392 }, { "epoch": 0.0211227862728763, "grad_norm": 9.351462364196777, "learning_rate": 8.631374016664432e-07, "logits/chosen": -0.27877122163772583, "logits/rejected": -0.519374668598175, "logps/chosen": -87.41629028320312, "logps/rejected": -109.61636352539062, "loss": 1.1507, "rewards/accuracies": 0.875, "rewards/chosen": 0.1201963871717453, "rewards/margins": 0.3078806400299072, "rewards/rejected": -0.18768423795700073, "step": 393 }, { "epoch": 0.021176533820644432, "grad_norm": 6.791660308837891, "learning_rate": 8.617848895942246e-07, "logits/chosen": -0.534822940826416, "logits/rejected": -0.7197980880737305, "logps/chosen": -89.85694885253906, "logps/rejected": -132.60464477539062, "loss": 0.5798, "rewards/accuracies": 1.0, "rewards/chosen": 0.5418879985809326, "rewards/margins": 1.2317395210266113, "rewards/rejected": -0.6898514032363892, "step": 394 }, { "epoch": 0.021230281368412564, "grad_norm": 6.97799015045166, "learning_rate": 8.604267983514593e-07, "logits/chosen": -0.408166766166687, "logits/rejected": -0.691979169845581, "logps/chosen": -99.1039047241211, "logps/rejected": -127.22286987304688, "loss": 0.7334, "rewards/accuracies": 0.875, "rewards/chosen": 0.29382583498954773, "rewards/margins": 0.9812952876091003, "rewards/rejected": -0.687469482421875, "step": 395 }, { "epoch": 0.0212840289161807, "grad_norm": 7.510430812835693, "learning_rate": 8.590631488815943e-07, "logits/chosen": -0.6546182036399841, "logits/rejected": -0.7147387266159058, "logps/chosen": -85.51350402832031, "logps/rejected": -116.71488952636719, "loss": 0.7792, "rewards/accuracies": 1.0, "rewards/chosen": 0.370207279920578, "rewards/margins": 0.8345896005630493, "rewards/rejected": -0.4643822908401489, "step": 396 }, { "epoch": 0.021337776463948833, "grad_norm": 8.095248222351074, "learning_rate": 8.576939622137914e-07, "logits/chosen": -0.5054526925086975, "logits/rejected": -0.7931860685348511, "logps/chosen": -97.27056121826172, "logps/rejected": -116.05073547363281, "loss": 0.7269, "rewards/accuracies": 1.0, "rewards/chosen": 0.44370949268341064, "rewards/margins": 0.8652433156967163, "rewards/rejected": -0.42153388261795044, "step": 397 }, { "epoch": 0.021391524011716966, "grad_norm": 7.023025035858154, "learning_rate": 8.563192594626026e-07, "logits/chosen": -0.5063807964324951, "logits/rejected": -0.7560736536979675, "logps/chosen": -60.01344299316406, "logps/rejected": -96.10995483398438, "loss": 0.8052, "rewards/accuracies": 0.875, "rewards/chosen": 0.28881698846817017, "rewards/margins": 0.8561569452285767, "rewards/rejected": -0.5673400163650513, "step": 398 }, { "epoch": 0.0214452715594851, "grad_norm": 7.273676872253418, "learning_rate": 8.549390618276451e-07, "logits/chosen": -0.4955627918243408, "logits/rejected": -0.7727431058883667, "logps/chosen": -77.20539093017578, "logps/rejected": -111.53607940673828, "loss": 0.7628, "rewards/accuracies": 1.0, "rewards/chosen": 0.4461873471736908, "rewards/margins": 0.8450964689254761, "rewards/rejected": -0.3989090919494629, "step": 399 }, { "epoch": 0.021499019107253232, "grad_norm": 8.025479316711426, "learning_rate": 8.535533905932737e-07, "logits/chosen": -0.6039217114448547, "logits/rejected": -0.6724085211753845, "logps/chosen": -66.64635467529297, "logps/rejected": -88.4715805053711, "loss": 1.1488, "rewards/accuracies": 0.75, "rewards/chosen": 0.1793912798166275, "rewards/margins": 0.29325026273727417, "rewards/rejected": -0.11385899037122726, "step": 400 }, { "epoch": 0.021552766655021365, "grad_norm": 8.519718170166016, "learning_rate": 8.521622671282532e-07, "logits/chosen": -0.5670723915100098, "logits/rejected": -0.730586588382721, "logps/chosen": -100.56636047363281, "logps/rejected": -140.93875122070312, "loss": 0.8794, "rewards/accuracies": 1.0, "rewards/chosen": 0.2734277844429016, "rewards/margins": 0.626110315322876, "rewards/rejected": -0.352682501077652, "step": 401 }, { "epoch": 0.021606514202789497, "grad_norm": 6.963486671447754, "learning_rate": 8.507657128854279e-07, "logits/chosen": -0.5109165906906128, "logits/rejected": -0.7599716186523438, "logps/chosen": -102.60496520996094, "logps/rejected": -109.02903747558594, "loss": 0.7067, "rewards/accuracies": 1.0, "rewards/chosen": 0.35969337821006775, "rewards/margins": 0.8990400433540344, "rewards/rejected": -0.5393466353416443, "step": 402 }, { "epoch": 0.02166026175055763, "grad_norm": 10.259397506713867, "learning_rate": 8.493637494013922e-07, "logits/chosen": -0.50200355052948, "logits/rejected": -0.6871232986450195, "logps/chosen": -79.18869018554688, "logps/rejected": -115.22865295410156, "loss": 1.1256, "rewards/accuracies": 0.75, "rewards/chosen": 0.2410467565059662, "rewards/margins": 0.34936076402664185, "rewards/rejected": -0.10831403732299805, "step": 403 }, { "epoch": 0.021714009298325763, "grad_norm": 8.734382629394531, "learning_rate": 8.47956398296157e-07, "logits/chosen": -0.38818132877349854, "logits/rejected": -0.5967045426368713, "logps/chosen": -93.48192596435547, "logps/rejected": -125.08663940429688, "loss": 0.9043, "rewards/accuracies": 0.875, "rewards/chosen": 0.1366182416677475, "rewards/margins": 0.6212257146835327, "rewards/rejected": -0.48460739850997925, "step": 404 }, { "epoch": 0.021767756846093896, "grad_norm": 7.914803981781006, "learning_rate": 8.465436812728179e-07, "logits/chosen": -0.6245490312576294, "logits/rejected": -0.8476388454437256, "logps/chosen": -63.0573844909668, "logps/rejected": -106.95098876953125, "loss": 0.9413, "rewards/accuracies": 0.75, "rewards/chosen": 0.38270479440689087, "rewards/margins": 0.6590538024902344, "rewards/rejected": -0.2763490080833435, "step": 405 }, { "epoch": 0.021821504393862032, "grad_norm": 8.169920921325684, "learning_rate": 8.451256201172186e-07, "logits/chosen": -0.717471718788147, "logits/rejected": -0.8148710131645203, "logps/chosen": -85.54566192626953, "logps/rejected": -118.85932922363281, "loss": 0.7887, "rewards/accuracies": 1.0, "rewards/chosen": 0.32287904620170593, "rewards/margins": 0.7958505749702454, "rewards/rejected": -0.47297149896621704, "step": 406 }, { "epoch": 0.021875251941630165, "grad_norm": 7.288476467132568, "learning_rate": 8.437022366976163e-07, "logits/chosen": -0.43112078309059143, "logits/rejected": -0.6634670495986938, "logps/chosen": -81.10909271240234, "logps/rejected": -140.49014282226562, "loss": 0.7708, "rewards/accuracies": 1.0, "rewards/chosen": 0.3955495357513428, "rewards/margins": 0.8508970141410828, "rewards/rejected": -0.45534753799438477, "step": 407 }, { "epoch": 0.021928999489398297, "grad_norm": 8.557352066040039, "learning_rate": 8.422735529643443e-07, "logits/chosen": -0.44726234674453735, "logits/rejected": -0.624946117401123, "logps/chosen": -82.70333099365234, "logps/rejected": -109.97267150878906, "loss": 1.0791, "rewards/accuracies": 0.625, "rewards/chosen": 0.15813380479812622, "rewards/margins": 0.42620062828063965, "rewards/rejected": -0.2680668234825134, "step": 408 }, { "epoch": 0.02198274703716643, "grad_norm": 7.739879608154297, "learning_rate": 8.408395909494732e-07, "logits/chosen": -0.45716869831085205, "logits/rejected": -0.7675366997718811, "logps/chosen": -71.71652221679688, "logps/rejected": -79.29115295410156, "loss": 0.9641, "rewards/accuracies": 1.0, "rewards/chosen": 0.33620724081993103, "rewards/margins": 0.5150159001350403, "rewards/rejected": -0.17880865931510925, "step": 409 }, { "epoch": 0.022036494584934563, "grad_norm": 9.347245216369629, "learning_rate": 8.394003727664709e-07, "logits/chosen": -0.5022048950195312, "logits/rejected": -0.7082732319831848, "logps/chosen": -103.00680541992188, "logps/rejected": -132.04229736328125, "loss": 0.8635, "rewards/accuracies": 0.875, "rewards/chosen": 0.2674364149570465, "rewards/margins": 0.654800534248352, "rewards/rejected": -0.38736411929130554, "step": 410 }, { "epoch": 0.022090242132702696, "grad_norm": 6.921037197113037, "learning_rate": 8.379559206098623e-07, "logits/chosen": -0.4656442403793335, "logits/rejected": -0.744530200958252, "logps/chosen": -82.50672912597656, "logps/rejected": -115.61360168457031, "loss": 0.7298, "rewards/accuracies": 1.0, "rewards/chosen": 0.4850219488143921, "rewards/margins": 0.8518251180648804, "rewards/rejected": -0.36680319905281067, "step": 411 }, { "epoch": 0.02214398968047083, "grad_norm": 7.308769702911377, "learning_rate": 8.365062567548867e-07, "logits/chosen": -0.4788022041320801, "logits/rejected": -0.4485514163970947, "logps/chosen": -110.72388458251953, "logps/rejected": -175.50726318359375, "loss": 0.5679, "rewards/accuracies": 1.0, "rewards/chosen": 0.366916298866272, "rewards/margins": 1.1998602151870728, "rewards/rejected": -0.8329439163208008, "step": 412 }, { "epoch": 0.02219773722823896, "grad_norm": 5.328125476837158, "learning_rate": 8.350514035571539e-07, "logits/chosen": -0.5326926708221436, "logits/rejected": -0.7309978604316711, "logps/chosen": -74.29196166992188, "logps/rejected": -119.92567443847656, "loss": 0.7341, "rewards/accuracies": 0.875, "rewards/chosen": 0.3555259108543396, "rewards/margins": 0.962948203086853, "rewards/rejected": -0.6074223518371582, "step": 413 }, { "epoch": 0.022251484776007094, "grad_norm": 6.019349098205566, "learning_rate": 8.335913834522998e-07, "logits/chosen": -0.393079936504364, "logits/rejected": -0.7302870154380798, "logps/chosen": -61.59940719604492, "logps/rejected": -119.22950744628906, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.31037062406539917, "rewards/margins": 1.0463714599609375, "rewards/rejected": -0.7360007762908936, "step": 414 }, { "epoch": 0.022305232323775227, "grad_norm": 6.970813751220703, "learning_rate": 8.321262189556409e-07, "logits/chosen": -0.6479988694190979, "logits/rejected": -1.0225071907043457, "logps/chosen": -71.14956665039062, "logps/rejected": -122.30042266845703, "loss": 0.7424, "rewards/accuracies": 0.875, "rewards/chosen": 0.2114604264497757, "rewards/margins": 0.9996956586837769, "rewards/rejected": -0.7882353067398071, "step": 415 }, { "epoch": 0.02235897987154336, "grad_norm": 8.393033981323242, "learning_rate": 8.306559326618259e-07, "logits/chosen": -0.5637806057929993, "logits/rejected": -0.7427411079406738, "logps/chosen": -85.45783996582031, "logps/rejected": -109.05714416503906, "loss": 0.8628, "rewards/accuracies": 0.875, "rewards/chosen": 0.1616324782371521, "rewards/margins": 0.7868552803993225, "rewards/rejected": -0.6252227425575256, "step": 416 }, { "epoch": 0.022412727419311496, "grad_norm": 8.664493560791016, "learning_rate": 8.291805472444886e-07, "logits/chosen": -0.4925822913646698, "logits/rejected": -0.7722136974334717, "logps/chosen": -82.40205383300781, "logps/rejected": -138.8902130126953, "loss": 0.8757, "rewards/accuracies": 0.875, "rewards/chosen": 0.28226977586746216, "rewards/margins": 0.9708297252655029, "rewards/rejected": -0.6885599493980408, "step": 417 }, { "epoch": 0.02246647496707963, "grad_norm": 5.876780033111572, "learning_rate": 8.277000854558969e-07, "logits/chosen": -0.3514257073402405, "logits/rejected": -0.8427184820175171, "logps/chosen": -80.08287811279297, "logps/rejected": -119.78553771972656, "loss": 0.8177, "rewards/accuracies": 0.75, "rewards/chosen": 0.3141867518424988, "rewards/margins": 0.9133978486061096, "rewards/rejected": -0.5992110967636108, "step": 418 }, { "epoch": 0.02252022251484776, "grad_norm": 7.562463283538818, "learning_rate": 8.262145701266033e-07, "logits/chosen": -0.3877696394920349, "logits/rejected": -0.7997497320175171, "logps/chosen": -79.00804138183594, "logps/rejected": -113.15538024902344, "loss": 0.774, "rewards/accuracies": 1.0, "rewards/chosen": 0.17658300697803497, "rewards/margins": 0.8457533121109009, "rewards/rejected": -0.6691702604293823, "step": 419 }, { "epoch": 0.022573970062615894, "grad_norm": 7.364825248718262, "learning_rate": 8.247240241650917e-07, "logits/chosen": -0.4685990810394287, "logits/rejected": -0.6723921298980713, "logps/chosen": -89.23411560058594, "logps/rejected": -108.93891143798828, "loss": 0.819, "rewards/accuracies": 1.0, "rewards/chosen": 0.19978682696819305, "rewards/margins": 0.7062132358551025, "rewards/rejected": -0.5064264535903931, "step": 420 }, { "epoch": 0.022627717610384027, "grad_norm": 7.043936729431152, "learning_rate": 8.232284705574249e-07, "logits/chosen": -0.4265397787094116, "logits/rejected": -0.6636292934417725, "logps/chosen": -58.268341064453125, "logps/rejected": -101.32440185546875, "loss": 0.8542, "rewards/accuracies": 0.875, "rewards/chosen": 0.3802594542503357, "rewards/margins": 0.7657908797264099, "rewards/rejected": -0.38553136587142944, "step": 421 }, { "epoch": 0.02268146515815216, "grad_norm": 7.640320777893066, "learning_rate": 8.217279323668895e-07, "logits/chosen": -0.4250754714012146, "logits/rejected": -0.7509661912918091, "logps/chosen": -86.7738037109375, "logps/rejected": -122.78596496582031, "loss": 0.792, "rewards/accuracies": 0.875, "rewards/chosen": 0.3204927146434784, "rewards/margins": 0.8607631921768188, "rewards/rejected": -0.5402705073356628, "step": 422 }, { "epoch": 0.022735212705920292, "grad_norm": 7.44843864440918, "learning_rate": 8.202224327336405e-07, "logits/chosen": -0.46798837184906006, "logits/rejected": -0.7443392872810364, "logps/chosen": -66.9738540649414, "logps/rejected": -101.98970031738281, "loss": 0.9405, "rewards/accuracies": 0.75, "rewards/chosen": 0.432750940322876, "rewards/margins": 0.6522971391677856, "rewards/rejected": -0.21954625844955444, "step": 423 }, { "epoch": 0.022788960253688425, "grad_norm": 7.2140655517578125, "learning_rate": 8.187119948743449e-07, "logits/chosen": -0.47923195362091064, "logits/rejected": -0.6018341779708862, "logps/chosen": -100.27091979980469, "logps/rejected": -125.55374145507812, "loss": 0.6948, "rewards/accuracies": 1.0, "rewards/chosen": 0.39575445652008057, "rewards/margins": 0.9974216818809509, "rewards/rejected": -0.6016672849655151, "step": 424 }, { "epoch": 0.022842707801456558, "grad_norm": 5.299772262573242, "learning_rate": 8.171966420818227e-07, "logits/chosen": -0.309378057718277, "logits/rejected": -0.5425280332565308, "logps/chosen": -74.81050872802734, "logps/rejected": -103.47669982910156, "loss": 0.5645, "rewards/accuracies": 1.0, "rewards/chosen": 0.3748512268066406, "rewards/margins": 1.2169536352157593, "rewards/rejected": -0.8421024084091187, "step": 425 }, { "epoch": 0.02289645534922469, "grad_norm": 6.921710014343262, "learning_rate": 8.156763977246889e-07, "logits/chosen": -0.47700586915016174, "logits/rejected": -0.6051280498504639, "logps/chosen": -89.62411499023438, "logps/rejected": -136.7732696533203, "loss": 0.6972, "rewards/accuracies": 1.0, "rewards/chosen": 0.239694744348526, "rewards/margins": 0.9625309705734253, "rewards/rejected": -0.7228362560272217, "step": 426 }, { "epoch": 0.022950202896992823, "grad_norm": 7.974945068359375, "learning_rate": 8.141512852469918e-07, "logits/chosen": -0.42335963249206543, "logits/rejected": -0.6819132566452026, "logps/chosen": -61.568214416503906, "logps/rejected": -79.38452911376953, "loss": 1.0196, "rewards/accuracies": 0.75, "rewards/chosen": 0.33415982127189636, "rewards/margins": 0.48223572969436646, "rewards/rejected": -0.1480759233236313, "step": 427 }, { "epoch": 0.02300395044476096, "grad_norm": 8.4947509765625, "learning_rate": 8.126213281678526e-07, "logits/chosen": -0.5197615623474121, "logits/rejected": -0.8187901973724365, "logps/chosen": -107.18109893798828, "logps/rejected": -134.470458984375, "loss": 0.7581, "rewards/accuracies": 1.0, "rewards/chosen": 0.0648244097828865, "rewards/margins": 0.8452411890029907, "rewards/rejected": -0.7804167866706848, "step": 428 }, { "epoch": 0.023057697992529092, "grad_norm": 9.03447151184082, "learning_rate": 8.11086550081102e-07, "logits/chosen": -0.45096445083618164, "logits/rejected": -0.6422216296195984, "logps/chosen": -97.41841125488281, "logps/rejected": -117.50894165039062, "loss": 0.8376, "rewards/accuracies": 0.875, "rewards/chosen": 0.3667028248310089, "rewards/margins": 0.8417689800262451, "rewards/rejected": -0.4750661849975586, "step": 429 }, { "epoch": 0.023111445540297225, "grad_norm": 9.27251148223877, "learning_rate": 8.095469746549171e-07, "logits/chosen": -0.5170062780380249, "logits/rejected": -0.7718736529350281, "logps/chosen": -74.64327239990234, "logps/rejected": -114.05581665039062, "loss": 1.0811, "rewards/accuracies": 0.875, "rewards/chosen": 0.08458875864744186, "rewards/margins": 0.3768150806427002, "rewards/rejected": -0.29222631454467773, "step": 430 }, { "epoch": 0.023165193088065358, "grad_norm": 6.024648666381836, "learning_rate": 8.080026256314549e-07, "logits/chosen": -0.4091903865337372, "logits/rejected": -0.7234742045402527, "logps/chosen": -74.6916275024414, "logps/rejected": -142.95448303222656, "loss": 0.5591, "rewards/accuracies": 1.0, "rewards/chosen": 0.29607635736465454, "rewards/margins": 1.2068874835968018, "rewards/rejected": -0.9108110666275024, "step": 431 }, { "epoch": 0.02321894063583349, "grad_norm": 6.5401201248168945, "learning_rate": 8.064535268264883e-07, "logits/chosen": -0.5211405158042908, "logits/rejected": -0.6714426279067993, "logps/chosen": -91.06954956054688, "logps/rejected": -143.86643981933594, "loss": 0.531, "rewards/accuracies": 1.0, "rewards/chosen": 0.39801526069641113, "rewards/margins": 1.3409204483032227, "rewards/rejected": -0.942905068397522, "step": 432 }, { "epoch": 0.023272688183601623, "grad_norm": 7.624337196350098, "learning_rate": 8.048997021290369e-07, "logits/chosen": -0.5306004881858826, "logits/rejected": -0.9551118612289429, "logps/chosen": -82.32575988769531, "logps/rejected": -101.46473693847656, "loss": 0.8687, "rewards/accuracies": 0.875, "rewards/chosen": 0.24746736884117126, "rewards/margins": 0.7155889868736267, "rewards/rejected": -0.4681216776371002, "step": 433 }, { "epoch": 0.023326435731369756, "grad_norm": 6.318516254425049, "learning_rate": 8.033411755009997e-07, "logits/chosen": -0.4339580833911896, "logits/rejected": -0.5364832878112793, "logps/chosen": -88.22373962402344, "logps/rejected": -113.12940216064453, "loss": 0.6644, "rewards/accuracies": 1.0, "rewards/chosen": 0.39737823605537415, "rewards/margins": 0.9828391671180725, "rewards/rejected": -0.5854609608650208, "step": 434 }, { "epoch": 0.02338018327913789, "grad_norm": 8.483478546142578, "learning_rate": 8.017779709767857e-07, "logits/chosen": -0.4805372357368469, "logits/rejected": -0.7934448719024658, "logps/chosen": -67.32972717285156, "logps/rejected": -88.32994079589844, "loss": 0.8108, "rewards/accuracies": 1.0, "rewards/chosen": 0.33126962184906006, "rewards/margins": 0.724446177482605, "rewards/rejected": -0.3931765556335449, "step": 435 }, { "epoch": 0.02343393082690602, "grad_norm": 8.51408863067627, "learning_rate": 8.00210112662942e-07, "logits/chosen": -0.5356080532073975, "logits/rejected": -0.7478476762771606, "logps/chosen": -77.8497085571289, "logps/rejected": -126.8206787109375, "loss": 0.9044, "rewards/accuracies": 0.75, "rewards/chosen": 0.23768869042396545, "rewards/margins": 0.8054761290550232, "rewards/rejected": -0.5677874684333801, "step": 436 }, { "epoch": 0.023487678374674154, "grad_norm": 7.612512111663818, "learning_rate": 7.986376247377836e-07, "logits/chosen": -0.44500088691711426, "logits/rejected": -0.5467145442962646, "logps/chosen": -82.58636474609375, "logps/rejected": -116.72201538085938, "loss": 0.8943, "rewards/accuracies": 0.75, "rewards/chosen": 0.33954787254333496, "rewards/margins": 0.7968809604644775, "rewards/rejected": -0.4573330879211426, "step": 437 }, { "epoch": 0.023541425922442287, "grad_norm": 8.608576774597168, "learning_rate": 7.970605314510192e-07, "logits/chosen": -0.5763580203056335, "logits/rejected": -0.8037289381027222, "logps/chosen": -98.38741302490234, "logps/rejected": -125.25379943847656, "loss": 0.7672, "rewards/accuracies": 1.0, "rewards/chosen": 0.022365771234035492, "rewards/margins": 0.8266461491584778, "rewards/rejected": -0.8042804002761841, "step": 438 }, { "epoch": 0.023595173470210423, "grad_norm": 8.916050910949707, "learning_rate": 7.954788571233786e-07, "logits/chosen": -0.46296143531799316, "logits/rejected": -0.7182796001434326, "logps/chosen": -92.67381286621094, "logps/rejected": -118.10578918457031, "loss": 0.7344, "rewards/accuracies": 1.0, "rewards/chosen": 0.22891993820667267, "rewards/margins": 0.8809258341789246, "rewards/rejected": -0.6520059108734131, "step": 439 }, { "epoch": 0.023648921017978556, "grad_norm": 7.936333179473877, "learning_rate": 7.938926261462365e-07, "logits/chosen": -0.3738171458244324, "logits/rejected": -0.6514754891395569, "logps/chosen": -79.20115661621094, "logps/rejected": -103.39704132080078, "loss": 0.9433, "rewards/accuracies": 1.0, "rewards/chosen": 0.429705947637558, "rewards/margins": 0.5538244843482971, "rewards/rejected": -0.12411852180957794, "step": 440 }, { "epoch": 0.02370266856574669, "grad_norm": 9.56844711303711, "learning_rate": 7.923018629812368e-07, "logits/chosen": -0.39676690101623535, "logits/rejected": -0.7574619054794312, "logps/chosen": -84.60523986816406, "logps/rejected": -105.73455810546875, "loss": 0.9566, "rewards/accuracies": 0.875, "rewards/chosen": 0.07461362332105637, "rewards/margins": 0.5628937482833862, "rewards/rejected": -0.48828011751174927, "step": 441 }, { "epoch": 0.02375641611351482, "grad_norm": 7.901853561401367, "learning_rate": 7.907065921599153e-07, "logits/chosen": -0.5040668249130249, "logits/rejected": -0.718947172164917, "logps/chosen": -96.41754150390625, "logps/rejected": -125.28307342529297, "loss": 0.8844, "rewards/accuracies": 0.875, "rewards/chosen": -0.056825827807188034, "rewards/margins": 0.747873067855835, "rewards/rejected": -0.8046988844871521, "step": 442 }, { "epoch": 0.023810163661282954, "grad_norm": 6.9084696769714355, "learning_rate": 7.891068382833214e-07, "logits/chosen": -0.5003049373626709, "logits/rejected": -0.7614659070968628, "logps/chosen": -82.88668823242188, "logps/rejected": -135.33819580078125, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.02843533456325531, "rewards/margins": 1.0170345306396484, "rewards/rejected": -0.9885993003845215, "step": 443 }, { "epoch": 0.023863911209051087, "grad_norm": 7.334207534790039, "learning_rate": 7.875026260216393e-07, "logits/chosen": -0.3880821168422699, "logits/rejected": -0.6947299838066101, "logps/chosen": -89.82861328125, "logps/rejected": -124.42120361328125, "loss": 0.6256, "rewards/accuracies": 1.0, "rewards/chosen": 0.28500714898109436, "rewards/margins": 1.056867003440857, "rewards/rejected": -0.7718597650527954, "step": 444 }, { "epoch": 0.02391765875681922, "grad_norm": 8.842364311218262, "learning_rate": 7.85893980113806e-07, "logits/chosen": -0.2469213902950287, "logits/rejected": -0.548051118850708, "logps/chosen": -81.11711883544922, "logps/rejected": -108.83488464355469, "loss": 0.9627, "rewards/accuracies": 0.875, "rewards/chosen": 0.22066804766654968, "rewards/margins": 0.572004497051239, "rewards/rejected": -0.35133644938468933, "step": 445 }, { "epoch": 0.023971406304587353, "grad_norm": 8.249946594238281, "learning_rate": 7.842809253671319e-07, "logits/chosen": -0.5505584478378296, "logits/rejected": -0.5947058200836182, "logps/chosen": -77.19481658935547, "logps/rejected": -113.9437255859375, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.3590509593486786, "rewards/margins": 0.9526396989822388, "rewards/rejected": -0.5935887098312378, "step": 446 }, { "epoch": 0.024025153852355485, "grad_norm": 6.780737400054932, "learning_rate": 7.826634866569164e-07, "logits/chosen": -0.44823703169822693, "logits/rejected": -0.5610001087188721, "logps/chosen": -77.51661682128906, "logps/rejected": -167.60617065429688, "loss": 0.5769, "rewards/accuracies": 0.875, "rewards/chosen": 0.2385588437318802, "rewards/margins": 1.3697946071624756, "rewards/rejected": -1.1312358379364014, "step": 447 }, { "epoch": 0.024078901400123618, "grad_norm": 6.394153118133545, "learning_rate": 7.810416889260653e-07, "logits/chosen": -0.5009193420410156, "logits/rejected": -0.5553930997848511, "logps/chosen": -88.69538116455078, "logps/rejected": -132.17935180664062, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 0.4127521514892578, "rewards/margins": 1.1352932453155518, "rewards/rejected": -0.722541093826294, "step": 448 }, { "epoch": 0.02413264894789175, "grad_norm": 7.0401716232299805, "learning_rate": 7.794155571847057e-07, "logits/chosen": -0.2870066463947296, "logits/rejected": -0.7537978291511536, "logps/chosen": -82.2220458984375, "logps/rejected": -135.6904296875, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.2678471505641937, "rewards/margins": 1.0321636199951172, "rewards/rejected": -0.7643164992332458, "step": 449 }, { "epoch": 0.024186396495659887, "grad_norm": 8.32508659362793, "learning_rate": 7.777851165098011e-07, "logits/chosen": -0.5504728555679321, "logits/rejected": -0.6927813291549683, "logps/chosen": -91.29072570800781, "logps/rejected": -157.31304931640625, "loss": 0.7222, "rewards/accuracies": 1.0, "rewards/chosen": -0.012720540165901184, "rewards/margins": 1.0396003723144531, "rewards/rejected": -1.052320957183838, "step": 450 }, { "epoch": 0.02424014404342802, "grad_norm": 11.343000411987305, "learning_rate": 7.761503920447634e-07, "logits/chosen": -0.4581787586212158, "logits/rejected": -0.7597699165344238, "logps/chosen": -86.64972686767578, "logps/rejected": -93.4219970703125, "loss": 1.102, "rewards/accuracies": 0.875, "rewards/chosen": -0.0009597092866897583, "rewards/margins": 0.36721888184547424, "rewards/rejected": -0.3681786060333252, "step": 451 }, { "epoch": 0.024293891591196153, "grad_norm": 12.91161060333252, "learning_rate": 7.745114089990659e-07, "logits/chosen": -0.749862551689148, "logits/rejected": -0.9248619675636292, "logps/chosen": -75.19427490234375, "logps/rejected": -90.80426788330078, "loss": 1.0865, "rewards/accuracies": 0.75, "rewards/chosen": 0.10096240043640137, "rewards/margins": 0.385442316532135, "rewards/rejected": -0.28447991609573364, "step": 452 }, { "epoch": 0.024347639138964285, "grad_norm": 6.932260990142822, "learning_rate": 7.728681926478549e-07, "logits/chosen": -0.590410590171814, "logits/rejected": -0.63494873046875, "logps/chosen": -82.29263305664062, "logps/rejected": -99.32011413574219, "loss": 0.8041, "rewards/accuracies": 1.0, "rewards/chosen": 0.44876039028167725, "rewards/margins": 0.7687848806381226, "rewards/rejected": -0.3200244903564453, "step": 453 }, { "epoch": 0.024401386686732418, "grad_norm": 6.669427871704102, "learning_rate": 7.712207683315594e-07, "logits/chosen": -0.6082048416137695, "logits/rejected": -0.7011226415634155, "logps/chosen": -64.87923431396484, "logps/rejected": -99.72596740722656, "loss": 0.7697, "rewards/accuracies": 0.875, "rewards/chosen": 0.3828639090061188, "rewards/margins": 0.8529906272888184, "rewards/rejected": -0.4701266884803772, "step": 454 }, { "epoch": 0.02445513423450055, "grad_norm": 9.07262134552002, "learning_rate": 7.695691614555002e-07, "logits/chosen": -0.4017524719238281, "logits/rejected": -0.6890518665313721, "logps/chosen": -98.58143615722656, "logps/rejected": -129.6182403564453, "loss": 0.7696, "rewards/accuracies": 1.0, "rewards/chosen": 0.19450616836547852, "rewards/margins": 0.8183708190917969, "rewards/rejected": -0.6238647103309631, "step": 455 }, { "epoch": 0.024508881782268684, "grad_norm": 8.309849739074707, "learning_rate": 7.679133974894982e-07, "logits/chosen": -0.5828233957290649, "logits/rejected": -0.9702988862991333, "logps/chosen": -95.46450805664062, "logps/rejected": -137.47592163085938, "loss": 0.8059, "rewards/accuracies": 1.0, "rewards/chosen": 0.18794265389442444, "rewards/margins": 0.8563880920410156, "rewards/rejected": -0.6684454679489136, "step": 456 }, { "epoch": 0.024562629330036816, "grad_norm": 7.82414436340332, "learning_rate": 7.662535019674826e-07, "logits/chosen": -0.3055247366428375, "logits/rejected": -0.762104868888855, "logps/chosen": -67.01690673828125, "logps/rejected": -88.92282104492188, "loss": 0.8324, "rewards/accuracies": 0.875, "rewards/chosen": 0.38684606552124023, "rewards/margins": 0.7781317234039307, "rewards/rejected": -0.39128565788269043, "step": 457 }, { "epoch": 0.02461637687780495, "grad_norm": 9.514270782470703, "learning_rate": 7.645895004870953e-07, "logits/chosen": -0.4206573963165283, "logits/rejected": -0.569348156452179, "logps/chosen": -68.7651596069336, "logps/rejected": -84.69754791259766, "loss": 0.996, "rewards/accuracies": 0.875, "rewards/chosen": 0.19300921261310577, "rewards/margins": 0.4951474666595459, "rewards/rejected": -0.3021382987499237, "step": 458 }, { "epoch": 0.024670124425573082, "grad_norm": 7.029499053955078, "learning_rate": 7.629214187092977e-07, "logits/chosen": -0.3734283447265625, "logits/rejected": -0.7163116931915283, "logps/chosen": -88.70042419433594, "logps/rejected": -145.3223419189453, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.15387395024299622, "rewards/margins": 1.2952346801757812, "rewards/rejected": -1.1413606405258179, "step": 459 }, { "epoch": 0.024723871973341215, "grad_norm": 10.299975395202637, "learning_rate": 7.612492823579744e-07, "logits/chosen": -0.8413804769515991, "logits/rejected": -0.7508734464645386, "logps/chosen": -80.67174530029297, "logps/rejected": -103.15706634521484, "loss": 1.1193, "rewards/accuracies": 0.75, "rewards/chosen": 0.037558555603027344, "rewards/margins": 0.4408063590526581, "rewards/rejected": -0.40324777364730835, "step": 460 }, { "epoch": 0.02477761952110935, "grad_norm": 8.179410934448242, "learning_rate": 7.595731172195364e-07, "logits/chosen": -0.5068320035934448, "logits/rejected": -0.7594693303108215, "logps/chosen": -50.48099136352539, "logps/rejected": -79.52085876464844, "loss": 1.1466, "rewards/accuracies": 0.875, "rewards/chosen": 0.40050482749938965, "rewards/margins": 0.3625420928001404, "rewards/rejected": 0.03796273469924927, "step": 461 }, { "epoch": 0.024831367068877484, "grad_norm": 6.534915924072266, "learning_rate": 7.578929491425237e-07, "logits/chosen": -0.6611806154251099, "logits/rejected": -0.9871463775634766, "logps/chosen": -101.77427673339844, "logps/rejected": -158.85704040527344, "loss": 0.5018, "rewards/accuracies": 1.0, "rewards/chosen": 0.24278384447097778, "rewards/margins": 1.3774943351745605, "rewards/rejected": -1.1347105503082275, "step": 462 }, { "epoch": 0.024885114616645616, "grad_norm": 9.458695411682129, "learning_rate": 7.562088040372066e-07, "logits/chosen": -0.3810614347457886, "logits/rejected": -0.5115857720375061, "logps/chosen": -84.13137817382812, "logps/rejected": -95.31614685058594, "loss": 0.9239, "rewards/accuracies": 0.75, "rewards/chosen": 0.280815064907074, "rewards/margins": 0.6312701106071472, "rewards/rejected": -0.35045504570007324, "step": 463 }, { "epoch": 0.02493886216441375, "grad_norm": 8.473527908325195, "learning_rate": 7.545207078751857e-07, "logits/chosen": -0.5347457528114319, "logits/rejected": -0.813988447189331, "logps/chosen": -101.28292846679688, "logps/rejected": -149.531982421875, "loss": 0.8476, "rewards/accuracies": 0.75, "rewards/chosen": 0.08423547446727753, "rewards/margins": 0.8835664987564087, "rewards/rejected": -0.79933100938797, "step": 464 }, { "epoch": 0.024992609712181882, "grad_norm": 5.1330766677856445, "learning_rate": 7.528286866889924e-07, "logits/chosen": -0.5445493459701538, "logits/rejected": -0.5648778676986694, "logps/chosen": -91.71717834472656, "logps/rejected": -127.25589752197266, "loss": 0.4249, "rewards/accuracies": 1.0, "rewards/chosen": 0.2891707420349121, "rewards/margins": 1.5700838565826416, "rewards/rejected": -1.2809131145477295, "step": 465 }, { "epoch": 0.025046357259950015, "grad_norm": 8.919391632080078, "learning_rate": 7.511327665716862e-07, "logits/chosen": -0.43354088068008423, "logits/rejected": -0.7524920105934143, "logps/chosen": -75.79352569580078, "logps/rejected": -129.26162719726562, "loss": 0.8161, "rewards/accuracies": 0.875, "rewards/chosen": 0.032510802149772644, "rewards/margins": 0.7943386435508728, "rewards/rejected": -0.761827826499939, "step": 466 }, { "epoch": 0.025100104807718147, "grad_norm": 6.092318534851074, "learning_rate": 7.494329736764537e-07, "logits/chosen": -0.5970028042793274, "logits/rejected": -0.8237210512161255, "logps/chosen": -89.574462890625, "logps/rejected": -140.05569458007812, "loss": 0.5269, "rewards/accuracies": 1.0, "rewards/chosen": 0.3584826588630676, "rewards/margins": 1.4103718996047974, "rewards/rejected": -1.051889181137085, "step": 467 }, { "epoch": 0.02515385235548628, "grad_norm": 7.867772579193115, "learning_rate": 7.477293342162037e-07, "logits/chosen": -0.41183024644851685, "logits/rejected": -0.7290879487991333, "logps/chosen": -87.62870025634766, "logps/rejected": -112.23393249511719, "loss": 0.8413, "rewards/accuracies": 0.875, "rewards/chosen": 0.14950667321681976, "rewards/margins": 0.7003246545791626, "rewards/rejected": -0.5508179664611816, "step": 468 }, { "epoch": 0.025207599903254413, "grad_norm": 7.5362443923950195, "learning_rate": 7.460218744631645e-07, "logits/chosen": -0.47348126769065857, "logits/rejected": -0.7624895572662354, "logps/chosen": -65.3670883178711, "logps/rejected": -76.55612182617188, "loss": 0.917, "rewards/accuracies": 0.875, "rewards/chosen": 0.3563833236694336, "rewards/margins": 0.5964365005493164, "rewards/rejected": -0.2400531768798828, "step": 469 }, { "epoch": 0.025261347451022546, "grad_norm": 7.505491256713867, "learning_rate": 7.443106207484775e-07, "logits/chosen": -0.5772174596786499, "logits/rejected": -0.9514181017875671, "logps/chosen": -89.36795806884766, "logps/rejected": -134.03843688964844, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.4210060238838196, "rewards/margins": 1.0392787456512451, "rewards/rejected": -0.6182727217674255, "step": 470 }, { "epoch": 0.025315094998790682, "grad_norm": 7.581161975860596, "learning_rate": 7.425955994617919e-07, "logits/chosen": -0.5843409299850464, "logits/rejected": -0.7788172960281372, "logps/chosen": -92.62406921386719, "logps/rejected": -129.4345703125, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.06627655029296875, "rewards/margins": 0.9752134084701538, "rewards/rejected": -0.9089368581771851, "step": 471 }, { "epoch": 0.025368842546558815, "grad_norm": 6.725006580352783, "learning_rate": 7.408768370508576e-07, "logits/chosen": -0.40064018964767456, "logits/rejected": -0.5744050145149231, "logps/chosen": -83.78401184082031, "logps/rejected": -137.03866577148438, "loss": 0.6242, "rewards/accuracies": 1.0, "rewards/chosen": 0.14756722748279572, "rewards/margins": 1.1537494659423828, "rewards/rejected": -1.0061821937561035, "step": 472 }, { "epoch": 0.025422590094326947, "grad_norm": 8.20006275177002, "learning_rate": 7.391543600211172e-07, "logits/chosen": -0.5442217588424683, "logits/rejected": -0.6960563659667969, "logps/chosen": -70.46719360351562, "logps/rejected": -98.96620178222656, "loss": 0.982, "rewards/accuracies": 0.875, "rewards/chosen": 0.10727784037590027, "rewards/margins": 0.5132899284362793, "rewards/rejected": -0.40601205825805664, "step": 473 }, { "epoch": 0.02547633764209508, "grad_norm": 9.186453819274902, "learning_rate": 7.374281949352972e-07, "logits/chosen": -0.6935830116271973, "logits/rejected": -0.6092934012413025, "logps/chosen": -80.08625030517578, "logps/rejected": -109.4278335571289, "loss": 0.8741, "rewards/accuracies": 1.0, "rewards/chosen": -0.11736488342285156, "rewards/margins": 0.6939413547515869, "rewards/rejected": -0.8113062381744385, "step": 474 }, { "epoch": 0.025530085189863213, "grad_norm": 8.674727439880371, "learning_rate": 7.356983684129989e-07, "logits/chosen": -0.5663500428199768, "logits/rejected": -0.6749187707901001, "logps/chosen": -83.73207092285156, "logps/rejected": -97.9882583618164, "loss": 0.8924, "rewards/accuracies": 0.875, "rewards/chosen": 0.16002464294433594, "rewards/margins": 0.6297483444213867, "rewards/rejected": -0.4697237014770508, "step": 475 }, { "epoch": 0.025583832737631346, "grad_norm": 7.953119277954102, "learning_rate": 7.339649071302867e-07, "logits/chosen": -0.3881515860557556, "logits/rejected": -0.8665343523025513, "logps/chosen": -75.9450912475586, "logps/rejected": -110.69939422607422, "loss": 0.7956, "rewards/accuracies": 0.875, "rewards/chosen": 0.33890801668167114, "rewards/margins": 0.8893479108810425, "rewards/rejected": -0.5504398941993713, "step": 476 }, { "epoch": 0.02563758028539948, "grad_norm": 6.637617588043213, "learning_rate": 7.322278378192782e-07, "logits/chosen": -0.3631148338317871, "logits/rejected": -0.6472980976104736, "logps/chosen": -68.94700622558594, "logps/rejected": -92.21940612792969, "loss": 0.7101, "rewards/accuracies": 0.875, "rewards/chosen": 0.44815793633461, "rewards/margins": 1.0164051055908203, "rewards/rejected": -0.5682473182678223, "step": 477 }, { "epoch": 0.02569132783316761, "grad_norm": 7.109462261199951, "learning_rate": 7.304871872677312e-07, "logits/chosen": -0.34959179162979126, "logits/rejected": -0.7623578906059265, "logps/chosen": -78.83657836914062, "logps/rejected": -143.01109313964844, "loss": 0.6554, "rewards/accuracies": 1.0, "rewards/chosen": 0.08109798282384872, "rewards/margins": 1.0858430862426758, "rewards/rejected": -1.0047451257705688, "step": 478 }, { "epoch": 0.025745075380935744, "grad_norm": 6.600827217102051, "learning_rate": 7.287429823186301e-07, "logits/chosen": -0.5455724000930786, "logits/rejected": -0.7143433094024658, "logps/chosen": -49.720401763916016, "logps/rejected": -74.9632568359375, "loss": 1.0827, "rewards/accuracies": 0.75, "rewards/chosen": 0.16296392679214478, "rewards/margins": 0.4683421850204468, "rewards/rejected": -0.305378258228302, "step": 479 }, { "epoch": 0.025798822928703877, "grad_norm": 6.859341144561768, "learning_rate": 7.269952498697734e-07, "logits/chosen": -0.3306613564491272, "logits/rejected": -0.4592418372631073, "logps/chosen": -78.88465881347656, "logps/rejected": -103.05387878417969, "loss": 0.7378, "rewards/accuracies": 0.875, "rewards/chosen": 0.23738789558410645, "rewards/margins": 0.9443413019180298, "rewards/rejected": -0.7069534063339233, "step": 480 }, { "epoch": 0.02585257047647201, "grad_norm": 8.409367561340332, "learning_rate": 7.252440168733571e-07, "logits/chosen": -0.595878541469574, "logits/rejected": -0.712478518486023, "logps/chosen": -87.77911376953125, "logps/rejected": -135.6846923828125, "loss": 0.8404, "rewards/accuracies": 1.0, "rewards/chosen": 0.30059191584587097, "rewards/margins": 0.8410674333572388, "rewards/rejected": -0.5404755473136902, "step": 481 }, { "epoch": 0.025906318024240146, "grad_norm": 10.672789573669434, "learning_rate": 7.234893103355606e-07, "logits/chosen": -0.45865583419799805, "logits/rejected": -0.9492020606994629, "logps/chosen": -93.44950866699219, "logps/rejected": -109.82638549804688, "loss": 0.9023, "rewards/accuracies": 0.75, "rewards/chosen": 0.060170892626047134, "rewards/margins": 0.7522347569465637, "rewards/rejected": -0.6920639276504517, "step": 482 }, { "epoch": 0.02596006557200828, "grad_norm": 7.187308311462402, "learning_rate": 7.217311573161292e-07, "logits/chosen": -0.5119210481643677, "logits/rejected": -0.645018458366394, "logps/chosen": -74.66809844970703, "logps/rejected": -127.25604248046875, "loss": 0.7484, "rewards/accuracies": 0.875, "rewards/chosen": 0.36632248759269714, "rewards/margins": 0.9341345429420471, "rewards/rejected": -0.5678120255470276, "step": 483 }, { "epoch": 0.02601381311977641, "grad_norm": 11.154306411743164, "learning_rate": 7.199695849279576e-07, "logits/chosen": -0.48053497076034546, "logits/rejected": -0.7118101716041565, "logps/chosen": -88.28960418701172, "logps/rejected": -111.1166763305664, "loss": 1.0465, "rewards/accuracies": 0.75, "rewards/chosen": -0.04363512992858887, "rewards/margins": 0.5195378065109253, "rewards/rejected": -0.5631729364395142, "step": 484 }, { "epoch": 0.026067560667544544, "grad_norm": 5.9751996994018555, "learning_rate": 7.182046203366709e-07, "logits/chosen": -0.5394608378410339, "logits/rejected": -0.6812800168991089, "logps/chosen": -110.22039794921875, "logps/rejected": -145.2031707763672, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 0.3083175718784332, "rewards/margins": 1.3282667398452759, "rewards/rejected": -1.019949197769165, "step": 485 }, { "epoch": 0.026121308215312677, "grad_norm": 7.638804912567139, "learning_rate": 7.164362907602071e-07, "logits/chosen": -0.6140745282173157, "logits/rejected": -0.6783902645111084, "logps/chosen": -90.57359313964844, "logps/rejected": -130.063232421875, "loss": 0.704, "rewards/accuracies": 1.0, "rewards/chosen": 0.11173401027917862, "rewards/margins": 1.0140730142593384, "rewards/rejected": -0.902338981628418, "step": 486 }, { "epoch": 0.02617505576308081, "grad_norm": 8.761343955993652, "learning_rate": 7.146646234683949e-07, "logits/chosen": -0.580095112323761, "logits/rejected": -0.6720125079154968, "logps/chosen": -90.40350341796875, "logps/rejected": -154.4210205078125, "loss": 0.7649, "rewards/accuracies": 0.875, "rewards/chosen": -0.04067101329565048, "rewards/margins": 1.050798773765564, "rewards/rejected": -1.0914697647094727, "step": 487 }, { "epoch": 0.026228803310848942, "grad_norm": 8.023530006408691, "learning_rate": 7.128896457825363e-07, "logits/chosen": -0.8425403237342834, "logits/rejected": -0.8102314472198486, "logps/chosen": -102.14181518554688, "logps/rejected": -137.13592529296875, "loss": 0.8394, "rewards/accuracies": 0.75, "rewards/chosen": 0.2727055549621582, "rewards/margins": 0.9216597080230713, "rewards/rejected": -0.6489542126655579, "step": 488 }, { "epoch": 0.026282550858617075, "grad_norm": 7.154997825622559, "learning_rate": 7.111113850749827e-07, "logits/chosen": -0.2786712050437927, "logits/rejected": -0.590296745300293, "logps/chosen": -74.93867492675781, "logps/rejected": -103.42362213134766, "loss": 0.7631, "rewards/accuracies": 0.875, "rewards/chosen": 0.12805700302124023, "rewards/margins": 0.8837379217147827, "rewards/rejected": -0.7556809186935425, "step": 489 }, { "epoch": 0.026336298406385208, "grad_norm": 6.591698169708252, "learning_rate": 7.09329868768714e-07, "logits/chosen": -0.5010906457901001, "logits/rejected": -0.7045717239379883, "logps/chosen": -93.52080535888672, "logps/rejected": -114.11204528808594, "loss": 0.7473, "rewards/accuracies": 0.875, "rewards/chosen": 0.12561318278312683, "rewards/margins": 0.9546626210212708, "rewards/rejected": -0.8290494084358215, "step": 490 }, { "epoch": 0.02639004595415334, "grad_norm": 7.47642183303833, "learning_rate": 7.075451243369156e-07, "logits/chosen": -0.3806176781654358, "logits/rejected": -0.5274667143821716, "logps/chosen": -75.42254638671875, "logps/rejected": -117.89915466308594, "loss": 0.8201, "rewards/accuracies": 0.875, "rewards/chosen": 0.22756896913051605, "rewards/margins": 0.928924560546875, "rewards/rejected": -0.7013556361198425, "step": 491 }, { "epoch": 0.026443793501921473, "grad_norm": 6.768543720245361, "learning_rate": 7.057571793025544e-07, "logits/chosen": -0.5376786589622498, "logits/rejected": -0.7344709038734436, "logps/chosen": -84.59709167480469, "logps/rejected": -128.1107177734375, "loss": 0.5961, "rewards/accuracies": 1.0, "rewards/chosen": 0.44526171684265137, "rewards/margins": 1.2997934818267822, "rewards/rejected": -0.8545318245887756, "step": 492 }, { "epoch": 0.02649754104968961, "grad_norm": 8.744667053222656, "learning_rate": 7.039660612379545e-07, "logits/chosen": -0.5493291616439819, "logits/rejected": -0.6788958311080933, "logps/chosen": -95.72891235351562, "logps/rejected": -130.69964599609375, "loss": 0.7675, "rewards/accuracies": 0.875, "rewards/chosen": 0.14955902099609375, "rewards/margins": 0.9225468039512634, "rewards/rejected": -0.7729878425598145, "step": 493 }, { "epoch": 0.026551288597457742, "grad_norm": 7.867975234985352, "learning_rate": 7.021717977643725e-07, "logits/chosen": -0.6415963768959045, "logits/rejected": -0.9035673141479492, "logps/chosen": -95.17317962646484, "logps/rejected": -124.81008911132812, "loss": 0.707, "rewards/accuracies": 1.0, "rewards/chosen": 0.05402769893407822, "rewards/margins": 0.9272803068161011, "rewards/rejected": -0.8732526302337646, "step": 494 }, { "epoch": 0.026605036145225875, "grad_norm": 7.490847110748291, "learning_rate": 7.003744165515703e-07, "logits/chosen": -0.5823894739151001, "logits/rejected": -0.623516321182251, "logps/chosen": -78.01838684082031, "logps/rejected": -86.815185546875, "loss": 0.8868, "rewards/accuracies": 0.875, "rewards/chosen": 0.16339904069900513, "rewards/margins": 0.6628296971321106, "rewards/rejected": -0.49943068623542786, "step": 495 }, { "epoch": 0.026658783692994008, "grad_norm": 9.398392677307129, "learning_rate": 6.985739453173902e-07, "logits/chosen": -0.38990455865859985, "logits/rejected": -0.7045729160308838, "logps/chosen": -93.80413818359375, "logps/rejected": -129.47933959960938, "loss": 0.8202, "rewards/accuracies": 0.75, "rewards/chosen": 0.08793163299560547, "rewards/margins": 0.9726498126983643, "rewards/rejected": -0.8847181797027588, "step": 496 }, { "epoch": 0.02671253124076214, "grad_norm": 6.71289587020874, "learning_rate": 6.967704118273256e-07, "logits/chosen": -0.5207402110099792, "logits/rejected": -0.6744440793991089, "logps/chosen": -75.00041961669922, "logps/rejected": -120.77678680419922, "loss": 0.7242, "rewards/accuracies": 1.0, "rewards/chosen": 0.31040215492248535, "rewards/margins": 0.9029518365859985, "rewards/rejected": -0.5925496816635132, "step": 497 }, { "epoch": 0.026766278788530273, "grad_norm": 7.229090690612793, "learning_rate": 6.949638438940941e-07, "logits/chosen": -0.6867992281913757, "logits/rejected": -0.799670934677124, "logps/chosen": -88.71637725830078, "logps/rejected": -153.31947326660156, "loss": 0.6894, "rewards/accuracies": 0.875, "rewards/chosen": 0.08890824019908905, "rewards/margins": 1.2164325714111328, "rewards/rejected": -1.1275243759155273, "step": 498 }, { "epoch": 0.026820026336298406, "grad_norm": 9.091512680053711, "learning_rate": 6.93154269377208e-07, "logits/chosen": -0.45814722776412964, "logits/rejected": -0.8121790885925293, "logps/chosen": -69.52001953125, "logps/rejected": -98.94236755371094, "loss": 1.0183, "rewards/accuracies": 0.875, "rewards/chosen": 0.08682761341333389, "rewards/margins": 0.5400620698928833, "rewards/rejected": -0.4532344937324524, "step": 499 }, { "epoch": 0.02687377388406654, "grad_norm": 9.86420726776123, "learning_rate": 6.913417161825449e-07, "logits/chosen": -0.554621696472168, "logits/rejected": -0.8649418950080872, "logps/chosen": -85.480712890625, "logps/rejected": -107.6649398803711, "loss": 0.8156, "rewards/accuracies": 0.875, "rewards/chosen": 0.17938730120658875, "rewards/margins": 0.8039389252662659, "rewards/rejected": -0.6245515942573547, "step": 500 }, { "epoch": 0.02692752143183467, "grad_norm": 8.650354385375977, "learning_rate": 6.895262122619173e-07, "logits/chosen": -0.5484771728515625, "logits/rejected": -0.8191328048706055, "logps/chosen": -77.82686614990234, "logps/rejected": -121.04710388183594, "loss": 0.8548, "rewards/accuracies": 0.875, "rewards/chosen": 0.05543418228626251, "rewards/margins": 0.7076231241226196, "rewards/rejected": -0.6521888971328735, "step": 501 }, { "epoch": 0.026981268979602804, "grad_norm": 7.082733154296875, "learning_rate": 6.877077856126415e-07, "logits/chosen": -0.4777858257293701, "logits/rejected": -0.6979163289070129, "logps/chosen": -84.3621826171875, "logps/rejected": -126.95653533935547, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.3189496695995331, "rewards/margins": 0.9405217170715332, "rewards/rejected": -0.6215720176696777, "step": 502 }, { "epoch": 0.027035016527370937, "grad_norm": 5.857508659362793, "learning_rate": 6.858864642771061e-07, "logits/chosen": -0.33928152918815613, "logits/rejected": -0.7942571043968201, "logps/chosen": -74.85140991210938, "logps/rejected": -128.5996551513672, "loss": 0.5459, "rewards/accuracies": 0.875, "rewards/chosen": 0.32528847455978394, "rewards/margins": 1.4025118350982666, "rewards/rejected": -1.0772234201431274, "step": 503 }, { "epoch": 0.027088764075139073, "grad_norm": 10.088953971862793, "learning_rate": 6.840622763423391e-07, "logits/chosen": -0.2756340503692627, "logits/rejected": -0.6287195682525635, "logps/chosen": -74.62994384765625, "logps/rejected": -117.77538299560547, "loss": 0.9216, "rewards/accuracies": 0.75, "rewards/chosen": -0.07135909050703049, "rewards/margins": 0.7434082627296448, "rewards/rejected": -0.8147673606872559, "step": 504 }, { "epoch": 0.027142511622907206, "grad_norm": 5.423161029815674, "learning_rate": 6.82235249939575e-07, "logits/chosen": -0.5593537092208862, "logits/rejected": -0.66860032081604, "logps/chosen": -80.34014129638672, "logps/rejected": -130.43511962890625, "loss": 0.474, "rewards/accuracies": 1.0, "rewards/chosen": 0.39304691553115845, "rewards/margins": 1.448836088180542, "rewards/rejected": -1.0557892322540283, "step": 505 }, { "epoch": 0.02719625917067534, "grad_norm": 6.825314521789551, "learning_rate": 6.804054132438208e-07, "logits/chosen": -0.4704897999763489, "logits/rejected": -0.43631330132484436, "logps/chosen": -72.21434783935547, "logps/rejected": -99.04844665527344, "loss": 0.7584, "rewards/accuracies": 1.0, "rewards/chosen": 0.22173714637756348, "rewards/margins": 0.9700209498405457, "rewards/rejected": -0.748283863067627, "step": 506 }, { "epoch": 0.02725000671844347, "grad_norm": 8.456744194030762, "learning_rate": 6.785727944734227e-07, "logits/chosen": -0.6356399655342102, "logits/rejected": -0.645244300365448, "logps/chosen": -86.7821044921875, "logps/rejected": -118.35205841064453, "loss": 0.8157, "rewards/accuracies": 1.0, "rewards/chosen": 0.07077240943908691, "rewards/margins": 0.7798008918762207, "rewards/rejected": -0.7090285420417786, "step": 507 }, { "epoch": 0.027303754266211604, "grad_norm": 8.177644729614258, "learning_rate": 6.767374218896286e-07, "logits/chosen": -0.2821059823036194, "logits/rejected": -0.6565772294998169, "logps/chosen": -73.20730590820312, "logps/rejected": -104.72552490234375, "loss": 0.9238, "rewards/accuracies": 0.75, "rewards/chosen": 0.17446939647197723, "rewards/margins": 0.6810926198959351, "rewards/rejected": -0.5066232085227966, "step": 508 }, { "epoch": 0.027357501813979737, "grad_norm": 7.220467567443848, "learning_rate": 6.748993237961543e-07, "logits/chosen": -0.6457940340042114, "logits/rejected": -0.8538851737976074, "logps/chosen": -105.09932708740234, "logps/rejected": -146.75399780273438, "loss": 0.6744, "rewards/accuracies": 0.875, "rewards/chosen": -0.046125978231430054, "rewards/margins": 1.122889518737793, "rewards/rejected": -1.1690154075622559, "step": 509 }, { "epoch": 0.02741124936174787, "grad_norm": 9.870168685913086, "learning_rate": 6.730585285387465e-07, "logits/chosen": -0.5542341470718384, "logits/rejected": -0.8003448843955994, "logps/chosen": -66.39418029785156, "logps/rejected": -134.95315551757812, "loss": 0.8562, "rewards/accuracies": 0.875, "rewards/chosen": 0.13405819237232208, "rewards/margins": 0.8111207485198975, "rewards/rejected": -0.6770625710487366, "step": 510 }, { "epoch": 0.027464996909516003, "grad_norm": 6.395220756530762, "learning_rate": 6.71215064504745e-07, "logits/chosen": -0.4889827072620392, "logits/rejected": -0.7671954035758972, "logps/chosen": -73.71995544433594, "logps/rejected": -117.67768859863281, "loss": 0.6508, "rewards/accuracies": 0.875, "rewards/chosen": 0.4487582743167877, "rewards/margins": 1.2468342781066895, "rewards/rejected": -0.7980759739875793, "step": 511 }, { "epoch": 0.027518744457284135, "grad_norm": 7.399710655212402, "learning_rate": 6.693689601226458e-07, "logits/chosen": -0.35869088768959045, "logits/rejected": -0.7353401184082031, "logps/chosen": -74.69718933105469, "logps/rejected": -113.75350952148438, "loss": 0.7636, "rewards/accuracies": 0.875, "rewards/chosen": 0.26156359910964966, "rewards/margins": 1.188791036605835, "rewards/rejected": -0.9272273778915405, "step": 512 }, { "epoch": 0.027572492005052268, "grad_norm": 8.441719055175781, "learning_rate": 6.67520243861662e-07, "logits/chosen": -0.43929147720336914, "logits/rejected": -0.7573917508125305, "logps/chosen": -84.00564575195312, "logps/rejected": -112.37773132324219, "loss": 0.9129, "rewards/accuracies": 0.875, "rewards/chosen": 0.050905391573905945, "rewards/margins": 0.6889047622680664, "rewards/rejected": -0.6379993557929993, "step": 513 }, { "epoch": 0.0276262395528204, "grad_norm": 12.233419418334961, "learning_rate": 6.656689442312853e-07, "logits/chosen": -0.5240112543106079, "logits/rejected": -0.7147119045257568, "logps/chosen": -102.17632293701172, "logps/rejected": -125.13505554199219, "loss": 1.1838, "rewards/accuracies": 0.75, "rewards/chosen": -0.07867565006017685, "rewards/margins": 0.51581871509552, "rewards/rejected": -0.5944943428039551, "step": 514 }, { "epoch": 0.027679987100588537, "grad_norm": 6.995861053466797, "learning_rate": 6.638150897808467e-07, "logits/chosen": -0.5730359554290771, "logits/rejected": -0.6559311151504517, "logps/chosen": -102.10371398925781, "logps/rejected": -140.79962158203125, "loss": 0.5354, "rewards/accuracies": 0.875, "rewards/chosen": 0.27145683765411377, "rewards/margins": 1.399881362915039, "rewards/rejected": -1.1284246444702148, "step": 515 }, { "epoch": 0.02773373464835667, "grad_norm": 8.816736221313477, "learning_rate": 6.619587090990747e-07, "logits/chosen": -0.6292909383773804, "logits/rejected": -0.8012309670448303, "logps/chosen": -83.28184509277344, "logps/rejected": -124.6097640991211, "loss": 0.7389, "rewards/accuracies": 0.875, "rewards/chosen": 0.0721002072095871, "rewards/margins": 1.0362735986709595, "rewards/rejected": -0.9641733169555664, "step": 516 }, { "epoch": 0.027787482196124803, "grad_norm": 7.807778358459473, "learning_rate": 6.600998308136559e-07, "logits/chosen": -0.44258570671081543, "logits/rejected": -0.780799925327301, "logps/chosen": -89.3375244140625, "logps/rejected": -126.86802673339844, "loss": 0.8476, "rewards/accuracies": 0.75, "rewards/chosen": 0.0875643789768219, "rewards/margins": 0.9324407577514648, "rewards/rejected": -0.8448764085769653, "step": 517 }, { "epoch": 0.027841229743892935, "grad_norm": 7.325178623199463, "learning_rate": 6.58238483590793e-07, "logits/chosen": -0.4097304344177246, "logits/rejected": -0.6198008060455322, "logps/chosen": -95.20932006835938, "logps/rejected": -120.3585205078125, "loss": 0.7836, "rewards/accuracies": 0.75, "rewards/chosen": 0.2402416318655014, "rewards/margins": 0.9379483461380005, "rewards/rejected": -0.6977066993713379, "step": 518 }, { "epoch": 0.027894977291661068, "grad_norm": 8.757471084594727, "learning_rate": 6.563746961347629e-07, "logits/chosen": -0.24775367975234985, "logits/rejected": -0.7508536577224731, "logps/chosen": -85.83599090576172, "logps/rejected": -114.0818862915039, "loss": 0.9164, "rewards/accuracies": 0.875, "rewards/chosen": 0.12654882669448853, "rewards/margins": 0.625817060470581, "rewards/rejected": -0.4992683231830597, "step": 519 }, { "epoch": 0.0279487248394292, "grad_norm": 8.329408645629883, "learning_rate": 6.545084971874736e-07, "logits/chosen": -0.43366628885269165, "logits/rejected": -0.6247124075889587, "logps/chosen": -48.23228454589844, "logps/rejected": -65.39488220214844, "loss": 1.0962, "rewards/accuracies": 0.625, "rewards/chosen": 0.27831003069877625, "rewards/margins": 0.4701734185218811, "rewards/rejected": -0.19186341762542725, "step": 520 }, { "epoch": 0.028002472387197334, "grad_norm": 9.544438362121582, "learning_rate": 6.526399155280218e-07, "logits/chosen": -0.6073004007339478, "logits/rejected": -0.8793046474456787, "logps/chosen": -99.48709106445312, "logps/rejected": -151.00140380859375, "loss": 0.811, "rewards/accuracies": 0.875, "rewards/chosen": 0.009234759956598282, "rewards/margins": 0.9905223846435547, "rewards/rejected": -0.9812875986099243, "step": 521 }, { "epoch": 0.028056219934965466, "grad_norm": 10.85853099822998, "learning_rate": 6.507689799722478e-07, "logits/chosen": -0.6524690985679626, "logits/rejected": -0.7320886254310608, "logps/chosen": -114.58345794677734, "logps/rejected": -125.41224670410156, "loss": 0.8221, "rewards/accuracies": 1.0, "rewards/chosen": -0.04554262384772301, "rewards/margins": 0.7512316107749939, "rewards/rejected": -0.7967742085456848, "step": 522 }, { "epoch": 0.0281099674827336, "grad_norm": 9.901765823364258, "learning_rate": 6.488957193722927e-07, "logits/chosen": -0.3448493778705597, "logits/rejected": -0.5513025522232056, "logps/chosen": -94.8276596069336, "logps/rejected": -94.26264953613281, "loss": 0.9443, "rewards/accuracies": 1.0, "rewards/chosen": -0.03915099427103996, "rewards/margins": 0.514700174331665, "rewards/rejected": -0.5538511276245117, "step": 523 }, { "epoch": 0.028163715030501732, "grad_norm": 8.004623413085938, "learning_rate": 6.470201626161519e-07, "logits/chosen": -0.5676316022872925, "logits/rejected": -0.7968378067016602, "logps/chosen": -92.10704040527344, "logps/rejected": -121.9271011352539, "loss": 0.6571, "rewards/accuracies": 0.875, "rewards/chosen": 0.29392558336257935, "rewards/margins": 1.178324580192566, "rewards/rejected": -0.8843989372253418, "step": 524 }, { "epoch": 0.028217462578269865, "grad_norm": 8.33222484588623, "learning_rate": 6.451423386272311e-07, "logits/chosen": -0.32439637184143066, "logits/rejected": -0.9078592658042908, "logps/chosen": -90.86624145507812, "logps/rejected": -148.4439697265625, "loss": 0.9009, "rewards/accuracies": 0.75, "rewards/chosen": 0.024992510676383972, "rewards/margins": 0.8378036022186279, "rewards/rejected": -0.8128111362457275, "step": 525 }, { "epoch": 0.028271210126038, "grad_norm": 6.703718662261963, "learning_rate": 6.432622763638992e-07, "logits/chosen": -0.5668975114822388, "logits/rejected": -0.8486053943634033, "logps/chosen": -82.85316467285156, "logps/rejected": -141.0728759765625, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 0.20304732024669647, "rewards/margins": 1.4600908756256104, "rewards/rejected": -1.2570436000823975, "step": 526 }, { "epoch": 0.028324957673806134, "grad_norm": 7.9669270515441895, "learning_rate": 6.413800048190417e-07, "logits/chosen": -0.5802762508392334, "logits/rejected": -0.5580863952636719, "logps/chosen": -105.87008666992188, "logps/rejected": -140.19677734375, "loss": 0.6581, "rewards/accuracies": 1.0, "rewards/chosen": -0.17872926592826843, "rewards/margins": 1.1024322509765625, "rewards/rejected": -1.2811615467071533, "step": 527 }, { "epoch": 0.028378705221574266, "grad_norm": 8.072198867797852, "learning_rate": 6.394955530196147e-07, "logits/chosen": -0.5197033882141113, "logits/rejected": -1.1893115043640137, "logps/chosen": -88.66725158691406, "logps/rejected": -156.08770751953125, "loss": 0.6991, "rewards/accuracies": 0.875, "rewards/chosen": 0.16271986067295074, "rewards/margins": 1.2934575080871582, "rewards/rejected": -1.130737543106079, "step": 528 }, { "epoch": 0.0284324527693424, "grad_norm": 10.940951347351074, "learning_rate": 6.376089500261958e-07, "logits/chosen": -0.7141159772872925, "logits/rejected": -0.5300204753875732, "logps/chosen": -80.11769104003906, "logps/rejected": -117.72384643554688, "loss": 1.0192, "rewards/accuracies": 0.875, "rewards/chosen": -0.15119963884353638, "rewards/margins": 0.44411033391952515, "rewards/rejected": -0.5953099727630615, "step": 529 }, { "epoch": 0.028486200317110532, "grad_norm": 6.179901599884033, "learning_rate": 6.357202249325371e-07, "logits/chosen": -0.5417790412902832, "logits/rejected": -0.6387176513671875, "logps/chosen": -66.44081115722656, "logps/rejected": -115.05770874023438, "loss": 0.5545, "rewards/accuracies": 1.0, "rewards/chosen": 0.36180976033210754, "rewards/margins": 1.2832229137420654, "rewards/rejected": -0.9214131832122803, "step": 530 }, { "epoch": 0.028539947864878665, "grad_norm": 9.274606704711914, "learning_rate": 6.338294068651162e-07, "logits/chosen": -0.4485885798931122, "logits/rejected": -0.8264050483703613, "logps/chosen": -71.27490234375, "logps/rejected": -116.75479125976562, "loss": 0.7051, "rewards/accuracies": 1.0, "rewards/chosen": 0.24189621210098267, "rewards/margins": 0.9592739343643188, "rewards/rejected": -0.717377781867981, "step": 531 }, { "epoch": 0.028593695412646798, "grad_norm": 10.440709114074707, "learning_rate": 6.319365249826864e-07, "logits/chosen": -0.5603377819061279, "logits/rejected": -0.8800411224365234, "logps/chosen": -83.66847229003906, "logps/rejected": -107.90161895751953, "loss": 0.8062, "rewards/accuracies": 0.75, "rewards/chosen": -0.0784912183880806, "rewards/margins": 1.2740273475646973, "rewards/rejected": -1.3525185585021973, "step": 532 }, { "epoch": 0.02864744296041493, "grad_norm": 13.19996452331543, "learning_rate": 6.300416084758283e-07, "logits/chosen": -0.5516321659088135, "logits/rejected": -0.7079360485076904, "logps/chosen": -73.462646484375, "logps/rejected": -100.67831420898438, "loss": 1.0407, "rewards/accuracies": 0.625, "rewards/chosen": 0.17155586183071136, "rewards/margins": 0.6954343318939209, "rewards/rejected": -0.5238784551620483, "step": 533 }, { "epoch": 0.028701190508183063, "grad_norm": 8.233198165893555, "learning_rate": 6.281446865664984e-07, "logits/chosen": -0.5119665861129761, "logits/rejected": -0.49625471234321594, "logps/chosen": -76.94454193115234, "logps/rejected": -111.19963836669922, "loss": 0.8029, "rewards/accuracies": 0.875, "rewards/chosen": 0.11132638156414032, "rewards/margins": 0.788280189037323, "rewards/rejected": -0.6769537925720215, "step": 534 }, { "epoch": 0.028754938055951196, "grad_norm": 7.799862861633301, "learning_rate": 6.262457885075789e-07, "logits/chosen": -0.5139655470848083, "logits/rejected": -0.8026143312454224, "logps/chosen": -76.98152160644531, "logps/rejected": -130.43460083007812, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.21863514184951782, "rewards/margins": 1.1381441354751587, "rewards/rejected": -0.9195089936256409, "step": 535 }, { "epoch": 0.028808685603719332, "grad_norm": 8.220873832702637, "learning_rate": 6.243449435824276e-07, "logits/chosen": -0.5876587629318237, "logits/rejected": -0.7707182765007019, "logps/chosen": -72.68904113769531, "logps/rejected": -105.83596801757812, "loss": 0.8598, "rewards/accuracies": 0.875, "rewards/chosen": 0.008280038833618164, "rewards/margins": 0.670197606086731, "rewards/rejected": -0.6619175672531128, "step": 536 }, { "epoch": 0.028862433151487465, "grad_norm": 7.4581403732299805, "learning_rate": 6.224421811044237e-07, "logits/chosen": -0.5335758924484253, "logits/rejected": -0.9039856195449829, "logps/chosen": -95.86715698242188, "logps/rejected": -124.1130599975586, "loss": 0.7235, "rewards/accuracies": 0.875, "rewards/chosen": 0.10294431447982788, "rewards/margins": 1.006281852722168, "rewards/rejected": -0.9033374786376953, "step": 537 }, { "epoch": 0.028916180699255598, "grad_norm": 8.115388870239258, "learning_rate": 6.205375304165194e-07, "logits/chosen": -0.5914067625999451, "logits/rejected": -0.6609340906143188, "logps/chosen": -81.40691375732422, "logps/rejected": -128.21279907226562, "loss": 0.6707, "rewards/accuracies": 1.0, "rewards/chosen": 0.10149925202131271, "rewards/margins": 1.0724294185638428, "rewards/rejected": -0.970930278301239, "step": 538 }, { "epoch": 0.02896992824702373, "grad_norm": 7.014781475067139, "learning_rate": 6.186310208907839e-07, "logits/chosen": -0.4542465806007385, "logits/rejected": -0.5989331007003784, "logps/chosen": -71.59950256347656, "logps/rejected": -110.37400817871094, "loss": 0.7337, "rewards/accuracies": 0.875, "rewards/chosen": 0.36936455965042114, "rewards/margins": 1.0033270120620728, "rewards/rejected": -0.6339624524116516, "step": 539 }, { "epoch": 0.029023675794791863, "grad_norm": 8.688709259033203, "learning_rate": 6.167226819279527e-07, "logits/chosen": -0.4876874089241028, "logits/rejected": -0.7821437120437622, "logps/chosen": -58.55912780761719, "logps/rejected": -96.41452026367188, "loss": 0.9596, "rewards/accuracies": 0.75, "rewards/chosen": 0.006934314966201782, "rewards/margins": 0.5833888053894043, "rewards/rejected": -0.5764545202255249, "step": 540 }, { "epoch": 0.029077423342559996, "grad_norm": 6.381515026092529, "learning_rate": 6.148125429569734e-07, "logits/chosen": -0.6537259221076965, "logits/rejected": -0.8862186670303345, "logps/chosen": -133.06796264648438, "logps/rejected": -162.06036376953125, "loss": 0.4963, "rewards/accuracies": 1.0, "rewards/chosen": 0.2114505171775818, "rewards/margins": 1.3505122661590576, "rewards/rejected": -1.139061689376831, "step": 541 }, { "epoch": 0.02913117089032813, "grad_norm": 8.661445617675781, "learning_rate": 6.129006334345519e-07, "logits/chosen": -0.5483713150024414, "logits/rejected": -0.7865151762962341, "logps/chosen": -90.5474853515625, "logps/rejected": -101.54081726074219, "loss": 0.8369, "rewards/accuracies": 0.875, "rewards/chosen": 0.23122593760490417, "rewards/margins": 0.8454281091690063, "rewards/rejected": -0.6142022013664246, "step": 542 }, { "epoch": 0.02918491843809626, "grad_norm": 6.230299949645996, "learning_rate": 6.109869828446979e-07, "logits/chosen": -0.6602153778076172, "logits/rejected": -0.80539870262146, "logps/chosen": -101.82976531982422, "logps/rejected": -146.61972045898438, "loss": 0.51, "rewards/accuracies": 1.0, "rewards/chosen": 0.28992050886154175, "rewards/margins": 1.4114011526107788, "rewards/rejected": -1.1214807033538818, "step": 543 }, { "epoch": 0.029238665985864394, "grad_norm": 6.932173252105713, "learning_rate": 6.090716206982713e-07, "logits/chosen": -0.4665969908237457, "logits/rejected": -0.5309183597564697, "logps/chosen": -72.36742401123047, "logps/rejected": -102.4378662109375, "loss": 0.7081, "rewards/accuracies": 1.0, "rewards/chosen": 0.20157422125339508, "rewards/margins": 0.9442340135574341, "rewards/rejected": -0.7426597476005554, "step": 544 }, { "epoch": 0.029292413533632527, "grad_norm": 7.576010704040527, "learning_rate": 6.071545765325253e-07, "logits/chosen": -0.6146838068962097, "logits/rejected": -0.8233847618103027, "logps/chosen": -83.06092834472656, "logps/rejected": -101.1820297241211, "loss": 0.8119, "rewards/accuracies": 0.875, "rewards/chosen": 0.31339800357818604, "rewards/margins": 0.7695042490959167, "rewards/rejected": -0.4561062455177307, "step": 545 }, { "epoch": 0.02934616108140066, "grad_norm": 6.8824920654296875, "learning_rate": 6.052358799106527e-07, "logits/chosen": -0.4143807590007782, "logits/rejected": -0.7886111736297607, "logps/chosen": -74.7923583984375, "logps/rejected": -125.47184753417969, "loss": 0.6369, "rewards/accuracies": 0.875, "rewards/chosen": 0.31427788734436035, "rewards/margins": 1.2023985385894775, "rewards/rejected": -0.888120710849762, "step": 546 }, { "epoch": 0.029399908629168796, "grad_norm": 8.437735557556152, "learning_rate": 6.03315560421329e-07, "logits/chosen": -0.7289080023765564, "logits/rejected": -0.6920559406280518, "logps/chosen": -90.01176452636719, "logps/rejected": -120.0971908569336, "loss": 0.819, "rewards/accuracies": 0.875, "rewards/chosen": 0.1726226806640625, "rewards/margins": 0.9787912368774414, "rewards/rejected": -0.8061685562133789, "step": 547 }, { "epoch": 0.02945365617693693, "grad_norm": 6.750065326690674, "learning_rate": 6.013936476782561e-07, "logits/chosen": -0.39846155047416687, "logits/rejected": -0.6640790104866028, "logps/chosen": -91.8084945678711, "logps/rejected": -135.5098114013672, "loss": 0.6036, "rewards/accuracies": 1.0, "rewards/chosen": 0.17562279105186462, "rewards/margins": 1.292417049407959, "rewards/rejected": -1.116794228553772, "step": 548 }, { "epoch": 0.02950740372470506, "grad_norm": 8.024323463439941, "learning_rate": 5.994701713197063e-07, "logits/chosen": -0.5355954170227051, "logits/rejected": -0.6985470056533813, "logps/chosen": -86.89678955078125, "logps/rejected": -123.79656982421875, "loss": 0.6858, "rewards/accuracies": 0.875, "rewards/chosen": 0.03934570401906967, "rewards/margins": 1.2484444379806519, "rewards/rejected": -1.2090985774993896, "step": 549 }, { "epoch": 0.029561151272473194, "grad_norm": 8.410124778747559, "learning_rate": 5.975451610080642e-07, "logits/chosen": -0.4546431303024292, "logits/rejected": -0.7805744409561157, "logps/chosen": -93.40823364257812, "logps/rejected": -167.27215576171875, "loss": 0.5314, "rewards/accuracies": 0.875, "rewards/chosen": 0.3699999451637268, "rewards/margins": 1.4522724151611328, "rewards/rejected": -1.0822725296020508, "step": 550 }, { "epoch": 0.029614898820241327, "grad_norm": 7.814667224884033, "learning_rate": 5.956186464293703e-07, "logits/chosen": -0.7592798471450806, "logits/rejected": -0.736774206161499, "logps/chosen": -94.76161193847656, "logps/rejected": -142.85443115234375, "loss": 0.5821, "rewards/accuracies": 1.0, "rewards/chosen": 0.3468934893608093, "rewards/margins": 1.3956459760665894, "rewards/rejected": -1.0487524271011353, "step": 551 }, { "epoch": 0.02966864636800946, "grad_norm": 10.56902027130127, "learning_rate": 5.936906572928624e-07, "logits/chosen": -0.5512069463729858, "logits/rejected": -0.5848386287689209, "logps/chosen": -74.49495697021484, "logps/rejected": -96.79032897949219, "loss": 1.1729, "rewards/accuracies": 0.5, "rewards/chosen": 0.05541644245386124, "rewards/margins": 0.4656200408935547, "rewards/rejected": -0.41020357608795166, "step": 552 }, { "epoch": 0.029722393915777592, "grad_norm": 6.851023197174072, "learning_rate": 5.917612233305182e-07, "logits/chosen": -0.6474483013153076, "logits/rejected": -0.7154521942138672, "logps/chosen": -87.47154235839844, "logps/rejected": -136.83880615234375, "loss": 0.6294, "rewards/accuracies": 1.0, "rewards/chosen": -0.03645167499780655, "rewards/margins": 1.201732873916626, "rewards/rejected": -1.2381844520568848, "step": 553 }, { "epoch": 0.029776141463545725, "grad_norm": 7.131008148193359, "learning_rate": 5.898303742965963e-07, "logits/chosen": -0.5907453298568726, "logits/rejected": -0.670932412147522, "logps/chosen": -80.09772491455078, "logps/rejected": -166.36648559570312, "loss": 0.586, "rewards/accuracies": 0.875, "rewards/chosen": 0.0918668806552887, "rewards/margins": 1.4034724235534668, "rewards/rejected": -1.3116055727005005, "step": 554 }, { "epoch": 0.029829889011313858, "grad_norm": 6.4502716064453125, "learning_rate": 5.878981399671773e-07, "logits/chosen": -0.4208954870700836, "logits/rejected": -0.7615500688552856, "logps/chosen": -88.842041015625, "logps/rejected": -149.52590942382812, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 0.307108610868454, "rewards/margins": 1.4261794090270996, "rewards/rejected": -1.1190707683563232, "step": 555 }, { "epoch": 0.02988363655908199, "grad_norm": 9.677691459655762, "learning_rate": 5.859645501397047e-07, "logits/chosen": -0.8661776781082153, "logits/rejected": -1.0205659866333008, "logps/chosen": -105.76343536376953, "logps/rejected": -116.16854858398438, "loss": 0.7916, "rewards/accuracies": 0.75, "rewards/chosen": -0.0804298147559166, "rewards/margins": 0.9402463436126709, "rewards/rejected": -1.0206761360168457, "step": 556 }, { "epoch": 0.029937384106850123, "grad_norm": 10.660415649414062, "learning_rate": 5.84029634632526e-07, "logits/chosen": -0.2515828013420105, "logits/rejected": -0.6010385155677795, "logps/chosen": -77.72097778320312, "logps/rejected": -123.20417022705078, "loss": 0.9992, "rewards/accuracies": 0.75, "rewards/chosen": 0.05886353924870491, "rewards/margins": 0.7671805024147034, "rewards/rejected": -0.7083170413970947, "step": 557 }, { "epoch": 0.02999113165461826, "grad_norm": 8.077925682067871, "learning_rate": 5.820934232844314e-07, "logits/chosen": -0.3048606514930725, "logits/rejected": -0.48728615045547485, "logps/chosen": -76.05445861816406, "logps/rejected": -126.6664047241211, "loss": 0.641, "rewards/accuracies": 0.875, "rewards/chosen": 0.28463315963745117, "rewards/margins": 1.2918307781219482, "rewards/rejected": -1.007197618484497, "step": 558 }, { "epoch": 0.030044879202386392, "grad_norm": 8.066760063171387, "learning_rate": 5.801559459541956e-07, "logits/chosen": -0.44821956753730774, "logits/rejected": -0.7278016805648804, "logps/chosen": -101.71133422851562, "logps/rejected": -169.93521118164062, "loss": 0.5734, "rewards/accuracies": 1.0, "rewards/chosen": -0.16155366599559784, "rewards/margins": 1.430591106414795, "rewards/rejected": -1.5921447277069092, "step": 559 }, { "epoch": 0.030098626750154525, "grad_norm": 7.547898769378662, "learning_rate": 5.782172325201155e-07, "logits/chosen": -0.46802911162376404, "logits/rejected": -0.4323127269744873, "logps/chosen": -78.06983947753906, "logps/rejected": -119.07838439941406, "loss": 0.7212, "rewards/accuracies": 1.0, "rewards/chosen": 0.13490314781665802, "rewards/margins": 0.8903651237487793, "rewards/rejected": -0.7554619908332825, "step": 560 }, { "epoch": 0.030152374297922658, "grad_norm": 7.6085205078125, "learning_rate": 5.762773128795505e-07, "logits/chosen": -0.33417779207229614, "logits/rejected": -0.7516422271728516, "logps/chosen": -64.55521392822266, "logps/rejected": -88.79312133789062, "loss": 0.735, "rewards/accuracies": 0.875, "rewards/chosen": 0.2816033959388733, "rewards/margins": 1.0340321063995361, "rewards/rejected": -0.7524287700653076, "step": 561 }, { "epoch": 0.03020612184569079, "grad_norm": 8.935864448547363, "learning_rate": 5.743362169484616e-07, "logits/chosen": -0.6179488897323608, "logits/rejected": -0.9362776875495911, "logps/chosen": -84.29489135742188, "logps/rejected": -118.15515899658203, "loss": 0.7456, "rewards/accuracies": 1.0, "rewards/chosen": -0.050969645380973816, "rewards/margins": 0.9041745662689209, "rewards/rejected": -0.9551441669464111, "step": 562 }, { "epoch": 0.030259869393458923, "grad_norm": 11.490570068359375, "learning_rate": 5.723939746609489e-07, "logits/chosen": -0.638555645942688, "logits/rejected": -0.6851341724395752, "logps/chosen": -94.73426055908203, "logps/rejected": -120.36894226074219, "loss": 0.9949, "rewards/accuracies": 0.875, "rewards/chosen": -0.09476185590028763, "rewards/margins": 0.5439042448997498, "rewards/rejected": -0.6386661529541016, "step": 563 }, { "epoch": 0.030313616941227056, "grad_norm": 13.03736400604248, "learning_rate": 5.704506159687913e-07, "logits/chosen": -0.5972259640693665, "logits/rejected": -0.816208004951477, "logps/chosen": -83.51612854003906, "logps/rejected": -115.3983154296875, "loss": 0.8485, "rewards/accuracies": 0.875, "rewards/chosen": -0.050669148564338684, "rewards/margins": 0.8579112887382507, "rewards/rejected": -0.9085804224014282, "step": 564 }, { "epoch": 0.03036736448899519, "grad_norm": 6.1555399894714355, "learning_rate": 5.68506170840984e-07, "logits/chosen": -0.546023964881897, "logits/rejected": -0.9968788623809814, "logps/chosen": -94.26448059082031, "logps/rejected": -107.03096771240234, "loss": 0.5167, "rewards/accuracies": 1.0, "rewards/chosen": 0.39440011978149414, "rewards/margins": 1.4650413990020752, "rewards/rejected": -1.070641279220581, "step": 565 }, { "epoch": 0.03042111203676332, "grad_norm": 8.703760147094727, "learning_rate": 5.665606692632762e-07, "logits/chosen": -0.7715000510215759, "logits/rejected": -0.6901755332946777, "logps/chosen": -103.18514251708984, "logps/rejected": -130.1761016845703, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.13113507628440857, "rewards/margins": 1.0377771854400635, "rewards/rejected": -0.9066420793533325, "step": 566 }, { "epoch": 0.030474859584531454, "grad_norm": 6.566905498504639, "learning_rate": 5.646141412377088e-07, "logits/chosen": -0.6601220369338989, "logits/rejected": -0.6915298104286194, "logps/chosen": -84.2286605834961, "logps/rejected": -132.83633422851562, "loss": 0.5053, "rewards/accuracies": 1.0, "rewards/chosen": 0.03054036945104599, "rewards/margins": 1.6031171083450317, "rewards/rejected": -1.5725767612457275, "step": 567 }, { "epoch": 0.030528607132299587, "grad_norm": 9.918318748474121, "learning_rate": 5.626666167821521e-07, "logits/chosen": -0.618608832359314, "logits/rejected": -0.8530561923980713, "logps/chosen": -104.23225402832031, "logps/rejected": -133.61318969726562, "loss": 0.8032, "rewards/accuracies": 1.0, "rewards/chosen": -0.21050767600536346, "rewards/margins": 0.7586933374404907, "rewards/rejected": -0.9692009687423706, "step": 568 }, { "epoch": 0.030582354680067723, "grad_norm": 9.725150108337402, "learning_rate": 5.607181259298424e-07, "logits/chosen": -0.558419406414032, "logits/rejected": -0.8098447322845459, "logps/chosen": -89.92630004882812, "logps/rejected": -129.73960876464844, "loss": 0.6731, "rewards/accuracies": 0.875, "rewards/chosen": 0.40352946519851685, "rewards/margins": 1.1149446964263916, "rewards/rejected": -0.7114152908325195, "step": 569 }, { "epoch": 0.030636102227835856, "grad_norm": 10.68249797821045, "learning_rate": 5.587686987289189e-07, "logits/chosen": -0.6082774996757507, "logits/rejected": -0.6101176738739014, "logps/chosen": -66.2979736328125, "logps/rejected": -95.95420837402344, "loss": 0.7854, "rewards/accuracies": 0.875, "rewards/chosen": 0.002001184970140457, "rewards/margins": 0.8938198685646057, "rewards/rejected": -0.891818642616272, "step": 570 }, { "epoch": 0.03068984977560399, "grad_norm": 3.949777841567993, "learning_rate": 5.568183652419606e-07, "logits/chosen": -0.4548388421535492, "logits/rejected": -0.8108513355255127, "logps/chosen": -74.27210998535156, "logps/rejected": -130.0594940185547, "loss": 0.319, "rewards/accuracies": 1.0, "rewards/chosen": 0.6271109580993652, "rewards/margins": 1.964177131652832, "rewards/rejected": -1.3370661735534668, "step": 571 }, { "epoch": 0.03074359732337212, "grad_norm": 7.037446975708008, "learning_rate": 5.548671555455225e-07, "logits/chosen": -0.7098451256752014, "logits/rejected": -0.7444288730621338, "logps/chosen": -84.98136901855469, "logps/rejected": -128.97634887695312, "loss": 0.5195, "rewards/accuracies": 1.0, "rewards/chosen": 0.2665165066719055, "rewards/margins": 1.3006370067596436, "rewards/rejected": -1.0341204404830933, "step": 572 }, { "epoch": 0.030797344871140254, "grad_norm": 6.503019332885742, "learning_rate": 5.529150997296724e-07, "logits/chosen": -0.5219157934188843, "logits/rejected": -1.139692783355713, "logps/chosen": -71.88064575195312, "logps/rejected": -150.67544555664062, "loss": 0.6171, "rewards/accuracies": 1.0, "rewards/chosen": 0.02127566933631897, "rewards/margins": 1.3662441968917847, "rewards/rejected": -1.344968557357788, "step": 573 }, { "epoch": 0.030851092418908387, "grad_norm": 8.26797866821289, "learning_rate": 5.50962227897525e-07, "logits/chosen": -0.5375006794929504, "logits/rejected": -0.7421151399612427, "logps/chosen": -93.94708251953125, "logps/rejected": -133.82107543945312, "loss": 0.5701, "rewards/accuracies": 0.875, "rewards/chosen": 0.24896803498268127, "rewards/margins": 1.3501321077346802, "rewards/rejected": -1.1011641025543213, "step": 574 }, { "epoch": 0.03090483996667652, "grad_norm": 7.518233776092529, "learning_rate": 5.490085701647804e-07, "logits/chosen": -0.5939725041389465, "logits/rejected": -0.8577125072479248, "logps/chosen": -80.7973403930664, "logps/rejected": -108.13873291015625, "loss": 0.7614, "rewards/accuracies": 1.0, "rewards/chosen": 0.22722667455673218, "rewards/margins": 1.0195845365524292, "rewards/rejected": -0.7923578023910522, "step": 575 }, { "epoch": 0.030958587514444653, "grad_norm": 9.990931510925293, "learning_rate": 5.470541566592572e-07, "logits/chosen": -0.48619240522384644, "logits/rejected": -0.6344749331474304, "logps/chosen": -83.81146240234375, "logps/rejected": -112.74518585205078, "loss": 0.88, "rewards/accuracies": 0.75, "rewards/chosen": 0.2929634153842926, "rewards/margins": 0.8759169578552246, "rewards/rejected": -0.5829535722732544, "step": 576 }, { "epoch": 0.031012335062212786, "grad_norm": 10.61060619354248, "learning_rate": 5.450990175204295e-07, "logits/chosen": -0.5865435004234314, "logits/rejected": -0.8856855630874634, "logps/chosen": -85.54669952392578, "logps/rejected": -113.70957946777344, "loss": 0.8479, "rewards/accuracies": 1.0, "rewards/chosen": -0.30616140365600586, "rewards/margins": 0.7420408725738525, "rewards/rejected": -1.0482022762298584, "step": 577 }, { "epoch": 0.03106608260998092, "grad_norm": 11.252848625183105, "learning_rate": 5.431431828989618e-07, "logits/chosen": -0.7227391004562378, "logits/rejected": -0.7543503046035767, "logps/chosen": -85.05533599853516, "logps/rejected": -114.85171508789062, "loss": 0.9148, "rewards/accuracies": 0.875, "rewards/chosen": -0.20511125028133392, "rewards/margins": 0.733572244644165, "rewards/rejected": -0.9386835694313049, "step": 578 }, { "epoch": 0.03111983015774905, "grad_norm": 7.835850238800049, "learning_rate": 5.411866829562428e-07, "logits/chosen": -0.8998408913612366, "logits/rejected": -0.8534300327301025, "logps/chosen": -97.14753723144531, "logps/rejected": -133.99685668945312, "loss": 0.5435, "rewards/accuracies": 1.0, "rewards/chosen": 0.24064484238624573, "rewards/margins": 1.4192345142364502, "rewards/rejected": -1.1785895824432373, "step": 579 }, { "epoch": 0.031173577705517187, "grad_norm": 6.718521595001221, "learning_rate": 5.392295478639225e-07, "logits/chosen": -0.6414810419082642, "logits/rejected": -0.43935221433639526, "logps/chosen": -73.19677734375, "logps/rejected": -93.02281951904297, "loss": 0.7361, "rewards/accuracies": 1.0, "rewards/chosen": 0.33040347695350647, "rewards/margins": 0.8903957009315491, "rewards/rejected": -0.5599921941757202, "step": 580 }, { "epoch": 0.03122732525328532, "grad_norm": 8.148469924926758, "learning_rate": 5.372718078034449e-07, "logits/chosen": -0.7383481860160828, "logits/rejected": -0.8980106115341187, "logps/chosen": -71.28256225585938, "logps/rejected": -99.52809143066406, "loss": 0.7706, "rewards/accuracies": 1.0, "rewards/chosen": 0.0670764371752739, "rewards/margins": 0.8876625895500183, "rewards/rejected": -0.8205861449241638, "step": 581 }, { "epoch": 0.03128107280105345, "grad_norm": 7.468225955963135, "learning_rate": 5.353134929655834e-07, "logits/chosen": -0.37122729420661926, "logits/rejected": -0.5954714417457581, "logps/chosen": -64.05928039550781, "logps/rejected": -126.9996109008789, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.2693803906440735, "rewards/margins": 1.113133430480957, "rewards/rejected": -0.8437530994415283, "step": 582 }, { "epoch": 0.03133482034882158, "grad_norm": 7.793248176574707, "learning_rate": 5.333546335499755e-07, "logits/chosen": -0.6649636030197144, "logits/rejected": -0.7730270624160767, "logps/chosen": -99.165283203125, "logps/rejected": -134.04574584960938, "loss": 0.6685, "rewards/accuracies": 0.875, "rewards/chosen": -0.1829148381948471, "rewards/margins": 1.0466349124908447, "rewards/rejected": -1.2295498847961426, "step": 583 }, { "epoch": 0.03138856789658972, "grad_norm": 8.285371780395508, "learning_rate": 5.313952597646567e-07, "logits/chosen": -0.6731595993041992, "logits/rejected": -0.7298198342323303, "logps/chosen": -86.544189453125, "logps/rejected": -115.92572784423828, "loss": 0.638, "rewards/accuracies": 1.0, "rewards/chosen": -0.06069307029247284, "rewards/margins": 1.0392494201660156, "rewards/rejected": -1.0999424457550049, "step": 584 }, { "epoch": 0.031442315444357855, "grad_norm": 7.041245937347412, "learning_rate": 5.294354018255944e-07, "logits/chosen": -0.4457075595855713, "logits/rejected": -0.6810296773910522, "logps/chosen": -83.15467071533203, "logps/rejected": -148.55947875976562, "loss": 0.5854, "rewards/accuracies": 1.0, "rewards/chosen": 0.2563842833042145, "rewards/margins": 1.1430201530456543, "rewards/rejected": -0.8866358399391174, "step": 585 }, { "epoch": 0.031496062992125984, "grad_norm": 7.595248222351074, "learning_rate": 5.274750899562229e-07, "logits/chosen": -0.5377132892608643, "logits/rejected": -0.855532169342041, "logps/chosen": -66.07292938232422, "logps/rejected": -125.84947204589844, "loss": 0.7194, "rewards/accuracies": 0.75, "rewards/chosen": 0.2829950451850891, "rewards/margins": 1.2425754070281982, "rewards/rejected": -0.9595802426338196, "step": 586 }, { "epoch": 0.03154981053989412, "grad_norm": 6.300678253173828, "learning_rate": 5.255143543869758e-07, "logits/chosen": -0.43991732597351074, "logits/rejected": -0.6710031628608704, "logps/chosen": -78.75753784179688, "logps/rejected": -104.96415710449219, "loss": 0.6533, "rewards/accuracies": 0.875, "rewards/chosen": -0.01320343092083931, "rewards/margins": 1.2475306987762451, "rewards/rejected": -1.2607340812683105, "step": 587 }, { "epoch": 0.03160355808766225, "grad_norm": 9.102422714233398, "learning_rate": 5.235532253548213e-07, "logits/chosen": -0.7858498096466064, "logits/rejected": -0.9402498602867126, "logps/chosen": -68.1700210571289, "logps/rejected": -115.27298736572266, "loss": 0.8257, "rewards/accuracies": 0.75, "rewards/chosen": 0.06970210373401642, "rewards/margins": 0.8710458874702454, "rewards/rejected": -0.8013437986373901, "step": 588 }, { "epoch": 0.031657305635430386, "grad_norm": 7.998831272125244, "learning_rate": 5.215917331027952e-07, "logits/chosen": -0.4016479253768921, "logits/rejected": -0.7438238859176636, "logps/chosen": -74.18819427490234, "logps/rejected": -128.78982543945312, "loss": 0.7301, "rewards/accuracies": 1.0, "rewards/chosen": 0.12778206169605255, "rewards/margins": 1.1362459659576416, "rewards/rejected": -1.0084638595581055, "step": 589 }, { "epoch": 0.031711053183198515, "grad_norm": 8.533957481384277, "learning_rate": 5.196299078795343e-07, "logits/chosen": -0.6668070554733276, "logits/rejected": -0.8017994165420532, "logps/chosen": -87.37496948242188, "logps/rejected": -147.50045776367188, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": -0.07228381931781769, "rewards/margins": 1.337658166885376, "rewards/rejected": -1.4099419116973877, "step": 590 }, { "epoch": 0.03176480073096665, "grad_norm": 11.027036666870117, "learning_rate": 5.176677799388106e-07, "logits/chosen": -0.6404227614402771, "logits/rejected": -0.6293964385986328, "logps/chosen": -77.40564727783203, "logps/rejected": -101.88495635986328, "loss": 0.8877, "rewards/accuracies": 0.875, "rewards/chosen": -0.08750424534082413, "rewards/margins": 0.8471838235855103, "rewards/rejected": -0.934688150882721, "step": 591 }, { "epoch": 0.03181854827873478, "grad_norm": 9.106761932373047, "learning_rate": 5.157053795390641e-07, "logits/chosen": -0.678102970123291, "logits/rejected": -0.7294355630874634, "logps/chosen": -97.13756561279297, "logps/rejected": -136.22305297851562, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": -0.08323364704847336, "rewards/margins": 1.0866106748580933, "rewards/rejected": -1.169844388961792, "step": 592 }, { "epoch": 0.03187229582650292, "grad_norm": 7.921453952789307, "learning_rate": 5.137427369429367e-07, "logits/chosen": -0.6154822111129761, "logits/rejected": -0.8332656621932983, "logps/chosen": -75.92434692382812, "logps/rejected": -86.71163940429688, "loss": 0.8018, "rewards/accuracies": 0.875, "rewards/chosen": 0.33826619386672974, "rewards/margins": 0.8034090995788574, "rewards/rejected": -0.46514296531677246, "step": 593 }, { "epoch": 0.031926043374271046, "grad_norm": 5.999557018280029, "learning_rate": 5.117798824168051e-07, "logits/chosen": -0.40914344787597656, "logits/rejected": -0.6369484663009644, "logps/chosen": -78.53263854980469, "logps/rejected": -112.00936889648438, "loss": 0.5941, "rewards/accuracies": 1.0, "rewards/chosen": 0.13590237498283386, "rewards/margins": 1.173100233078003, "rewards/rejected": -1.0371978282928467, "step": 594 }, { "epoch": 0.03197979092203918, "grad_norm": 7.856106281280518, "learning_rate": 5.098168462303141e-07, "logits/chosen": -0.4048411250114441, "logits/rejected": -0.793762743473053, "logps/chosen": -96.40618896484375, "logps/rejected": -110.63007354736328, "loss": 0.7868, "rewards/accuracies": 0.75, "rewards/chosen": 0.00406607985496521, "rewards/margins": 1.0200872421264648, "rewards/rejected": -1.0160212516784668, "step": 595 }, { "epoch": 0.03203353846980732, "grad_norm": 9.297659873962402, "learning_rate": 5.078536586559103e-07, "logits/chosen": -0.590066134929657, "logits/rejected": -0.6968763470649719, "logps/chosen": -80.29603576660156, "logps/rejected": -113.13834381103516, "loss": 0.9084, "rewards/accuracies": 0.875, "rewards/chosen": 0.15965582430362701, "rewards/margins": 0.8447661399841309, "rewards/rejected": -0.685110330581665, "step": 596 }, { "epoch": 0.03208728601757545, "grad_norm": 9.334245681762695, "learning_rate": 5.058903499683746e-07, "logits/chosen": -0.4816211462020874, "logits/rejected": -0.7550541758537292, "logps/chosen": -71.4161148071289, "logps/rejected": -105.97187805175781, "loss": 0.7866, "rewards/accuracies": 0.875, "rewards/chosen": 0.09871931374073029, "rewards/margins": 1.150578260421753, "rewards/rejected": -1.051858901977539, "step": 597 }, { "epoch": 0.032141033565343584, "grad_norm": 7.658260345458984, "learning_rate": 5.039269504443556e-07, "logits/chosen": -0.48446959257125854, "logits/rejected": -0.5650824308395386, "logps/chosen": -96.09785461425781, "logps/rejected": -108.189453125, "loss": 0.6409, "rewards/accuracies": 1.0, "rewards/chosen": 0.09476885199546814, "rewards/margins": 1.0423463582992554, "rewards/rejected": -0.9475774765014648, "step": 598 }, { "epoch": 0.03219478111311171, "grad_norm": 7.740126132965088, "learning_rate": 5.01963490361903e-07, "logits/chosen": -0.4582410752773285, "logits/rejected": -0.6890443563461304, "logps/chosen": -73.97208404541016, "logps/rejected": -136.49221801757812, "loss": 0.6339, "rewards/accuracies": 1.0, "rewards/chosen": 0.1915815770626068, "rewards/margins": 1.1993786096572876, "rewards/rejected": -1.0077970027923584, "step": 599 }, { "epoch": 0.03224852866087985, "grad_norm": 6.905500888824463, "learning_rate": 5e-07, "logits/chosen": -0.4806552231311798, "logits/rejected": -0.7579709887504578, "logps/chosen": -94.65138244628906, "logps/rejected": -168.84481811523438, "loss": 0.5871, "rewards/accuracies": 0.875, "rewards/chosen": -0.12233862280845642, "rewards/margins": 1.4139548540115356, "rewards/rejected": -1.5362935066223145, "step": 600 }, { "epoch": 0.03230227620864798, "grad_norm": 6.0192975997924805, "learning_rate": 4.98036509638097e-07, "logits/chosen": -0.6011431217193604, "logits/rejected": -0.7359423041343689, "logps/chosen": -62.32979202270508, "logps/rejected": -101.39447021484375, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 0.18788591027259827, "rewards/margins": 1.2624963521957397, "rewards/rejected": -1.0746104717254639, "step": 601 }, { "epoch": 0.032356023756416115, "grad_norm": 9.68626594543457, "learning_rate": 4.960730495556445e-07, "logits/chosen": -0.5378775596618652, "logits/rejected": -0.7112482786178589, "logps/chosen": -87.8258285522461, "logps/rejected": -130.9329071044922, "loss": 0.8349, "rewards/accuracies": 0.625, "rewards/chosen": 0.032933756709098816, "rewards/margins": 1.0480756759643555, "rewards/rejected": -1.0151419639587402, "step": 602 }, { "epoch": 0.032409771304184244, "grad_norm": 10.51264762878418, "learning_rate": 4.941096500316253e-07, "logits/chosen": -0.5181289911270142, "logits/rejected": -0.6356170177459717, "logps/chosen": -83.44453430175781, "logps/rejected": -90.61045837402344, "loss": 0.9384, "rewards/accuracies": 0.625, "rewards/chosen": -0.043010905385017395, "rewards/margins": 0.8100171685218811, "rewards/rejected": -0.8530280590057373, "step": 603 }, { "epoch": 0.03246351885195238, "grad_norm": 7.336607933044434, "learning_rate": 4.921463413440898e-07, "logits/chosen": -0.5451788902282715, "logits/rejected": -0.7391940355300903, "logps/chosen": -91.16065979003906, "logps/rejected": -119.72543334960938, "loss": 0.7066, "rewards/accuracies": 1.0, "rewards/chosen": 0.1354527473449707, "rewards/margins": 1.0061503648757935, "rewards/rejected": -0.8706976771354675, "step": 604 }, { "epoch": 0.03251726639972051, "grad_norm": 10.031682014465332, "learning_rate": 4.901831537696859e-07, "logits/chosen": -0.7704207897186279, "logits/rejected": -0.6128878593444824, "logps/chosen": -86.63076782226562, "logps/rejected": -103.10627746582031, "loss": 0.8547, "rewards/accuracies": 0.875, "rewards/chosen": 0.04316642880439758, "rewards/margins": 0.7869977951049805, "rewards/rejected": -0.7438313961029053, "step": 605 }, { "epoch": 0.032571013947488646, "grad_norm": 7.167978286743164, "learning_rate": 4.88220117583195e-07, "logits/chosen": -0.623897910118103, "logits/rejected": -1.113237977027893, "logps/chosen": -79.14581298828125, "logps/rejected": -129.1562042236328, "loss": 0.6497, "rewards/accuracies": 0.875, "rewards/chosen": -0.12618502974510193, "rewards/margins": 1.3929439783096313, "rewards/rejected": -1.5191290378570557, "step": 606 }, { "epoch": 0.03262476149525678, "grad_norm": 8.778635025024414, "learning_rate": 4.862572630570632e-07, "logits/chosen": -0.4653218984603882, "logits/rejected": -0.6936641931533813, "logps/chosen": -88.52848815917969, "logps/rejected": -112.9065170288086, "loss": 0.7327, "rewards/accuracies": 0.875, "rewards/chosen": -0.24923157691955566, "rewards/margins": 1.013629674911499, "rewards/rejected": -1.2628613710403442, "step": 607 }, { "epoch": 0.03267850904302491, "grad_norm": 8.148833274841309, "learning_rate": 4.842946204609359e-07, "logits/chosen": -0.7179454565048218, "logits/rejected": -1.1879855394363403, "logps/chosen": -93.71632385253906, "logps/rejected": -189.02294921875, "loss": 0.483, "rewards/accuracies": 1.0, "rewards/chosen": -0.0041448622941970825, "rewards/margins": 1.5472955703735352, "rewards/rejected": -1.5514403581619263, "step": 608 }, { "epoch": 0.03273225659079305, "grad_norm": 9.826654434204102, "learning_rate": 4.823322200611894e-07, "logits/chosen": -0.3729318082332611, "logits/rejected": -0.9929598569869995, "logps/chosen": -92.39785766601562, "logps/rejected": -149.96786499023438, "loss": 0.8039, "rewards/accuracies": 0.75, "rewards/chosen": -0.11256924271583557, "rewards/margins": 0.9996153712272644, "rewards/rejected": -1.1121846437454224, "step": 609 }, { "epoch": 0.03278600413856118, "grad_norm": 6.784736633300781, "learning_rate": 4.803700921204658e-07, "logits/chosen": -0.4363518953323364, "logits/rejected": -0.5599923133850098, "logps/chosen": -64.25714874267578, "logps/rejected": -98.48246765136719, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.1332513988018036, "rewards/margins": 1.1346909999847412, "rewards/rejected": -1.0014395713806152, "step": 610 }, { "epoch": 0.03283975168632931, "grad_norm": 10.885212898254395, "learning_rate": 4.784082668972048e-07, "logits/chosen": -0.6714180111885071, "logits/rejected": -0.8613975644111633, "logps/chosen": -94.35983276367188, "logps/rejected": -106.86113739013672, "loss": 0.826, "rewards/accuracies": 0.875, "rewards/chosen": 0.0002947300672531128, "rewards/margins": 0.9118413925170898, "rewards/rejected": -0.9115465879440308, "step": 611 }, { "epoch": 0.03289349923409744, "grad_norm": 6.491450786590576, "learning_rate": 4.764467746451787e-07, "logits/chosen": -0.6292083263397217, "logits/rejected": -0.7781423926353455, "logps/chosen": -97.101806640625, "logps/rejected": -159.37718200683594, "loss": 0.4611, "rewards/accuracies": 1.0, "rewards/chosen": 0.37886500358581543, "rewards/margins": 1.7291555404663086, "rewards/rejected": -1.3502906560897827, "step": 612 }, { "epoch": 0.03294724678186558, "grad_norm": 7.715190410614014, "learning_rate": 4.7448564561302425e-07, "logits/chosen": -0.6039716005325317, "logits/rejected": -0.7157342433929443, "logps/chosen": -80.086669921875, "logps/rejected": -127.81349182128906, "loss": 0.7238, "rewards/accuracies": 0.875, "rewards/chosen": 0.11800796538591385, "rewards/margins": 1.0396406650543213, "rewards/rejected": -0.9216327667236328, "step": 613 }, { "epoch": 0.03300099432963371, "grad_norm": 9.692184448242188, "learning_rate": 4.725249100437772e-07, "logits/chosen": -0.49190032482147217, "logits/rejected": -0.8127591013908386, "logps/chosen": -91.57392883300781, "logps/rejected": -131.26919555664062, "loss": 0.7534, "rewards/accuracies": 0.875, "rewards/chosen": 0.24450361728668213, "rewards/margins": 1.360818862915039, "rewards/rejected": -1.1163153648376465, "step": 614 }, { "epoch": 0.033054741877401844, "grad_norm": 8.885555267333984, "learning_rate": 4.705645981744054e-07, "logits/chosen": -0.7359647750854492, "logits/rejected": -0.7773431539535522, "logps/chosen": -100.98867797851562, "logps/rejected": -116.5601806640625, "loss": 0.664, "rewards/accuracies": 1.0, "rewards/chosen": 0.13070012629032135, "rewards/margins": 1.1288889646530151, "rewards/rejected": -0.998188853263855, "step": 615 }, { "epoch": 0.033108489425169974, "grad_norm": 5.74119758605957, "learning_rate": 4.686047402353433e-07, "logits/chosen": -0.4907291531562805, "logits/rejected": -0.6746664047241211, "logps/chosen": -96.84876251220703, "logps/rejected": -163.27586364746094, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 0.1524963229894638, "rewards/margins": 1.884968638420105, "rewards/rejected": -1.73247230052948, "step": 616 }, { "epoch": 0.03316223697293811, "grad_norm": 9.402079582214355, "learning_rate": 4.666453664500245e-07, "logits/chosen": -0.713340699672699, "logits/rejected": -0.691076397895813, "logps/chosen": -83.80741119384766, "logps/rejected": -107.77182006835938, "loss": 0.772, "rewards/accuracies": 1.0, "rewards/chosen": 0.24850381910800934, "rewards/margins": 0.8224989771842957, "rewards/rejected": -0.5739951133728027, "step": 617 }, { "epoch": 0.033215984520706246, "grad_norm": 8.107818603515625, "learning_rate": 4.6468650703441674e-07, "logits/chosen": -0.7258659601211548, "logits/rejected": -0.7633432149887085, "logps/chosen": -83.16914367675781, "logps/rejected": -101.33570098876953, "loss": 0.9069, "rewards/accuracies": 0.875, "rewards/chosen": 0.21712824702262878, "rewards/margins": 0.764595091342926, "rewards/rejected": -0.5474668741226196, "step": 618 }, { "epoch": 0.033269732068474375, "grad_norm": 5.152230262756348, "learning_rate": 4.6272819219655513e-07, "logits/chosen": -0.6951889395713806, "logits/rejected": -0.8417446613311768, "logps/chosen": -85.37074279785156, "logps/rejected": -137.7667999267578, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 0.36454686522483826, "rewards/margins": 1.5964701175689697, "rewards/rejected": -1.231923222541809, "step": 619 }, { "epoch": 0.03332347961624251, "grad_norm": 13.603208541870117, "learning_rate": 4.6077045213607755e-07, "logits/chosen": -0.5858896970748901, "logits/rejected": -0.6363260746002197, "logps/chosen": -104.4343032836914, "logps/rejected": -135.7537078857422, "loss": 0.7351, "rewards/accuracies": 1.0, "rewards/chosen": -0.001949496567249298, "rewards/margins": 0.916826605796814, "rewards/rejected": -0.9187760353088379, "step": 620 }, { "epoch": 0.03337722716401064, "grad_norm": 5.797398567199707, "learning_rate": 4.5881331704375717e-07, "logits/chosen": -0.49570104479789734, "logits/rejected": -0.8089923858642578, "logps/chosen": -105.8198013305664, "logps/rejected": -173.14901733398438, "loss": 0.4325, "rewards/accuracies": 1.0, "rewards/chosen": -0.0260557159781456, "rewards/margins": 1.5845141410827637, "rewards/rejected": -1.6105698347091675, "step": 621 }, { "epoch": 0.03343097471177878, "grad_norm": 7.601419925689697, "learning_rate": 4.5685681710103835e-07, "logits/chosen": -0.6959386467933655, "logits/rejected": -0.7639058828353882, "logps/chosen": -90.01693725585938, "logps/rejected": -126.60846710205078, "loss": 0.5734, "rewards/accuracies": 1.0, "rewards/chosen": -0.19543808698654175, "rewards/margins": 1.2715380191802979, "rewards/rejected": -1.4669761657714844, "step": 622 }, { "epoch": 0.033484722259546906, "grad_norm": 6.705808639526367, "learning_rate": 4.5490098247957035e-07, "logits/chosen": -0.7309533953666687, "logits/rejected": -0.7770398855209351, "logps/chosen": -89.1234130859375, "logps/rejected": -111.97673034667969, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": -0.0425875186920166, "rewards/margins": 1.0660780668258667, "rewards/rejected": -1.1086655855178833, "step": 623 }, { "epoch": 0.03353846980731504, "grad_norm": 8.225669860839844, "learning_rate": 4.529458433407428e-07, "logits/chosen": -0.4227210581302643, "logits/rejected": -0.5695346593856812, "logps/chosen": -69.95433044433594, "logps/rejected": -145.61685180664062, "loss": 0.5518, "rewards/accuracies": 0.875, "rewards/chosen": -0.09772466123104095, "rewards/margins": 1.7445323467254639, "rewards/rejected": -1.8422571420669556, "step": 624 }, { "epoch": 0.03359221735508317, "grad_norm": 6.308511257171631, "learning_rate": 4.5099142983521963e-07, "logits/chosen": -0.5502666234970093, "logits/rejected": -0.6669168472290039, "logps/chosen": -56.95618438720703, "logps/rejected": -107.61539459228516, "loss": 0.5823, "rewards/accuracies": 1.0, "rewards/chosen": 0.4001650810241699, "rewards/margins": 1.189414381980896, "rewards/rejected": -0.7892492413520813, "step": 625 }, { "epoch": 0.03364596490285131, "grad_norm": 8.276154518127441, "learning_rate": 4.4903777210247507e-07, "logits/chosen": -0.6363383531570435, "logits/rejected": -0.6784260869026184, "logps/chosen": -96.97117614746094, "logps/rejected": -149.93426513671875, "loss": 0.5181, "rewards/accuracies": 1.0, "rewards/chosen": 0.01801939308643341, "rewards/margins": 1.4707001447677612, "rewards/rejected": -1.4526807069778442, "step": 626 }, { "epoch": 0.03369971245061944, "grad_norm": 4.131911754608154, "learning_rate": 4.470849002703278e-07, "logits/chosen": -0.65470951795578, "logits/rejected": -0.7580817341804504, "logps/chosen": -88.43344116210938, "logps/rejected": -135.6339874267578, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 0.3193442225456238, "rewards/margins": 1.858778715133667, "rewards/rejected": -1.5394344329833984, "step": 627 }, { "epoch": 0.033753459998387574, "grad_norm": 5.8456549644470215, "learning_rate": 4.4513284445447734e-07, "logits/chosen": -0.6032462120056152, "logits/rejected": -0.9764176607131958, "logps/chosen": -95.53583526611328, "logps/rejected": -159.828857421875, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 0.30497580766677856, "rewards/margins": 1.7472496032714844, "rewards/rejected": -1.4422738552093506, "step": 628 }, { "epoch": 0.03380720754615571, "grad_norm": 9.581640243530273, "learning_rate": 4.4318163475803946e-07, "logits/chosen": -0.5667802095413208, "logits/rejected": -0.8898786306381226, "logps/chosen": -103.06565856933594, "logps/rejected": -137.29962158203125, "loss": 0.7003, "rewards/accuracies": 0.875, "rewards/chosen": -0.32374608516693115, "rewards/margins": 1.0436407327651978, "rewards/rejected": -1.367386817932129, "step": 629 }, { "epoch": 0.03386095509392384, "grad_norm": 7.109737396240234, "learning_rate": 4.412313012710812e-07, "logits/chosen": -0.5854474902153015, "logits/rejected": -0.9624674320220947, "logps/chosen": -97.74449920654297, "logps/rejected": -140.26058959960938, "loss": 0.5429, "rewards/accuracies": 1.0, "rewards/chosen": 0.11103461682796478, "rewards/margins": 1.299454689025879, "rewards/rejected": -1.188420057296753, "step": 630 }, { "epoch": 0.033914702641691975, "grad_norm": 8.876824378967285, "learning_rate": 4.392818740701578e-07, "logits/chosen": -0.7908871173858643, "logits/rejected": -0.7990108728408813, "logps/chosen": -80.28163146972656, "logps/rejected": -124.66183471679688, "loss": 0.7936, "rewards/accuracies": 1.0, "rewards/chosen": -0.20700569450855255, "rewards/margins": 0.8966859579086304, "rewards/rejected": -1.103691577911377, "step": 631 }, { "epoch": 0.033968450189460105, "grad_norm": 7.538534164428711, "learning_rate": 4.3733338321784777e-07, "logits/chosen": -0.6017280220985413, "logits/rejected": -0.6181896924972534, "logps/chosen": -76.98684692382812, "logps/rejected": -114.49388885498047, "loss": 0.6146, "rewards/accuracies": 1.0, "rewards/chosen": 0.2475624978542328, "rewards/margins": 1.1292306184768677, "rewards/rejected": -0.8816681504249573, "step": 632 }, { "epoch": 0.03402219773722824, "grad_norm": 8.26999568939209, "learning_rate": 4.353858587622912e-07, "logits/chosen": -0.5785642862319946, "logits/rejected": -0.7428444623947144, "logps/chosen": -103.74725341796875, "logps/rejected": -142.5939178466797, "loss": 0.5631, "rewards/accuracies": 1.0, "rewards/chosen": -0.06749778985977173, "rewards/margins": 1.5513582229614258, "rewards/rejected": -1.6188560724258423, "step": 633 }, { "epoch": 0.03407594528499637, "grad_norm": 8.719396591186523, "learning_rate": 4.334393307367239e-07, "logits/chosen": -0.5849472880363464, "logits/rejected": -0.7718167901039124, "logps/chosen": -105.94989013671875, "logps/rejected": -161.8540496826172, "loss": 0.6443, "rewards/accuracies": 1.0, "rewards/chosen": -0.11618242412805557, "rewards/margins": 1.2930874824523926, "rewards/rejected": -1.4092698097229004, "step": 634 }, { "epoch": 0.034129692832764506, "grad_norm": 7.574139595031738, "learning_rate": 4.3149382915901606e-07, "logits/chosen": -0.5907869338989258, "logits/rejected": -0.8032389879226685, "logps/chosen": -76.92984771728516, "logps/rejected": -117.54576110839844, "loss": 0.763, "rewards/accuracies": 0.75, "rewards/chosen": -0.05670052021741867, "rewards/margins": 0.9965586066246033, "rewards/rejected": -1.0532591342926025, "step": 635 }, { "epoch": 0.034183440380532636, "grad_norm": 5.709794044494629, "learning_rate": 4.295493840312087e-07, "logits/chosen": -0.6391381025314331, "logits/rejected": -0.7744686603546143, "logps/chosen": -77.88555145263672, "logps/rejected": -140.5464324951172, "loss": 0.4944, "rewards/accuracies": 1.0, "rewards/chosen": 0.2050856649875641, "rewards/margins": 1.7452037334442139, "rewards/rejected": -1.5401182174682617, "step": 636 }, { "epoch": 0.03423718792830077, "grad_norm": 8.265750885009766, "learning_rate": 4.276060253390511e-07, "logits/chosen": -0.5909054279327393, "logits/rejected": -0.7713615298271179, "logps/chosen": -78.48088836669922, "logps/rejected": -105.97406005859375, "loss": 0.8638, "rewards/accuracies": 0.75, "rewards/chosen": 0.1346690058708191, "rewards/margins": 0.8579265475273132, "rewards/rejected": -0.7232576012611389, "step": 637 }, { "epoch": 0.0342909354760689, "grad_norm": 8.994619369506836, "learning_rate": 4.256637830515385e-07, "logits/chosen": -0.5698118805885315, "logits/rejected": -0.8241775035858154, "logps/chosen": -102.56254577636719, "logps/rejected": -143.84881591796875, "loss": 0.7502, "rewards/accuracies": 0.875, "rewards/chosen": -0.11908753961324692, "rewards/margins": 0.940855085849762, "rewards/rejected": -1.059942603111267, "step": 638 }, { "epoch": 0.03434468302383704, "grad_norm": 8.460371971130371, "learning_rate": 4.2372268712044956e-07, "logits/chosen": -0.5662652254104614, "logits/rejected": -0.7969576120376587, "logps/chosen": -84.10491943359375, "logps/rejected": -119.32184600830078, "loss": 0.7695, "rewards/accuracies": 0.875, "rewards/chosen": 0.02846279740333557, "rewards/margins": 0.9155237674713135, "rewards/rejected": -0.8870609998703003, "step": 639 }, { "epoch": 0.034398430571605174, "grad_norm": 9.439326286315918, "learning_rate": 4.2178276747988444e-07, "logits/chosen": -0.9039698839187622, "logits/rejected": -1.0065993070602417, "logps/chosen": -94.28877258300781, "logps/rejected": -126.4561767578125, "loss": 0.69, "rewards/accuracies": 0.875, "rewards/chosen": -0.07726460695266724, "rewards/margins": 1.1309573650360107, "rewards/rejected": -1.2082219123840332, "step": 640 }, { "epoch": 0.0344521781193733, "grad_norm": 7.055704593658447, "learning_rate": 4.198440540458045e-07, "logits/chosen": -0.5508767366409302, "logits/rejected": -0.7557835578918457, "logps/chosen": -91.57740020751953, "logps/rejected": -125.13502502441406, "loss": 0.6686, "rewards/accuracies": 0.875, "rewards/chosen": -0.0639323741197586, "rewards/margins": 1.2123610973358154, "rewards/rejected": -1.276293396949768, "step": 641 }, { "epoch": 0.03450592566714144, "grad_norm": 9.051188468933105, "learning_rate": 4.1790657671556855e-07, "logits/chosen": -0.5714504718780518, "logits/rejected": -0.8736838102340698, "logps/chosen": -82.64602661132812, "logps/rejected": -122.87081909179688, "loss": 0.6329, "rewards/accuracies": 1.0, "rewards/chosen": -0.19215331971645355, "rewards/margins": 1.1106781959533691, "rewards/rejected": -1.3028314113616943, "step": 642 }, { "epoch": 0.03455967321490957, "grad_norm": 8.381659507751465, "learning_rate": 4.159703653674741e-07, "logits/chosen": -0.611016035079956, "logits/rejected": -0.8391165733337402, "logps/chosen": -95.28317260742188, "logps/rejected": -130.52720642089844, "loss": 0.7169, "rewards/accuracies": 0.75, "rewards/chosen": -0.22419142723083496, "rewards/margins": 1.167614459991455, "rewards/rejected": -1.39180588722229, "step": 643 }, { "epoch": 0.034613420762677705, "grad_norm": 5.736966133117676, "learning_rate": 4.1403544986029513e-07, "logits/chosen": -0.6335850954055786, "logits/rejected": -0.8716615438461304, "logps/chosen": -89.39505004882812, "logps/rejected": -142.301025390625, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": -0.03830995038151741, "rewards/margins": 1.5955379009246826, "rewards/rejected": -1.633847713470459, "step": 644 }, { "epoch": 0.034667168310445834, "grad_norm": 10.211275100708008, "learning_rate": 4.121018600328227e-07, "logits/chosen": -0.6273261904716492, "logits/rejected": -0.6800127029418945, "logps/chosen": -69.32307434082031, "logps/rejected": -95.77194213867188, "loss": 1.0811, "rewards/accuracies": 0.625, "rewards/chosen": -0.0941387340426445, "rewards/margins": 0.514427900314331, "rewards/rejected": -0.6085666418075562, "step": 645 }, { "epoch": 0.03472091585821397, "grad_norm": 7.409835338592529, "learning_rate": 4.101696257034037e-07, "logits/chosen": -0.7027994990348816, "logits/rejected": -0.8179329633712769, "logps/chosen": -67.29900360107422, "logps/rejected": -96.60245513916016, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": -0.12028437107801437, "rewards/margins": 1.0158188343048096, "rewards/rejected": -1.1361031532287598, "step": 646 }, { "epoch": 0.0347746634059821, "grad_norm": 10.50412368774414, "learning_rate": 4.0823877666948194e-07, "logits/chosen": -0.4664965271949768, "logits/rejected": -0.8070378303527832, "logps/chosen": -83.66603088378906, "logps/rejected": -115.29845428466797, "loss": 0.8074, "rewards/accuracies": 0.875, "rewards/chosen": -0.07762628048658371, "rewards/margins": 0.9623950719833374, "rewards/rejected": -1.040021300315857, "step": 647 }, { "epoch": 0.034828410953750236, "grad_norm": 4.698235034942627, "learning_rate": 4.0630934270713755e-07, "logits/chosen": -0.5954183340072632, "logits/rejected": -0.8449184894561768, "logps/chosen": -82.33883666992188, "logps/rejected": -153.775146484375, "loss": 0.4002, "rewards/accuracies": 1.0, "rewards/chosen": 0.1432419717311859, "rewards/margins": 1.787036418914795, "rewards/rejected": -1.6437945365905762, "step": 648 }, { "epoch": 0.034882158501518365, "grad_norm": 6.139332294464111, "learning_rate": 4.0438135357062985e-07, "logits/chosen": -0.48749905824661255, "logits/rejected": -0.8369503021240234, "logps/chosen": -74.27041625976562, "logps/rejected": -120.2864990234375, "loss": 0.5547, "rewards/accuracies": 0.875, "rewards/chosen": 0.09599432349205017, "rewards/margins": 1.6957619190216064, "rewards/rejected": -1.5997675657272339, "step": 649 }, { "epoch": 0.0349359060492865, "grad_norm": 6.7009124755859375, "learning_rate": 4.0245483899193586e-07, "logits/chosen": -0.6145637035369873, "logits/rejected": -0.7580772638320923, "logps/chosen": -80.97282409667969, "logps/rejected": -121.99165344238281, "loss": 0.5868, "rewards/accuracies": 1.0, "rewards/chosen": 0.03952017053961754, "rewards/margins": 1.306532382965088, "rewards/rejected": -1.267012357711792, "step": 650 }, { "epoch": 0.03498965359705464, "grad_norm": 9.469929695129395, "learning_rate": 4.0052982868029373e-07, "logits/chosen": -0.5144017934799194, "logits/rejected": -0.7887222766876221, "logps/chosen": -78.02767181396484, "logps/rejected": -116.68928527832031, "loss": 0.7925, "rewards/accuracies": 0.875, "rewards/chosen": 0.12410588562488556, "rewards/margins": 1.0171339511871338, "rewards/rejected": -0.8930279612541199, "step": 651 }, { "epoch": 0.03504340114482277, "grad_norm": 11.785784721374512, "learning_rate": 3.9860635232174387e-07, "logits/chosen": -0.6186634302139282, "logits/rejected": -1.0957224369049072, "logps/chosen": -95.82867431640625, "logps/rejected": -129.06671142578125, "loss": 0.8406, "rewards/accuracies": 0.875, "rewards/chosen": -0.08924400061368942, "rewards/margins": 0.784622848033905, "rewards/rejected": -0.873866856098175, "step": 652 }, { "epoch": 0.0350971486925909, "grad_norm": 8.4454927444458, "learning_rate": 3.966844395786708e-07, "logits/chosen": -0.6687749624252319, "logits/rejected": -0.8199905753135681, "logps/chosen": -85.86489868164062, "logps/rejected": -114.888671875, "loss": 0.8932, "rewards/accuracies": 0.875, "rewards/chosen": -0.33673200011253357, "rewards/margins": 0.8441847562789917, "rewards/rejected": -1.1809167861938477, "step": 653 }, { "epoch": 0.03515089624035903, "grad_norm": 6.7808756828308105, "learning_rate": 3.9476412008934724e-07, "logits/chosen": -0.5724535584449768, "logits/rejected": -0.6780301332473755, "logps/chosen": -64.57200622558594, "logps/rejected": -118.97002410888672, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": 0.10117088258266449, "rewards/margins": 1.4421573877334595, "rewards/rejected": -1.3409864902496338, "step": 654 }, { "epoch": 0.03520464378812717, "grad_norm": 6.727846145629883, "learning_rate": 3.9284542346747467e-07, "logits/chosen": -0.6693817377090454, "logits/rejected": -0.8534969687461853, "logps/chosen": -104.6119384765625, "logps/rejected": -129.17262268066406, "loss": 0.5594, "rewards/accuracies": 0.875, "rewards/chosen": 0.0574013777077198, "rewards/margins": 1.5501397848129272, "rewards/rejected": -1.4927384853363037, "step": 655 }, { "epoch": 0.0352583913358953, "grad_norm": 8.057894706726074, "learning_rate": 3.909283793017288e-07, "logits/chosen": -0.8541219234466553, "logits/rejected": -0.9958515167236328, "logps/chosen": -106.40361022949219, "logps/rejected": -108.03924560546875, "loss": 0.6536, "rewards/accuracies": 0.875, "rewards/chosen": 0.0009920168668031693, "rewards/margins": 1.1331787109375, "rewards/rejected": -1.1321866512298584, "step": 656 }, { "epoch": 0.035312138883663434, "grad_norm": 7.6631340980529785, "learning_rate": 3.89013017155302e-07, "logits/chosen": -0.5530163049697876, "logits/rejected": -0.45625489950180054, "logps/chosen": -89.87667846679688, "logps/rejected": -110.7386474609375, "loss": 0.7421, "rewards/accuracies": 1.0, "rewards/chosen": 0.23789778351783752, "rewards/margins": 1.0576244592666626, "rewards/rejected": -0.8197267055511475, "step": 657 }, { "epoch": 0.03536588643143156, "grad_norm": 6.91584587097168, "learning_rate": 3.8709936656544817e-07, "logits/chosen": -0.5753040909767151, "logits/rejected": -0.7113423347473145, "logps/chosen": -72.34064483642578, "logps/rejected": -124.19331359863281, "loss": 0.6358, "rewards/accuracies": 0.875, "rewards/chosen": -0.041884854435920715, "rewards/margins": 1.2404505014419556, "rewards/rejected": -1.2823352813720703, "step": 658 }, { "epoch": 0.0354196339791997, "grad_norm": 7.041658878326416, "learning_rate": 3.851874570430266e-07, "logits/chosen": -0.5655523538589478, "logits/rejected": -0.6918456554412842, "logps/chosen": -66.64675903320312, "logps/rejected": -81.06437683105469, "loss": 0.689, "rewards/accuracies": 0.875, "rewards/chosen": 0.09326935559511185, "rewards/margins": 1.0305993556976318, "rewards/rejected": -0.9373300075531006, "step": 659 }, { "epoch": 0.03547338152696783, "grad_norm": 5.780549049377441, "learning_rate": 3.8327731807204744e-07, "logits/chosen": -0.6310330629348755, "logits/rejected": -0.9282959699630737, "logps/chosen": -88.57028198242188, "logps/rejected": -144.2147979736328, "loss": 0.4005, "rewards/accuracies": 1.0, "rewards/chosen": 0.057380903512239456, "rewards/margins": 1.6213874816894531, "rewards/rejected": -1.5640066862106323, "step": 660 }, { "epoch": 0.035527129074735965, "grad_norm": 9.179677963256836, "learning_rate": 3.8136897910921604e-07, "logits/chosen": -0.5165460109710693, "logits/rejected": -0.8169257640838623, "logps/chosen": -91.19393157958984, "logps/rejected": -149.40589904785156, "loss": 0.6542, "rewards/accuracies": 0.875, "rewards/chosen": -0.19408783316612244, "rewards/margins": 1.1796660423278809, "rewards/rejected": -1.3737540245056152, "step": 661 }, { "epoch": 0.0355808766225041, "grad_norm": 6.491672992706299, "learning_rate": 3.794624695834807e-07, "logits/chosen": -0.5968656539916992, "logits/rejected": -0.7170764207839966, "logps/chosen": -88.97088623046875, "logps/rejected": -114.48873138427734, "loss": 0.615, "rewards/accuracies": 1.0, "rewards/chosen": 0.37049031257629395, "rewards/margins": 1.22767972946167, "rewards/rejected": -0.857189416885376, "step": 662 }, { "epoch": 0.03563462417027223, "grad_norm": 7.913340091705322, "learning_rate": 3.7755781889557627e-07, "logits/chosen": -0.5569940805435181, "logits/rejected": -0.7486147880554199, "logps/chosen": -94.75019836425781, "logps/rejected": -122.24205017089844, "loss": 0.6576, "rewards/accuracies": 0.875, "rewards/chosen": -0.22861573100090027, "rewards/margins": 1.2386844158172607, "rewards/rejected": -1.4673001766204834, "step": 663 }, { "epoch": 0.03568837171804037, "grad_norm": 5.8082051277160645, "learning_rate": 3.7565505641757266e-07, "logits/chosen": -0.5537548065185547, "logits/rejected": -0.7943853139877319, "logps/chosen": -89.43672180175781, "logps/rejected": -148.6561279296875, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": -0.06169487535953522, "rewards/margins": 2.0244851112365723, "rewards/rejected": -2.0861802101135254, "step": 664 }, { "epoch": 0.035742119265808496, "grad_norm": 7.460577964782715, "learning_rate": 3.7375421149242096e-07, "logits/chosen": -0.5787289142608643, "logits/rejected": -0.7111059427261353, "logps/chosen": -56.413551330566406, "logps/rejected": -98.48411560058594, "loss": 0.7973, "rewards/accuracies": 0.875, "rewards/chosen": -0.0637110248208046, "rewards/margins": 0.994929313659668, "rewards/rejected": -1.0586402416229248, "step": 665 }, { "epoch": 0.03579586681357663, "grad_norm": 6.22314977645874, "learning_rate": 3.718553134335016e-07, "logits/chosen": -0.49443745613098145, "logits/rejected": -0.8747332096099854, "logps/chosen": -65.65482330322266, "logps/rejected": -101.81959533691406, "loss": 0.5068, "rewards/accuracies": 1.0, "rewards/chosen": 0.15019601583480835, "rewards/margins": 1.3773282766342163, "rewards/rejected": -1.2271322011947632, "step": 666 }, { "epoch": 0.03584961436134476, "grad_norm": 11.79557991027832, "learning_rate": 3.699583915241717e-07, "logits/chosen": -0.6174644231796265, "logits/rejected": -0.8240375518798828, "logps/chosen": -106.94013977050781, "logps/rejected": -132.3618927001953, "loss": 0.9622, "rewards/accuracies": 0.75, "rewards/chosen": -0.2864145040512085, "rewards/margins": 0.7045012712478638, "rewards/rejected": -0.9909157752990723, "step": 667 }, { "epoch": 0.0359033619091129, "grad_norm": 9.207590103149414, "learning_rate": 3.6806347501731365e-07, "logits/chosen": -0.4732949137687683, "logits/rejected": -0.8099555969238281, "logps/chosen": -104.8673095703125, "logps/rejected": -152.33590698242188, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": -0.1925300806760788, "rewards/margins": 1.448000431060791, "rewards/rejected": -1.6405305862426758, "step": 668 }, { "epoch": 0.03595710945688103, "grad_norm": 8.990251541137695, "learning_rate": 3.6617059313488373e-07, "logits/chosen": -0.6564855575561523, "logits/rejected": -0.9770410060882568, "logps/chosen": -85.83119201660156, "logps/rejected": -132.65887451171875, "loss": 0.8577, "rewards/accuracies": 0.875, "rewards/chosen": -0.22936736047267914, "rewards/margins": 0.8962463736534119, "rewards/rejected": -1.1256136894226074, "step": 669 }, { "epoch": 0.03601085700464916, "grad_norm": 7.881577491760254, "learning_rate": 3.642797750674629e-07, "logits/chosen": -0.6014187335968018, "logits/rejected": -0.8813595771789551, "logps/chosen": -72.19586181640625, "logps/rejected": -121.5498275756836, "loss": 0.7554, "rewards/accuracies": 1.0, "rewards/chosen": -0.023863837122917175, "rewards/margins": 0.9704183340072632, "rewards/rejected": -0.9942822456359863, "step": 670 }, { "epoch": 0.03606460455241729, "grad_norm": 10.23489761352539, "learning_rate": 3.6239104997380424e-07, "logits/chosen": -0.5355775356292725, "logits/rejected": -0.6046326160430908, "logps/chosen": -76.18307495117188, "logps/rejected": -111.47815704345703, "loss": 0.828, "rewards/accuracies": 0.875, "rewards/chosen": 0.25667428970336914, "rewards/margins": 0.8507390022277832, "rewards/rejected": -0.5940647125244141, "step": 671 }, { "epoch": 0.03611835210018543, "grad_norm": 7.740089416503906, "learning_rate": 3.605044469803854e-07, "logits/chosen": -0.5416321754455566, "logits/rejected": -0.7857733964920044, "logps/chosen": -91.330078125, "logps/rejected": -180.77645874023438, "loss": 0.4767, "rewards/accuracies": 0.875, "rewards/chosen": 0.041738174855709076, "rewards/margins": 1.846373438835144, "rewards/rejected": -1.8046351671218872, "step": 672 }, { "epoch": 0.036172099647953565, "grad_norm": 7.836188316345215, "learning_rate": 3.586199951809582e-07, "logits/chosen": -0.6315485239028931, "logits/rejected": -0.9807299375534058, "logps/chosen": -73.31902313232422, "logps/rejected": -113.95877075195312, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": 0.12517745792865753, "rewards/margins": 1.2310590744018555, "rewards/rejected": -1.105881690979004, "step": 673 }, { "epoch": 0.036225847195721694, "grad_norm": 12.19095516204834, "learning_rate": 3.567377236361008e-07, "logits/chosen": -0.5273862481117249, "logits/rejected": -0.5911237001419067, "logps/chosen": -135.0470733642578, "logps/rejected": -165.9716339111328, "loss": 1.0645, "rewards/accuracies": 0.875, "rewards/chosen": -0.29765796661376953, "rewards/margins": 0.7253243923187256, "rewards/rejected": -1.0229823589324951, "step": 674 }, { "epoch": 0.03627959474348983, "grad_norm": 10.499970436096191, "learning_rate": 3.548576613727689e-07, "logits/chosen": -0.5959356427192688, "logits/rejected": -0.9460274577140808, "logps/chosen": -82.85147094726562, "logps/rejected": -81.02261352539062, "loss": 0.9601, "rewards/accuracies": 0.875, "rewards/chosen": -0.03176560625433922, "rewards/margins": 0.781395673751831, "rewards/rejected": -0.8131612539291382, "step": 675 }, { "epoch": 0.03633334229125796, "grad_norm": 8.276034355163574, "learning_rate": 3.529798373838481e-07, "logits/chosen": -0.4542352557182312, "logits/rejected": -0.9272215962409973, "logps/chosen": -82.55889129638672, "logps/rejected": -135.02520751953125, "loss": 0.7176, "rewards/accuracies": 0.875, "rewards/chosen": -0.06979916989803314, "rewards/margins": 1.3890080451965332, "rewards/rejected": -1.458807349205017, "step": 676 }, { "epoch": 0.036387089839026096, "grad_norm": 8.307058334350586, "learning_rate": 3.5110428062770745e-07, "logits/chosen": -0.749872624874115, "logits/rejected": -0.8263155221939087, "logps/chosen": -100.12751007080078, "logps/rejected": -115.29397583007812, "loss": 0.7591, "rewards/accuracies": 1.0, "rewards/chosen": -0.331951379776001, "rewards/margins": 0.8537285327911377, "rewards/rejected": -1.1856799125671387, "step": 677 }, { "epoch": 0.036440837386794225, "grad_norm": 7.524534225463867, "learning_rate": 3.492310200277522e-07, "logits/chosen": -0.5581697225570679, "logits/rejected": -0.8143494725227356, "logps/chosen": -80.24476623535156, "logps/rejected": -156.28604125976562, "loss": 0.6168, "rewards/accuracies": 0.875, "rewards/chosen": 0.25532442331314087, "rewards/margins": 1.4754148721694946, "rewards/rejected": -1.220090389251709, "step": 678 }, { "epoch": 0.03649458493456236, "grad_norm": 6.648694038391113, "learning_rate": 3.4736008447197824e-07, "logits/chosen": -0.5280447602272034, "logits/rejected": -0.7188539505004883, "logps/chosen": -69.12641906738281, "logps/rejected": -102.53292083740234, "loss": 0.5989, "rewards/accuracies": 1.0, "rewards/chosen": 0.05604182183742523, "rewards/margins": 1.2185702323913574, "rewards/rejected": -1.1625285148620605, "step": 679 }, { "epoch": 0.03654833248233049, "grad_norm": 11.746505737304688, "learning_rate": 3.454915028125263e-07, "logits/chosen": -0.7573858499526978, "logits/rejected": -0.8531477451324463, "logps/chosen": -81.75782775878906, "logps/rejected": -96.41738891601562, "loss": 1.1, "rewards/accuracies": 0.75, "rewards/chosen": -0.3346001207828522, "rewards/margins": 0.34472084045410156, "rewards/rejected": -0.6793209314346313, "step": 680 }, { "epoch": 0.03660208003009863, "grad_norm": 8.63896369934082, "learning_rate": 3.4362530386523724e-07, "logits/chosen": -0.7091917991638184, "logits/rejected": -0.7141443490982056, "logps/chosen": -64.84410095214844, "logps/rejected": -106.36604309082031, "loss": 0.7957, "rewards/accuracies": 1.0, "rewards/chosen": 0.14145919680595398, "rewards/margins": 0.9638069272041321, "rewards/rejected": -0.8223477602005005, "step": 681 }, { "epoch": 0.036655827577866756, "grad_norm": 9.021488189697266, "learning_rate": 3.417615164092069e-07, "logits/chosen": -0.6227965950965881, "logits/rejected": -0.785117506980896, "logps/chosen": -88.16488647460938, "logps/rejected": -124.53301239013672, "loss": 0.7881, "rewards/accuracies": 0.875, "rewards/chosen": 0.027843229472637177, "rewards/margins": 1.2882087230682373, "rewards/rejected": -1.260365605354309, "step": 682 }, { "epoch": 0.03670957512563489, "grad_norm": 8.721512794494629, "learning_rate": 3.399001691863441e-07, "logits/chosen": -0.863761305809021, "logits/rejected": -0.7986739277839661, "logps/chosen": -102.9293212890625, "logps/rejected": -159.22097778320312, "loss": 0.5618, "rewards/accuracies": 0.875, "rewards/chosen": -0.43021029233932495, "rewards/margins": 1.5832433700561523, "rewards/rejected": -2.013453483581543, "step": 683 }, { "epoch": 0.03676332267340303, "grad_norm": 9.210837364196777, "learning_rate": 3.3804129090092536e-07, "logits/chosen": -0.5095766186714172, "logits/rejected": -0.9213505983352661, "logps/chosen": -92.21206665039062, "logps/rejected": -161.9468536376953, "loss": 0.4766, "rewards/accuracies": 0.875, "rewards/chosen": -0.13881543278694153, "rewards/margins": 1.8954708576202393, "rewards/rejected": -2.0342862606048584, "step": 684 }, { "epoch": 0.03681707022117116, "grad_norm": 9.621092796325684, "learning_rate": 3.361849102191533e-07, "logits/chosen": -0.5784767270088196, "logits/rejected": -0.790291428565979, "logps/chosen": -85.18959045410156, "logps/rejected": -132.8776397705078, "loss": 0.793, "rewards/accuracies": 0.875, "rewards/chosen": -0.15184125304222107, "rewards/margins": 1.1342144012451172, "rewards/rejected": -1.2860556840896606, "step": 685 }, { "epoch": 0.036870817768939294, "grad_norm": 8.386796951293945, "learning_rate": 3.3433105576871445e-07, "logits/chosen": -0.5986143350601196, "logits/rejected": -0.7131360769271851, "logps/chosen": -92.72688293457031, "logps/rejected": -148.33071899414062, "loss": 0.6044, "rewards/accuracies": 0.875, "rewards/chosen": -0.2654241919517517, "rewards/margins": 1.4698600769042969, "rewards/rejected": -1.7352843284606934, "step": 686 }, { "epoch": 0.036924565316707424, "grad_norm": 7.0322394371032715, "learning_rate": 3.32479756138338e-07, "logits/chosen": -0.37500154972076416, "logits/rejected": -0.9101811647415161, "logps/chosen": -83.53804016113281, "logps/rejected": -140.19989013671875, "loss": 0.4855, "rewards/accuracies": 1.0, "rewards/chosen": 0.10695964097976685, "rewards/margins": 1.6283165216445923, "rewards/rejected": -1.5213568210601807, "step": 687 }, { "epoch": 0.03697831286447556, "grad_norm": 6.989006042480469, "learning_rate": 3.306310398773543e-07, "logits/chosen": -0.6927367448806763, "logits/rejected": -0.9989447593688965, "logps/chosen": -80.9221420288086, "logps/rejected": -122.96833801269531, "loss": 0.6416, "rewards/accuracies": 1.0, "rewards/chosen": 0.12916311621665955, "rewards/margins": 1.1928164958953857, "rewards/rejected": -1.0636533498764038, "step": 688 }, { "epoch": 0.03703206041224369, "grad_norm": 12.971725463867188, "learning_rate": 3.2878493549525515e-07, "logits/chosen": -0.660380482673645, "logits/rejected": -0.5516759157180786, "logps/chosen": -73.57266235351562, "logps/rejected": -77.35110473632812, "loss": 1.2892, "rewards/accuracies": 0.5, "rewards/chosen": -0.3140859603881836, "rewards/margins": 0.22846673429012299, "rewards/rejected": -0.542552649974823, "step": 689 }, { "epoch": 0.037085807960011825, "grad_norm": 9.573136329650879, "learning_rate": 3.269414714612534e-07, "logits/chosen": -0.690120279788971, "logits/rejected": -0.8802876472473145, "logps/chosen": -70.33718872070312, "logps/rejected": -108.16355895996094, "loss": 0.7915, "rewards/accuracies": 0.875, "rewards/chosen": -0.15967582166194916, "rewards/margins": 1.1418479681015015, "rewards/rejected": -1.3015236854553223, "step": 690 }, { "epoch": 0.037139555507779955, "grad_norm": 9.702142715454102, "learning_rate": 3.251006762038456e-07, "logits/chosen": -0.5904650092124939, "logits/rejected": -0.8295743465423584, "logps/chosen": -90.64139556884766, "logps/rejected": -141.24017333984375, "loss": 0.5753, "rewards/accuracies": 1.0, "rewards/chosen": -0.4416710436344147, "rewards/margins": 1.340391993522644, "rewards/rejected": -1.7820630073547363, "step": 691 }, { "epoch": 0.03719330305554809, "grad_norm": 11.586276054382324, "learning_rate": 3.232625781103715e-07, "logits/chosen": -0.6614329814910889, "logits/rejected": -0.7189191579818726, "logps/chosen": -103.09715270996094, "logps/rejected": -132.3131561279297, "loss": 0.7936, "rewards/accuracies": 0.75, "rewards/chosen": 0.05026690661907196, "rewards/margins": 1.0476772785186768, "rewards/rejected": -0.9974102973937988, "step": 692 }, { "epoch": 0.03724705060331623, "grad_norm": 7.046651363372803, "learning_rate": 3.214272055265774e-07, "logits/chosen": -0.7784832715988159, "logits/rejected": -0.7919198274612427, "logps/chosen": -84.33151245117188, "logps/rejected": -109.41844177246094, "loss": 0.6913, "rewards/accuracies": 0.875, "rewards/chosen": -0.32089605927467346, "rewards/margins": 1.101486325263977, "rewards/rejected": -1.4223823547363281, "step": 693 }, { "epoch": 0.037300798151084356, "grad_norm": 12.21971607208252, "learning_rate": 3.1959458675617906e-07, "logits/chosen": -0.7149659395217896, "logits/rejected": -0.8636602163314819, "logps/chosen": -106.56996154785156, "logps/rejected": -132.00833129882812, "loss": 0.9841, "rewards/accuracies": 0.75, "rewards/chosen": -0.17830370366573334, "rewards/margins": 0.9579786658287048, "rewards/rejected": -1.1362824440002441, "step": 694 }, { "epoch": 0.03735454569885249, "grad_norm": 6.711921215057373, "learning_rate": 3.1776475006042514e-07, "logits/chosen": -0.5980788469314575, "logits/rejected": -0.8050870895385742, "logps/chosen": -89.16564178466797, "logps/rejected": -142.779541015625, "loss": 0.4574, "rewards/accuracies": 1.0, "rewards/chosen": 0.1941465437412262, "rewards/margins": 1.6216249465942383, "rewards/rejected": -1.427478313446045, "step": 695 }, { "epoch": 0.03740829324662062, "grad_norm": 8.0258207321167, "learning_rate": 3.15937723657661e-07, "logits/chosen": -0.4970434308052063, "logits/rejected": -0.6868813037872314, "logps/chosen": -87.46186828613281, "logps/rejected": -146.61512756347656, "loss": 0.5093, "rewards/accuracies": 1.0, "rewards/chosen": -0.21566006541252136, "rewards/margins": 1.3468873500823975, "rewards/rejected": -1.5625474452972412, "step": 696 }, { "epoch": 0.03746204079438876, "grad_norm": 12.877750396728516, "learning_rate": 3.1411353572289403e-07, "logits/chosen": -0.5387152433395386, "logits/rejected": -0.8638031482696533, "logps/chosen": -99.54096984863281, "logps/rejected": -178.4119110107422, "loss": 0.6568, "rewards/accuracies": 0.875, "rewards/chosen": -0.27660855650901794, "rewards/margins": 1.4651000499725342, "rewards/rejected": -1.741708517074585, "step": 697 }, { "epoch": 0.03751578834215689, "grad_norm": 7.735604763031006, "learning_rate": 3.1229221438735837e-07, "logits/chosen": -0.806607723236084, "logits/rejected": -0.9402638077735901, "logps/chosen": -65.47804260253906, "logps/rejected": -112.54678344726562, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": -0.05872820317745209, "rewards/margins": 1.2286039590835571, "rewards/rejected": -1.2873320579528809, "step": 698 }, { "epoch": 0.037569535889925024, "grad_norm": 8.08509349822998, "learning_rate": 3.104737877380827e-07, "logits/chosen": -0.649471640586853, "logits/rejected": -0.5973415970802307, "logps/chosen": -71.55877685546875, "logps/rejected": -117.21534729003906, "loss": 0.7197, "rewards/accuracies": 1.0, "rewards/chosen": 0.2575012743473053, "rewards/margins": 0.8958107829093933, "rewards/rejected": -0.6383095383644104, "step": 699 }, { "epoch": 0.03762328343769315, "grad_norm": 7.937282562255859, "learning_rate": 3.086582838174551e-07, "logits/chosen": -0.6558883190155029, "logits/rejected": -0.6055994033813477, "logps/chosen": -68.35356140136719, "logps/rejected": -105.2479248046875, "loss": 0.7614, "rewards/accuracies": 0.875, "rewards/chosen": -0.01503719761967659, "rewards/margins": 1.0105513334274292, "rewards/rejected": -1.0255885124206543, "step": 700 }, { "epoch": 0.03767703098546129, "grad_norm": 8.252848625183105, "learning_rate": 3.068457306227921e-07, "logits/chosen": -0.5853729844093323, "logits/rejected": -0.7126668691635132, "logps/chosen": -107.05699157714844, "logps/rejected": -139.77407836914062, "loss": 0.6546, "rewards/accuracies": 0.875, "rewards/chosen": -0.10842657089233398, "rewards/margins": 1.1688017845153809, "rewards/rejected": -1.2772283554077148, "step": 701 }, { "epoch": 0.03773077853322942, "grad_norm": 9.366060256958008, "learning_rate": 3.05036156105906e-07, "logits/chosen": -0.6754028797149658, "logits/rejected": -0.8179764747619629, "logps/chosen": -84.41289520263672, "logps/rejected": -128.371337890625, "loss": 0.7689, "rewards/accuracies": 0.875, "rewards/chosen": -0.15177825093269348, "rewards/margins": 0.9816766381263733, "rewards/rejected": -1.1334547996520996, "step": 702 }, { "epoch": 0.037784526080997555, "grad_norm": 7.5273919105529785, "learning_rate": 3.0322958817267427e-07, "logits/chosen": -0.8072121143341064, "logits/rejected": -1.0360885858535767, "logps/chosen": -91.87400817871094, "logps/rejected": -112.41732025146484, "loss": 0.7061, "rewards/accuracies": 0.875, "rewards/chosen": -0.22343850135803223, "rewards/margins": 1.2803586721420288, "rewards/rejected": -1.503797173500061, "step": 703 }, { "epoch": 0.03783827362876569, "grad_norm": 7.574466705322266, "learning_rate": 3.014260546826097e-07, "logits/chosen": -0.5451156497001648, "logits/rejected": -0.9620851874351501, "logps/chosen": -92.04518127441406, "logps/rejected": -135.3177490234375, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": -0.13693809509277344, "rewards/margins": 1.2029224634170532, "rewards/rejected": -1.3398605585098267, "step": 704 }, { "epoch": 0.03789202117653382, "grad_norm": 9.32886791229248, "learning_rate": 2.996255834484296e-07, "logits/chosen": -0.6748923063278198, "logits/rejected": -0.5428451895713806, "logps/chosen": -90.72340393066406, "logps/rejected": -111.44511413574219, "loss": 0.7832, "rewards/accuracies": 0.875, "rewards/chosen": -0.0970517247915268, "rewards/margins": 0.8142352104187012, "rewards/rejected": -0.9112869501113892, "step": 705 }, { "epoch": 0.037945768724301956, "grad_norm": 7.839056968688965, "learning_rate": 2.978282022356275e-07, "logits/chosen": -0.628490149974823, "logits/rejected": -0.7360343933105469, "logps/chosen": -74.3869400024414, "logps/rejected": -99.3019027709961, "loss": 0.6529, "rewards/accuracies": 1.0, "rewards/chosen": 0.2986869215965271, "rewards/margins": 1.1105064153671265, "rewards/rejected": -0.8118195533752441, "step": 706 }, { "epoch": 0.037999516272070086, "grad_norm": 6.864739418029785, "learning_rate": 2.960339387620454e-07, "logits/chosen": -0.3905555009841919, "logits/rejected": -0.8723859786987305, "logps/chosen": -88.20437622070312, "logps/rejected": -141.89303588867188, "loss": 0.5524, "rewards/accuracies": 1.0, "rewards/chosen": -0.06316991150379181, "rewards/margins": 1.361868143081665, "rewards/rejected": -1.4250380992889404, "step": 707 }, { "epoch": 0.03805326381983822, "grad_norm": 5.293715953826904, "learning_rate": 2.942428206974456e-07, "logits/chosen": -0.561200737953186, "logits/rejected": -0.7700548768043518, "logps/chosen": -109.68231201171875, "logps/rejected": -146.9243621826172, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": -0.06075315177440643, "rewards/margins": 2.0531318187713623, "rewards/rejected": -2.113884925842285, "step": 708 }, { "epoch": 0.03810701136760635, "grad_norm": 4.59727144241333, "learning_rate": 2.924548756630844e-07, "logits/chosen": -0.543777585029602, "logits/rejected": -0.8105175495147705, "logps/chosen": -100.65045928955078, "logps/rejected": -188.03370666503906, "loss": 0.2781, "rewards/accuracies": 1.0, "rewards/chosen": 0.20384415984153748, "rewards/margins": 2.3779678344726562, "rewards/rejected": -2.174123764038086, "step": 709 }, { "epoch": 0.03816075891537449, "grad_norm": 6.603420734405518, "learning_rate": 2.906701312312861e-07, "logits/chosen": -0.7732895612716675, "logits/rejected": -0.8093118071556091, "logps/chosen": -98.18729400634766, "logps/rejected": -142.55169677734375, "loss": 0.5989, "rewards/accuracies": 0.875, "rewards/chosen": 0.04766872525215149, "rewards/margins": 1.353705883026123, "rewards/rejected": -1.3060370683670044, "step": 710 }, { "epoch": 0.03821450646314262, "grad_norm": 8.56801700592041, "learning_rate": 2.888886149250173e-07, "logits/chosen": -0.5945100784301758, "logits/rejected": -0.837255597114563, "logps/chosen": -83.21514892578125, "logps/rejected": -137.60191345214844, "loss": 0.5887, "rewards/accuracies": 0.875, "rewards/chosen": -0.00731147825717926, "rewards/margins": 1.5267313718795776, "rewards/rejected": -1.5340428352355957, "step": 711 }, { "epoch": 0.03826825401091075, "grad_norm": 20.529142379760742, "learning_rate": 2.8711035421746363e-07, "logits/chosen": -0.6902414560317993, "logits/rejected": -0.8655490279197693, "logps/chosen": -97.7506103515625, "logps/rejected": -114.50788116455078, "loss": 0.5948, "rewards/accuracies": 1.0, "rewards/chosen": -0.19427956640720367, "rewards/margins": 1.1301366090774536, "rewards/rejected": -1.324416160583496, "step": 712 }, { "epoch": 0.03832200155867888, "grad_norm": 8.702665328979492, "learning_rate": 2.853353765316051e-07, "logits/chosen": -0.6203235387802124, "logits/rejected": -1.0115509033203125, "logps/chosen": -77.09933471679688, "logps/rejected": -122.24383544921875, "loss": 0.5887, "rewards/accuracies": 1.0, "rewards/chosen": -0.02742178738117218, "rewards/margins": 1.173346757888794, "rewards/rejected": -1.2007684707641602, "step": 713 }, { "epoch": 0.03837574910644702, "grad_norm": 6.712912559509277, "learning_rate": 2.835637092397932e-07, "logits/chosen": -0.547156572341919, "logits/rejected": -0.7539544105529785, "logps/chosen": -80.2132339477539, "logps/rejected": -130.85606384277344, "loss": 0.5495, "rewards/accuracies": 1.0, "rewards/chosen": 0.07244092226028442, "rewards/margins": 1.4282433986663818, "rewards/rejected": -1.3558025360107422, "step": 714 }, { "epoch": 0.038429496654215155, "grad_norm": 7.7094268798828125, "learning_rate": 2.8179537966332883e-07, "logits/chosen": -0.6612792015075684, "logits/rejected": -0.7429506778717041, "logps/chosen": -75.95818328857422, "logps/rejected": -115.54781341552734, "loss": 0.5729, "rewards/accuracies": 1.0, "rewards/chosen": 0.21513444185256958, "rewards/margins": 1.1586519479751587, "rewards/rejected": -0.9435175061225891, "step": 715 }, { "epoch": 0.038483244201983284, "grad_norm": 9.824764251708984, "learning_rate": 2.800304150720424e-07, "logits/chosen": -0.7811596989631653, "logits/rejected": -0.7992464303970337, "logps/chosen": -95.9091567993164, "logps/rejected": -119.14420318603516, "loss": 0.8004, "rewards/accuracies": 1.0, "rewards/chosen": -0.3405529260635376, "rewards/margins": 0.750622570514679, "rewards/rejected": -1.0911755561828613, "step": 716 }, { "epoch": 0.03853699174975142, "grad_norm": 8.54819107055664, "learning_rate": 2.7826884268387084e-07, "logits/chosen": -0.6746514439582825, "logits/rejected": -0.7976412773132324, "logps/chosen": -92.34904479980469, "logps/rejected": -121.63672637939453, "loss": 0.7238, "rewards/accuracies": 0.875, "rewards/chosen": -0.06223855912685394, "rewards/margins": 1.0201053619384766, "rewards/rejected": -1.0823439359664917, "step": 717 }, { "epoch": 0.03859073929751955, "grad_norm": 9.010311126708984, "learning_rate": 2.765106896644395e-07, "logits/chosen": -0.5708978772163391, "logits/rejected": -0.7735472321510315, "logps/chosen": -101.73075866699219, "logps/rejected": -146.85704040527344, "loss": 0.7598, "rewards/accuracies": 0.875, "rewards/chosen": -0.36111128330230713, "rewards/margins": 1.1331956386566162, "rewards/rejected": -1.4943069219589233, "step": 718 }, { "epoch": 0.038644486845287686, "grad_norm": 10.858604431152344, "learning_rate": 2.7475598312664285e-07, "logits/chosen": -0.769136369228363, "logits/rejected": -0.7775800228118896, "logps/chosen": -112.32925415039062, "logps/rejected": -128.8840789794922, "loss": 0.7091, "rewards/accuracies": 0.75, "rewards/chosen": 0.31727781891822815, "rewards/margins": 1.312415599822998, "rewards/rejected": -0.9951378107070923, "step": 719 }, { "epoch": 0.038698234393055815, "grad_norm": 7.689519882202148, "learning_rate": 2.730047501302266e-07, "logits/chosen": -0.6376043558120728, "logits/rejected": -0.9069548845291138, "logps/chosen": -96.82869720458984, "logps/rejected": -146.0591583251953, "loss": 0.5485, "rewards/accuracies": 1.0, "rewards/chosen": -0.08713502436876297, "rewards/margins": 1.3983957767486572, "rewards/rejected": -1.485530972480774, "step": 720 }, { "epoch": 0.03875198194082395, "grad_norm": 6.2469611167907715, "learning_rate": 2.712570176813697e-07, "logits/chosen": -0.6831238269805908, "logits/rejected": -0.7461338639259338, "logps/chosen": -96.95396423339844, "logps/rejected": -174.4862518310547, "loss": 0.4618, "rewards/accuracies": 0.875, "rewards/chosen": 0.23872342705726624, "rewards/margins": 2.2016587257385254, "rewards/rejected": -1.962935209274292, "step": 721 }, { "epoch": 0.03880572948859208, "grad_norm": 8.32548999786377, "learning_rate": 2.695128127322689e-07, "logits/chosen": -0.5565392971038818, "logits/rejected": -0.6749974489212036, "logps/chosen": -69.10150146484375, "logps/rejected": -97.45648193359375, "loss": 0.8292, "rewards/accuracies": 0.875, "rewards/chosen": 0.08680516481399536, "rewards/margins": 0.9992325901985168, "rewards/rejected": -0.9124274253845215, "step": 722 }, { "epoch": 0.03885947703636022, "grad_norm": 9.229727745056152, "learning_rate": 2.677721621807217e-07, "logits/chosen": -0.6534993648529053, "logits/rejected": -0.7931410670280457, "logps/chosen": -92.54600524902344, "logps/rejected": -122.69889831542969, "loss": 0.8207, "rewards/accuracies": 0.875, "rewards/chosen": -0.6375648975372314, "rewards/margins": 0.8042935132980347, "rewards/rejected": -1.4418584108352661, "step": 723 }, { "epoch": 0.038913224584128346, "grad_norm": 9.547405242919922, "learning_rate": 2.6603509286971337e-07, "logits/chosen": -0.4963204264640808, "logits/rejected": -0.6493628025054932, "logps/chosen": -83.09684753417969, "logps/rejected": -94.8472900390625, "loss": 0.8092, "rewards/accuracies": 0.75, "rewards/chosen": 0.08488932251930237, "rewards/margins": 0.8403060436248779, "rewards/rejected": -0.755416750907898, "step": 724 }, { "epoch": 0.03896697213189648, "grad_norm": 8.75818920135498, "learning_rate": 2.6430163158700113e-07, "logits/chosen": -0.4469984769821167, "logits/rejected": -0.8085393905639648, "logps/chosen": -98.25666809082031, "logps/rejected": -113.67827606201172, "loss": 0.6987, "rewards/accuracies": 1.0, "rewards/chosen": -0.021126359701156616, "rewards/margins": 0.9741745591163635, "rewards/rejected": -0.9953008890151978, "step": 725 }, { "epoch": 0.03902071967966462, "grad_norm": 5.0522966384887695, "learning_rate": 2.6257180506470277e-07, "logits/chosen": -0.7254922389984131, "logits/rejected": -0.8092784881591797, "logps/chosen": -91.55499267578125, "logps/rejected": -149.31326293945312, "loss": 0.3303, "rewards/accuracies": 1.0, "rewards/chosen": -0.013098828494548798, "rewards/margins": 1.840981125831604, "rewards/rejected": -1.8540799617767334, "step": 726 }, { "epoch": 0.03907446722743275, "grad_norm": 11.014172554016113, "learning_rate": 2.6084563997888296e-07, "logits/chosen": -0.8169276714324951, "logits/rejected": -0.8161587715148926, "logps/chosen": -117.38027954101562, "logps/rejected": -175.8406982421875, "loss": 0.4881, "rewards/accuracies": 0.875, "rewards/chosen": -0.38252317905426025, "rewards/margins": 1.8902794122695923, "rewards/rejected": -2.2728025913238525, "step": 727 }, { "epoch": 0.039128214775200884, "grad_norm": 6.365426540374756, "learning_rate": 2.591231629491423e-07, "logits/chosen": -0.7599687576293945, "logits/rejected": -0.7243821620941162, "logps/chosen": -85.49930572509766, "logps/rejected": -152.4051971435547, "loss": 0.4613, "rewards/accuracies": 1.0, "rewards/chosen": 0.1814870834350586, "rewards/margins": 1.754408359527588, "rewards/rejected": -1.5729211568832397, "step": 728 }, { "epoch": 0.03918196232296901, "grad_norm": 6.961399078369141, "learning_rate": 2.574044005382081e-07, "logits/chosen": -0.7352026700973511, "logits/rejected": -0.7147886753082275, "logps/chosen": -80.35275268554688, "logps/rejected": -127.64649200439453, "loss": 0.5072, "rewards/accuracies": 1.0, "rewards/chosen": 0.16196995973587036, "rewards/margins": 1.4976074695587158, "rewards/rejected": -1.3356375694274902, "step": 729 }, { "epoch": 0.03923570987073715, "grad_norm": 7.649588584899902, "learning_rate": 2.556893792515227e-07, "logits/chosen": -0.40624117851257324, "logits/rejected": -0.49398404359817505, "logps/chosen": -96.33955383300781, "logps/rejected": -123.00117492675781, "loss": 0.5675, "rewards/accuracies": 0.875, "rewards/chosen": -0.02899809181690216, "rewards/margins": 1.4590017795562744, "rewards/rejected": -1.4879999160766602, "step": 730 }, { "epoch": 0.03928945741850528, "grad_norm": 7.748857021331787, "learning_rate": 2.539781255368355e-07, "logits/chosen": -0.6851269006729126, "logits/rejected": -0.8611475825309753, "logps/chosen": -69.375244140625, "logps/rejected": -106.30546569824219, "loss": 0.714, "rewards/accuracies": 0.875, "rewards/chosen": -0.11472135037183762, "rewards/margins": 1.0802119970321655, "rewards/rejected": -1.1949334144592285, "step": 731 }, { "epoch": 0.039343204966273415, "grad_norm": 11.204488754272461, "learning_rate": 2.5227066578379617e-07, "logits/chosen": -0.6677002906799316, "logits/rejected": -0.7722468376159668, "logps/chosen": -105.6826171875, "logps/rejected": -132.34567260742188, "loss": 0.8176, "rewards/accuracies": 0.875, "rewards/chosen": -0.4593377113342285, "rewards/margins": 0.9329068660736084, "rewards/rejected": -1.392244577407837, "step": 732 }, { "epoch": 0.039396952514041544, "grad_norm": 6.80488920211792, "learning_rate": 2.505670263235464e-07, "logits/chosen": -0.5943731069564819, "logits/rejected": -0.9544639587402344, "logps/chosen": -70.56122589111328, "logps/rejected": -124.44296264648438, "loss": 0.61, "rewards/accuracies": 1.0, "rewards/chosen": -0.2388138771057129, "rewards/margins": 1.1477375030517578, "rewards/rejected": -1.3865512609481812, "step": 733 }, { "epoch": 0.03945070006180968, "grad_norm": 6.338500022888184, "learning_rate": 2.4886723342831375e-07, "logits/chosen": -0.4494349956512451, "logits/rejected": -0.8252532482147217, "logps/chosen": -84.60494995117188, "logps/rejected": -129.79150390625, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": -0.019162073731422424, "rewards/margins": 1.49580717086792, "rewards/rejected": -1.5149692296981812, "step": 734 }, { "epoch": 0.03950444760957781, "grad_norm": 8.7371187210083, "learning_rate": 2.4717131331100774e-07, "logits/chosen": -0.7468314170837402, "logits/rejected": -0.7121862173080444, "logps/chosen": -82.73765563964844, "logps/rejected": -115.15592956542969, "loss": 0.7233, "rewards/accuracies": 1.0, "rewards/chosen": 0.2907719612121582, "rewards/margins": 0.9356023669242859, "rewards/rejected": -0.6448303461074829, "step": 735 }, { "epoch": 0.039558195157345946, "grad_norm": 14.324700355529785, "learning_rate": 2.4547929212481435e-07, "logits/chosen": -0.7230560779571533, "logits/rejected": -0.6542279720306396, "logps/chosen": -72.34066009521484, "logps/rejected": -107.50325012207031, "loss": 0.9734, "rewards/accuracies": 0.625, "rewards/chosen": -0.11403907835483551, "rewards/margins": 0.9392410516738892, "rewards/rejected": -1.0532801151275635, "step": 736 }, { "epoch": 0.03961194270511408, "grad_norm": 8.586572647094727, "learning_rate": 2.4379119596279364e-07, "logits/chosen": -0.47067737579345703, "logits/rejected": -0.4720095694065094, "logps/chosen": -80.81477355957031, "logps/rejected": -117.63702392578125, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": 0.03301538527011871, "rewards/margins": 1.2072811126708984, "rewards/rejected": -1.174265742301941, "step": 737 }, { "epoch": 0.03966569025288221, "grad_norm": 7.990939617156982, "learning_rate": 2.421070508574763e-07, "logits/chosen": -0.6981499791145325, "logits/rejected": -0.8710757493972778, "logps/chosen": -91.7569351196289, "logps/rejected": -119.58995056152344, "loss": 0.7405, "rewards/accuracies": 1.0, "rewards/chosen": -0.01613597199320793, "rewards/margins": 1.1447652578353882, "rewards/rejected": -1.1609013080596924, "step": 738 }, { "epoch": 0.03971943780065035, "grad_norm": 6.775259017944336, "learning_rate": 2.404268827804637e-07, "logits/chosen": -0.718546986579895, "logits/rejected": -0.8627328872680664, "logps/chosen": -75.64596557617188, "logps/rejected": -96.53233337402344, "loss": 0.4742, "rewards/accuracies": 1.0, "rewards/chosen": 0.26878422498703003, "rewards/margins": 1.3637211322784424, "rewards/rejected": -1.0949368476867676, "step": 739 }, { "epoch": 0.03977318534841848, "grad_norm": 6.995737075805664, "learning_rate": 2.387507176420256e-07, "logits/chosen": -0.672360897064209, "logits/rejected": -0.7875985503196716, "logps/chosen": -94.52703857421875, "logps/rejected": -126.496826171875, "loss": 0.6328, "rewards/accuracies": 1.0, "rewards/chosen": 0.050301115959882736, "rewards/margins": 1.137545108795166, "rewards/rejected": -1.0872440338134766, "step": 740 }, { "epoch": 0.03982693289618661, "grad_norm": 8.672804832458496, "learning_rate": 2.3707858129070217e-07, "logits/chosen": -0.6607568264007568, "logits/rejected": -0.6581891179084778, "logps/chosen": -108.4710693359375, "logps/rejected": -139.23304748535156, "loss": 0.6852, "rewards/accuracies": 0.875, "rewards/chosen": -0.14192995429039001, "rewards/margins": 1.2771387100219727, "rewards/rejected": -1.419068694114685, "step": 741 }, { "epoch": 0.03988068044395474, "grad_norm": 10.163958549499512, "learning_rate": 2.3541049951290476e-07, "logits/chosen": -0.7237582802772522, "logits/rejected": -0.7803841829299927, "logps/chosen": -68.23587036132812, "logps/rejected": -106.08604431152344, "loss": 0.9347, "rewards/accuracies": 0.75, "rewards/chosen": -0.10141650587320328, "rewards/margins": 0.8714467883110046, "rewards/rejected": -0.9728633761405945, "step": 742 }, { "epoch": 0.03993442799172288, "grad_norm": 6.104794979095459, "learning_rate": 2.3374649803251756e-07, "logits/chosen": -0.6297526359558105, "logits/rejected": -0.5467948913574219, "logps/chosen": -90.19676208496094, "logps/rejected": -137.736083984375, "loss": 0.4469, "rewards/accuracies": 1.0, "rewards/chosen": 0.2702789306640625, "rewards/margins": 1.7160676717758179, "rewards/rejected": -1.4457887411117554, "step": 743 }, { "epoch": 0.03998817553949101, "grad_norm": 9.962902069091797, "learning_rate": 2.3208660251050156e-07, "logits/chosen": -0.706579864025116, "logits/rejected": -1.0785006284713745, "logps/chosen": -84.78997802734375, "logps/rejected": -128.2900848388672, "loss": 0.9357, "rewards/accuracies": 0.75, "rewards/chosen": -0.21949893236160278, "rewards/margins": 0.7860291004180908, "rewards/rejected": -1.0055280923843384, "step": 744 }, { "epoch": 0.040041923087259144, "grad_norm": 6.4992852210998535, "learning_rate": 2.3043083854449986e-07, "logits/chosen": -0.6702761650085449, "logits/rejected": -0.753881573677063, "logps/chosen": -89.30689239501953, "logps/rejected": -126.93154907226562, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030112862586975098, "rewards/margins": 1.3845264911651611, "rewards/rejected": -1.3875377178192139, "step": 745 }, { "epoch": 0.040095670635027274, "grad_norm": 9.823094367980957, "learning_rate": 2.287792316684407e-07, "logits/chosen": -0.8790714144706726, "logits/rejected": -0.9885706901550293, "logps/chosen": -79.85736083984375, "logps/rejected": -118.66886901855469, "loss": 0.8538, "rewards/accuracies": 0.75, "rewards/chosen": -0.212977796792984, "rewards/margins": 1.0224056243896484, "rewards/rejected": -1.23538339138031, "step": 746 }, { "epoch": 0.04014941818279541, "grad_norm": 11.860604286193848, "learning_rate": 2.2713180735214504e-07, "logits/chosen": -0.894019365310669, "logits/rejected": -0.9072297811508179, "logps/chosen": -79.12437438964844, "logps/rejected": -85.81069946289062, "loss": 1.0635, "rewards/accuracies": 0.875, "rewards/chosen": -0.1724262535572052, "rewards/margins": 0.5364360213279724, "rewards/rejected": -0.7088623046875, "step": 747 }, { "epoch": 0.040203165730563546, "grad_norm": 6.627924919128418, "learning_rate": 2.2548859100093403e-07, "logits/chosen": -0.6606245040893555, "logits/rejected": -0.7072981595993042, "logps/chosen": -105.18218231201172, "logps/rejected": -169.38973999023438, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": -0.16881093382835388, "rewards/margins": 1.97557532787323, "rewards/rejected": -2.1443862915039062, "step": 748 }, { "epoch": 0.040256913278331675, "grad_norm": 6.956027984619141, "learning_rate": 2.238496079552367e-07, "logits/chosen": -0.664187490940094, "logits/rejected": -0.731914758682251, "logps/chosen": -116.1605224609375, "logps/rejected": -159.2670440673828, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": -0.17156267166137695, "rewards/margins": 1.480554223060608, "rewards/rejected": -1.6521168947219849, "step": 749 }, { "epoch": 0.04031066082609981, "grad_norm": 11.054577827453613, "learning_rate": 2.2221488349019902e-07, "logits/chosen": -0.48339325189590454, "logits/rejected": -0.8273731470108032, "logps/chosen": -107.74986267089844, "logps/rejected": -135.9376678466797, "loss": 0.8587, "rewards/accuracies": 0.875, "rewards/chosen": -0.40245431661605835, "rewards/margins": 0.8871314525604248, "rewards/rejected": -1.2895857095718384, "step": 750 }, { "epoch": 0.04036440837386794, "grad_norm": 5.0379157066345215, "learning_rate": 2.205844428152942e-07, "logits/chosen": -0.6862085461616516, "logits/rejected": -0.929879903793335, "logps/chosen": -79.9913330078125, "logps/rejected": -132.914306640625, "loss": 0.4051, "rewards/accuracies": 1.0, "rewards/chosen": 0.051264241337776184, "rewards/margins": 1.7373111248016357, "rewards/rejected": -1.686046838760376, "step": 751 }, { "epoch": 0.04041815592163608, "grad_norm": 8.974724769592285, "learning_rate": 2.1895831107393482e-07, "logits/chosen": -0.6849677562713623, "logits/rejected": -0.8281288146972656, "logps/chosen": -106.23727416992188, "logps/rejected": -130.7172088623047, "loss": 0.6103, "rewards/accuracies": 1.0, "rewards/chosen": -0.02682015299797058, "rewards/margins": 1.2523826360702515, "rewards/rejected": -1.2792026996612549, "step": 752 }, { "epoch": 0.040471903469404207, "grad_norm": 6.217437267303467, "learning_rate": 2.1733651334308362e-07, "logits/chosen": -0.6633888483047485, "logits/rejected": -0.8611729145050049, "logps/chosen": -80.74871826171875, "logps/rejected": -132.0811767578125, "loss": 0.5389, "rewards/accuracies": 1.0, "rewards/chosen": -0.33083677291870117, "rewards/margins": 1.3223114013671875, "rewards/rejected": -1.6531481742858887, "step": 753 }, { "epoch": 0.04052565101717234, "grad_norm": 8.1962890625, "learning_rate": 2.1571907463286798e-07, "logits/chosen": -0.7560558319091797, "logits/rejected": -0.8755959868431091, "logps/chosen": -88.15702819824219, "logps/rejected": -127.85890197753906, "loss": 0.6221, "rewards/accuracies": 1.0, "rewards/chosen": 0.008878998458385468, "rewards/margins": 1.315895915031433, "rewards/rejected": -1.3070168495178223, "step": 754 }, { "epoch": 0.04057939856494047, "grad_norm": 11.445975303649902, "learning_rate": 2.1410601988619392e-07, "logits/chosen": -0.7236574292182922, "logits/rejected": -0.8574732542037964, "logps/chosen": -91.7083740234375, "logps/rejected": -117.82135009765625, "loss": 0.7799, "rewards/accuracies": 0.875, "rewards/chosen": -0.05091686546802521, "rewards/margins": 0.9790334701538086, "rewards/rejected": -1.0299503803253174, "step": 755 }, { "epoch": 0.04063314611270861, "grad_norm": 9.218972206115723, "learning_rate": 2.1249737397836088e-07, "logits/chosen": -0.49474814534187317, "logits/rejected": -0.7600416541099548, "logps/chosen": -83.51551818847656, "logps/rejected": -127.68470764160156, "loss": 0.7336, "rewards/accuracies": 0.875, "rewards/chosen": -0.0253116637468338, "rewards/margins": 0.9774875044822693, "rewards/rejected": -1.0027992725372314, "step": 756 }, { "epoch": 0.04068689366047674, "grad_norm": 9.73753833770752, "learning_rate": 2.1089316171667837e-07, "logits/chosen": -0.6559625267982483, "logits/rejected": -0.9304879903793335, "logps/chosen": -97.44245910644531, "logps/rejected": -114.62492370605469, "loss": 0.6598, "rewards/accuracies": 0.875, "rewards/chosen": 0.028372079133987427, "rewards/margins": 1.1279242038726807, "rewards/rejected": -1.0995521545410156, "step": 757 }, { "epoch": 0.040740641208244874, "grad_norm": 10.483987808227539, "learning_rate": 2.092934078400847e-07, "logits/chosen": -0.5298672318458557, "logits/rejected": -0.7872850894927979, "logps/chosen": -73.68215942382812, "logps/rejected": -96.79808044433594, "loss": 0.8845, "rewards/accuracies": 0.75, "rewards/chosen": -0.12978212535381317, "rewards/margins": 0.6918063163757324, "rewards/rejected": -0.8215884566307068, "step": 758 }, { "epoch": 0.04079438875601301, "grad_norm": 9.00783634185791, "learning_rate": 2.0769813701876332e-07, "logits/chosen": -0.6158140897750854, "logits/rejected": -0.639676570892334, "logps/chosen": -80.99070739746094, "logps/rejected": -132.4852294921875, "loss": 0.7793, "rewards/accuracies": 1.0, "rewards/chosen": 0.15527528524398804, "rewards/margins": 0.9577383995056152, "rewards/rejected": -0.8024630546569824, "step": 759 }, { "epoch": 0.04084813630378114, "grad_norm": 8.80483341217041, "learning_rate": 2.0610737385376348e-07, "logits/chosen": -0.6051195859909058, "logits/rejected": -0.6256772875785828, "logps/chosen": -66.02762603759766, "logps/rejected": -102.23220825195312, "loss": 0.7652, "rewards/accuracies": 1.0, "rewards/chosen": -0.00941435992717743, "rewards/margins": 1.086291790008545, "rewards/rejected": -1.0957062244415283, "step": 760 }, { "epoch": 0.040901883851549276, "grad_norm": 6.531935214996338, "learning_rate": 2.0452114287662126e-07, "logits/chosen": -0.8495103120803833, "logits/rejected": -0.9723453521728516, "logps/chosen": -86.06903076171875, "logps/rejected": -128.5280303955078, "loss": 0.4468, "rewards/accuracies": 1.0, "rewards/chosen": 0.06953192502260208, "rewards/margins": 1.421574592590332, "rewards/rejected": -1.3520426750183105, "step": 761 }, { "epoch": 0.040955631399317405, "grad_norm": 7.278002738952637, "learning_rate": 2.0293946854898076e-07, "logits/chosen": -0.6776783466339111, "logits/rejected": -0.7068448066711426, "logps/chosen": -83.78226470947266, "logps/rejected": -135.93556213378906, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": 0.17833323776721954, "rewards/margins": 1.3872275352478027, "rewards/rejected": -1.2088942527770996, "step": 762 }, { "epoch": 0.04100937894708554, "grad_norm": 9.292208671569824, "learning_rate": 2.0136237526221645e-07, "logits/chosen": -0.7927431464195251, "logits/rejected": -0.8521476984024048, "logps/chosen": -104.9405517578125, "logps/rejected": -130.06947326660156, "loss": 0.6408, "rewards/accuracies": 0.875, "rewards/chosen": -0.059896670281887054, "rewards/margins": 1.3218350410461426, "rewards/rejected": -1.381731629371643, "step": 763 }, { "epoch": 0.04106312649485367, "grad_norm": 12.382944107055664, "learning_rate": 1.9978988733705803e-07, "logits/chosen": -0.6960416436195374, "logits/rejected": -0.8311112523078918, "logps/chosen": -79.1255111694336, "logps/rejected": -99.9970474243164, "loss": 1.0249, "rewards/accuracies": 0.75, "rewards/chosen": -0.22159019112586975, "rewards/margins": 0.6299751400947571, "rewards/rejected": -0.8515653610229492, "step": 764 }, { "epoch": 0.04111687404262181, "grad_norm": 8.905945777893066, "learning_rate": 1.9822202902321427e-07, "logits/chosen": -0.5381357073783875, "logits/rejected": -0.6292402744293213, "logps/chosen": -90.88026428222656, "logps/rejected": -155.60797119140625, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 0.14555931091308594, "rewards/margins": 1.7371885776519775, "rewards/rejected": -1.5916292667388916, "step": 765 }, { "epoch": 0.041170621590389936, "grad_norm": 12.934619903564453, "learning_rate": 1.9665882449900022e-07, "logits/chosen": -0.6883718371391296, "logits/rejected": -0.9205670356750488, "logps/chosen": -92.49539947509766, "logps/rejected": -99.02030944824219, "loss": 0.8557, "rewards/accuracies": 0.875, "rewards/chosen": -0.44033122062683105, "rewards/margins": 0.7193205952644348, "rewards/rejected": -1.159651756286621, "step": 766 }, { "epoch": 0.04122436913815807, "grad_norm": 4.9216485023498535, "learning_rate": 1.951002978709631e-07, "logits/chosen": -0.7320770025253296, "logits/rejected": -0.7826344966888428, "logps/chosen": -102.69308471679688, "logps/rejected": -155.98753356933594, "loss": 0.3217, "rewards/accuracies": 1.0, "rewards/chosen": 0.03503851592540741, "rewards/margins": 2.0056965351104736, "rewards/rejected": -1.9706580638885498, "step": 767 }, { "epoch": 0.0412781166859262, "grad_norm": 9.495551109313965, "learning_rate": 1.9354647317351187e-07, "logits/chosen": -0.6473127603530884, "logits/rejected": -0.8320600986480713, "logps/chosen": -82.3733139038086, "logps/rejected": -104.32769012451172, "loss": 0.884, "rewards/accuracies": 0.875, "rewards/chosen": 0.005017995834350586, "rewards/margins": 0.7225644588470459, "rewards/rejected": -0.7175464630126953, "step": 768 }, { "epoch": 0.04133186423369434, "grad_norm": 11.787629127502441, "learning_rate": 1.9199737436854512e-07, "logits/chosen": -0.812034010887146, "logits/rejected": -0.84006667137146, "logps/chosen": -103.72234344482422, "logps/rejected": -142.36610412597656, "loss": 0.7894, "rewards/accuracies": 0.875, "rewards/chosen": -0.1840866059064865, "rewards/margins": 1.1640076637268066, "rewards/rejected": -1.3480943441390991, "step": 769 }, { "epoch": 0.041385611781462474, "grad_norm": 9.549795150756836, "learning_rate": 1.9045302534508295e-07, "logits/chosen": -0.722746729850769, "logits/rejected": -0.8457269668579102, "logps/chosen": -112.07904052734375, "logps/rejected": -161.46241760253906, "loss": 0.6599, "rewards/accuracies": 0.875, "rewards/chosen": -0.526552677154541, "rewards/margins": 1.2315235137939453, "rewards/rejected": -1.7580761909484863, "step": 770 }, { "epoch": 0.0414393593292306, "grad_norm": 14.094355583190918, "learning_rate": 1.8891344991889796e-07, "logits/chosen": -0.8485637903213501, "logits/rejected": -0.8959760665893555, "logps/chosen": -102.66665649414062, "logps/rejected": -120.41519927978516, "loss": 1.0252, "rewards/accuracies": 0.75, "rewards/chosen": -0.410696417093277, "rewards/margins": 0.6315237283706665, "rewards/rejected": -1.042220115661621, "step": 771 }, { "epoch": 0.04149310687699874, "grad_norm": 12.905317306518555, "learning_rate": 1.8737867183214756e-07, "logits/chosen": -0.5519200563430786, "logits/rejected": -0.7324067950248718, "logps/chosen": -90.91896057128906, "logps/rejected": -118.41909790039062, "loss": 0.9694, "rewards/accuracies": 0.75, "rewards/chosen": -0.35865291953086853, "rewards/margins": 0.8625617027282715, "rewards/rejected": -1.2212145328521729, "step": 772 }, { "epoch": 0.04154685442476687, "grad_norm": 8.228156089782715, "learning_rate": 1.8584871475300811e-07, "logits/chosen": -0.7101948857307434, "logits/rejected": -0.9043569564819336, "logps/chosen": -77.53511810302734, "logps/rejected": -125.19535827636719, "loss": 0.6653, "rewards/accuracies": 1.0, "rewards/chosen": -0.21376252174377441, "rewards/margins": 1.3221904039382935, "rewards/rejected": -1.5359528064727783, "step": 773 }, { "epoch": 0.041600601972535005, "grad_norm": 7.290024280548096, "learning_rate": 1.8432360227531113e-07, "logits/chosen": -0.612587571144104, "logits/rejected": -0.5847752094268799, "logps/chosen": -98.63694763183594, "logps/rejected": -111.63262939453125, "loss": 0.6265, "rewards/accuracies": 0.875, "rewards/chosen": 0.11111493408679962, "rewards/margins": 1.2687486410140991, "rewards/rejected": -1.1576337814331055, "step": 774 }, { "epoch": 0.041654349520303134, "grad_norm": 5.711800575256348, "learning_rate": 1.828033579181773e-07, "logits/chosen": -0.5377753973007202, "logits/rejected": -0.7969902157783508, "logps/chosen": -80.65196228027344, "logps/rejected": -108.97067260742188, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 0.0439884215593338, "rewards/margins": 1.440993070602417, "rewards/rejected": -1.3970046043395996, "step": 775 }, { "epoch": 0.04170809706807127, "grad_norm": 7.454909324645996, "learning_rate": 1.812880051256551e-07, "logits/chosen": -0.636047899723053, "logits/rejected": -0.7850624322891235, "logps/chosen": -95.42554473876953, "logps/rejected": -139.06573486328125, "loss": 0.5782, "rewards/accuracies": 0.875, "rewards/chosen": -0.00018759071826934814, "rewards/margins": 1.4025014638900757, "rewards/rejected": -1.402688980102539, "step": 776 }, { "epoch": 0.0417618446158394, "grad_norm": 5.392938613891602, "learning_rate": 1.7977756726635957e-07, "logits/chosen": -0.5641781091690063, "logits/rejected": -0.48685941100120544, "logps/chosen": -74.94424438476562, "logps/rejected": -124.49942016601562, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 0.25797563791275024, "rewards/margins": 1.6695250272750854, "rewards/rejected": -1.4115493297576904, "step": 777 }, { "epoch": 0.041815592163607536, "grad_norm": 6.904516220092773, "learning_rate": 1.7827206763311054e-07, "logits/chosen": -0.8065742254257202, "logits/rejected": -0.7508752346038818, "logps/chosen": -93.78004455566406, "logps/rejected": -151.7648468017578, "loss": 0.4645, "rewards/accuracies": 0.875, "rewards/chosen": -0.045453302562236786, "rewards/margins": 1.799658179283142, "rewards/rejected": -1.845111608505249, "step": 778 }, { "epoch": 0.041869339711375665, "grad_norm": 8.814549446105957, "learning_rate": 1.7677152944257511e-07, "logits/chosen": -0.6457135677337646, "logits/rejected": -1.0086586475372314, "logps/chosen": -77.23780822753906, "logps/rejected": -122.76145935058594, "loss": 0.5954, "rewards/accuracies": 0.875, "rewards/chosen": 0.04643264040350914, "rewards/margins": 1.447402000427246, "rewards/rejected": -1.4009692668914795, "step": 779 }, { "epoch": 0.0419230872591438, "grad_norm": 8.680941581726074, "learning_rate": 1.7527597583490823e-07, "logits/chosen": -0.6688162088394165, "logits/rejected": -0.7194143533706665, "logps/chosen": -102.35894775390625, "logps/rejected": -146.42787170410156, "loss": 0.6339, "rewards/accuracies": 1.0, "rewards/chosen": -0.39258289337158203, "rewards/margins": 1.2327580451965332, "rewards/rejected": -1.6253408193588257, "step": 780 }, { "epoch": 0.04197683480691194, "grad_norm": 6.057084560394287, "learning_rate": 1.7378542987339673e-07, "logits/chosen": -0.8462555408477783, "logits/rejected": -0.8595598936080933, "logps/chosen": -88.06340026855469, "logps/rejected": -153.61024475097656, "loss": 0.4025, "rewards/accuracies": 1.0, "rewards/chosen": 0.06655742228031158, "rewards/margins": 1.755646824836731, "rewards/rejected": -1.6890895366668701, "step": 781 }, { "epoch": 0.04203058235468007, "grad_norm": 9.910270690917969, "learning_rate": 1.7229991454410308e-07, "logits/chosen": -0.6037763953208923, "logits/rejected": -0.7760944366455078, "logps/chosen": -91.40938568115234, "logps/rejected": -128.02890014648438, "loss": 0.6628, "rewards/accuracies": 0.875, "rewards/chosen": -0.008961960673332214, "rewards/margins": 1.2442078590393066, "rewards/rejected": -1.2531697750091553, "step": 782 }, { "epoch": 0.0420843299024482, "grad_norm": 6.559506893157959, "learning_rate": 1.708194527555114e-07, "logits/chosen": -0.7718608379364014, "logits/rejected": -0.8073055744171143, "logps/chosen": -75.9872817993164, "logps/rejected": -121.38804626464844, "loss": 0.6673, "rewards/accuracies": 1.0, "rewards/chosen": 0.049701713025569916, "rewards/margins": 1.4952752590179443, "rewards/rejected": -1.4455735683441162, "step": 783 }, { "epoch": 0.04213807745021633, "grad_norm": 7.880884170532227, "learning_rate": 1.6934406733817413e-07, "logits/chosen": -0.5035495162010193, "logits/rejected": -0.5217552185058594, "logps/chosen": -81.76063537597656, "logps/rejected": -119.80155944824219, "loss": 0.6653, "rewards/accuracies": 0.875, "rewards/chosen": 0.24052682518959045, "rewards/margins": 1.204782485961914, "rewards/rejected": -0.9642558097839355, "step": 784 }, { "epoch": 0.04219182499798447, "grad_norm": 9.494074821472168, "learning_rate": 1.6787378104435929e-07, "logits/chosen": -0.39203301072120667, "logits/rejected": -0.7699752449989319, "logps/chosen": -90.14427185058594, "logps/rejected": -119.53788757324219, "loss": 0.8109, "rewards/accuracies": 0.875, "rewards/chosen": -0.14762234687805176, "rewards/margins": 0.968752384185791, "rewards/rejected": -1.1163747310638428, "step": 785 }, { "epoch": 0.0422455725457526, "grad_norm": 4.707596302032471, "learning_rate": 1.6640861654770005e-07, "logits/chosen": -0.562144935131073, "logits/rejected": -0.6327670812606812, "logps/chosen": -88.89228057861328, "logps/rejected": -151.77142333984375, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 0.17785638570785522, "rewards/margins": 2.139118194580078, "rewards/rejected": -1.9612617492675781, "step": 786 }, { "epoch": 0.042299320093520734, "grad_norm": 8.249420166015625, "learning_rate": 1.649485964428462e-07, "logits/chosen": -0.5297073125839233, "logits/rejected": -0.9008538722991943, "logps/chosen": -82.14012908935547, "logps/rejected": -106.90983581542969, "loss": 0.5595, "rewards/accuracies": 1.0, "rewards/chosen": -0.013055041432380676, "rewards/margins": 1.2959339618682861, "rewards/rejected": -1.3089890480041504, "step": 787 }, { "epoch": 0.042353067641288863, "grad_norm": 7.5422868728637695, "learning_rate": 1.6349374324511346e-07, "logits/chosen": -0.8036482334136963, "logits/rejected": -0.757236897945404, "logps/chosen": -84.73725891113281, "logps/rejected": -121.5020751953125, "loss": 0.5455, "rewards/accuracies": 1.0, "rewards/chosen": -0.12349863350391388, "rewards/margins": 1.1936731338500977, "rewards/rejected": -1.3171716928482056, "step": 788 }, { "epoch": 0.042406815189057, "grad_norm": 7.401148796081543, "learning_rate": 1.620440793901377e-07, "logits/chosen": -0.6493305563926697, "logits/rejected": -0.9544890522956848, "logps/chosen": -76.6766357421875, "logps/rejected": -147.06057739257812, "loss": 0.5729, "rewards/accuracies": 1.0, "rewards/chosen": 0.2123546302318573, "rewards/margins": 1.6216256618499756, "rewards/rejected": -1.409271001815796, "step": 789 }, { "epoch": 0.04246056273682513, "grad_norm": 6.95987606048584, "learning_rate": 1.605996272335291e-07, "logits/chosen": -0.7036823630332947, "logits/rejected": -0.7662244439125061, "logps/chosen": -91.12955474853516, "logps/rejected": -124.01072692871094, "loss": 0.5475, "rewards/accuracies": 1.0, "rewards/chosen": -0.10143748670816422, "rewards/margins": 1.2852132320404053, "rewards/rejected": -1.386650800704956, "step": 790 }, { "epoch": 0.042514310284593265, "grad_norm": 5.245736598968506, "learning_rate": 1.5916040905052692e-07, "logits/chosen": -0.7547256946563721, "logits/rejected": -0.8914695978164673, "logps/chosen": -105.06613159179688, "logps/rejected": -149.74188232421875, "loss": 0.2963, "rewards/accuracies": 1.0, "rewards/chosen": 0.20455770194530487, "rewards/margins": 2.177342176437378, "rewards/rejected": -1.9727845191955566, "step": 791 }, { "epoch": 0.0425680578323614, "grad_norm": 8.579684257507324, "learning_rate": 1.5772644703565564e-07, "logits/chosen": -0.4905208349227905, "logits/rejected": -0.691032886505127, "logps/chosen": -88.70614624023438, "logps/rejected": -126.30595397949219, "loss": 0.6618, "rewards/accuracies": 1.0, "rewards/chosen": -0.019372841343283653, "rewards/margins": 1.209036111831665, "rewards/rejected": -1.228408932685852, "step": 792 }, { "epoch": 0.04262180538012953, "grad_norm": 8.709781646728516, "learning_rate": 1.562977633023837e-07, "logits/chosen": -0.5709748864173889, "logits/rejected": -0.8440839052200317, "logps/chosen": -78.98991394042969, "logps/rejected": -113.58998107910156, "loss": 0.6824, "rewards/accuracies": 0.875, "rewards/chosen": -0.31041979789733887, "rewards/margins": 1.3595646619796753, "rewards/rejected": -1.6699843406677246, "step": 793 }, { "epoch": 0.04267555292789767, "grad_norm": 8.65339183807373, "learning_rate": 1.548743798827814e-07, "logits/chosen": -0.6976636648178101, "logits/rejected": -0.7981129288673401, "logps/chosen": -111.41394805908203, "logps/rejected": -152.5538787841797, "loss": 0.526, "rewards/accuracies": 1.0, "rewards/chosen": -0.09131629765033722, "rewards/margins": 1.5127274990081787, "rewards/rejected": -1.6040438413619995, "step": 794 }, { "epoch": 0.042729300475665796, "grad_norm": 7.517888069152832, "learning_rate": 1.534563187271821e-07, "logits/chosen": -0.6796112656593323, "logits/rejected": -0.7636511325836182, "logps/chosen": -101.79129028320312, "logps/rejected": -142.10092163085938, "loss": 0.5433, "rewards/accuracies": 1.0, "rewards/chosen": -0.09343726933002472, "rewards/margins": 1.6396392583847046, "rewards/rejected": -1.733076572418213, "step": 795 }, { "epoch": 0.04278304802343393, "grad_norm": 12.087640762329102, "learning_rate": 1.5204360170384284e-07, "logits/chosen": -0.5779902338981628, "logits/rejected": -0.9241939187049866, "logps/chosen": -88.50374603271484, "logps/rejected": -136.03533935546875, "loss": 0.757, "rewards/accuracies": 0.875, "rewards/chosen": -0.4299134314060211, "rewards/margins": 1.2201014757156372, "rewards/rejected": -1.650014877319336, "step": 796 }, { "epoch": 0.04283679557120206, "grad_norm": 8.15462589263916, "learning_rate": 1.5063625059860796e-07, "logits/chosen": -0.6898058652877808, "logits/rejected": -0.8636894226074219, "logps/chosen": -77.84471130371094, "logps/rejected": -127.77925109863281, "loss": 0.7362, "rewards/accuracies": 0.875, "rewards/chosen": 0.09291724860668182, "rewards/margins": 1.3533587455749512, "rewards/rejected": -1.260441541671753, "step": 797 }, { "epoch": 0.0428905431189702, "grad_norm": 7.077314853668213, "learning_rate": 1.4923428711457215e-07, "logits/chosen": -0.46082955598831177, "logits/rejected": -0.7680069208145142, "logps/chosen": -75.48269653320312, "logps/rejected": -147.4001007080078, "loss": 0.614, "rewards/accuracies": 0.875, "rewards/chosen": -0.14132261276245117, "rewards/margins": 1.6272555589675903, "rewards/rejected": -1.768578290939331, "step": 798 }, { "epoch": 0.04294429066673833, "grad_norm": 8.566364288330078, "learning_rate": 1.4783773287174684e-07, "logits/chosen": -0.8981176614761353, "logits/rejected": -0.9236620664596558, "logps/chosen": -84.2950439453125, "logps/rejected": -123.73194885253906, "loss": 0.606, "rewards/accuracies": 0.875, "rewards/chosen": -0.07914933562278748, "rewards/margins": 1.4035353660583496, "rewards/rejected": -1.4826847314834595, "step": 799 }, { "epoch": 0.042998038214506464, "grad_norm": 5.636478424072266, "learning_rate": 1.4644660940672627e-07, "logits/chosen": -0.7270197868347168, "logits/rejected": -0.8622840642929077, "logps/chosen": -94.80131530761719, "logps/rejected": -138.4375, "loss": 0.4228, "rewards/accuracies": 1.0, "rewards/chosen": -0.08075408637523651, "rewards/margins": 1.6637669801712036, "rewards/rejected": -1.744521141052246, "step": 800 }, { "epoch": 0.04305178576227459, "grad_norm": 10.22067642211914, "learning_rate": 1.4506093817235492e-07, "logits/chosen": -0.7612448930740356, "logits/rejected": -0.9169006943702698, "logps/chosen": -94.57559204101562, "logps/rejected": -129.1352081298828, "loss": 0.7855, "rewards/accuracies": 0.875, "rewards/chosen": -0.5829235315322876, "rewards/margins": 1.0542347431182861, "rewards/rejected": -1.6371581554412842, "step": 801 }, { "epoch": 0.04310553331004273, "grad_norm": 8.184894561767578, "learning_rate": 1.436807405373973e-07, "logits/chosen": -0.6455401182174683, "logits/rejected": -0.8337315320968628, "logps/chosen": -76.52290344238281, "logps/rejected": -115.4096450805664, "loss": 0.5936, "rewards/accuracies": 1.0, "rewards/chosen": 0.16489064693450928, "rewards/margins": 1.3316378593444824, "rewards/rejected": -1.1667472124099731, "step": 802 }, { "epoch": 0.043159280857810865, "grad_norm": 9.81550407409668, "learning_rate": 1.4230603778620852e-07, "logits/chosen": -0.81175297498703, "logits/rejected": -0.9541151523590088, "logps/chosen": -82.7430419921875, "logps/rejected": -129.85885620117188, "loss": 0.768, "rewards/accuracies": 0.875, "rewards/chosen": -0.07158079743385315, "rewards/margins": 1.2380086183547974, "rewards/rejected": -1.3095893859863281, "step": 803 }, { "epoch": 0.043213028405578995, "grad_norm": 6.948380470275879, "learning_rate": 1.4093685111840565e-07, "logits/chosen": -0.5378615856170654, "logits/rejected": -0.8594182729721069, "logps/chosen": -75.70240020751953, "logps/rejected": -140.3169708251953, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.11253901571035385, "rewards/margins": 1.2408199310302734, "rewards/rejected": -1.128280758857727, "step": 804 }, { "epoch": 0.04326677595334713, "grad_norm": 7.197903633117676, "learning_rate": 1.3957320164854059e-07, "logits/chosen": -0.6320145130157471, "logits/rejected": -0.8845992088317871, "logps/chosen": -72.27452087402344, "logps/rejected": -129.891357421875, "loss": 0.4984, "rewards/accuracies": 1.0, "rewards/chosen": 0.2827944755554199, "rewards/margins": 1.6825041770935059, "rewards/rejected": -1.399709701538086, "step": 805 }, { "epoch": 0.04332052350111526, "grad_norm": 7.119307994842529, "learning_rate": 1.382151104057754e-07, "logits/chosen": -0.6298428773880005, "logits/rejected": -0.5671582818031311, "logps/chosen": -86.71846008300781, "logps/rejected": -147.65133666992188, "loss": 0.517, "rewards/accuracies": 1.0, "rewards/chosen": 0.06335914134979248, "rewards/margins": 1.4575594663619995, "rewards/rejected": -1.394200325012207, "step": 806 }, { "epoch": 0.043374271048883396, "grad_norm": 7.702904224395752, "learning_rate": 1.3686259833355678e-07, "logits/chosen": -0.700762927532196, "logits/rejected": -0.8786563873291016, "logps/chosen": -93.29704284667969, "logps/rejected": -152.09317016601562, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": 0.05422419309616089, "rewards/margins": 1.701130986213684, "rewards/rejected": -1.6469067335128784, "step": 807 }, { "epoch": 0.043428018596651526, "grad_norm": 8.58197021484375, "learning_rate": 1.3551568628929432e-07, "logits/chosen": -0.5673668384552002, "logits/rejected": -0.7246526479721069, "logps/chosen": -79.49383544921875, "logps/rejected": -101.97599792480469, "loss": 0.8306, "rewards/accuracies": 1.0, "rewards/chosen": -0.15435445308685303, "rewards/margins": 0.7302920818328857, "rewards/rejected": -0.8846465349197388, "step": 808 }, { "epoch": 0.04348176614441966, "grad_norm": 8.8089599609375, "learning_rate": 1.3417439504403766e-07, "logits/chosen": -0.8026152849197388, "logits/rejected": -0.9042845964431763, "logps/chosen": -79.97138977050781, "logps/rejected": -124.31282043457031, "loss": 0.6417, "rewards/accuracies": 1.0, "rewards/chosen": 0.10176515579223633, "rewards/margins": 1.0441250801086426, "rewards/rejected": -0.942359983921051, "step": 809 }, { "epoch": 0.04353551369218779, "grad_norm": 10.705148696899414, "learning_rate": 1.3283874528215733e-07, "logits/chosen": -0.6149063110351562, "logits/rejected": -0.82048499584198, "logps/chosen": -77.50799560546875, "logps/rejected": -122.70699310302734, "loss": 0.6471, "rewards/accuracies": 0.875, "rewards/chosen": 0.0005831122398376465, "rewards/margins": 1.27958083152771, "rewards/rejected": -1.2789976596832275, "step": 810 }, { "epoch": 0.04358926123995593, "grad_norm": 8.403605461120605, "learning_rate": 1.3150875760102465e-07, "logits/chosen": -0.8394116759300232, "logits/rejected": -0.8234984874725342, "logps/chosen": -81.11840057373047, "logps/rejected": -99.7491226196289, "loss": 0.8155, "rewards/accuracies": 0.875, "rewards/chosen": 0.055725935846567154, "rewards/margins": 0.8716539144515991, "rewards/rejected": -0.8159279227256775, "step": 811 }, { "epoch": 0.043643008787724064, "grad_norm": 6.507322788238525, "learning_rate": 1.3018445251069508e-07, "logits/chosen": -0.5855860710144043, "logits/rejected": -0.82449871301651, "logps/chosen": -70.74617004394531, "logps/rejected": -107.79280853271484, "loss": 0.5571, "rewards/accuracies": 1.0, "rewards/chosen": 0.38337039947509766, "rewards/margins": 1.3861950635910034, "rewards/rejected": -1.0028245449066162, "step": 812 }, { "epoch": 0.04369675633549219, "grad_norm": 5.838150501251221, "learning_rate": 1.2886585043359156e-07, "logits/chosen": -0.6114562153816223, "logits/rejected": -0.7347598671913147, "logps/chosen": -103.73982238769531, "logps/rejected": -166.82363891601562, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 0.3510482907295227, "rewards/margins": 1.921353816986084, "rewards/rejected": -1.570305347442627, "step": 813 }, { "epoch": 0.04375050388326033, "grad_norm": 9.76150894165039, "learning_rate": 1.275529717041891e-07, "logits/chosen": -0.5136087536811829, "logits/rejected": -0.8707386255264282, "logps/chosen": -91.97799682617188, "logps/rejected": -114.1183853149414, "loss": 0.7844, "rewards/accuracies": 1.0, "rewards/chosen": -0.10539579391479492, "rewards/margins": 0.9271670579910278, "rewards/rejected": -1.0325628519058228, "step": 814 }, { "epoch": 0.04380425143102846, "grad_norm": 9.825204849243164, "learning_rate": 1.262458365687015e-07, "logits/chosen": -0.7382162809371948, "logits/rejected": -0.9008877873420715, "logps/chosen": -75.17744445800781, "logps/rejected": -128.24026489257812, "loss": 0.7613, "rewards/accuracies": 0.75, "rewards/chosen": -0.010768696665763855, "rewards/margins": 1.0809811353683472, "rewards/rejected": -1.091749906539917, "step": 815 }, { "epoch": 0.043857998978796595, "grad_norm": 7.9242658615112305, "learning_rate": 1.249444651847702e-07, "logits/chosen": -0.7520813941955566, "logits/rejected": -0.7305682897567749, "logps/chosen": -79.63063049316406, "logps/rejected": -126.90387725830078, "loss": 0.658, "rewards/accuracies": 1.0, "rewards/chosen": 0.025029372423887253, "rewards/margins": 1.146191954612732, "rewards/rejected": -1.1211626529693604, "step": 816 }, { "epoch": 0.043911746526564724, "grad_norm": 5.679788589477539, "learning_rate": 1.236488776211515e-07, "logits/chosen": -0.8256125450134277, "logits/rejected": -0.9586639404296875, "logps/chosen": -101.70716094970703, "logps/rejected": -166.8810272216797, "loss": 0.2916, "rewards/accuracies": 1.0, "rewards/chosen": -0.10849381238222122, "rewards/margins": 2.011967182159424, "rewards/rejected": -2.1204609870910645, "step": 817 }, { "epoch": 0.04396549407433286, "grad_norm": 7.0654520988464355, "learning_rate": 1.223590938574082e-07, "logits/chosen": -0.6898108720779419, "logits/rejected": -0.7593812942504883, "logps/chosen": -86.4976577758789, "logps/rejected": -113.95950317382812, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": -0.3372691571712494, "rewards/margins": 1.2481459379196167, "rewards/rejected": -1.5854151248931885, "step": 818 }, { "epoch": 0.04401924162210099, "grad_norm": 12.088933944702148, "learning_rate": 1.210751337836016e-07, "logits/chosen": -0.6032209396362305, "logits/rejected": -0.7372310757637024, "logps/chosen": -126.31684875488281, "logps/rejected": -137.4420928955078, "loss": 0.9968, "rewards/accuracies": 0.75, "rewards/chosen": -0.37162071466445923, "rewards/margins": 1.1659446954727173, "rewards/rejected": -1.5375653505325317, "step": 819 }, { "epoch": 0.044072989169869126, "grad_norm": 10.266844749450684, "learning_rate": 1.1979701719998454e-07, "logits/chosen": -0.6576359272003174, "logits/rejected": -0.7660357356071472, "logps/chosen": -90.71223449707031, "logps/rejected": -114.9363784790039, "loss": 0.7194, "rewards/accuracies": 0.875, "rewards/chosen": -0.10476990789175034, "rewards/margins": 1.0130244493484497, "rewards/rejected": -1.1177945137023926, "step": 820 }, { "epoch": 0.044126736717637255, "grad_norm": 6.478957176208496, "learning_rate": 1.1852476381669557e-07, "logits/chosen": -0.7204630374908447, "logits/rejected": -0.7950636744499207, "logps/chosen": -79.72686004638672, "logps/rejected": -133.87551879882812, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": -0.08696846663951874, "rewards/margins": 1.5816200971603394, "rewards/rejected": -1.6685885190963745, "step": 821 }, { "epoch": 0.04418048426540539, "grad_norm": 6.345378875732422, "learning_rate": 1.1725839325345599e-07, "logits/chosen": -0.6713749170303345, "logits/rejected": -0.8440518975257874, "logps/chosen": -90.02011108398438, "logps/rejected": -112.00395202636719, "loss": 0.5078, "rewards/accuracies": 1.0, "rewards/chosen": -0.14732596278190613, "rewards/margins": 1.311824917793274, "rewards/rejected": -1.459150791168213, "step": 822 }, { "epoch": 0.04423423181317353, "grad_norm": 8.387128829956055, "learning_rate": 1.1599792503926609e-07, "logits/chosen": -0.46146267652511597, "logits/rejected": -0.5318410396575928, "logps/chosen": -90.51431274414062, "logps/rejected": -101.27820587158203, "loss": 0.6136, "rewards/accuracies": 1.0, "rewards/chosen": 0.13633108139038086, "rewards/margins": 1.1235429048538208, "rewards/rejected": -0.9872118234634399, "step": 823 }, { "epoch": 0.04428797936094166, "grad_norm": 6.622624397277832, "learning_rate": 1.1474337861210543e-07, "logits/chosen": -0.5487193465232849, "logits/rejected": -0.7153528928756714, "logps/chosen": -88.19334411621094, "logps/rejected": -112.94762420654297, "loss": 0.5078, "rewards/accuracies": 1.0, "rewards/chosen": 0.06982622295618057, "rewards/margins": 1.4251680374145508, "rewards/rejected": -1.3553416728973389, "step": 824 }, { "epoch": 0.04434172690870979, "grad_norm": 5.568865776062012, "learning_rate": 1.134947733186315e-07, "logits/chosen": -0.6248857975006104, "logits/rejected": -0.6935945749282837, "logps/chosen": -80.16374206542969, "logps/rejected": -133.22552490234375, "loss": 0.3725, "rewards/accuracies": 1.0, "rewards/chosen": 0.1347309947013855, "rewards/margins": 1.731189250946045, "rewards/rejected": -1.5964581966400146, "step": 825 }, { "epoch": 0.04439547445647792, "grad_norm": 7.960962772369385, "learning_rate": 1.1225212841388282e-07, "logits/chosen": -0.34982380270957947, "logits/rejected": -0.6288244724273682, "logps/chosen": -82.6952896118164, "logps/rejected": -120.71294403076172, "loss": 0.6433, "rewards/accuracies": 0.875, "rewards/chosen": 0.1924315094947815, "rewards/margins": 1.5423195362091064, "rewards/rejected": -1.3498880863189697, "step": 826 }, { "epoch": 0.04444922200424606, "grad_norm": 6.210901737213135, "learning_rate": 1.1101546306098092e-07, "logits/chosen": -0.5640324354171753, "logits/rejected": -0.6944502592086792, "logps/chosen": -106.69379425048828, "logps/rejected": -157.324462890625, "loss": 0.4327, "rewards/accuracies": 1.0, "rewards/chosen": 0.04612545669078827, "rewards/margins": 1.6892296075820923, "rewards/rejected": -1.643104076385498, "step": 827 }, { "epoch": 0.04450296955201419, "grad_norm": 9.06938648223877, "learning_rate": 1.097847963308351e-07, "logits/chosen": -0.7174173593521118, "logits/rejected": -0.7684688568115234, "logps/chosen": -92.61532592773438, "logps/rejected": -140.07203674316406, "loss": 0.5086, "rewards/accuracies": 1.0, "rewards/chosen": -0.010239750146865845, "rewards/margins": 1.6220171451568604, "rewards/rejected": -1.6322568655014038, "step": 828 }, { "epoch": 0.044556717099782324, "grad_norm": 7.910536766052246, "learning_rate": 1.0856014720184925e-07, "logits/chosen": -0.6064719557762146, "logits/rejected": -0.7963322401046753, "logps/chosen": -98.68523406982422, "logps/rejected": -159.86541748046875, "loss": 0.5261, "rewards/accuracies": 0.875, "rewards/chosen": -0.1890304684638977, "rewards/margins": 1.6180896759033203, "rewards/rejected": -1.8071202039718628, "step": 829 }, { "epoch": 0.04461046464755045, "grad_norm": 8.384634017944336, "learning_rate": 1.0734153455962763e-07, "logits/chosen": -0.8605486154556274, "logits/rejected": -0.8483513593673706, "logps/chosen": -89.44536590576172, "logps/rejected": -118.92450714111328, "loss": 0.5964, "rewards/accuracies": 1.0, "rewards/chosen": -0.11399110406637192, "rewards/margins": 1.135964035987854, "rewards/rejected": -1.249955177307129, "step": 830 }, { "epoch": 0.04466421219531859, "grad_norm": 6.804311752319336, "learning_rate": 1.0612897719668456e-07, "logits/chosen": -0.6316882967948914, "logits/rejected": -0.6306581497192383, "logps/chosen": -64.4083251953125, "logps/rejected": -94.20359802246094, "loss": 0.6938, "rewards/accuracies": 0.875, "rewards/chosen": -0.008869878947734833, "rewards/margins": 1.073202133178711, "rewards/rejected": -1.0820720195770264, "step": 831 }, { "epoch": 0.04471795974308672, "grad_norm": 7.927284240722656, "learning_rate": 1.0492249381215478e-07, "logits/chosen": -0.7985237836837769, "logits/rejected": -0.9932118654251099, "logps/chosen": -82.85612487792969, "logps/rejected": -125.09081268310547, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": 0.10381525754928589, "rewards/margins": 1.3811063766479492, "rewards/rejected": -1.277291178703308, "step": 832 }, { "epoch": 0.044771707290854855, "grad_norm": 6.228450775146484, "learning_rate": 1.0372210301150464e-07, "logits/chosen": -0.7725996971130371, "logits/rejected": -0.9320018887519836, "logps/chosen": -84.3681640625, "logps/rejected": -148.42596435546875, "loss": 0.4121, "rewards/accuracies": 1.0, "rewards/chosen": 0.06222972646355629, "rewards/margins": 1.8805807828903198, "rewards/rejected": -1.8183510303497314, "step": 833 }, { "epoch": 0.04482545483862299, "grad_norm": 7.563370704650879, "learning_rate": 1.0252782330624498e-07, "logits/chosen": -0.8957310914993286, "logits/rejected": -0.9512521028518677, "logps/chosen": -80.22821044921875, "logps/rejected": -118.1845932006836, "loss": 0.6443, "rewards/accuracies": 1.0, "rewards/chosen": 0.09481129795312881, "rewards/margins": 1.2728642225265503, "rewards/rejected": -1.1780529022216797, "step": 834 }, { "epoch": 0.04487920238639112, "grad_norm": 8.682588577270508, "learning_rate": 1.013396731136465e-07, "logits/chosen": -0.7981885075569153, "logits/rejected": -0.861862063407898, "logps/chosen": -89.45716857910156, "logps/rejected": -139.83583068847656, "loss": 0.7362, "rewards/accuracies": 0.875, "rewards/chosen": -0.2961158752441406, "rewards/margins": 1.1643284559249878, "rewards/rejected": -1.4604442119598389, "step": 835 }, { "epoch": 0.04493294993415926, "grad_norm": 8.621870040893555, "learning_rate": 1.0015767075645471e-07, "logits/chosen": -0.8035182952880859, "logits/rejected": -0.7927248477935791, "logps/chosen": -84.14013671875, "logps/rejected": -120.95443725585938, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": -0.13499541580677032, "rewards/margins": 1.2433909177780151, "rewards/rejected": -1.3783862590789795, "step": 836 }, { "epoch": 0.044986697481927386, "grad_norm": 9.804915428161621, "learning_rate": 9.89818344626085e-08, "logits/chosen": -0.7131041288375854, "logits/rejected": -0.7195731401443481, "logps/chosen": -109.23544311523438, "logps/rejected": -142.00115966796875, "loss": 0.6862, "rewards/accuracies": 0.875, "rewards/chosen": -0.49296632409095764, "rewards/margins": 1.2334587574005127, "rewards/rejected": -1.7264251708984375, "step": 837 }, { "epoch": 0.04504044502969552, "grad_norm": 8.907795906066895, "learning_rate": 9.781218236495775e-08, "logits/chosen": -0.9686172008514404, "logits/rejected": -0.9367592334747314, "logps/chosen": -79.4311752319336, "logps/rejected": -125.43509674072266, "loss": 0.8456, "rewards/accuracies": 0.75, "rewards/chosen": -0.11010538041591644, "rewards/margins": 1.0261242389678955, "rewards/rejected": -1.1362297534942627, "step": 838 }, { "epoch": 0.04509419257746365, "grad_norm": 8.331557273864746, "learning_rate": 9.66487325009851e-08, "logits/chosen": -0.6140229105949402, "logits/rejected": -0.7832940816879272, "logps/chosen": -83.87664794921875, "logps/rejected": -126.94097137451172, "loss": 0.6585, "rewards/accuracies": 0.875, "rewards/chosen": 0.051263049244880676, "rewards/margins": 1.0615553855895996, "rewards/rejected": -1.010292410850525, "step": 839 }, { "epoch": 0.04514794012523179, "grad_norm": 9.142289161682129, "learning_rate": 9.549150281252632e-08, "logits/chosen": -0.722920835018158, "logits/rejected": -0.8325307369232178, "logps/chosen": -69.37684631347656, "logps/rejected": -87.47014617919922, "loss": 0.7255, "rewards/accuracies": 1.0, "rewards/chosen": 0.19744764268398285, "rewards/margins": 0.9028087854385376, "rewards/rejected": -0.7053611278533936, "step": 840 }, { "epoch": 0.04520168767299992, "grad_norm": 6.3426899909973145, "learning_rate": 9.434051114549497e-08, "logits/chosen": -0.5011838674545288, "logits/rejected": -0.6405996084213257, "logps/chosen": -87.46165466308594, "logps/rejected": -150.52630615234375, "loss": 0.4378, "rewards/accuracies": 1.0, "rewards/chosen": -0.18897095322608948, "rewards/margins": 1.7678120136260986, "rewards/rejected": -1.9567828178405762, "step": 841 }, { "epoch": 0.04525543522076805, "grad_norm": 8.914499282836914, "learning_rate": 9.319577524960653e-08, "logits/chosen": -0.4721774756908417, "logits/rejected": -0.8111955523490906, "logps/chosen": -76.33541870117188, "logps/rejected": -102.73779296875, "loss": 0.7271, "rewards/accuracies": 0.875, "rewards/chosen": 0.01617303118109703, "rewards/margins": 1.078303575515747, "rewards/rejected": -1.0621304512023926, "step": 842 }, { "epoch": 0.04530918276853618, "grad_norm": 8.29342269897461, "learning_rate": 9.205731277810447e-08, "logits/chosen": -0.6015782356262207, "logits/rejected": -0.6645594239234924, "logps/chosen": -100.30278015136719, "logps/rejected": -128.43002319335938, "loss": 0.7466, "rewards/accuracies": 1.0, "rewards/chosen": -0.5267756581306458, "rewards/margins": 0.9662588834762573, "rewards/rejected": -1.4930346012115479, "step": 843 }, { "epoch": 0.04536293031630432, "grad_norm": 8.294228553771973, "learning_rate": 9.092514128748818e-08, "logits/chosen": -0.5984904766082764, "logits/rejected": -0.9610815048217773, "logps/chosen": -77.75392150878906, "logps/rejected": -129.50680541992188, "loss": 0.517, "rewards/accuracies": 0.875, "rewards/chosen": 0.20196545124053955, "rewards/margins": 1.559670329093933, "rewards/rejected": -1.3577048778533936, "step": 844 }, { "epoch": 0.045416677864072455, "grad_norm": 7.110414028167725, "learning_rate": 8.97992782372432e-08, "logits/chosen": -0.581181526184082, "logits/rejected": -0.8008157014846802, "logps/chosen": -50.5107421875, "logps/rejected": -99.7409439086914, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.21731817722320557, "rewards/margins": 1.205211877822876, "rewards/rejected": -0.9878937602043152, "step": 845 }, { "epoch": 0.045470425411840584, "grad_norm": 12.541234016418457, "learning_rate": 8.867974098957015e-08, "logits/chosen": -0.5853449702262878, "logits/rejected": -0.7870502471923828, "logps/chosen": -91.19670867919922, "logps/rejected": -92.61776733398438, "loss": 1.1831, "rewards/accuracies": 0.75, "rewards/chosen": -0.487047016620636, "rewards/margins": 0.4765259027481079, "rewards/rejected": -0.9635729789733887, "step": 846 }, { "epoch": 0.04552417295960872, "grad_norm": 5.76936674118042, "learning_rate": 8.75665468091183e-08, "logits/chosen": -0.6183640956878662, "logits/rejected": -0.7900167107582092, "logps/chosen": -96.34318542480469, "logps/rejected": -128.5752410888672, "loss": 0.4572, "rewards/accuracies": 1.0, "rewards/chosen": -0.11989083141088486, "rewards/margins": 1.62062406539917, "rewards/rejected": -1.7405149936676025, "step": 847 }, { "epoch": 0.04557792050737685, "grad_norm": 6.867844104766846, "learning_rate": 8.645971286271903e-08, "logits/chosen": -0.6720746159553528, "logits/rejected": -0.8231372833251953, "logps/chosen": -68.17044067382812, "logps/rejected": -127.73812866210938, "loss": 0.5746, "rewards/accuracies": 0.875, "rewards/chosen": 0.09696295857429504, "rewards/margins": 1.4244593381881714, "rewards/rejected": -1.3274962902069092, "step": 848 }, { "epoch": 0.045631668055144986, "grad_norm": 8.073737144470215, "learning_rate": 8.535925621912121e-08, "logits/chosen": -0.5152043104171753, "logits/rejected": -0.8835150003433228, "logps/chosen": -90.40864562988281, "logps/rejected": -137.49581909179688, "loss": 0.6027, "rewards/accuracies": 1.0, "rewards/chosen": 0.12456990778446198, "rewards/margins": 1.3958216905593872, "rewards/rejected": -1.2712517976760864, "step": 849 }, { "epoch": 0.045685415602913115, "grad_norm": 8.02540111541748, "learning_rate": 8.426519384872732e-08, "logits/chosen": -0.536312997341156, "logits/rejected": -0.6592484712600708, "logps/chosen": -81.31676483154297, "logps/rejected": -125.92967224121094, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.11499777436256409, "rewards/margins": 1.0726492404937744, "rewards/rejected": -0.9576514363288879, "step": 850 }, { "epoch": 0.04573916315068125, "grad_norm": 12.105794906616211, "learning_rate": 8.317754262333281e-08, "logits/chosen": -0.5538253784179688, "logits/rejected": -0.8808213472366333, "logps/chosen": -94.71160888671875, "logps/rejected": -128.93426513671875, "loss": 0.7047, "rewards/accuracies": 0.875, "rewards/chosen": -0.20016509294509888, "rewards/margins": 1.1382181644439697, "rewards/rejected": -1.338383436203003, "step": 851 }, { "epoch": 0.04579291069844938, "grad_norm": 7.351831912994385, "learning_rate": 8.209631931586497e-08, "logits/chosen": -0.37247058749198914, "logits/rejected": -0.7341089248657227, "logps/chosen": -84.75607299804688, "logps/rejected": -126.48013305664062, "loss": 0.5188, "rewards/accuracies": 1.0, "rewards/chosen": -0.23659133911132812, "rewards/margins": 1.3666536808013916, "rewards/rejected": -1.6032451391220093, "step": 852 }, { "epoch": 0.04584665824621752, "grad_norm": 6.808477878570557, "learning_rate": 8.102154060012456e-08, "logits/chosen": -0.7725784778594971, "logits/rejected": -0.9332031011581421, "logps/chosen": -78.4030990600586, "logps/rejected": -144.7857208251953, "loss": 0.5705, "rewards/accuracies": 0.875, "rewards/chosen": -0.04877962917089462, "rewards/margins": 1.707944631576538, "rewards/rejected": -1.7567241191864014, "step": 853 }, { "epoch": 0.045900405793985646, "grad_norm": 4.252152919769287, "learning_rate": 7.995322305052904e-08, "logits/chosen": -0.8454805016517639, "logits/rejected": -0.6972620487213135, "logps/chosen": -86.4189453125, "logps/rejected": -168.622314453125, "loss": 0.2831, "rewards/accuracies": 1.0, "rewards/chosen": 0.08060956001281738, "rewards/margins": 2.4024300575256348, "rewards/rejected": -2.3218204975128174, "step": 854 }, { "epoch": 0.04595415334175378, "grad_norm": 8.628849029541016, "learning_rate": 7.889138314185678e-08, "logits/chosen": -0.6968797445297241, "logits/rejected": -0.8129622936248779, "logps/chosen": -106.22327423095703, "logps/rejected": -132.73638916015625, "loss": 0.6323, "rewards/accuracies": 0.875, "rewards/chosen": -0.28716492652893066, "rewards/margins": 1.1235854625701904, "rewards/rejected": -1.410750389099121, "step": 855 }, { "epoch": 0.04600790088952192, "grad_norm": 8.574492454528809, "learning_rate": 7.783603724899257e-08, "logits/chosen": -0.5171170234680176, "logits/rejected": -0.690341591835022, "logps/chosen": -93.34660339355469, "logps/rejected": -129.01966857910156, "loss": 0.6592, "rewards/accuracies": 1.0, "rewards/chosen": 0.017640825361013412, "rewards/margins": 0.9803970456123352, "rewards/rejected": -0.9627562165260315, "step": 856 }, { "epoch": 0.04606164843729005, "grad_norm": 6.5475263595581055, "learning_rate": 7.67872016466754e-08, "logits/chosen": -0.6461421251296997, "logits/rejected": -0.8809307813644409, "logps/chosen": -93.44737243652344, "logps/rejected": -125.25419616699219, "loss": 0.4837, "rewards/accuracies": 1.0, "rewards/chosen": -0.03755383938550949, "rewards/margins": 1.564470887184143, "rewards/rejected": -1.6020246744155884, "step": 857 }, { "epoch": 0.046115395985058184, "grad_norm": 9.69187068939209, "learning_rate": 7.57448925092482e-08, "logits/chosen": -0.6484555006027222, "logits/rejected": -0.7741883397102356, "logps/chosen": -74.74447631835938, "logps/rejected": -103.09550476074219, "loss": 0.8549, "rewards/accuracies": 0.875, "rewards/chosen": 0.004470646381378174, "rewards/margins": 0.7978636026382446, "rewards/rejected": -0.7933928966522217, "step": 858 }, { "epoch": 0.046169143532826314, "grad_norm": 11.220030784606934, "learning_rate": 7.470912591040696e-08, "logits/chosen": -0.6914015412330627, "logits/rejected": -0.6978241205215454, "logps/chosen": -75.86946868896484, "logps/rejected": -108.17106628417969, "loss": 0.7224, "rewards/accuracies": 0.75, "rewards/chosen": -0.013397619128227234, "rewards/margins": 1.1164929866790771, "rewards/rejected": -1.1298906803131104, "step": 859 }, { "epoch": 0.04622289108059445, "grad_norm": 8.127026557922363, "learning_rate": 7.36799178229539e-08, "logits/chosen": -0.5766858458518982, "logits/rejected": -0.740463137626648, "logps/chosen": -96.87313079833984, "logps/rejected": -122.37490844726562, "loss": 0.6382, "rewards/accuracies": 0.875, "rewards/chosen": -0.3498982787132263, "rewards/margins": 1.0972483158111572, "rewards/rejected": -1.4471466541290283, "step": 860 }, { "epoch": 0.04627663862836258, "grad_norm": 7.307992935180664, "learning_rate": 7.265728411855105e-08, "logits/chosen": -0.7058729529380798, "logits/rejected": -0.8248885273933411, "logps/chosen": -77.36190032958984, "logps/rejected": -125.0450668334961, "loss": 0.681, "rewards/accuracies": 0.875, "rewards/chosen": -0.16981562972068787, "rewards/margins": 1.1199778318405151, "rewards/rejected": -1.2897934913635254, "step": 861 }, { "epoch": 0.046330386176130715, "grad_norm": 5.625227451324463, "learning_rate": 7.164124056747523e-08, "logits/chosen": -0.5580927729606628, "logits/rejected": -0.6868921518325806, "logps/chosen": -77.49059295654297, "logps/rejected": -124.27005004882812, "loss": 0.4997, "rewards/accuracies": 1.0, "rewards/chosen": 0.05645853281021118, "rewards/margins": 1.4878449440002441, "rewards/rejected": -1.4313864707946777, "step": 862 }, { "epoch": 0.046384133723898845, "grad_norm": 9.274627685546875, "learning_rate": 7.063180283837473e-08, "logits/chosen": -0.5974302887916565, "logits/rejected": -0.8451200723648071, "logps/chosen": -90.7562255859375, "logps/rejected": -126.52020263671875, "loss": 0.6461, "rewards/accuracies": 1.0, "rewards/chosen": -0.14597029983997345, "rewards/margins": 1.0392273664474487, "rewards/rejected": -1.1851977109909058, "step": 863 }, { "epoch": 0.04643788127166698, "grad_norm": 7.822931289672852, "learning_rate": 6.962898649802822e-08, "logits/chosen": -0.8737372159957886, "logits/rejected": -0.8909921050071716, "logps/chosen": -96.29176330566406, "logps/rejected": -138.62564086914062, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": 0.012439150363206863, "rewards/margins": 1.418925166130066, "rewards/rejected": -1.4064860343933105, "step": 864 }, { "epoch": 0.04649162881943511, "grad_norm": 8.896645545959473, "learning_rate": 6.863280701110408e-08, "logits/chosen": -0.6127061247825623, "logits/rejected": -0.779906153678894, "logps/chosen": -76.19710540771484, "logps/rejected": -129.0114288330078, "loss": 0.6372, "rewards/accuracies": 0.875, "rewards/chosen": -0.03224867582321167, "rewards/margins": 1.227088451385498, "rewards/rejected": -1.259337067604065, "step": 865 }, { "epoch": 0.046545376367203246, "grad_norm": 4.373945713043213, "learning_rate": 6.76432797399225e-08, "logits/chosen": -0.5156643390655518, "logits/rejected": -0.7312272787094116, "logps/chosen": -83.44233703613281, "logps/rejected": -136.37677001953125, "loss": 0.3079, "rewards/accuracies": 1.0, "rewards/chosen": 0.2851261496543884, "rewards/margins": 2.039963722229004, "rewards/rejected": -1.7548375129699707, "step": 866 }, { "epoch": 0.04659912391497138, "grad_norm": 5.913762092590332, "learning_rate": 6.666041994421795e-08, "logits/chosen": -0.48587462306022644, "logits/rejected": -0.7718603610992432, "logps/chosen": -80.08433532714844, "logps/rejected": -122.06869506835938, "loss": 0.4821, "rewards/accuracies": 1.0, "rewards/chosen": -0.024449113756418228, "rewards/margins": 1.513674020767212, "rewards/rejected": -1.5381231307983398, "step": 867 }, { "epoch": 0.04665287146273951, "grad_norm": 6.948429584503174, "learning_rate": 6.568424278090445e-08, "logits/chosen": -0.6107131242752075, "logits/rejected": -0.7280551195144653, "logps/chosen": -104.99918365478516, "logps/rejected": -135.58395385742188, "loss": 0.4947, "rewards/accuracies": 1.0, "rewards/chosen": -0.08733482658863068, "rewards/margins": 1.3494726419448853, "rewards/rejected": -1.43680739402771, "step": 868 }, { "epoch": 0.04670661901050765, "grad_norm": 8.158418655395508, "learning_rate": 6.471476330384129e-08, "logits/chosen": -0.7161338329315186, "logits/rejected": -0.6494104862213135, "logps/chosen": -91.50942993164062, "logps/rejected": -126.61354064941406, "loss": 0.6333, "rewards/accuracies": 0.875, "rewards/chosen": 0.0013249553740024567, "rewards/margins": 1.4290428161621094, "rewards/rejected": -1.427717924118042, "step": 869 }, { "epoch": 0.04676036655827578, "grad_norm": 4.783553123474121, "learning_rate": 6.375199646360141e-08, "logits/chosen": -0.661374568939209, "logits/rejected": -0.7916299104690552, "logps/chosen": -85.74851989746094, "logps/rejected": -153.36672973632812, "loss": 0.3671, "rewards/accuracies": 1.0, "rewards/chosen": 0.055647000670433044, "rewards/margins": 1.7553869485855103, "rewards/rejected": -1.699739933013916, "step": 870 }, { "epoch": 0.046814114106043914, "grad_norm": 10.298039436340332, "learning_rate": 6.279595710724061e-08, "logits/chosen": -0.850151777267456, "logits/rejected": -0.972399115562439, "logps/chosen": -100.39414978027344, "logps/rejected": -145.23033142089844, "loss": 0.7109, "rewards/accuracies": 0.875, "rewards/chosen": -0.2616676092147827, "rewards/margins": 1.1226513385772705, "rewards/rejected": -1.3843189477920532, "step": 871 }, { "epoch": 0.04686786165381204, "grad_norm": 7.487720966339111, "learning_rate": 6.184665997806831e-08, "logits/chosen": -0.5303115248680115, "logits/rejected": -0.6527265310287476, "logps/chosen": -68.92695617675781, "logps/rejected": -104.79911804199219, "loss": 0.6387, "rewards/accuracies": 1.0, "rewards/chosen": -0.24612277746200562, "rewards/margins": 1.180188775062561, "rewards/rejected": -1.4263114929199219, "step": 872 }, { "epoch": 0.04692160920158018, "grad_norm": 7.552265167236328, "learning_rate": 6.090411971542037e-08, "logits/chosen": -0.477422297000885, "logits/rejected": -0.6473179459571838, "logps/chosen": -70.88108825683594, "logps/rejected": -100.79731750488281, "loss": 0.6862, "rewards/accuracies": 0.75, "rewards/chosen": -0.005997858941555023, "rewards/margins": 1.1364610195159912, "rewards/rejected": -1.1424587965011597, "step": 873 }, { "epoch": 0.04697535674934831, "grad_norm": 9.351654052734375, "learning_rate": 5.996835085443403e-08, "logits/chosen": -0.648647665977478, "logits/rejected": -0.8782021999359131, "logps/chosen": -96.32368469238281, "logps/rejected": -132.32516479492188, "loss": 0.5568, "rewards/accuracies": 1.0, "rewards/chosen": -0.2720581591129303, "rewards/margins": 1.2770622968673706, "rewards/rejected": -1.5491204261779785, "step": 874 }, { "epoch": 0.047029104297116445, "grad_norm": 9.02763843536377, "learning_rate": 5.9039367825822526e-08, "logits/chosen": -0.7399516105651855, "logits/rejected": -0.7290601134300232, "logps/chosen": -88.96540832519531, "logps/rejected": -151.57037353515625, "loss": 0.6151, "rewards/accuracies": 0.875, "rewards/chosen": 0.06308995187282562, "rewards/margins": 1.5163968801498413, "rewards/rejected": -1.4533069133758545, "step": 875 }, { "epoch": 0.047082851844884574, "grad_norm": 7.673978805541992, "learning_rate": 5.8117184955653265e-08, "logits/chosen": -0.5964321494102478, "logits/rejected": -0.7791176438331604, "logps/chosen": -91.4417724609375, "logps/rejected": -117.26123046875, "loss": 0.6071, "rewards/accuracies": 1.0, "rewards/chosen": -0.45096659660339355, "rewards/margins": 1.1932052373886108, "rewards/rejected": -1.644171953201294, "step": 876 }, { "epoch": 0.04713659939265271, "grad_norm": 12.785576820373535, "learning_rate": 5.720181646512717e-08, "logits/chosen": -0.6293849945068359, "logits/rejected": -0.7832484245300293, "logps/chosen": -107.2771224975586, "logps/rejected": -135.0654754638672, "loss": 0.9941, "rewards/accuracies": 0.625, "rewards/chosen": -0.47838956117630005, "rewards/margins": 0.8348985910415649, "rewards/rejected": -1.3132882118225098, "step": 877 }, { "epoch": 0.047190346940420846, "grad_norm": 6.137625694274902, "learning_rate": 5.6293276470358417e-08, "logits/chosen": -0.7886516451835632, "logits/rejected": -0.8817706108093262, "logps/chosen": -89.70944213867188, "logps/rejected": -121.9573974609375, "loss": 0.4421, "rewards/accuracies": 1.0, "rewards/chosen": 0.1063896119594574, "rewards/margins": 1.5250434875488281, "rewards/rejected": -1.418653964996338, "step": 878 }, { "epoch": 0.047244094488188976, "grad_norm": 9.201836585998535, "learning_rate": 5.539157898215785e-08, "logits/chosen": -0.7883318662643433, "logits/rejected": -0.8404229283332825, "logps/chosen": -100.9067611694336, "logps/rejected": -123.74302673339844, "loss": 0.7223, "rewards/accuracies": 1.0, "rewards/chosen": -0.24442973732948303, "rewards/margins": 1.1183192729949951, "rewards/rejected": -1.3627490997314453, "step": 879 }, { "epoch": 0.04729784203595711, "grad_norm": 9.68266773223877, "learning_rate": 5.44967379058161e-08, "logits/chosen": -0.678221583366394, "logits/rejected": -1.0200773477554321, "logps/chosen": -84.21833801269531, "logps/rejected": -111.7186279296875, "loss": 0.7624, "rewards/accuracies": 0.875, "rewards/chosen": -0.38110384345054626, "rewards/margins": 0.9129441976547241, "rewards/rejected": -1.2940480709075928, "step": 880 }, { "epoch": 0.04735158958372524, "grad_norm": 8.634781837463379, "learning_rate": 5.3608767040889624e-08, "logits/chosen": -0.71517413854599, "logits/rejected": -0.8323066234588623, "logps/chosen": -89.33662414550781, "logps/rejected": -138.33322143554688, "loss": 0.5935, "rewards/accuracies": 1.0, "rewards/chosen": -0.4112567901611328, "rewards/margins": 1.2992780208587646, "rewards/rejected": -1.7105350494384766, "step": 881 }, { "epoch": 0.04740533713149338, "grad_norm": 7.602910995483398, "learning_rate": 5.272768008098749e-08, "logits/chosen": -0.7349193692207336, "logits/rejected": -0.8615543842315674, "logps/chosen": -89.70048522949219, "logps/rejected": -135.8232421875, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": 0.058967262506484985, "rewards/margins": 1.3995749950408936, "rewards/rejected": -1.3406076431274414, "step": 882 }, { "epoch": 0.04745908467926151, "grad_norm": 7.110910892486572, "learning_rate": 5.185349061356065e-08, "logits/chosen": -0.7360897064208984, "logits/rejected": -0.8581385612487793, "logps/chosen": -85.97222900390625, "logps/rejected": -148.82327270507812, "loss": 0.5028, "rewards/accuracies": 0.875, "rewards/chosen": -0.29651135206222534, "rewards/margins": 1.7185919284820557, "rewards/rejected": -2.015103340148926, "step": 883 }, { "epoch": 0.04751283222702964, "grad_norm": 6.500244617462158, "learning_rate": 5.0986212119692226e-08, "logits/chosen": -0.7650365233421326, "logits/rejected": -0.8067201375961304, "logps/chosen": -101.24542236328125, "logps/rejected": -136.92141723632812, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": 0.07148098945617676, "rewards/margins": 1.7925050258636475, "rewards/rejected": -1.7210240364074707, "step": 884 }, { "epoch": 0.04756657977479777, "grad_norm": 8.150625228881836, "learning_rate": 5.012585797388935e-08, "logits/chosen": -0.7985470294952393, "logits/rejected": -0.7693698406219482, "logps/chosen": -76.7236557006836, "logps/rejected": -103.49879455566406, "loss": 0.6855, "rewards/accuracies": 0.875, "rewards/chosen": -0.177934929728508, "rewards/margins": 1.1588122844696045, "rewards/rejected": -1.336747169494629, "step": 885 }, { "epoch": 0.04762032732256591, "grad_norm": 7.8426313400268555, "learning_rate": 4.92724414438771e-08, "logits/chosen": -0.6602977514266968, "logits/rejected": -0.7487387657165527, "logps/chosen": -67.87635803222656, "logps/rejected": -96.82499694824219, "loss": 0.7701, "rewards/accuracies": 0.875, "rewards/chosen": -0.10627837479114532, "rewards/margins": 0.9933789968490601, "rewards/rejected": -1.0996572971343994, "step": 886 }, { "epoch": 0.04767407487033404, "grad_norm": 7.058547496795654, "learning_rate": 4.8425975690394475e-08, "logits/chosen": -0.8019285202026367, "logits/rejected": -0.7898787260055542, "logps/chosen": -83.36396789550781, "logps/rejected": -119.62256622314453, "loss": 0.5537, "rewards/accuracies": 0.875, "rewards/chosen": 0.03314576297998428, "rewards/margins": 1.4334237575531006, "rewards/rejected": -1.400277853012085, "step": 887 }, { "epoch": 0.047727822418102174, "grad_norm": 9.888683319091797, "learning_rate": 4.758647376699032e-08, "logits/chosen": -0.621895432472229, "logits/rejected": -0.7203749418258667, "logps/chosen": -84.63848114013672, "logps/rejected": -112.84288787841797, "loss": 0.8666, "rewards/accuracies": 0.875, "rewards/chosen": -0.23547136783599854, "rewards/margins": 0.8792285919189453, "rewards/rejected": -1.1147000789642334, "step": 888 }, { "epoch": 0.04778156996587031, "grad_norm": 12.411364555358887, "learning_rate": 4.675394861982268e-08, "logits/chosen": -0.8163186311721802, "logits/rejected": -0.8603343963623047, "logps/chosen": -89.50066375732422, "logps/rejected": -155.93692016601562, "loss": 0.4104, "rewards/accuracies": 0.875, "rewards/chosen": -0.10982359945774078, "rewards/margins": 1.923937201499939, "rewards/rejected": -2.0337610244750977, "step": 889 }, { "epoch": 0.04783531751363844, "grad_norm": 7.032689571380615, "learning_rate": 4.592841308745932e-08, "logits/chosen": -0.7535769939422607, "logits/rejected": -0.8446379899978638, "logps/chosen": -97.45449829101562, "logps/rejected": -149.18576049804688, "loss": 0.5251, "rewards/accuracies": 0.875, "rewards/chosen": 0.11590814590454102, "rewards/margins": 1.7632943391799927, "rewards/rejected": -1.6473863124847412, "step": 890 }, { "epoch": 0.047889065061406576, "grad_norm": 6.9323930740356445, "learning_rate": 4.510987990067949e-08, "logits/chosen": -0.5844084620475769, "logits/rejected": -0.8423067331314087, "logps/chosen": -82.10266876220703, "logps/rejected": -105.35284423828125, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 0.13786464929580688, "rewards/margins": 1.2123849391937256, "rewards/rejected": -1.074520230293274, "step": 891 }, { "epoch": 0.047942812609174705, "grad_norm": 10.589164733886719, "learning_rate": 4.429836168227735e-08, "logits/chosen": -0.7273870706558228, "logits/rejected": -0.7145615816116333, "logps/chosen": -83.76527404785156, "logps/rejected": -110.715087890625, "loss": 0.7818, "rewards/accuracies": 0.875, "rewards/chosen": -0.35771772265434265, "rewards/margins": 0.8348692655563354, "rewards/rejected": -1.1925870180130005, "step": 892 }, { "epoch": 0.04799656015694284, "grad_norm": 7.6075568199157715, "learning_rate": 4.349387094686785e-08, "logits/chosen": -0.5828322172164917, "logits/rejected": -0.7905491590499878, "logps/chosen": -83.30477905273438, "logps/rejected": -115.49839782714844, "loss": 0.638, "rewards/accuracies": 1.0, "rewards/chosen": -0.1525566130876541, "rewards/margins": 1.1395800113677979, "rewards/rejected": -1.2921366691589355, "step": 893 }, { "epoch": 0.04805030770471097, "grad_norm": 9.051214218139648, "learning_rate": 4.269642010069319e-08, "logits/chosen": -0.8039073944091797, "logits/rejected": -0.8848326802253723, "logps/chosen": -90.70014953613281, "logps/rejected": -126.57560729980469, "loss": 0.6906, "rewards/accuracies": 0.875, "rewards/chosen": -0.12397942692041397, "rewards/margins": 1.1005264520645142, "rewards/rejected": -1.22450590133667, "step": 894 }, { "epoch": 0.04810405525247911, "grad_norm": 7.52002477645874, "learning_rate": 4.190602144143207e-08, "logits/chosen": -0.4238978326320648, "logits/rejected": -0.7218887805938721, "logps/chosen": -89.08851623535156, "logps/rejected": -136.63844299316406, "loss": 0.5749, "rewards/accuracies": 1.0, "rewards/chosen": -0.0030064135789871216, "rewards/margins": 1.2766971588134766, "rewards/rejected": -1.2797034978866577, "step": 895 }, { "epoch": 0.048157802800247236, "grad_norm": 4.783623218536377, "learning_rate": 4.112268715800943e-08, "logits/chosen": -0.6798886656761169, "logits/rejected": -0.8850741386413574, "logps/chosen": -86.34455108642578, "logps/rejected": -132.83932495117188, "loss": 0.3406, "rewards/accuracies": 1.0, "rewards/chosen": 0.04320533201098442, "rewards/margins": 1.7897136211395264, "rewards/rejected": -1.7465083599090576, "step": 896 }, { "epoch": 0.04821155034801537, "grad_norm": 7.535660266876221, "learning_rate": 4.034642933040911e-08, "logits/chosen": -0.6185505986213684, "logits/rejected": -0.8641486167907715, "logps/chosen": -84.00444030761719, "logps/rejected": -128.46572875976562, "loss": 0.5789, "rewards/accuracies": 0.875, "rewards/chosen": -0.04029481112957001, "rewards/margins": 1.5363959074020386, "rewards/rejected": -1.5766907930374146, "step": 897 }, { "epoch": 0.0482652978957835, "grad_norm": 7.079602241516113, "learning_rate": 3.9577259929486904e-08, "logits/chosen": -0.5767539739608765, "logits/rejected": -0.7801718711853027, "logps/chosen": -97.33092498779297, "logps/rejected": -139.47494506835938, "loss": 0.5381, "rewards/accuracies": 1.0, "rewards/chosen": 0.06880532205104828, "rewards/margins": 1.3831195831298828, "rewards/rejected": -1.3143141269683838, "step": 898 }, { "epoch": 0.04831904544355164, "grad_norm": 5.471474647521973, "learning_rate": 3.881519081678658e-08, "logits/chosen": -0.7628575563430786, "logits/rejected": -0.925121545791626, "logps/chosen": -103.95771789550781, "logps/rejected": -153.28399658203125, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": -0.1262800246477127, "rewards/margins": 2.171527624130249, "rewards/rejected": -2.2978076934814453, "step": 899 }, { "epoch": 0.048372792991319774, "grad_norm": 6.157526969909668, "learning_rate": 3.806023374435663e-08, "logits/chosen": -0.855146050453186, "logits/rejected": -0.822509765625, "logps/chosen": -91.51908874511719, "logps/rejected": -140.55873107910156, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": -0.15910135209560394, "rewards/margins": 1.6824195384979248, "rewards/rejected": -1.8415207862854004, "step": 900 }, { "epoch": 0.0484265405390879, "grad_norm": 7.523366928100586, "learning_rate": 3.731240035456901e-08, "logits/chosen": -0.6810941696166992, "logits/rejected": -0.7939736247062683, "logps/chosen": -92.76551055908203, "logps/rejected": -150.35565185546875, "loss": 0.6289, "rewards/accuracies": 0.875, "rewards/chosen": -0.40784981846809387, "rewards/margins": 1.291502833366394, "rewards/rejected": -1.6993526220321655, "step": 901 }, { "epoch": 0.04848028808685604, "grad_norm": 4.925612449645996, "learning_rate": 3.65717021799396e-08, "logits/chosen": -0.7259070873260498, "logits/rejected": -0.8969805836677551, "logps/chosen": -108.62970733642578, "logps/rejected": -164.57354736328125, "loss": 0.3275, "rewards/accuracies": 1.0, "rewards/chosen": 0.03529471904039383, "rewards/margins": 2.030729293823242, "rewards/rejected": -1.9954346418380737, "step": 902 }, { "epoch": 0.04853403563462417, "grad_norm": 9.822469711303711, "learning_rate": 3.583815064295065e-08, "logits/chosen": -0.6081196069717407, "logits/rejected": -0.7775384187698364, "logps/chosen": -85.71968078613281, "logps/rejected": -110.79730224609375, "loss": 0.7423, "rewards/accuracies": 0.875, "rewards/chosen": 0.009656287729740143, "rewards/margins": 0.9088273644447327, "rewards/rejected": -0.8991711139678955, "step": 903 }, { "epoch": 0.048587783182392305, "grad_norm": 7.007907390594482, "learning_rate": 3.5111757055874326e-08, "logits/chosen": -0.660891056060791, "logits/rejected": -1.0302317142486572, "logps/chosen": -89.25492858886719, "logps/rejected": -142.77862548828125, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": -0.24384728074073792, "rewards/margins": 1.5477349758148193, "rewards/rejected": -1.7915823459625244, "step": 904 }, { "epoch": 0.048641530730160434, "grad_norm": 14.026386260986328, "learning_rate": 3.4392532620598216e-08, "logits/chosen": -0.5893158912658691, "logits/rejected": -0.764263391494751, "logps/chosen": -74.34800720214844, "logps/rejected": -109.1385726928711, "loss": 0.9759, "rewards/accuracies": 0.875, "rewards/chosen": -0.2484074980020523, "rewards/margins": 0.7746601700782776, "rewards/rejected": -1.0230677127838135, "step": 905 }, { "epoch": 0.04869527827792857, "grad_norm": 11.945779800415039, "learning_rate": 3.3680488428453e-08, "logits/chosen": -0.7860875129699707, "logits/rejected": -0.76237553358078, "logps/chosen": -104.75282287597656, "logps/rejected": -114.31653594970703, "loss": 1.0044, "rewards/accuracies": 0.75, "rewards/chosen": -0.20890790224075317, "rewards/margins": 0.7623636722564697, "rewards/rejected": -0.9712715744972229, "step": 906 }, { "epoch": 0.0487490258256967, "grad_norm": 7.952846527099609, "learning_rate": 3.297563546004073e-08, "logits/chosen": -0.6515728831291199, "logits/rejected": -0.8125324249267578, "logps/chosen": -79.64319610595703, "logps/rejected": -118.74800872802734, "loss": 0.5178, "rewards/accuracies": 1.0, "rewards/chosen": 0.0739743784070015, "rewards/margins": 1.4430304765701294, "rewards/rejected": -1.369056224822998, "step": 907 }, { "epoch": 0.048802773373464836, "grad_norm": 6.213957786560059, "learning_rate": 3.2277984585066364e-08, "logits/chosen": -0.5299290418624878, "logits/rejected": -1.0806506872177124, "logps/chosen": -76.51634979248047, "logps/rejected": -151.77488708496094, "loss": 0.3254, "rewards/accuracies": 1.0, "rewards/chosen": -0.026461556553840637, "rewards/margins": 1.95500648021698, "rewards/rejected": -1.9814679622650146, "step": 908 }, { "epoch": 0.048856520921232965, "grad_norm": 8.182555198669434, "learning_rate": 3.1587546562169274e-08, "logits/chosen": -0.7366210222244263, "logits/rejected": -0.8888530731201172, "logps/chosen": -90.2340316772461, "logps/rejected": -148.13116455078125, "loss": 0.5635, "rewards/accuracies": 0.875, "rewards/chosen": -0.2620033025741577, "rewards/margins": 1.5441524982452393, "rewards/rejected": -1.806155800819397, "step": 909 }, { "epoch": 0.0489102684690011, "grad_norm": 10.72977352142334, "learning_rate": 3.0904332038757974e-08, "logits/chosen": -0.8162565231323242, "logits/rejected": -1.0126348733901978, "logps/chosen": -91.06812286376953, "logps/rejected": -131.3672637939453, "loss": 0.9768, "rewards/accuracies": 0.625, "rewards/chosen": -0.6286998987197876, "rewards/margins": 0.9581747651100159, "rewards/rejected": -1.5868746042251587, "step": 910 }, { "epoch": 0.04896401601676924, "grad_norm": 7.862979412078857, "learning_rate": 3.0228351550845524e-08, "logits/chosen": -0.7662482261657715, "logits/rejected": -0.8324440717697144, "logps/chosen": -79.29299926757812, "logps/rejected": -123.58667755126953, "loss": 0.5837, "rewards/accuracies": 1.0, "rewards/chosen": 0.07468633353710175, "rewards/margins": 1.4660954475402832, "rewards/rejected": -1.391409158706665, "step": 911 }, { "epoch": 0.04901776356453737, "grad_norm": 8.47566032409668, "learning_rate": 2.955961552288727e-08, "logits/chosen": -0.7867745161056519, "logits/rejected": -0.8187644481658936, "logps/chosen": -76.71022033691406, "logps/rejected": -108.03132629394531, "loss": 0.8273, "rewards/accuracies": 0.875, "rewards/chosen": 0.033391546458005905, "rewards/margins": 0.8686684370040894, "rewards/rejected": -0.8352768421173096, "step": 912 }, { "epoch": 0.0490715111123055, "grad_norm": 7.036310195922852, "learning_rate": 2.889813426762011e-08, "logits/chosen": -0.7862410545349121, "logits/rejected": -0.9328353404998779, "logps/chosen": -87.64884948730469, "logps/rejected": -135.68423461914062, "loss": 0.5128, "rewards/accuracies": 0.875, "rewards/chosen": -0.024297188967466354, "rewards/margins": 1.5441159009933472, "rewards/rejected": -1.5684131383895874, "step": 913 }, { "epoch": 0.04912525866007363, "grad_norm": 7.50009298324585, "learning_rate": 2.8243917985903253e-08, "logits/chosen": -0.5451648235321045, "logits/rejected": -0.8596552014350891, "logps/chosen": -69.37652587890625, "logps/rejected": -108.30462646484375, "loss": 0.7176, "rewards/accuracies": 1.0, "rewards/chosen": 0.25373128056526184, "rewards/margins": 0.999255895614624, "rewards/rejected": -0.7455246448516846, "step": 914 }, { "epoch": 0.04917900620784177, "grad_norm": 9.767491340637207, "learning_rate": 2.7596976766560977e-08, "logits/chosen": -0.7584033012390137, "logits/rejected": -0.8134979605674744, "logps/chosen": -106.73832702636719, "logps/rejected": -144.49188232421875, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": -0.5251469612121582, "rewards/margins": 1.0825544595718384, "rewards/rejected": -1.6077014207839966, "step": 915 }, { "epoch": 0.0492327537556099, "grad_norm": 8.966424942016602, "learning_rate": 2.695732058622735e-08, "logits/chosen": -0.7051568627357483, "logits/rejected": -0.7797049283981323, "logps/chosen": -100.36602783203125, "logps/rejected": -134.832763671875, "loss": 0.5873, "rewards/accuracies": 1.0, "rewards/chosen": -0.29489395022392273, "rewards/margins": 1.2217159271240234, "rewards/rejected": -1.5166099071502686, "step": 916 }, { "epoch": 0.049286501303378034, "grad_norm": 12.375151634216309, "learning_rate": 2.6324959309191874e-08, "logits/chosen": -0.5989211797714233, "logits/rejected": -0.843718409538269, "logps/chosen": -104.11579132080078, "logps/rejected": -123.68159484863281, "loss": 0.758, "rewards/accuracies": 0.75, "rewards/chosen": -0.1572691947221756, "rewards/margins": 1.1768888235092163, "rewards/rejected": -1.334157943725586, "step": 917 }, { "epoch": 0.049340248851146164, "grad_norm": 8.020186424255371, "learning_rate": 2.56999026872477e-08, "logits/chosen": -0.6868534088134766, "logits/rejected": -0.8555862307548523, "logps/chosen": -100.22944641113281, "logps/rejected": -135.75497436523438, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": -0.04585418477654457, "rewards/margins": 1.5089757442474365, "rewards/rejected": -1.5548299551010132, "step": 918 }, { "epoch": 0.0493939963989143, "grad_norm": 8.240158081054688, "learning_rate": 2.5082160359541138e-08, "logits/chosen": -0.70988929271698, "logits/rejected": -0.9535796046257019, "logps/chosen": -62.98827362060547, "logps/rejected": -102.12547302246094, "loss": 0.7825, "rewards/accuracies": 0.875, "rewards/chosen": 0.020343497395515442, "rewards/margins": 0.9210437536239624, "rewards/rejected": -0.9007002711296082, "step": 919 }, { "epoch": 0.04944774394668243, "grad_norm": 5.688581943511963, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -0.7335445880889893, "logits/rejected": -0.7960599660873413, "logps/chosen": -96.25944519042969, "logps/rejected": -162.0817108154297, "loss": 0.3631, "rewards/accuracies": 1.0, "rewards/chosen": 0.12910017371177673, "rewards/margins": 2.082399368286133, "rewards/rejected": -1.9532994031906128, "step": 920 }, { "epoch": 0.049501491494450565, "grad_norm": 7.4089179039001465, "learning_rate": 2.386865657930226e-08, "logits/chosen": -0.6525486707687378, "logits/rejected": -0.7321764826774597, "logps/chosen": -93.77760314941406, "logps/rejected": -131.76162719726562, "loss": 0.4926, "rewards/accuracies": 1.0, "rewards/chosen": -0.1876431405544281, "rewards/margins": 1.4185351133346558, "rewards/rejected": -1.6061782836914062, "step": 921 }, { "epoch": 0.0495552390422187, "grad_norm": 8.556584358215332, "learning_rate": 2.3272913840499396e-08, "logits/chosen": -0.4738423824310303, "logits/rejected": -0.645923376083374, "logps/chosen": -96.56430053710938, "logps/rejected": -144.44578552246094, "loss": 0.5993, "rewards/accuracies": 0.875, "rewards/chosen": -0.2443404197692871, "rewards/margins": 1.4304068088531494, "rewards/rejected": -1.6747472286224365, "step": 922 }, { "epoch": 0.04960898658998683, "grad_norm": 10.177748680114746, "learning_rate": 2.2684522823104456e-08, "logits/chosen": -0.8065897226333618, "logits/rejected": -0.8883717060089111, "logps/chosen": -109.3260498046875, "logps/rejected": -136.18043518066406, "loss": 0.6459, "rewards/accuracies": 0.875, "rewards/chosen": -0.48454540967941284, "rewards/margins": 1.4235481023788452, "rewards/rejected": -1.9080934524536133, "step": 923 }, { "epoch": 0.04966273413775497, "grad_norm": 11.693577766418457, "learning_rate": 2.2103492600834937e-08, "logits/chosen": -0.7894699573516846, "logits/rejected": -0.7462475895881653, "logps/chosen": -83.94677734375, "logps/rejected": -133.0740966796875, "loss": 0.9117, "rewards/accuracies": 0.75, "rewards/chosen": -0.5638355016708374, "rewards/margins": 1.0707875490188599, "rewards/rejected": -1.6346230506896973, "step": 924 }, { "epoch": 0.049716481685523096, "grad_norm": 9.95130729675293, "learning_rate": 2.1529832133895588e-08, "logits/chosen": -0.6013067960739136, "logits/rejected": -0.6655162572860718, "logps/chosen": -74.7082290649414, "logps/rejected": -82.73184204101562, "loss": 0.8516, "rewards/accuracies": 0.75, "rewards/chosen": -0.24484239518642426, "rewards/margins": 0.7615705728530884, "rewards/rejected": -1.0064129829406738, "step": 925 }, { "epoch": 0.04977022923329123, "grad_norm": 8.495624542236328, "learning_rate": 2.0963550268840446e-08, "logits/chosen": -0.6377952098846436, "logits/rejected": -0.769828200340271, "logps/chosen": -75.220703125, "logps/rejected": -130.43911743164062, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": -0.006972074508666992, "rewards/margins": 1.4706225395202637, "rewards/rejected": -1.4775946140289307, "step": 926 }, { "epoch": 0.04982397678105936, "grad_norm": 9.803694725036621, "learning_rate": 2.0404655738436417e-08, "logits/chosen": -0.7372729182243347, "logits/rejected": -0.8733534812927246, "logps/chosen": -89.67182922363281, "logps/rejected": -130.8540496826172, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": -0.2575630247592926, "rewards/margins": 1.0770283937454224, "rewards/rejected": -1.3345913887023926, "step": 927 }, { "epoch": 0.0498777243288275, "grad_norm": 8.706215858459473, "learning_rate": 1.9853157161528468e-08, "logits/chosen": -0.6463040709495544, "logits/rejected": -0.8237420916557312, "logps/chosen": -89.85963439941406, "logps/rejected": -122.45868682861328, "loss": 0.6263, "rewards/accuracies": 0.875, "rewards/chosen": -0.22657258808612823, "rewards/margins": 1.2033015489578247, "rewards/rejected": -1.4298741817474365, "step": 928 }, { "epoch": 0.04993147187659563, "grad_norm": 10.89100170135498, "learning_rate": 1.9309063042907024e-08, "logits/chosen": -0.7881485819816589, "logits/rejected": -0.7344053983688354, "logps/chosen": -70.12205505371094, "logps/rejected": -88.55856323242188, "loss": 1.0798, "rewards/accuracies": 0.75, "rewards/chosen": -0.32742124795913696, "rewards/margins": 0.574494481086731, "rewards/rejected": -0.9019157290458679, "step": 929 }, { "epoch": 0.049985219424363764, "grad_norm": 9.072222709655762, "learning_rate": 1.8772381773176416e-08, "logits/chosen": -0.6913076639175415, "logits/rejected": -0.7165856957435608, "logps/chosen": -96.47908782958984, "logps/rejected": -126.90711975097656, "loss": 0.6579, "rewards/accuracies": 1.0, "rewards/chosen": -0.09927182644605637, "rewards/margins": 1.1280517578125, "rewards/rejected": -1.2273235321044922, "step": 930 }, { "epoch": 0.05003896697213189, "grad_norm": 7.653972625732422, "learning_rate": 1.8243121628625623e-08, "logits/chosen": -0.7076081037521362, "logits/rejected": -0.8202123641967773, "logps/chosen": -66.38391876220703, "logps/rejected": -79.4378890991211, "loss": 0.8651, "rewards/accuracies": 0.875, "rewards/chosen": -0.05211373418569565, "rewards/margins": 0.752827525138855, "rewards/rejected": -0.8049411773681641, "step": 931 }, { "epoch": 0.05009271451990003, "grad_norm": 7.911796569824219, "learning_rate": 1.772129077110096e-08, "logits/chosen": -0.5450606346130371, "logits/rejected": -0.837932288646698, "logps/chosen": -86.55402374267578, "logps/rejected": -155.49900817871094, "loss": 0.5238, "rewards/accuracies": 0.875, "rewards/chosen": -0.16057553887367249, "rewards/margins": 1.6193888187408447, "rewards/rejected": -1.7799643278121948, "step": 932 }, { "epoch": 0.050146462067668165, "grad_norm": 8.687859535217285, "learning_rate": 1.7206897247879715e-08, "logits/chosen": -0.5580852031707764, "logits/rejected": -0.8791922926902771, "logps/chosen": -89.3305435180664, "logps/rejected": -121.73765563964844, "loss": 0.6588, "rewards/accuracies": 1.0, "rewards/chosen": -0.10477913916110992, "rewards/margins": 1.0025744438171387, "rewards/rejected": -1.107353687286377, "step": 933 }, { "epoch": 0.050200209615436295, "grad_norm": 7.949826717376709, "learning_rate": 1.6699948991546364e-08, "logits/chosen": -0.42492929100990295, "logits/rejected": -0.639462947845459, "logps/chosen": -76.40912628173828, "logps/rejected": -101.72725677490234, "loss": 0.5842, "rewards/accuracies": 1.0, "rewards/chosen": -0.01994180679321289, "rewards/margins": 1.1814507246017456, "rewards/rejected": -1.2013925313949585, "step": 934 }, { "epoch": 0.05025395716320443, "grad_norm": 7.7614288330078125, "learning_rate": 1.6200453819870118e-08, "logits/chosen": -0.7003356218338013, "logits/rejected": -0.7337514162063599, "logps/chosen": -79.7490463256836, "logps/rejected": -116.43782043457031, "loss": 0.6494, "rewards/accuracies": 0.875, "rewards/chosen": 0.06873016059398651, "rewards/margins": 1.2213689088821411, "rewards/rejected": -1.1526386737823486, "step": 935 }, { "epoch": 0.05030770471097256, "grad_norm": 8.497488021850586, "learning_rate": 1.570841943568446e-08, "logits/chosen": -0.49890369176864624, "logits/rejected": -0.7325989603996277, "logps/chosen": -112.7068862915039, "logps/rejected": -164.94798278808594, "loss": 0.5171, "rewards/accuracies": 1.0, "rewards/chosen": -0.4527997076511383, "rewards/margins": 1.4457454681396484, "rewards/rejected": -1.8985451459884644, "step": 936 }, { "epoch": 0.050361452258740697, "grad_norm": 6.582622051239014, "learning_rate": 1.522385342676824e-08, "logits/chosen": -0.824722945690155, "logits/rejected": -0.7563296556472778, "logps/chosen": -77.90245819091797, "logps/rejected": -132.32997131347656, "loss": 0.5186, "rewards/accuracies": 0.875, "rewards/chosen": 0.009331561625003815, "rewards/margins": 1.9550468921661377, "rewards/rejected": -1.9457151889801025, "step": 937 }, { "epoch": 0.050415199806508826, "grad_norm": 5.652670383453369, "learning_rate": 1.4746763265728768e-08, "logits/chosen": -0.7761021852493286, "logits/rejected": -0.8811581134796143, "logps/chosen": -112.06674194335938, "logps/rejected": -161.85897827148438, "loss": 0.3321, "rewards/accuracies": 1.0, "rewards/chosen": -0.2095741480588913, "rewards/margins": 1.80415940284729, "rewards/rejected": -2.013733386993408, "step": 938 }, { "epoch": 0.05046894735427696, "grad_norm": 4.3196516036987305, "learning_rate": 1.4277156309886573e-08, "logits/chosen": -0.662330687046051, "logits/rejected": -0.8447627425193787, "logps/chosen": -84.24441528320312, "logps/rejected": -139.28250122070312, "loss": 0.3202, "rewards/accuracies": 1.0, "rewards/chosen": -0.09775092452764511, "rewards/margins": 1.9046512842178345, "rewards/rejected": -2.0024023056030273, "step": 939 }, { "epoch": 0.05052269490204509, "grad_norm": 8.635978698730469, "learning_rate": 1.3815039801161722e-08, "logits/chosen": -0.8864489793777466, "logits/rejected": -0.8247860670089722, "logps/chosen": -96.34303283691406, "logps/rejected": -134.18740844726562, "loss": 0.5757, "rewards/accuracies": 1.0, "rewards/chosen": -0.19148650765419006, "rewards/margins": 1.2141015529632568, "rewards/rejected": -1.405588150024414, "step": 940 }, { "epoch": 0.05057644244981323, "grad_norm": 9.622781753540039, "learning_rate": 1.3360420865962507e-08, "logits/chosen": -0.594180703163147, "logits/rejected": -0.7774614095687866, "logps/chosen": -100.93305969238281, "logps/rejected": -128.44248962402344, "loss": 0.6128, "rewards/accuracies": 1.0, "rewards/chosen": -0.1988852620124817, "rewards/margins": 1.4143768548965454, "rewards/rejected": -1.6132620573043823, "step": 941 }, { "epoch": 0.050630189997581364, "grad_norm": 7.559035301208496, "learning_rate": 1.2913306515075328e-08, "logits/chosen": -0.8042731285095215, "logits/rejected": -0.8722237348556519, "logps/chosen": -86.87742614746094, "logps/rejected": -145.40304565429688, "loss": 0.5492, "rewards/accuracies": 1.0, "rewards/chosen": 0.061339519917964935, "rewards/margins": 1.7861509323120117, "rewards/rejected": -1.7248114347457886, "step": 942 }, { "epoch": 0.05068393754534949, "grad_norm": 7.291956901550293, "learning_rate": 1.24737036435566e-08, "logits/chosen": -0.48170995712280273, "logits/rejected": -0.8340456485748291, "logps/chosen": -81.59976959228516, "logps/rejected": -123.38209533691406, "loss": 0.5598, "rewards/accuracies": 1.0, "rewards/chosen": 0.05689731240272522, "rewards/margins": 1.4410947561264038, "rewards/rejected": -1.384197473526001, "step": 943 }, { "epoch": 0.05073768509311763, "grad_norm": 9.08354377746582, "learning_rate": 1.2041619030626282e-08, "logits/chosen": -0.86356520652771, "logits/rejected": -0.742080807685852, "logps/chosen": -90.81300354003906, "logps/rejected": -110.6649169921875, "loss": 0.6875, "rewards/accuracies": 0.875, "rewards/chosen": -0.20676416158676147, "rewards/margins": 1.072756052017212, "rewards/rejected": -1.2795201539993286, "step": 944 }, { "epoch": 0.05079143264088576, "grad_norm": 7.510627269744873, "learning_rate": 1.1617059339563807e-08, "logits/chosen": -0.8033379912376404, "logits/rejected": -0.9026014804840088, "logps/chosen": -89.30085754394531, "logps/rejected": -126.29446411132812, "loss": 0.6553, "rewards/accuracies": 1.0, "rewards/chosen": -0.21194520592689514, "rewards/margins": 1.206228494644165, "rewards/rejected": -1.4181736707687378, "step": 945 }, { "epoch": 0.050845180188653895, "grad_norm": 7.1064133644104, "learning_rate": 1.1200031117604702e-08, "logits/chosen": -0.8048015832901001, "logits/rejected": -0.7986891865730286, "logps/chosen": -76.4354248046875, "logps/rejected": -108.5704345703125, "loss": 0.6049, "rewards/accuracies": 1.0, "rewards/chosen": 0.16479402780532837, "rewards/margins": 1.2613508701324463, "rewards/rejected": -1.0965569019317627, "step": 946 }, { "epoch": 0.050898927736422024, "grad_norm": 6.487582683563232, "learning_rate": 1.0790540795840019e-08, "logits/chosen": -0.7146155834197998, "logits/rejected": -0.5822433233261108, "logps/chosen": -71.32400512695312, "logps/rejected": -115.32209777832031, "loss": 0.5875, "rewards/accuracies": 0.875, "rewards/chosen": 0.08383980393409729, "rewards/margins": 1.3017581701278687, "rewards/rejected": -1.2179183959960938, "step": 947 }, { "epoch": 0.05095267528419016, "grad_norm": 8.986845016479492, "learning_rate": 1.0388594689117069e-08, "logits/chosen": -0.711078941822052, "logits/rejected": -0.8210771083831787, "logps/chosen": -88.65773010253906, "logps/rejected": -102.2611083984375, "loss": 0.8318, "rewards/accuracies": 1.0, "rewards/chosen": 0.028139159083366394, "rewards/margins": 0.8480124473571777, "rewards/rejected": -0.8198733329772949, "step": 948 }, { "epoch": 0.05100642283195829, "grad_norm": 6.1574506759643555, "learning_rate": 9.994198995942226e-09, "logits/chosen": -0.5405627489089966, "logits/rejected": -0.8340440988540649, "logps/chosen": -87.42858123779297, "logps/rejected": -156.03677368164062, "loss": 0.4077, "rewards/accuracies": 1.0, "rewards/chosen": 0.2209634780883789, "rewards/margins": 1.984893798828125, "rewards/rejected": -1.763930320739746, "step": 949 }, { "epoch": 0.051060170379726426, "grad_norm": 6.135793209075928, "learning_rate": 9.607359798384784e-09, "logits/chosen": -0.714262843132019, "logits/rejected": -0.7415896058082581, "logps/chosen": -99.3916015625, "logps/rejected": -136.2452392578125, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 0.1227726936340332, "rewards/margins": 1.7171645164489746, "rewards/rejected": -1.5943918228149414, "step": 950 }, { "epoch": 0.051113917927494555, "grad_norm": 8.182071685791016, "learning_rate": 9.228083061983805e-09, "logits/chosen": -0.5464996099472046, "logits/rejected": -0.8575695753097534, "logps/chosen": -82.7381820678711, "logps/rejected": -128.4928436279297, "loss": 0.5652, "rewards/accuracies": 0.875, "rewards/chosen": 0.09574887156486511, "rewards/margins": 1.5537567138671875, "rewards/rejected": -1.4580078125, "step": 951 }, { "epoch": 0.05116766547526269, "grad_norm": 7.6520891189575195, "learning_rate": 8.856374635655695e-09, "logits/chosen": -0.6674847602844238, "logits/rejected": -0.7716501951217651, "logps/chosen": -62.864723205566406, "logps/rejected": -98.92556762695312, "loss": 0.6999, "rewards/accuracies": 1.0, "rewards/chosen": 0.14843444526195526, "rewards/margins": 1.0606205463409424, "rewards/rejected": -0.9121860861778259, "step": 952 }, { "epoch": 0.05122141302303083, "grad_norm": 11.565359115600586, "learning_rate": 8.492240251604221e-09, "logits/chosen": -0.7821686267852783, "logits/rejected": -0.7723621129989624, "logps/chosen": -98.20672607421875, "logps/rejected": -119.41336059570312, "loss": 0.86, "rewards/accuracies": 0.875, "rewards/chosen": -0.3172934055328369, "rewards/margins": 0.9393529891967773, "rewards/rejected": -1.2566463947296143, "step": 953 }, { "epoch": 0.05127516057079896, "grad_norm": 8.987815856933594, "learning_rate": 8.135685525232028e-09, "logits/chosen": -0.6031497120857239, "logits/rejected": -0.8855668306350708, "logps/chosen": -80.30361938476562, "logps/rejected": -123.9595947265625, "loss": 0.7142, "rewards/accuracies": 1.0, "rewards/chosen": -0.23140469193458557, "rewards/margins": 1.0874253511428833, "rewards/rejected": -1.3188300132751465, "step": 954 }, { "epoch": 0.05132890811856709, "grad_norm": 5.3693461418151855, "learning_rate": 7.786715955054201e-09, "logits/chosen": -0.49632781744003296, "logits/rejected": -0.8469691276550293, "logps/chosen": -65.87211608886719, "logps/rejected": -96.89631652832031, "loss": 0.5161, "rewards/accuracies": 1.0, "rewards/chosen": 0.015157986432313919, "rewards/margins": 1.4984687566757202, "rewards/rejected": -1.4833106994628906, "step": 955 }, { "epoch": 0.05138265566633522, "grad_norm": 7.899611949920654, "learning_rate": 7.445336922613065e-09, "logits/chosen": -0.5060408115386963, "logits/rejected": -0.9457062482833862, "logps/chosen": -67.84479522705078, "logps/rejected": -121.63963317871094, "loss": 0.5892, "rewards/accuracies": 0.875, "rewards/chosen": 0.25328329205513, "rewards/margins": 1.5954015254974365, "rewards/rejected": -1.342118263244629, "step": 956 }, { "epoch": 0.05143640321410336, "grad_norm": 7.929161548614502, "learning_rate": 7.111553692395633e-09, "logits/chosen": -0.6453719139099121, "logits/rejected": -0.7846086025238037, "logps/chosen": -80.33673095703125, "logps/rejected": -117.30836486816406, "loss": 0.6833, "rewards/accuracies": 0.875, "rewards/chosen": 0.023862071335315704, "rewards/margins": 1.119260549545288, "rewards/rejected": -1.0953985452651978, "step": 957 }, { "epoch": 0.05149015076187149, "grad_norm": 5.244797229766846, "learning_rate": 6.785371411752283e-09, "logits/chosen": -0.6574623584747314, "logits/rejected": -0.866584837436676, "logps/chosen": -84.46086883544922, "logps/rejected": -141.43008422851562, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 0.11052881181240082, "rewards/margins": 2.273510456085205, "rewards/rejected": -2.1629815101623535, "step": 958 }, { "epoch": 0.051543898309639624, "grad_norm": 7.068449020385742, "learning_rate": 6.466795110817214e-09, "logits/chosen": -0.46985089778900146, "logits/rejected": -0.9479564428329468, "logps/chosen": -67.56028747558594, "logps/rejected": -96.50946807861328, "loss": 0.7459, "rewards/accuracies": 1.0, "rewards/chosen": 0.11225695163011551, "rewards/margins": 1.0854178667068481, "rewards/rejected": -0.9731608629226685, "step": 959 }, { "epoch": 0.05159764585740775, "grad_norm": 7.0861382484436035, "learning_rate": 6.15582970243117e-09, "logits/chosen": -0.5449313521385193, "logits/rejected": -0.5870343446731567, "logps/chosen": -78.3323745727539, "logps/rejected": -123.13485717773438, "loss": 0.548, "rewards/accuracies": 1.0, "rewards/chosen": 0.1358247697353363, "rewards/margins": 1.476072072982788, "rewards/rejected": -1.3402472734451294, "step": 960 }, { "epoch": 0.05165139340517589, "grad_norm": 8.52079963684082, "learning_rate": 5.852479982065339e-09, "logits/chosen": -0.6145734786987305, "logits/rejected": -0.7161664962768555, "logps/chosen": -84.66669464111328, "logps/rejected": -107.05254364013672, "loss": 0.7072, "rewards/accuracies": 1.0, "rewards/chosen": 0.15589208900928497, "rewards/margins": 0.9213835000991821, "rewards/rejected": -0.7654914259910583, "step": 961 }, { "epoch": 0.05170514095294402, "grad_norm": 7.201258659362793, "learning_rate": 5.556750627747742e-09, "logits/chosen": -0.8308221697807312, "logits/rejected": -0.7943094968795776, "logps/chosen": -91.28129577636719, "logps/rejected": -150.6673583984375, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": -0.06382784247398376, "rewards/margins": 1.670295238494873, "rewards/rejected": -1.7341231107711792, "step": 962 }, { "epoch": 0.051758888500712155, "grad_norm": 11.43411922454834, "learning_rate": 5.26864619999079e-09, "logits/chosen": -0.7056839466094971, "logits/rejected": -0.9044938683509827, "logps/chosen": -98.81825256347656, "logps/rejected": -137.58766174316406, "loss": 0.764, "rewards/accuracies": 0.875, "rewards/chosen": -0.4771055579185486, "rewards/margins": 1.0386638641357422, "rewards/rejected": -1.5157694816589355, "step": 963 }, { "epoch": 0.05181263604848029, "grad_norm": 6.593680381774902, "learning_rate": 4.988171141721231e-09, "logits/chosen": -0.6778083443641663, "logits/rejected": -0.7334750890731812, "logps/chosen": -88.61833190917969, "logps/rejected": -148.7073974609375, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": -0.19675925374031067, "rewards/margins": 1.690980076789856, "rewards/rejected": -1.8877393007278442, "step": 964 }, { "epoch": 0.05186638359624842, "grad_norm": 6.143848419189453, "learning_rate": 4.715329778211374e-09, "logits/chosen": -0.6015793681144714, "logits/rejected": -0.7354377508163452, "logps/chosen": -71.1463623046875, "logps/rejected": -137.74179077148438, "loss": 0.5239, "rewards/accuracies": 1.0, "rewards/chosen": 0.09249283373355865, "rewards/margins": 1.4944913387298584, "rewards/rejected": -1.401998519897461, "step": 965 }, { "epoch": 0.05192013114401656, "grad_norm": 9.366235733032227, "learning_rate": 4.450126317012637e-09, "logits/chosen": -0.6335364580154419, "logits/rejected": -0.8824995756149292, "logps/chosen": -98.01614379882812, "logps/rejected": -129.677734375, "loss": 0.7757, "rewards/accuracies": 1.0, "rewards/chosen": -0.2525232434272766, "rewards/margins": 1.0033650398254395, "rewards/rejected": -1.2558883428573608, "step": 966 }, { "epoch": 0.051973878691784686, "grad_norm": 6.281034469604492, "learning_rate": 4.1925648478903786e-09, "logits/chosen": -0.5483347773551941, "logits/rejected": -0.9118915796279907, "logps/chosen": -73.55049896240234, "logps/rejected": -126.34773254394531, "loss": 0.416, "rewards/accuracies": 1.0, "rewards/chosen": 0.2705968916416168, "rewards/margins": 1.611229658126831, "rewards/rejected": -1.3406327962875366, "step": 967 }, { "epoch": 0.05202762623955282, "grad_norm": 10.963261604309082, "learning_rate": 3.9426493427611175e-09, "logits/chosen": -0.8877754211425781, "logits/rejected": -0.9199085235595703, "logps/chosen": -100.42781066894531, "logps/rejected": -124.91184997558594, "loss": 0.6944, "rewards/accuracies": 1.0, "rewards/chosen": -0.12364298105239868, "rewards/margins": 1.0063194036483765, "rewards/rejected": -1.12996244430542, "step": 968 }, { "epoch": 0.05208137378732095, "grad_norm": 7.927393436431885, "learning_rate": 3.7003836556310787e-09, "logits/chosen": -0.5941413640975952, "logits/rejected": -0.7020019292831421, "logps/chosen": -91.31095886230469, "logps/rejected": -133.13169860839844, "loss": 0.6384, "rewards/accuracies": 0.875, "rewards/chosen": -0.14844563603401184, "rewards/margins": 1.475653052330017, "rewards/rejected": -1.624098777770996, "step": 969 }, { "epoch": 0.05213512133508909, "grad_norm": 8.897116661071777, "learning_rate": 3.465771522536853e-09, "logits/chosen": -0.4810856580734253, "logits/rejected": -0.6693933010101318, "logps/chosen": -65.31549072265625, "logps/rejected": -88.0207290649414, "loss": 0.9388, "rewards/accuracies": 0.75, "rewards/chosen": -0.18000450730323792, "rewards/margins": 0.6706110835075378, "rewards/rejected": -0.8506155610084534, "step": 970 }, { "epoch": 0.05218886888285722, "grad_norm": 6.758164882659912, "learning_rate": 3.238816561487834e-09, "logits/chosen": -0.7675652503967285, "logits/rejected": -0.7848396301269531, "logps/chosen": -83.70218658447266, "logps/rejected": -124.62163543701172, "loss": 0.4937, "rewards/accuracies": 0.875, "rewards/chosen": -0.017577366903424263, "rewards/margins": 1.7750180959701538, "rewards/rejected": -1.792595386505127, "step": 971 }, { "epoch": 0.052242616430625353, "grad_norm": 7.815019607543945, "learning_rate": 3.019522272410202e-09, "logits/chosen": -0.7040433883666992, "logits/rejected": -0.9108592867851257, "logps/chosen": -114.53923797607422, "logps/rejected": -157.58255004882812, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": -0.2246471643447876, "rewards/margins": 1.8350517749786377, "rewards/rejected": -2.059699058532715, "step": 972 }, { "epoch": 0.05229636397839348, "grad_norm": 9.282889366149902, "learning_rate": 2.8078920370931404e-09, "logits/chosen": -0.6506635546684265, "logits/rejected": -0.6554042100906372, "logps/chosen": -111.47503662109375, "logps/rejected": -134.6463623046875, "loss": 0.5755, "rewards/accuracies": 1.0, "rewards/chosen": -0.4857989251613617, "rewards/margins": 1.2082979679107666, "rewards/rejected": -1.6940969228744507, "step": 973 }, { "epoch": 0.05235011152616162, "grad_norm": 6.893423080444336, "learning_rate": 2.603929119136761e-09, "logits/chosen": -0.8245573043823242, "logits/rejected": -1.0465458631515503, "logps/chosen": -93.21940612792969, "logps/rejected": -143.75289916992188, "loss": 0.4285, "rewards/accuracies": 1.0, "rewards/chosen": -0.06930875778198242, "rewards/margins": 1.635469675064087, "rewards/rejected": -1.7047784328460693, "step": 974 }, { "epoch": 0.052403859073929755, "grad_norm": 7.243135452270508, "learning_rate": 2.407636663901591e-09, "logits/chosen": -0.7579708099365234, "logits/rejected": -0.9223552942276001, "logps/chosen": -90.08309936523438, "logps/rejected": -135.9847869873047, "loss": 0.5511, "rewards/accuracies": 1.0, "rewards/chosen": -0.1409289836883545, "rewards/margins": 1.6634019613265991, "rewards/rejected": -1.804330825805664, "step": 975 }, { "epoch": 0.052457606621697885, "grad_norm": 6.986074924468994, "learning_rate": 2.2190176984600016e-09, "logits/chosen": -0.9188860058784485, "logits/rejected": -0.9572144150733948, "logps/chosen": -100.83145141601562, "logps/rejected": -140.84078979492188, "loss": 0.4585, "rewards/accuracies": 1.0, "rewards/chosen": -0.08110465109348297, "rewards/margins": 1.7304433584213257, "rewards/rejected": -1.8115479946136475, "step": 976 }, { "epoch": 0.05251135416946602, "grad_norm": 8.182229042053223, "learning_rate": 2.0380751315498545e-09, "logits/chosen": -0.3945918083190918, "logits/rejected": -0.6619648933410645, "logps/chosen": -98.97474670410156, "logps/rejected": -140.43499755859375, "loss": 0.5395, "rewards/accuracies": 1.0, "rewards/chosen": -0.07172183692455292, "rewards/margins": 1.4074766635894775, "rewards/rejected": -1.4791984558105469, "step": 977 }, { "epoch": 0.05256510171723415, "grad_norm": 9.352846145629883, "learning_rate": 1.8648117535293717e-09, "logits/chosen": -0.7320584058761597, "logits/rejected": -0.7948988676071167, "logps/chosen": -74.54985046386719, "logps/rejected": -120.84281921386719, "loss": 0.7596, "rewards/accuracies": 0.75, "rewards/chosen": -0.06485749781131744, "rewards/margins": 1.2898125648498535, "rewards/rejected": -1.3546701669692993, "step": 978 }, { "epoch": 0.052618849265002286, "grad_norm": 7.260446548461914, "learning_rate": 1.6992302363341704e-09, "logits/chosen": -0.44648218154907227, "logits/rejected": -1.217307209968567, "logps/chosen": -84.5563735961914, "logps/rejected": -125.43209075927734, "loss": 0.5488, "rewards/accuracies": 0.875, "rewards/chosen": 0.003940872848033905, "rewards/margins": 1.419123888015747, "rewards/rejected": -1.4151828289031982, "step": 979 }, { "epoch": 0.052672596812770416, "grad_norm": 8.06701374053955, "learning_rate": 1.541333133436018e-09, "logits/chosen": -0.8114326596260071, "logits/rejected": -0.854230523109436, "logps/chosen": -73.72830200195312, "logps/rejected": -91.02816772460938, "loss": 0.8833, "rewards/accuracies": 0.875, "rewards/chosen": -0.028093863278627396, "rewards/margins": 0.688884437084198, "rewards/rejected": -0.7169783115386963, "step": 980 }, { "epoch": 0.05272634436053855, "grad_norm": 9.996211051940918, "learning_rate": 1.3911228798036412e-09, "logits/chosen": -0.562085747718811, "logits/rejected": -0.7453323006629944, "logps/chosen": -93.26527404785156, "logps/rejected": -139.08746337890625, "loss": 0.7853, "rewards/accuracies": 0.75, "rewards/chosen": 0.0073968395590782166, "rewards/margins": 1.1481304168701172, "rewards/rejected": -1.1407335996627808, "step": 981 }, { "epoch": 0.05278009190830668, "grad_norm": 10.70457649230957, "learning_rate": 1.2486017918649783e-09, "logits/chosen": -0.6483954191207886, "logits/rejected": -0.9585478901863098, "logps/chosen": -88.88848876953125, "logps/rejected": -148.72268676757812, "loss": 0.7223, "rewards/accuracies": 0.75, "rewards/chosen": -0.07351746410131454, "rewards/margins": 1.3939001560211182, "rewards/rejected": -1.4674174785614014, "step": 982 }, { "epoch": 0.05283383945607482, "grad_norm": 8.314146041870117, "learning_rate": 1.1137720674714301e-09, "logits/chosen": -0.7447327971458435, "logits/rejected": -0.8045557737350464, "logps/chosen": -70.65321350097656, "logps/rejected": -86.91586303710938, "loss": 0.8339, "rewards/accuracies": 0.875, "rewards/chosen": 0.0366002693772316, "rewards/margins": 0.7728639841079712, "rewards/rejected": -0.7362637519836426, "step": 983 }, { "epoch": 0.05288758700384295, "grad_norm": 13.401081085205078, "learning_rate": 9.866357858642205e-10, "logits/chosen": -0.660984456539154, "logits/rejected": -0.7600001096725464, "logps/chosen": -63.35674285888672, "logps/rejected": -67.97764587402344, "loss": 1.162, "rewards/accuracies": 0.75, "rewards/chosen": -0.46583035588264465, "rewards/margins": 0.389530211687088, "rewards/rejected": -0.8553605079650879, "step": 984 }, { "epoch": 0.05294133455161108, "grad_norm": 7.4787821769714355, "learning_rate": 8.671949076420881e-10, "logits/chosen": -0.9439188241958618, "logits/rejected": -0.8261070847511292, "logps/chosen": -100.11639404296875, "logps/rejected": -177.27474975585938, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": -0.3358500599861145, "rewards/margins": 2.0192954540252686, "rewards/rejected": -2.3551454544067383, "step": 985 }, { "epoch": 0.05299508209937922, "grad_norm": 6.971595287322998, "learning_rate": 7.554512747310338e-10, "logits/chosen": -0.7997468709945679, "logits/rejected": -0.6261577010154724, "logps/chosen": -85.90911865234375, "logps/rejected": -114.58788299560547, "loss": 0.7002, "rewards/accuracies": 0.875, "rewards/chosen": -0.1808246672153473, "rewards/margins": 1.165442943572998, "rewards/rejected": -1.346267580986023, "step": 986 }, { "epoch": 0.05304882964714735, "grad_norm": 6.702908039093018, "learning_rate": 6.51406610356231e-10, "logits/chosen": -0.7664769887924194, "logits/rejected": -0.9270436763763428, "logps/chosen": -90.60813903808594, "logps/rejected": -158.26040649414062, "loss": 0.4324, "rewards/accuracies": 1.0, "rewards/chosen": 0.032196372747421265, "rewards/margins": 1.9036833047866821, "rewards/rejected": -1.8714869022369385, "step": 987 }, { "epoch": 0.053102577194915485, "grad_norm": 6.807716369628906, "learning_rate": 5.550625190150482e-10, "logits/chosen": -0.7439359426498413, "logits/rejected": -0.8750110864639282, "logps/chosen": -86.10722351074219, "logps/rejected": -150.8014373779297, "loss": 0.4715, "rewards/accuracies": 1.0, "rewards/chosen": -0.13370347023010254, "rewards/margins": 1.6160330772399902, "rewards/rejected": -1.7497365474700928, "step": 988 }, { "epoch": 0.053156324742683614, "grad_norm": 9.913330078125, "learning_rate": 4.664204864525123e-10, "logits/chosen": -0.7873706221580505, "logits/rejected": -0.953166127204895, "logps/chosen": -105.70438385009766, "logps/rejected": -156.42587280273438, "loss": 0.5164, "rewards/accuracies": 1.0, "rewards/chosen": -0.5507638454437256, "rewards/margins": 1.474498987197876, "rewards/rejected": -2.0252628326416016, "step": 989 }, { "epoch": 0.05321007229045175, "grad_norm": 5.959756851196289, "learning_rate": 3.854818796385495e-10, "logits/chosen": -0.6816747784614563, "logits/rejected": -1.0337260961532593, "logps/chosen": -88.51079559326172, "logps/rejected": -131.30088806152344, "loss": 0.5362, "rewards/accuracies": 1.0, "rewards/chosen": -0.1873810887336731, "rewards/margins": 1.305304765701294, "rewards/rejected": -1.4926856756210327, "step": 990 }, { "epoch": 0.05326381983821988, "grad_norm": 8.25664234161377, "learning_rate": 3.1224794674650225e-10, "logits/chosen": -0.6520593166351318, "logits/rejected": -0.7987700700759888, "logps/chosen": -71.85877990722656, "logps/rejected": -108.78142547607422, "loss": 0.9606, "rewards/accuracies": 0.875, "rewards/chosen": -0.05954772233963013, "rewards/margins": 0.8478366136550903, "rewards/rejected": -0.9073843955993652, "step": 991 }, { "epoch": 0.053317567385988016, "grad_norm": 7.128324031829834, "learning_rate": 2.4671981713419997e-10, "logits/chosen": -0.6963656544685364, "logits/rejected": -0.8854733109474182, "logps/chosen": -77.96942138671875, "logps/rejected": -126.25082397460938, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": -0.04366188496351242, "rewards/margins": 1.3107225894927979, "rewards/rejected": -1.3543845415115356, "step": 992 }, { "epoch": 0.053371314933756145, "grad_norm": 6.389100551605225, "learning_rate": 1.8889850132658424e-10, "logits/chosen": -0.6212677955627441, "logits/rejected": -0.7949864864349365, "logps/chosen": -94.80404663085938, "logps/rejected": -113.58343505859375, "loss": 0.5483, "rewards/accuracies": 1.0, "rewards/chosen": 0.14776155352592468, "rewards/margins": 1.291280746459961, "rewards/rejected": -1.1435191631317139, "step": 993 }, { "epoch": 0.05342506248152428, "grad_norm": 9.829010009765625, "learning_rate": 1.3878489099972134e-10, "logits/chosen": -0.7753487825393677, "logits/rejected": -0.8770256638526917, "logps/chosen": -109.01293182373047, "logps/rejected": -140.13206481933594, "loss": 0.6519, "rewards/accuracies": 0.75, "rewards/chosen": -0.31612417101860046, "rewards/margins": 1.4228726625442505, "rewards/rejected": -1.7389967441558838, "step": 994 }, { "epoch": 0.05347881002929241, "grad_norm": 6.780810832977295, "learning_rate": 9.637975896759076e-11, "logits/chosen": -0.518179178237915, "logits/rejected": -0.8447328209877014, "logps/chosen": -95.39848327636719, "logps/rejected": -140.66976928710938, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": -0.20970793068408966, "rewards/margins": 1.622942566871643, "rewards/rejected": -1.8326504230499268, "step": 995 }, { "epoch": 0.05353255757706055, "grad_norm": 9.213013648986816, "learning_rate": 6.168375916970614e-11, "logits/chosen": -0.7195215225219727, "logits/rejected": -0.9501616358757019, "logps/chosen": -103.54450225830078, "logps/rejected": -171.6224365234375, "loss": 0.5208, "rewards/accuracies": 0.875, "rewards/chosen": -0.5024901628494263, "rewards/margins": 1.5275843143463135, "rewards/rejected": -2.03007435798645, "step": 996 }, { "epoch": 0.05358630512482868, "grad_norm": 7.375332355499268, "learning_rate": 3.4697426661345344e-11, "logits/chosen": -0.6305344700813293, "logits/rejected": -0.8296784162521362, "logps/chosen": -64.22871398925781, "logps/rejected": -107.68180084228516, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": 0.08493919670581818, "rewards/margins": 1.2428326606750488, "rewards/rejected": -1.157893419265747, "step": 997 }, { "epoch": 0.05364005267259681, "grad_norm": 8.552000045776367, "learning_rate": 1.5421177605168255e-11, "logits/chosen": -0.7244608402252197, "logits/rejected": -0.8031644225120544, "logps/chosen": -99.22880554199219, "logps/rejected": -176.34869384765625, "loss": 0.4568, "rewards/accuracies": 0.875, "rewards/chosen": -0.1341504007577896, "rewards/margins": 2.2987401485443115, "rewards/rejected": -2.4328904151916504, "step": 998 }, { "epoch": 0.05369380022036495, "grad_norm": 8.966371536254883, "learning_rate": 3.855309264721995e-12, "logits/chosen": -0.7010620832443237, "logits/rejected": -0.83616042137146, "logps/chosen": -86.00282287597656, "logps/rejected": -130.06333923339844, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": -0.12561750411987305, "rewards/margins": 1.2360445261001587, "rewards/rejected": -1.3616620302200317, "step": 999 }, { "epoch": 0.05374754776813308, "grad_norm": 9.37076473236084, "learning_rate": 0.0, "logits/chosen": -0.6493831872940063, "logits/rejected": -0.8236144781112671, "logps/chosen": -93.28886413574219, "logps/rejected": -141.2165985107422, "loss": 0.6991, "rewards/accuracies": 0.875, "rewards/chosen": -0.255878746509552, "rewards/margins": 1.2940934896469116, "rewards/rejected": -1.5499721765518188, "step": 1000 } ], "logging_steps": 1, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }