{ "best_metric": 0.6825469136238098, "best_model_checkpoint": "/mnt/data/shesj/Trained/RL4CoT/DPO/Parallel_Iter2_numglueCorrect_iter2_10lang.json/checkpoint-200", "epoch": 0.050327126321087066, "eval_steps": 100, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5e-08, "logits/chosen": -0.7881901264190674, "logits/rejected": -0.7754368782043457, "logps/chosen": -5.556678295135498, "logps/rejected": -8.082754135131836, "loss": 0.693, "rewards/accuracies": 0.3187499940395355, "rewards/chosen": 0.0005767763941548765, "rewards/margins": -0.000614482443779707, "rewards/rejected": 0.0011912587797269225, "step": 5 }, { "epoch": 0.0, "learning_rate": 1e-07, "logits/chosen": -0.7774807214736938, "logits/rejected": -0.7521709203720093, "logps/chosen": -6.2856526374816895, "logps/rejected": -7.786572456359863, "loss": 0.6935, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0011454308405518532, "rewards/margins": 0.002339282538741827, "rewards/rejected": -0.003484714310616255, "step": 10 }, { "epoch": 0.0, "learning_rate": 1.5e-07, "logits/chosen": -0.7695692777633667, "logits/rejected": -0.7617800831794739, "logps/chosen": -5.672076225280762, "logps/rejected": -7.90362548828125, "loss": 0.6935, "rewards/accuracies": 0.4124999940395355, "rewards/chosen": -0.004858463071286678, "rewards/margins": -0.00798516534268856, "rewards/rejected": 0.003126702504232526, "step": 15 }, { "epoch": 0.01, "learning_rate": 2e-07, "logits/chosen": -0.8169188499450684, "logits/rejected": -0.8234481811523438, "logps/chosen": -5.951030731201172, "logps/rejected": -7.665135383605957, "loss": 0.6924, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.0009545508655719459, "rewards/margins": -0.0022635911591351032, "rewards/rejected": 0.00130904046818614, "step": 20 }, { "epoch": 0.01, "learning_rate": 2.5e-07, "logits/chosen": -0.7988893389701843, "logits/rejected": -0.7831005454063416, "logps/chosen": -4.960128307342529, "logps/rejected": -7.793705940246582, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0017494624480605125, "rewards/margins": 0.0019936964381486177, "rewards/rejected": -0.0002442340482957661, "step": 25 }, { "epoch": 0.01, "learning_rate": 3e-07, "logits/chosen": -0.7896796464920044, "logits/rejected": -0.7605875730514526, "logps/chosen": -6.406218528747559, "logps/rejected": -8.445697784423828, "loss": 0.6923, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0006792292697355151, "rewards/margins": -0.0016146342968568206, "rewards/rejected": 0.0009354048524983227, "step": 30 }, { "epoch": 0.01, "learning_rate": 3.5e-07, "logits/chosen": -0.8104821443557739, "logits/rejected": -0.7983841896057129, "logps/chosen": -6.952303409576416, "logps/rejected": -8.65689754486084, "loss": 0.6926, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0003856793628074229, "rewards/margins": 0.0037465274799615145, "rewards/rejected": -0.0033608481753617525, "step": 35 }, { "epoch": 0.01, "learning_rate": 4e-07, "logits/chosen": -0.8198621869087219, "logits/rejected": -0.8019220232963562, "logps/chosen": -6.161223888397217, "logps/rejected": -7.956850528717041, "loss": 0.6927, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.001837434945628047, "rewards/margins": 0.005904150195419788, "rewards/rejected": -0.004066715482622385, "step": 40 }, { "epoch": 0.01, "learning_rate": 4.5e-07, "logits/chosen": -0.7631333470344543, "logits/rejected": -0.7561143636703491, "logps/chosen": -5.855575084686279, "logps/rejected": -7.01950740814209, "loss": 0.6913, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0016343919560313225, "rewards/margins": 0.005311951506882906, "rewards/rejected": -0.0036775595508515835, "step": 45 }, { "epoch": 0.01, "learning_rate": 5e-07, "logits/chosen": -0.7467092871665955, "logits/rejected": -0.7552592754364014, "logps/chosen": -7.219940185546875, "logps/rejected": -7.984251976013184, "loss": 0.6907, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0006344284047372639, "rewards/margins": 0.0040216282941401005, "rewards/rejected": -0.003387199714779854, "step": 50 }, { "epoch": 0.01, "learning_rate": 5.5e-07, "logits/chosen": -0.8183493614196777, "logits/rejected": -0.8048542737960815, "logps/chosen": -5.986401557922363, "logps/rejected": -7.050605773925781, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": 0.0044477893970906734, "rewards/margins": 0.013396045193076134, "rewards/rejected": -0.008948257192969322, "step": 55 }, { "epoch": 0.02, "learning_rate": 6e-07, "logits/chosen": -0.7246443033218384, "logits/rejected": -0.7153327465057373, "logps/chosen": -6.37067985534668, "logps/rejected": -7.855441093444824, "loss": 0.69, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0028912366833537817, "rewards/margins": 0.0029723027255386114, "rewards/rejected": -0.005863540340214968, "step": 60 }, { "epoch": 0.02, "learning_rate": 6.5e-07, "logits/chosen": -0.7883706092834473, "logits/rejected": -0.7892045974731445, "logps/chosen": -5.0366129875183105, "logps/rejected": -6.685678005218506, "loss": 0.689, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003989654593169689, "rewards/margins": 0.0065727815963327885, "rewards/rejected": -0.002583127235993743, "step": 65 }, { "epoch": 0.02, "learning_rate": 7e-07, "logits/chosen": -0.7610381245613098, "logits/rejected": -0.767534613609314, "logps/chosen": -6.8763604164123535, "logps/rejected": -8.272597312927246, "loss": 0.687, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0005852003814652562, "rewards/margins": 0.012984293513000011, "rewards/rejected": -0.013569491915404797, "step": 70 }, { "epoch": 0.02, "learning_rate": 7.5e-07, "logits/chosen": -0.7938845753669739, "logits/rejected": -0.7884698510169983, "logps/chosen": -6.220009803771973, "logps/rejected": -7.81838321685791, "loss": 0.685, "rewards/accuracies": 0.625, "rewards/chosen": 0.008494245819747448, "rewards/margins": 0.02036314085125923, "rewards/rejected": -0.011868895962834358, "step": 75 }, { "epoch": 0.02, "learning_rate": 8e-07, "logits/chosen": -0.760898232460022, "logits/rejected": -0.7529922127723694, "logps/chosen": -6.070019245147705, "logps/rejected": -8.474264144897461, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008083779248408973, "rewards/margins": 0.0290432907640934, "rewards/rejected": -0.029851669445633888, "step": 80 }, { "epoch": 0.02, "learning_rate": 8.499999999999999e-07, "logits/chosen": -0.8255828619003296, "logits/rejected": -0.8029024004936218, "logps/chosen": -5.739585876464844, "logps/rejected": -8.894620895385742, "loss": 0.681, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0006468339124694467, "rewards/margins": 0.039313118904829025, "rewards/rejected": -0.03866628557443619, "step": 85 }, { "epoch": 0.02, "learning_rate": 9e-07, "logits/chosen": -0.8031052350997925, "logits/rejected": -0.7612560987472534, "logps/chosen": -6.660666465759277, "logps/rejected": -10.91639232635498, "loss": 0.677, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.008988827466964722, "rewards/margins": 0.03577885776758194, "rewards/rejected": -0.04476768523454666, "step": 90 }, { "epoch": 0.02, "learning_rate": 9.499999999999999e-07, "logits/chosen": -0.8087406158447266, "logits/rejected": -0.7717125415802002, "logps/chosen": -6.990227699279785, "logps/rejected": -10.181965827941895, "loss": 0.6766, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.017899103462696075, "rewards/margins": 0.03709184005856514, "rewards/rejected": -0.054990947246551514, "step": 95 }, { "epoch": 0.03, "learning_rate": 1e-06, "logits/chosen": -0.7920883297920227, "logits/rejected": -0.7615999579429626, "logps/chosen": -7.010110378265381, "logps/rejected": -8.589981079101562, "loss": 0.6742, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.01662164181470871, "rewards/margins": 0.04322618246078491, "rewards/rejected": -0.05984782055020332, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": -1.2141907215118408, "eval_logits/rejected": -1.2049294710159302, "eval_logps/chosen": -6.552766799926758, "eval_logps/rejected": -8.47075366973877, "eval_loss": 0.6869122385978699, "eval_rewards/accuracies": 0.5723472833633423, "eval_rewards/chosen": -0.021150289103388786, "eval_rewards/margins": 0.02127229794859886, "eval_rewards/rejected": -0.0424225889146328, "eval_runtime": 628.2123, "eval_samples_per_second": 31.588, "eval_steps_per_second": 0.495, "step": 100 }, { "epoch": 0.03, "learning_rate": 9.999829128320873e-07, "logits/chosen": -0.7386836409568787, "logits/rejected": -0.7065194845199585, "logps/chosen": -7.015887260437012, "logps/rejected": -8.969260215759277, "loss": 0.6691, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.02153836190700531, "rewards/margins": 0.05296989530324936, "rewards/rejected": -0.07450826466083527, "step": 105 }, { "epoch": 0.03, "learning_rate": 9.999316524962345e-07, "logits/chosen": -0.8299457430839539, "logits/rejected": -0.8254146575927734, "logps/chosen": -6.386677265167236, "logps/rejected": -8.159158706665039, "loss": 0.6626, "rewards/accuracies": 0.625, "rewards/chosen": -0.019821835681796074, "rewards/margins": 0.08776978403329849, "rewards/rejected": -0.10759161412715912, "step": 110 }, { "epoch": 0.03, "learning_rate": 9.998462224960173e-07, "logits/chosen": -0.7512461543083191, "logits/rejected": -0.7053896188735962, "logps/chosen": -7.265576362609863, "logps/rejected": -10.415300369262695, "loss": 0.6565, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.04903126507997513, "rewards/margins": 0.10108338296413422, "rewards/rejected": -0.15011465549468994, "step": 115 }, { "epoch": 0.03, "learning_rate": 9.99726628670463e-07, "logits/chosen": -0.8122448921203613, "logits/rejected": -0.7945531010627747, "logps/chosen": -6.279524326324463, "logps/rejected": -8.167196273803711, "loss": 0.6583, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04174378514289856, "rewards/margins": 0.07122843712568283, "rewards/rejected": -0.11297222226858139, "step": 120 }, { "epoch": 0.03, "learning_rate": 9.995728791936505e-07, "logits/chosen": -0.7480685114860535, "logits/rejected": -0.7061656713485718, "logps/chosen": -6.890868186950684, "logps/rejected": -10.21199893951416, "loss": 0.651, "rewards/accuracies": 0.625, "rewards/chosen": -0.07250909507274628, "rewards/margins": 0.08439986407756805, "rewards/rejected": -0.15690895915031433, "step": 125 }, { "epoch": 0.03, "learning_rate": 9.993849845741523e-07, "logits/chosen": -0.7156326174736023, "logits/rejected": -0.7210611701011658, "logps/chosen": -7.967876434326172, "logps/rejected": -11.226155281066895, "loss": 0.6563, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.0788055881857872, "rewards/margins": 0.14646434783935547, "rewards/rejected": -0.22526994347572327, "step": 130 }, { "epoch": 0.03, "learning_rate": 9.991629576543163e-07, "logits/chosen": -0.8028038740158081, "logits/rejected": -0.7880641222000122, "logps/chosen": -7.9293036460876465, "logps/rejected": -12.446617126464844, "loss": 0.6393, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.07287696748971939, "rewards/margins": 0.16767558455467224, "rewards/rejected": -0.24055257439613342, "step": 135 }, { "epoch": 0.04, "learning_rate": 9.989068136093872e-07, "logits/chosen": -0.6804400682449341, "logits/rejected": -0.6678518056869507, "logps/chosen": -7.790997505187988, "logps/rejected": -10.521781921386719, "loss": 0.6429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10002864897251129, "rewards/margins": 0.16521799564361572, "rewards/rejected": -0.2652466297149658, "step": 140 }, { "epoch": 0.04, "learning_rate": 9.986165699464705e-07, "logits/chosen": -0.7420132160186768, "logits/rejected": -0.7359737157821655, "logps/chosen": -7.692935943603516, "logps/rejected": -11.80825138092041, "loss": 0.6275, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12237729877233505, "rewards/margins": 0.2165246307849884, "rewards/rejected": -0.33890193700790405, "step": 145 }, { "epoch": 0.04, "learning_rate": 9.982922465033348e-07, "logits/chosen": -0.6650699377059937, "logits/rejected": -0.6643859148025513, "logps/chosen": -8.27735710144043, "logps/rejected": -11.08592700958252, "loss": 0.6316, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18646416068077087, "rewards/margins": 0.17933328449726105, "rewards/rejected": -0.3657974600791931, "step": 150 }, { "epoch": 0.04, "learning_rate": 9.979338654470567e-07, "logits/chosen": -0.6874249577522278, "logits/rejected": -0.6583540439605713, "logps/chosen": -8.387764930725098, "logps/rejected": -10.597826957702637, "loss": 0.6355, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.15453846752643585, "rewards/margins": 0.15605905652046204, "rewards/rejected": -0.3105975389480591, "step": 155 }, { "epoch": 0.04, "learning_rate": 9.975414512725056e-07, "logits/chosen": -0.6604259610176086, "logits/rejected": -0.654133677482605, "logps/chosen": -8.139281272888184, "logps/rejected": -11.677125930786133, "loss": 0.6281, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.1922483593225479, "rewards/margins": 0.17367199063301086, "rewards/rejected": -0.36592036485671997, "step": 160 }, { "epoch": 0.04, "learning_rate": 9.971150308006687e-07, "logits/chosen": -0.6868435144424438, "logits/rejected": -0.6831103563308716, "logps/chosen": -7.650822639465332, "logps/rejected": -13.520294189453125, "loss": 0.6184, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.1557883322238922, "rewards/margins": 0.309310644865036, "rewards/rejected": -0.4650990068912506, "step": 165 }, { "epoch": 0.04, "learning_rate": 9.966546331768192e-07, "logits/chosen": -0.6930921673774719, "logits/rejected": -0.656936764717102, "logps/chosen": -7.236788749694824, "logps/rejected": -12.021242141723633, "loss": 0.617, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16039922833442688, "rewards/margins": 0.18796458840370178, "rewards/rejected": -0.34836381673812866, "step": 170 }, { "epoch": 0.04, "learning_rate": 9.961602898685223e-07, "logits/chosen": -0.6678417921066284, "logits/rejected": -0.6494520306587219, "logps/chosen": -8.197237014770508, "logps/rejected": -13.004777908325195, "loss": 0.6192, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.16785219311714172, "rewards/margins": 0.2895987629890442, "rewards/rejected": -0.4574509561061859, "step": 175 }, { "epoch": 0.05, "learning_rate": 9.956320346634875e-07, "logits/chosen": -0.6635026931762695, "logits/rejected": -0.6535638570785522, "logps/chosen": -8.445914268493652, "logps/rejected": -14.642396926879883, "loss": 0.6054, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.2178197205066681, "rewards/margins": 0.33262819051742554, "rewards/rejected": -0.5504478812217712, "step": 180 }, { "epoch": 0.05, "learning_rate": 9.95069903667256e-07, "logits/chosen": -0.6291212439537048, "logits/rejected": -0.5968618392944336, "logps/chosen": -8.441099166870117, "logps/rejected": -13.59777545928955, "loss": 0.6019, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.262218177318573, "rewards/margins": 0.2801818251609802, "rewards/rejected": -0.5424000024795532, "step": 185 }, { "epoch": 0.05, "learning_rate": 9.944739353007341e-07, "logits/chosen": -0.6783192753791809, "logits/rejected": -0.6327847242355347, "logps/chosen": -8.718297004699707, "logps/rejected": -15.599523544311523, "loss": 0.5953, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.2528052031993866, "rewards/margins": 0.31637701392173767, "rewards/rejected": -0.569182276725769, "step": 190 }, { "epoch": 0.05, "learning_rate": 9.938441702975689e-07, "logits/chosen": -0.6378843784332275, "logits/rejected": -0.6440542936325073, "logps/chosen": -9.802359580993652, "logps/rejected": -14.98701286315918, "loss": 0.5907, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.3599366247653961, "rewards/margins": 0.2988061010837555, "rewards/rejected": -0.6587426066398621, "step": 195 }, { "epoch": 0.05, "learning_rate": 9.931806517013612e-07, "logits/chosen": -0.6049096584320068, "logits/rejected": -0.6118007302284241, "logps/chosen": -7.990042686462402, "logps/rejected": -13.457636833190918, "loss": 0.5959, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28124743700027466, "rewards/margins": 0.39520224928855896, "rewards/rejected": -0.6764496564865112, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -1.100651502609253, "eval_logits/rejected": -1.090796947479248, "eval_logps/chosen": -9.126388549804688, "eval_logps/rejected": -11.862701416015625, "eval_loss": 0.6825469136238098, "eval_rewards/accuracies": 0.5799839496612549, "eval_rewards/chosen": -0.27851250767707825, "eval_rewards/margins": 0.10310473293066025, "eval_rewards/rejected": -0.3816172480583191, "eval_runtime": 646.4588, "eval_samples_per_second": 30.696, "eval_steps_per_second": 0.481, "step": 200 } ], "logging_steps": 5, "max_steps": 2000, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "trial_name": null, "trial_params": null }