MathOctopus-MAPO-DPO-7B / trainer_state.json
VincentVioletLx
commit from VincentLx
f05cfb9
{
"best_metric": 0.6825469136238098,
"best_model_checkpoint": "/mnt/data/shesj/Trained/RL4CoT/DPO/Parallel_Iter2_numglueCorrect_iter2_10lang.json/checkpoint-200",
"epoch": 0.050327126321087066,
"eval_steps": 100,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"learning_rate": 5e-08,
"logits/chosen": -0.7881901264190674,
"logits/rejected": -0.7754368782043457,
"logps/chosen": -5.556678295135498,
"logps/rejected": -8.082754135131836,
"loss": 0.693,
"rewards/accuracies": 0.3187499940395355,
"rewards/chosen": 0.0005767763941548765,
"rewards/margins": -0.000614482443779707,
"rewards/rejected": 0.0011912587797269225,
"step": 5
},
{
"epoch": 0.0,
"learning_rate": 1e-07,
"logits/chosen": -0.7774807214736938,
"logits/rejected": -0.7521709203720093,
"logps/chosen": -6.2856526374816895,
"logps/rejected": -7.786572456359863,
"loss": 0.6935,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0011454308405518532,
"rewards/margins": 0.002339282538741827,
"rewards/rejected": -0.003484714310616255,
"step": 10
},
{
"epoch": 0.0,
"learning_rate": 1.5e-07,
"logits/chosen": -0.7695692777633667,
"logits/rejected": -0.7617800831794739,
"logps/chosen": -5.672076225280762,
"logps/rejected": -7.90362548828125,
"loss": 0.6935,
"rewards/accuracies": 0.4124999940395355,
"rewards/chosen": -0.004858463071286678,
"rewards/margins": -0.00798516534268856,
"rewards/rejected": 0.003126702504232526,
"step": 15
},
{
"epoch": 0.01,
"learning_rate": 2e-07,
"logits/chosen": -0.8169188499450684,
"logits/rejected": -0.8234481811523438,
"logps/chosen": -5.951030731201172,
"logps/rejected": -7.665135383605957,
"loss": 0.6924,
"rewards/accuracies": 0.4749999940395355,
"rewards/chosen": -0.0009545508655719459,
"rewards/margins": -0.0022635911591351032,
"rewards/rejected": 0.00130904046818614,
"step": 20
},
{
"epoch": 0.01,
"learning_rate": 2.5e-07,
"logits/chosen": -0.7988893389701843,
"logits/rejected": -0.7831005454063416,
"logps/chosen": -4.960128307342529,
"logps/rejected": -7.793705940246582,
"loss": 0.6929,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0017494624480605125,
"rewards/margins": 0.0019936964381486177,
"rewards/rejected": -0.0002442340482957661,
"step": 25
},
{
"epoch": 0.01,
"learning_rate": 3e-07,
"logits/chosen": -0.7896796464920044,
"logits/rejected": -0.7605875730514526,
"logps/chosen": -6.406218528747559,
"logps/rejected": -8.445697784423828,
"loss": 0.6923,
"rewards/accuracies": 0.46875,
"rewards/chosen": -0.0006792292697355151,
"rewards/margins": -0.0016146342968568206,
"rewards/rejected": 0.0009354048524983227,
"step": 30
},
{
"epoch": 0.01,
"learning_rate": 3.5e-07,
"logits/chosen": -0.8104821443557739,
"logits/rejected": -0.7983841896057129,
"logps/chosen": -6.952303409576416,
"logps/rejected": -8.65689754486084,
"loss": 0.6926,
"rewards/accuracies": 0.5062500238418579,
"rewards/chosen": 0.0003856793628074229,
"rewards/margins": 0.0037465274799615145,
"rewards/rejected": -0.0033608481753617525,
"step": 35
},
{
"epoch": 0.01,
"learning_rate": 4e-07,
"logits/chosen": -0.8198621869087219,
"logits/rejected": -0.8019220232963562,
"logps/chosen": -6.161223888397217,
"logps/rejected": -7.956850528717041,
"loss": 0.6927,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.001837434945628047,
"rewards/margins": 0.005904150195419788,
"rewards/rejected": -0.004066715482622385,
"step": 40
},
{
"epoch": 0.01,
"learning_rate": 4.5e-07,
"logits/chosen": -0.7631333470344543,
"logits/rejected": -0.7561143636703491,
"logps/chosen": -5.855575084686279,
"logps/rejected": -7.01950740814209,
"loss": 0.6913,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0016343919560313225,
"rewards/margins": 0.005311951506882906,
"rewards/rejected": -0.0036775595508515835,
"step": 45
},
{
"epoch": 0.01,
"learning_rate": 5e-07,
"logits/chosen": -0.7467092871665955,
"logits/rejected": -0.7552592754364014,
"logps/chosen": -7.219940185546875,
"logps/rejected": -7.984251976013184,
"loss": 0.6907,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": 0.0006344284047372639,
"rewards/margins": 0.0040216282941401005,
"rewards/rejected": -0.003387199714779854,
"step": 50
},
{
"epoch": 0.01,
"learning_rate": 5.5e-07,
"logits/chosen": -0.8183493614196777,
"logits/rejected": -0.8048542737960815,
"logps/chosen": -5.986401557922363,
"logps/rejected": -7.050605773925781,
"loss": 0.6903,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0044477893970906734,
"rewards/margins": 0.013396045193076134,
"rewards/rejected": -0.008948257192969322,
"step": 55
},
{
"epoch": 0.02,
"learning_rate": 6e-07,
"logits/chosen": -0.7246443033218384,
"logits/rejected": -0.7153327465057373,
"logps/chosen": -6.37067985534668,
"logps/rejected": -7.855441093444824,
"loss": 0.69,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.0028912366833537817,
"rewards/margins": 0.0029723027255386114,
"rewards/rejected": -0.005863540340214968,
"step": 60
},
{
"epoch": 0.02,
"learning_rate": 6.5e-07,
"logits/chosen": -0.7883706092834473,
"logits/rejected": -0.7892045974731445,
"logps/chosen": -5.0366129875183105,
"logps/rejected": -6.685678005218506,
"loss": 0.689,
"rewards/accuracies": 0.5625,
"rewards/chosen": 0.003989654593169689,
"rewards/margins": 0.0065727815963327885,
"rewards/rejected": -0.002583127235993743,
"step": 65
},
{
"epoch": 0.02,
"learning_rate": 7e-07,
"logits/chosen": -0.7610381245613098,
"logits/rejected": -0.767534613609314,
"logps/chosen": -6.8763604164123535,
"logps/rejected": -8.272597312927246,
"loss": 0.687,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.0005852003814652562,
"rewards/margins": 0.012984293513000011,
"rewards/rejected": -0.013569491915404797,
"step": 70
},
{
"epoch": 0.02,
"learning_rate": 7.5e-07,
"logits/chosen": -0.7938845753669739,
"logits/rejected": -0.7884698510169983,
"logps/chosen": -6.220009803771973,
"logps/rejected": -7.81838321685791,
"loss": 0.685,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.008494245819747448,
"rewards/margins": 0.02036314085125923,
"rewards/rejected": -0.011868895962834358,
"step": 75
},
{
"epoch": 0.02,
"learning_rate": 8e-07,
"logits/chosen": -0.760898232460022,
"logits/rejected": -0.7529922127723694,
"logps/chosen": -6.070019245147705,
"logps/rejected": -8.474264144897461,
"loss": 0.6809,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.0008083779248408973,
"rewards/margins": 0.0290432907640934,
"rewards/rejected": -0.029851669445633888,
"step": 80
},
{
"epoch": 0.02,
"learning_rate": 8.499999999999999e-07,
"logits/chosen": -0.8255828619003296,
"logits/rejected": -0.8029024004936218,
"logps/chosen": -5.739585876464844,
"logps/rejected": -8.894620895385742,
"loss": 0.681,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": 0.0006468339124694467,
"rewards/margins": 0.039313118904829025,
"rewards/rejected": -0.03866628557443619,
"step": 85
},
{
"epoch": 0.02,
"learning_rate": 9e-07,
"logits/chosen": -0.8031052350997925,
"logits/rejected": -0.7612560987472534,
"logps/chosen": -6.660666465759277,
"logps/rejected": -10.91639232635498,
"loss": 0.677,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.008988827466964722,
"rewards/margins": 0.03577885776758194,
"rewards/rejected": -0.04476768523454666,
"step": 90
},
{
"epoch": 0.02,
"learning_rate": 9.499999999999999e-07,
"logits/chosen": -0.8087406158447266,
"logits/rejected": -0.7717125415802002,
"logps/chosen": -6.990227699279785,
"logps/rejected": -10.181965827941895,
"loss": 0.6766,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -0.017899103462696075,
"rewards/margins": 0.03709184005856514,
"rewards/rejected": -0.054990947246551514,
"step": 95
},
{
"epoch": 0.03,
"learning_rate": 1e-06,
"logits/chosen": -0.7920883297920227,
"logits/rejected": -0.7615999579429626,
"logps/chosen": -7.010110378265381,
"logps/rejected": -8.589981079101562,
"loss": 0.6742,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -0.01662164181470871,
"rewards/margins": 0.04322618246078491,
"rewards/rejected": -0.05984782055020332,
"step": 100
},
{
"epoch": 0.03,
"eval_logits/chosen": -1.2141907215118408,
"eval_logits/rejected": -1.2049294710159302,
"eval_logps/chosen": -6.552766799926758,
"eval_logps/rejected": -8.47075366973877,
"eval_loss": 0.6869122385978699,
"eval_rewards/accuracies": 0.5723472833633423,
"eval_rewards/chosen": -0.021150289103388786,
"eval_rewards/margins": 0.02127229794859886,
"eval_rewards/rejected": -0.0424225889146328,
"eval_runtime": 628.2123,
"eval_samples_per_second": 31.588,
"eval_steps_per_second": 0.495,
"step": 100
},
{
"epoch": 0.03,
"learning_rate": 9.999829128320873e-07,
"logits/chosen": -0.7386836409568787,
"logits/rejected": -0.7065194845199585,
"logps/chosen": -7.015887260437012,
"logps/rejected": -8.969260215759277,
"loss": 0.6691,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -0.02153836190700531,
"rewards/margins": 0.05296989530324936,
"rewards/rejected": -0.07450826466083527,
"step": 105
},
{
"epoch": 0.03,
"learning_rate": 9.999316524962345e-07,
"logits/chosen": -0.8299457430839539,
"logits/rejected": -0.8254146575927734,
"logps/chosen": -6.386677265167236,
"logps/rejected": -8.159158706665039,
"loss": 0.6626,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.019821835681796074,
"rewards/margins": 0.08776978403329849,
"rewards/rejected": -0.10759161412715912,
"step": 110
},
{
"epoch": 0.03,
"learning_rate": 9.998462224960173e-07,
"logits/chosen": -0.7512461543083191,
"logits/rejected": -0.7053896188735962,
"logps/chosen": -7.265576362609863,
"logps/rejected": -10.415300369262695,
"loss": 0.6565,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.04903126507997513,
"rewards/margins": 0.10108338296413422,
"rewards/rejected": -0.15011465549468994,
"step": 115
},
{
"epoch": 0.03,
"learning_rate": 9.99726628670463e-07,
"logits/chosen": -0.8122448921203613,
"logits/rejected": -0.7945531010627747,
"logps/chosen": -6.279524326324463,
"logps/rejected": -8.167196273803711,
"loss": 0.6583,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.04174378514289856,
"rewards/margins": 0.07122843712568283,
"rewards/rejected": -0.11297222226858139,
"step": 120
},
{
"epoch": 0.03,
"learning_rate": 9.995728791936505e-07,
"logits/chosen": -0.7480685114860535,
"logits/rejected": -0.7061656713485718,
"logps/chosen": -6.890868186950684,
"logps/rejected": -10.21199893951416,
"loss": 0.651,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.07250909507274628,
"rewards/margins": 0.08439986407756805,
"rewards/rejected": -0.15690895915031433,
"step": 125
},
{
"epoch": 0.03,
"learning_rate": 9.993849845741523e-07,
"logits/chosen": -0.7156326174736023,
"logits/rejected": -0.7210611701011658,
"logps/chosen": -7.967876434326172,
"logps/rejected": -11.226155281066895,
"loss": 0.6563,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.0788055881857872,
"rewards/margins": 0.14646434783935547,
"rewards/rejected": -0.22526994347572327,
"step": 130
},
{
"epoch": 0.03,
"learning_rate": 9.991629576543163e-07,
"logits/chosen": -0.8028038740158081,
"logits/rejected": -0.7880641222000122,
"logps/chosen": -7.9293036460876465,
"logps/rejected": -12.446617126464844,
"loss": 0.6393,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -0.07287696748971939,
"rewards/margins": 0.16767558455467224,
"rewards/rejected": -0.24055257439613342,
"step": 135
},
{
"epoch": 0.04,
"learning_rate": 9.989068136093872e-07,
"logits/chosen": -0.6804400682449341,
"logits/rejected": -0.6678518056869507,
"logps/chosen": -7.790997505187988,
"logps/rejected": -10.521781921386719,
"loss": 0.6429,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.10002864897251129,
"rewards/margins": 0.16521799564361572,
"rewards/rejected": -0.2652466297149658,
"step": 140
},
{
"epoch": 0.04,
"learning_rate": 9.986165699464705e-07,
"logits/chosen": -0.7420132160186768,
"logits/rejected": -0.7359737157821655,
"logps/chosen": -7.692935943603516,
"logps/rejected": -11.80825138092041,
"loss": 0.6275,
"rewards/accuracies": 0.71875,
"rewards/chosen": -0.12237729877233505,
"rewards/margins": 0.2165246307849884,
"rewards/rejected": -0.33890193700790405,
"step": 145
},
{
"epoch": 0.04,
"learning_rate": 9.982922465033348e-07,
"logits/chosen": -0.6650699377059937,
"logits/rejected": -0.6643859148025513,
"logps/chosen": -8.27735710144043,
"logps/rejected": -11.08592700958252,
"loss": 0.6316,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.18646416068077087,
"rewards/margins": 0.17933328449726105,
"rewards/rejected": -0.3657974600791931,
"step": 150
},
{
"epoch": 0.04,
"learning_rate": 9.979338654470567e-07,
"logits/chosen": -0.6874249577522278,
"logits/rejected": -0.6583540439605713,
"logps/chosen": -8.387764930725098,
"logps/rejected": -10.597826957702637,
"loss": 0.6355,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.15453846752643585,
"rewards/margins": 0.15605905652046204,
"rewards/rejected": -0.3105975389480591,
"step": 155
},
{
"epoch": 0.04,
"learning_rate": 9.975414512725056e-07,
"logits/chosen": -0.6604259610176086,
"logits/rejected": -0.654133677482605,
"logps/chosen": -8.139281272888184,
"logps/rejected": -11.677125930786133,
"loss": 0.6281,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.1922483593225479,
"rewards/margins": 0.17367199063301086,
"rewards/rejected": -0.36592036485671997,
"step": 160
},
{
"epoch": 0.04,
"learning_rate": 9.971150308006687e-07,
"logits/chosen": -0.6868435144424438,
"logits/rejected": -0.6831103563308716,
"logps/chosen": -7.650822639465332,
"logps/rejected": -13.520294189453125,
"loss": 0.6184,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.1557883322238922,
"rewards/margins": 0.309310644865036,
"rewards/rejected": -0.4650990068912506,
"step": 165
},
{
"epoch": 0.04,
"learning_rate": 9.966546331768192e-07,
"logits/chosen": -0.6930921673774719,
"logits/rejected": -0.656936764717102,
"logps/chosen": -7.236788749694824,
"logps/rejected": -12.021242141723633,
"loss": 0.617,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.16039922833442688,
"rewards/margins": 0.18796458840370178,
"rewards/rejected": -0.34836381673812866,
"step": 170
},
{
"epoch": 0.04,
"learning_rate": 9.961602898685223e-07,
"logits/chosen": -0.6678417921066284,
"logits/rejected": -0.6494520306587219,
"logps/chosen": -8.197237014770508,
"logps/rejected": -13.004777908325195,
"loss": 0.6192,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -0.16785219311714172,
"rewards/margins": 0.2895987629890442,
"rewards/rejected": -0.4574509561061859,
"step": 175
},
{
"epoch": 0.05,
"learning_rate": 9.956320346634875e-07,
"logits/chosen": -0.6635026931762695,
"logits/rejected": -0.6535638570785522,
"logps/chosen": -8.445914268493652,
"logps/rejected": -14.642396926879883,
"loss": 0.6054,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": -0.2178197205066681,
"rewards/margins": 0.33262819051742554,
"rewards/rejected": -0.5504478812217712,
"step": 180
},
{
"epoch": 0.05,
"learning_rate": 9.95069903667256e-07,
"logits/chosen": -0.6291212439537048,
"logits/rejected": -0.5968618392944336,
"logps/chosen": -8.441099166870117,
"logps/rejected": -13.59777545928955,
"loss": 0.6019,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -0.262218177318573,
"rewards/margins": 0.2801818251609802,
"rewards/rejected": -0.5424000024795532,
"step": 185
},
{
"epoch": 0.05,
"learning_rate": 9.944739353007341e-07,
"logits/chosen": -0.6783192753791809,
"logits/rejected": -0.6327847242355347,
"logps/chosen": -8.718297004699707,
"logps/rejected": -15.599523544311523,
"loss": 0.5953,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -0.2528052031993866,
"rewards/margins": 0.31637701392173767,
"rewards/rejected": -0.569182276725769,
"step": 190
},
{
"epoch": 0.05,
"learning_rate": 9.938441702975689e-07,
"logits/chosen": -0.6378843784332275,
"logits/rejected": -0.6440542936325073,
"logps/chosen": -9.802359580993652,
"logps/rejected": -14.98701286315918,
"loss": 0.5907,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -0.3599366247653961,
"rewards/margins": 0.2988061010837555,
"rewards/rejected": -0.6587426066398621,
"step": 195
},
{
"epoch": 0.05,
"learning_rate": 9.931806517013612e-07,
"logits/chosen": -0.6049096584320068,
"logits/rejected": -0.6118007302284241,
"logps/chosen": -7.990042686462402,
"logps/rejected": -13.457636833190918,
"loss": 0.5959,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.28124743700027466,
"rewards/margins": 0.39520224928855896,
"rewards/rejected": -0.6764496564865112,
"step": 200
},
{
"epoch": 0.05,
"eval_logits/chosen": -1.100651502609253,
"eval_logits/rejected": -1.090796947479248,
"eval_logps/chosen": -9.126388549804688,
"eval_logps/rejected": -11.862701416015625,
"eval_loss": 0.6825469136238098,
"eval_rewards/accuracies": 0.5799839496612549,
"eval_rewards/chosen": -0.27851250767707825,
"eval_rewards/margins": 0.10310473293066025,
"eval_rewards/rejected": -0.3816172480583191,
"eval_runtime": 646.4588,
"eval_samples_per_second": 30.696,
"eval_steps_per_second": 0.481,
"step": 200
}
],
"logging_steps": 5,
"max_steps": 2000,
"num_train_epochs": 1,
"save_steps": 100,
"total_flos": 0.0,
"trial_name": null,
"trial_params": null
}