diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4801 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2632, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.361328125, + "learning_rate": 1.893939393939394e-09, + "logits/chosen": -1.4715663194656372, + "logits/rejected": -0.9266279935836792, + "logps/chosen": -194.24078369140625, + "logps/rejected": -198.9897003173828, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.29296875, + "learning_rate": 1.893939393939394e-08, + "logits/chosen": -1.477033019065857, + "logits/rejected": -1.0564019680023193, + "logps/chosen": -209.50955200195312, + "logps/rejected": -199.38864135742188, + "loss": 0.693, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": 0.00017503734852652997, + "rewards/margins": -0.0007337426068261266, + "rewards/margins_max": 0.0013889693655073643, + "rewards/margins_min": -0.002856454811990261, + "rewards/margins_std": 0.003001968376338482, + "rewards/rejected": 0.0009087801445275545, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.291015625, + "learning_rate": 3.787878787878788e-08, + "logits/chosen": -1.4086185693740845, + "logits/rejected": -0.9495820999145508, + "logps/chosen": -248.8169403076172, + "logps/rejected": -228.78634643554688, + "loss": 0.6933, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.0003385419549886137, + "rewards/margins": -0.0008140860009007156, + "rewards/margins_max": 0.0013392677064985037, + "rewards/margins_min": -0.002967439591884613, + "rewards/margins_std": 0.0030453018844127655, + "rewards/rejected": 0.0004755440168082714, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.341796875, + "learning_rate": 5.6818181818181815e-08, + "logits/chosen": -1.3442885875701904, + "logits/rejected": -1.0605539083480835, + "logps/chosen": -199.19622802734375, + "logps/rejected": -209.0339813232422, + "loss": 0.6934, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -2.300643791386392e-05, + "rewards/margins": -0.0006394138326868415, + "rewards/margins_max": 0.0013273811200633645, + "rewards/margins_min": -0.0026062086690217257, + "rewards/margins_std": 0.0027814677450805902, + "rewards/rejected": 0.0006164073711261153, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.326171875, + "learning_rate": 7.575757575757576e-08, + "logits/chosen": -1.3370082378387451, + "logits/rejected": -1.105715036392212, + "logps/chosen": -209.1099090576172, + "logps/rejected": -232.79086303710938, + "loss": 0.6931, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00016671940102241933, + "rewards/margins": -0.0002557897532824427, + "rewards/margins_max": 0.0017397021874785423, + "rewards/margins_min": -0.0022512818686664104, + "rewards/margins_std": 0.002822051988914609, + "rewards/rejected": 0.000422509154304862, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.2734375, + "learning_rate": 9.469696969696969e-08, + "logits/chosen": -1.3629438877105713, + "logits/rejected": -1.057975172996521, + "logps/chosen": -231.24758911132812, + "logps/rejected": -239.5913543701172, + "loss": 0.6928, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0008590269717387855, + "rewards/margins": 0.0009242333471775055, + "rewards/margins_max": 0.003584180725738406, + "rewards/margins_min": -0.0017357139149680734, + "rewards/margins_std": 0.0037617336492985487, + "rewards/rejected": -6.520649912999943e-05, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.28515625, + "learning_rate": 1.1363636363636363e-07, + "logits/chosen": -1.3622150421142578, + "logits/rejected": -1.0315033197402954, + "logps/chosen": -199.6304473876953, + "logps/rejected": -212.83926391601562, + "loss": 0.693, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -0.0002507457393221557, + "rewards/margins": -0.0001059188725776039, + "rewards/margins_max": 0.002144067781046033, + "rewards/margins_min": -0.0023559057153761387, + "rewards/margins_std": 0.0031819615978747606, + "rewards/rejected": -0.0001448269176762551, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 0.28125, + "learning_rate": 1.3257575757575757e-07, + "logits/chosen": -1.379127860069275, + "logits/rejected": -1.1307542324066162, + "logps/chosen": -185.8495635986328, + "logps/rejected": -207.99673461914062, + "loss": 0.6929, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.000694580958224833, + "rewards/margins": 0.00036811447353102267, + "rewards/margins_max": 0.00218455889262259, + "rewards/margins_min": -0.0014483298873528838, + "rewards/margins_std": 0.002568840514868498, + "rewards/rejected": 0.0003264665720053017, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.25390625, + "learning_rate": 1.5151515151515152e-07, + "logits/chosen": -1.3772165775299072, + "logits/rejected": -1.1073375940322876, + "logps/chosen": -199.511474609375, + "logps/rejected": -206.6280059814453, + "loss": 0.6925, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0008196959970518947, + "rewards/margins": 0.0012880888534709811, + "rewards/margins_max": 0.003329794155433774, + "rewards/margins_min": -0.000753616273868829, + "rewards/margins_std": 0.002887406852096319, + "rewards/rejected": -0.00046839285641908646, + "step": 80 + }, + { + "epoch": 0.03, + "grad_norm": 0.259765625, + "learning_rate": 1.7045454545454543e-07, + "logits/chosen": -1.4906318187713623, + "logits/rejected": -1.1894948482513428, + "logps/chosen": -204.3669891357422, + "logps/rejected": -220.7799072265625, + "loss": 0.692, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0020121335983276367, + "rewards/margins": 0.002966915722936392, + "rewards/margins_max": 0.005120046902447939, + "rewards/margins_min": 0.0008137855911627412, + "rewards/margins_std": 0.0030449863988906145, + "rewards/rejected": -0.0009547824738547206, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.318359375, + "learning_rate": 1.8939393939393938e-07, + "logits/chosen": -1.436471939086914, + "logits/rejected": -1.124845266342163, + "logps/chosen": -206.4337615966797, + "logps/rejected": -215.1757049560547, + "loss": 0.6926, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0012693710159510374, + "rewards/margins": 0.0008539790287613869, + "rewards/margins_max": 0.0031065032817423344, + "rewards/margins_min": -0.0013985451078042388, + "rewards/margins_std": 0.0031855504494160414, + "rewards/rejected": 0.0004153919289819896, + "step": 100 + }, + { + "epoch": 0.04, + "grad_norm": 0.244140625, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -1.4335253238677979, + "logits/rejected": -1.1322035789489746, + "logps/chosen": -193.65826416015625, + "logps/rejected": -200.38711547851562, + "loss": 0.6924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0013359611621126533, + "rewards/margins": 0.0015128880040720105, + "rewards/margins_max": 0.004125602543354034, + "rewards/margins_min": -0.001099826768040657, + "rewards/margins_std": 0.003694936167448759, + "rewards/rejected": -0.00017692662368062884, + "step": 110 + }, + { + "epoch": 0.05, + "grad_norm": 0.2431640625, + "learning_rate": 2.2727272727272726e-07, + "logits/chosen": -1.3395394086837769, + "logits/rejected": -1.0315684080123901, + "logps/chosen": -203.34072875976562, + "logps/rejected": -192.33340454101562, + "loss": 0.6922, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0007492561126127839, + "rewards/margins": 0.001645550481043756, + "rewards/margins_max": 0.0036673967260867357, + "rewards/margins_min": -0.00037629506550729275, + "rewards/margins_std": 0.002859321655705571, + "rewards/rejected": -0.0008962946012616158, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 0.3203125, + "learning_rate": 2.462121212121212e-07, + "logits/chosen": -1.4741287231445312, + "logits/rejected": -1.0452475547790527, + "logps/chosen": -252.58242797851562, + "logps/rejected": -234.0312042236328, + "loss": 0.6915, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.002638085512444377, + "rewards/margins": 0.0036562737077474594, + "rewards/margins_max": 0.0064610885456204414, + "rewards/margins_min": 0.0008514595101587474, + "rewards/margins_std": 0.003966606222093105, + "rewards/rejected": -0.0010181884281337261, + "step": 130 + }, + { + "epoch": 0.05, + "grad_norm": 0.28125, + "learning_rate": 2.6515151515151514e-07, + "logits/chosen": -1.3656947612762451, + "logits/rejected": -1.044594407081604, + "logps/chosen": -206.4570770263672, + "logps/rejected": -211.5678253173828, + "loss": 0.6914, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.0019431791733950377, + "rewards/margins": 0.0030987593345344067, + "rewards/margins_max": 0.0053262365981936455, + "rewards/margins_min": 0.000871282652951777, + "rewards/margins_std": 0.0031501282937824726, + "rewards/rejected": -0.0011555805103853345, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.267578125, + "learning_rate": 2.840909090909091e-07, + "logits/chosen": -1.3692163228988647, + "logits/rejected": -1.0668622255325317, + "logps/chosen": -197.52001953125, + "logps/rejected": -231.03829956054688, + "loss": 0.6911, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.0020190435461699963, + "rewards/margins": 0.004240790382027626, + "rewards/margins_max": 0.007745341397821903, + "rewards/margins_min": 0.0007362383184954524, + "rewards/margins_std": 0.004956183955073357, + "rewards/rejected": -0.0022217463701963425, + "step": 150 + }, + { + "epoch": 0.06, + "grad_norm": 0.3359375, + "learning_rate": 3.0303030303030305e-07, + "logits/chosen": -1.3538802862167358, + "logits/rejected": -1.2184001207351685, + "logps/chosen": -175.86048889160156, + "logps/rejected": -229.12576293945312, + "loss": 0.6912, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.0017691183602437377, + "rewards/margins": 0.0037867154460400343, + "rewards/margins_max": 0.006128127686679363, + "rewards/margins_min": 0.0014453029725700617, + "rewards/margins_std": 0.0033112571109086275, + "rewards/rejected": -0.0020175972022116184, + "step": 160 + }, + { + "epoch": 0.06, + "grad_norm": 0.439453125, + "learning_rate": 3.2196969696969695e-07, + "logits/chosen": -1.4180030822753906, + "logits/rejected": -1.0590273141860962, + "logps/chosen": -199.34068298339844, + "logps/rejected": -224.4752197265625, + "loss": 0.6906, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.0037923683412373066, + "rewards/margins": 0.005361521150916815, + "rewards/margins_max": 0.008897420018911362, + "rewards/margins_min": 0.0018256225157529116, + "rewards/margins_std": 0.005000515840947628, + "rewards/rejected": -0.0015691530425101519, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 0.42578125, + "learning_rate": 3.4090909090909085e-07, + "logits/chosen": -1.4597827196121216, + "logits/rejected": -1.124783992767334, + "logps/chosen": -210.7503204345703, + "logps/rejected": -233.1248779296875, + "loss": 0.6901, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.003537885844707489, + "rewards/margins": 0.0069324844516813755, + "rewards/margins_max": 0.009725996293127537, + "rewards/margins_min": 0.004138973541557789, + "rewards/margins_std": 0.0039506214670836926, + "rewards/rejected": -0.00339459883980453, + "step": 180 + }, + { + "epoch": 0.07, + "grad_norm": 0.34375, + "learning_rate": 3.5984848484848486e-07, + "logits/chosen": -1.431032419204712, + "logits/rejected": -1.0481399297714233, + "logps/chosen": -221.9123077392578, + "logps/rejected": -223.14517211914062, + "loss": 0.6896, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.002624739659950137, + "rewards/margins": 0.006167866289615631, + "rewards/margins_max": 0.009970493614673615, + "rewards/margins_min": 0.002365240128710866, + "rewards/margins_std": 0.005377725698053837, + "rewards/rejected": -0.0035431268624961376, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.310546875, + "learning_rate": 3.7878787878787876e-07, + "logits/chosen": -1.3933680057525635, + "logits/rejected": -1.0791442394256592, + "logps/chosen": -216.5106964111328, + "logps/rejected": -220.7644500732422, + "loss": 0.6894, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.0041386038064956665, + "rewards/margins": 0.00709198322147131, + "rewards/margins_max": 0.011054454371333122, + "rewards/margins_min": 0.0031295125372707844, + "rewards/margins_std": 0.005603780038654804, + "rewards/rejected": -0.002953379647806287, + "step": 200 + }, + { + "epoch": 0.08, + "grad_norm": 0.296875, + "learning_rate": 3.977272727272727e-07, + "logits/chosen": -1.3440656661987305, + "logits/rejected": -1.1712138652801514, + "logps/chosen": -197.05728149414062, + "logps/rejected": -211.1956787109375, + "loss": 0.6893, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.005659022368490696, + "rewards/margins": 0.008139314129948616, + "rewards/margins_max": 0.011830927804112434, + "rewards/margins_min": 0.004447699058800936, + "rewards/margins_std": 0.005220732185989618, + "rewards/rejected": -0.0024802912957966328, + "step": 210 + }, + { + "epoch": 0.08, + "grad_norm": 0.28515625, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -1.4657204151153564, + "logits/rejected": -1.0755724906921387, + "logps/chosen": -217.37124633789062, + "logps/rejected": -216.32089233398438, + "loss": 0.6889, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.005143741611391306, + "rewards/margins": 0.009008055552840233, + "rewards/margins_max": 0.015119703486561775, + "rewards/margins_min": 0.0028964090161025524, + "rewards/margins_std": 0.008643174543976784, + "rewards/rejected": -0.0038643144071102142, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 0.267578125, + "learning_rate": 4.3560606060606057e-07, + "logits/chosen": -1.480959177017212, + "logits/rejected": -1.1383044719696045, + "logps/chosen": -206.03359985351562, + "logps/rejected": -221.08889770507812, + "loss": 0.6882, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.005945052020251751, + "rewards/margins": 0.009998776949942112, + "rewards/margins_max": 0.013563087210059166, + "rewards/margins_min": 0.006434465758502483, + "rewards/margins_std": 0.0050406972877681255, + "rewards/rejected": -0.004053723998367786, + "step": 230 + }, + { + "epoch": 0.09, + "grad_norm": 0.3046875, + "learning_rate": 4.545454545454545e-07, + "logits/chosen": -1.4084275960922241, + "logits/rejected": -1.198710560798645, + "logps/chosen": -195.56588745117188, + "logps/rejected": -214.75173950195312, + "loss": 0.6881, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.0056480844505131245, + "rewards/margins": 0.010742878541350365, + "rewards/margins_max": 0.016357477754354477, + "rewards/margins_min": 0.005128280725330114, + "rewards/margins_std": 0.007940240204334259, + "rewards/rejected": -0.00509479409083724, + "step": 240 + }, + { + "epoch": 0.09, + "grad_norm": 0.294921875, + "learning_rate": 4.734848484848485e-07, + "logits/chosen": -1.439551830291748, + "logits/rejected": -1.1524641513824463, + "logps/chosen": -189.01382446289062, + "logps/rejected": -229.9130859375, + "loss": 0.6866, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.005949309095740318, + "rewards/margins": 0.012400015257298946, + "rewards/margins_max": 0.018142709508538246, + "rewards/margins_min": 0.006657321937382221, + "rewards/margins_std": 0.008121393620967865, + "rewards/rejected": -0.0064507052302360535, + "step": 250 + }, + { + "epoch": 0.1, + "grad_norm": 0.25390625, + "learning_rate": 4.924242424242424e-07, + "logits/chosen": -1.529597520828247, + "logits/rejected": -1.2286248207092285, + "logps/chosen": -176.6389617919922, + "logps/rejected": -206.6012725830078, + "loss": 0.6865, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.007012097630649805, + "rewards/margins": 0.012566355988383293, + "rewards/margins_max": 0.01809503510594368, + "rewards/margins_min": 0.007037677802145481, + "rewards/margins_std": 0.007818731479346752, + "rewards/rejected": -0.005554257892072201, + "step": 260 + }, + { + "epoch": 0.1, + "grad_norm": 0.298828125, + "learning_rate": 4.999920796099437e-07, + "logits/chosen": -1.4192304611206055, + "logits/rejected": -1.127290964126587, + "logps/chosen": -220.34561157226562, + "logps/rejected": -223.0558624267578, + "loss": 0.6848, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.00964759849011898, + "rewards/margins": 0.01655970886349678, + "rewards/margins_max": 0.023178983479738235, + "rewards/margins_min": 0.0099404351785779, + "rewards/margins_std": 0.009361067786812782, + "rewards/rejected": -0.006912109907716513, + "step": 270 + }, + { + "epoch": 0.11, + "grad_norm": 0.296875, + "learning_rate": 4.999436790436923e-07, + "logits/chosen": -1.4525351524353027, + "logits/rejected": -1.1418185234069824, + "logps/chosen": -198.59756469726562, + "logps/rejected": -224.8717041015625, + "loss": 0.6854, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.006603570189327002, + "rewards/margins": 0.014784199185669422, + "rewards/margins_max": 0.022250749170780182, + "rewards/margins_min": 0.00731764966621995, + "rewards/margins_std": 0.010559295304119587, + "rewards/rejected": -0.008180629462003708, + "step": 280 + }, + { + "epoch": 0.11, + "grad_norm": 0.322265625, + "learning_rate": 4.998512866364003e-07, + "logits/chosen": -1.3750841617584229, + "logits/rejected": -1.1162524223327637, + "logps/chosen": -196.96287536621094, + "logps/rejected": -234.20751953125, + "loss": 0.6844, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.00851917453110218, + "rewards/margins": 0.01636343263089657, + "rewards/margins_max": 0.024043789133429527, + "rewards/margins_min": 0.008683075197041035, + "rewards/margins_std": 0.010861665941774845, + "rewards/rejected": -0.007844258099794388, + "step": 290 + }, + { + "epoch": 0.11, + "grad_norm": 0.244140625, + "learning_rate": 4.997149186497795e-07, + "logits/chosen": -1.3377901315689087, + "logits/rejected": -1.0969083309173584, + "logps/chosen": -211.07064819335938, + "logps/rejected": -216.2685546875, + "loss": 0.6834, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.007954636588692665, + "rewards/margins": 0.01811050996184349, + "rewards/margins_max": 0.0279003344476223, + "rewards/margins_min": 0.008320683613419533, + "rewards/margins_std": 0.013844907283782959, + "rewards/rejected": -0.0101558743044734, + "step": 300 + }, + { + "epoch": 0.12, + "grad_norm": 0.294921875, + "learning_rate": 4.995345990855521e-07, + "logits/chosen": -1.3734376430511475, + "logits/rejected": -1.094660997390747, + "logps/chosen": -209.6073760986328, + "logps/rejected": -222.2782745361328, + "loss": 0.6834, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.010066825896501541, + "rewards/margins": 0.02010076865553856, + "rewards/margins_max": 0.029314884915947914, + "rewards/margins_min": 0.01088665146380663, + "rewards/margins_std": 0.013030730187892914, + "rewards/rejected": -0.010033941827714443, + "step": 310 + }, + { + "epoch": 0.12, + "grad_norm": 0.28515625, + "learning_rate": 4.993103596812268e-07, + "logits/chosen": -1.3820875883102417, + "logits/rejected": -1.1262398958206177, + "logps/chosen": -197.0897674560547, + "logps/rejected": -210.62136840820312, + "loss": 0.6821, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.011968965642154217, + "rewards/margins": 0.024391215294599533, + "rewards/margins_max": 0.03280986472964287, + "rewards/margins_min": 0.015972565859556198, + "rewards/margins_std": 0.011905769817531109, + "rewards/rejected": -0.012422251515090466, + "step": 320 + }, + { + "epoch": 0.13, + "grad_norm": 0.330078125, + "learning_rate": 4.990422399045117e-07, + "logits/chosen": -1.4338971376419067, + "logits/rejected": -1.1740996837615967, + "logps/chosen": -192.77890014648438, + "logps/rejected": -227.9630126953125, + "loss": 0.6817, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.012633833102881908, + "rewards/margins": 0.0226894561201334, + "rewards/margins_max": 0.03381115198135376, + "rewards/margins_min": 0.01156776025891304, + "rewards/margins_std": 0.015728455036878586, + "rewards/rejected": -0.010055623017251492, + "step": 330 + }, + { + "epoch": 0.13, + "grad_norm": 0.291015625, + "learning_rate": 4.987302869463687e-07, + "logits/chosen": -1.3690948486328125, + "logits/rejected": -1.02110755443573, + "logps/chosen": -216.27938842773438, + "logps/rejected": -252.06924438476562, + "loss": 0.6806, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.013316566124558449, + "rewards/margins": 0.026786301285028458, + "rewards/margins_max": 0.03776715695858002, + "rewards/margins_min": 0.015805436298251152, + "rewards/margins_std": 0.01552928052842617, + "rewards/rejected": -0.01346973329782486, + "step": 340 + }, + { + "epoch": 0.13, + "grad_norm": 0.3515625, + "learning_rate": 4.98374555712707e-07, + "logits/chosen": -1.3813257217407227, + "logits/rejected": -1.0808042287826538, + "logps/chosen": -202.20233154296875, + "logps/rejected": -234.57485961914062, + "loss": 0.6797, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.012134606018662453, + "rewards/margins": 0.031434256583452225, + "rewards/margins_max": 0.04257182776927948, + "rewards/margins_min": 0.020296679809689522, + "rewards/margins_std": 0.015750911086797714, + "rewards/rejected": -0.019299646839499474, + "step": 350 + }, + { + "epoch": 0.14, + "grad_norm": 0.29296875, + "learning_rate": 4.979751088147191e-07, + "logits/chosen": -1.3914369344711304, + "logits/rejected": -1.2104722261428833, + "logps/chosen": -229.1161651611328, + "logps/rejected": -233.09030151367188, + "loss": 0.6786, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.012462800368666649, + "rewards/margins": 0.027766436338424683, + "rewards/margins_max": 0.03918435052037239, + "rewards/margins_min": 0.016348522156476974, + "rewards/margins_std": 0.016147367656230927, + "rewards/rejected": -0.015303634107112885, + "step": 360 + }, + { + "epoch": 0.14, + "grad_norm": 0.2734375, + "learning_rate": 4.97532016557862e-07, + "logits/chosen": -1.4021486043930054, + "logits/rejected": -1.1357289552688599, + "logps/chosen": -194.69229125976562, + "logps/rejected": -206.661865234375, + "loss": 0.6797, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.010312230326235294, + "rewards/margins": 0.025726106017827988, + "rewards/margins_max": 0.036852724850177765, + "rewards/margins_min": 0.01459948904812336, + "rewards/margins_std": 0.015735412016510963, + "rewards/rejected": -0.015413874760270119, + "step": 370 + }, + { + "epoch": 0.14, + "grad_norm": 0.326171875, + "learning_rate": 4.970453569294811e-07, + "logits/chosen": -1.336107611656189, + "logits/rejected": -1.0358856916427612, + "logps/chosen": -199.49639892578125, + "logps/rejected": -212.7289276123047, + "loss": 0.6787, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.013159600086510181, + "rewards/margins": 0.030392413958907127, + "rewards/margins_max": 0.04219727963209152, + "rewards/margins_min": 0.018587548285722733, + "rewards/margins_std": 0.016694601625204086, + "rewards/rejected": -0.01723281480371952, + "step": 380 + }, + { + "epoch": 0.15, + "grad_norm": 0.25390625, + "learning_rate": 4.965152155850854e-07, + "logits/chosen": -1.447141170501709, + "logits/rejected": -1.1050375699996948, + "logps/chosen": -208.603271484375, + "logps/rejected": -216.28231811523438, + "loss": 0.6774, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.012095400132238865, + "rewards/margins": 0.02805733121931553, + "rewards/margins_max": 0.043057795614004135, + "rewards/margins_min": 0.013056864961981773, + "rewards/margins_std": 0.02121386118233204, + "rewards/rejected": -0.01596193201839924, + "step": 390 + }, + { + "epoch": 0.15, + "grad_norm": 0.26171875, + "learning_rate": 4.959416858332709e-07, + "logits/chosen": -1.3536365032196045, + "logits/rejected": -1.0935043096542358, + "logps/chosen": -178.9142608642578, + "logps/rejected": -206.5353546142578, + "loss": 0.6742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.015787001699209213, + "rewards/margins": 0.03439975157380104, + "rewards/margins_max": 0.047203429043293, + "rewards/margins_min": 0.021596072241663933, + "rewards/margins_std": 0.018107129260897636, + "rewards/rejected": -0.01861274614930153, + "step": 400 + }, + { + "epoch": 0.16, + "grad_norm": 0.2734375, + "learning_rate": 4.953248686192974e-07, + "logits/chosen": -1.404972791671753, + "logits/rejected": -1.0352896451950073, + "logps/chosen": -212.09304809570312, + "logps/rejected": -224.814697265625, + "loss": 0.6772, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014694595709443092, + "rewards/margins": 0.032047249376773834, + "rewards/margins_max": 0.04682258516550064, + "rewards/margins_min": 0.01727190613746643, + "rewards/margins_std": 0.020895490422844887, + "rewards/rejected": -0.017352653667330742, + "step": 410 + }, + { + "epoch": 0.16, + "grad_norm": 0.294921875, + "learning_rate": 4.946648725073222e-07, + "logits/chosen": -1.4838746786117554, + "logits/rejected": -1.2245352268218994, + "logps/chosen": -212.3232421875, + "logps/rejected": -218.89013671875, + "loss": 0.6757, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.016488736495375633, + "rewards/margins": 0.03431572765111923, + "rewards/margins_max": 0.04685080051422119, + "rewards/margins_min": 0.02178065851330757, + "rewards/margins_std": 0.017727266997098923, + "rewards/rejected": -0.017826993018388748, + "step": 420 + }, + { + "epoch": 0.16, + "grad_norm": 0.29296875, + "learning_rate": 4.93961813661291e-07, + "logits/chosen": -1.5144537687301636, + "logits/rejected": -1.2026809453964233, + "logps/chosen": -191.7454071044922, + "logps/rejected": -203.25830078125, + "loss": 0.676, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.014662086963653564, + "rewards/margins": 0.03749576956033707, + "rewards/margins_max": 0.05218175798654556, + "rewards/margins_min": 0.022809788584709167, + "rewards/margins_std": 0.020769115537405014, + "rewards/rejected": -0.022833682596683502, + "step": 430 + }, + { + "epoch": 0.17, + "grad_norm": 0.31640625, + "learning_rate": 4.932158158244936e-07, + "logits/chosen": -1.460267186164856, + "logits/rejected": -1.102550745010376, + "logps/chosen": -219.0205841064453, + "logps/rejected": -230.55673217773438, + "loss": 0.6748, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.01621498540043831, + "rewards/margins": 0.03951702266931534, + "rewards/margins_max": 0.05593367666006088, + "rewards/margins_min": 0.023100370541214943, + "rewards/margins_std": 0.023216653615236282, + "rewards/rejected": -0.02330203540623188, + "step": 440 + }, + { + "epoch": 0.17, + "grad_norm": 0.287109375, + "learning_rate": 4.924270102977827e-07, + "logits/chosen": -1.4851996898651123, + "logits/rejected": -1.1021556854248047, + "logps/chosen": -212.36428833007812, + "logps/rejected": -234.95278930664062, + "loss": 0.6729, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.01592712476849556, + "rewards/margins": 0.040354181081056595, + "rewards/margins_max": 0.05639289692044258, + "rewards/margins_min": 0.02431546524167061, + "rewards/margins_std": 0.022682171314954758, + "rewards/rejected": -0.024427054449915886, + "step": 450 + }, + { + "epoch": 0.17, + "grad_norm": 0.298828125, + "learning_rate": 4.915955359164651e-07, + "logits/chosen": -1.4281965494155884, + "logits/rejected": -1.1299288272857666, + "logps/chosen": -203.64056396484375, + "logps/rejected": -209.16561889648438, + "loss": 0.6725, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014464335516095161, + "rewards/margins": 0.03879351541399956, + "rewards/margins_max": 0.05169098451733589, + "rewards/margins_min": 0.025896048173308372, + "rewards/margins_std": 0.01823977194726467, + "rewards/rejected": -0.024329179897904396, + "step": 460 + }, + { + "epoch": 0.18, + "grad_norm": 0.3203125, + "learning_rate": 4.907215390258652e-07, + "logits/chosen": -1.3715511560440063, + "logits/rejected": -1.1071712970733643, + "logps/chosen": -186.81532287597656, + "logps/rejected": -208.74130249023438, + "loss": 0.6714, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01985536888241768, + "rewards/margins": 0.04831348732113838, + "rewards/margins_max": 0.07152094691991806, + "rewards/margins_min": 0.025106023997068405, + "rewards/margins_std": 0.03282030671834946, + "rewards/rejected": -0.028458122164011, + "step": 470 + }, + { + "epoch": 0.18, + "grad_norm": 0.294921875, + "learning_rate": 4.898051734555674e-07, + "logits/chosen": -1.520582675933838, + "logits/rejected": -1.1530619859695435, + "logps/chosen": -226.6740264892578, + "logps/rejected": -220.93807983398438, + "loss": 0.6713, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01769147627055645, + "rewards/margins": 0.04557216167449951, + "rewards/margins_max": 0.06148766726255417, + "rewards/margins_min": 0.029656657949090004, + "rewards/margins_std": 0.0225079283118248, + "rewards/rejected": -0.02788068726658821, + "step": 480 + }, + { + "epoch": 0.19, + "grad_norm": 0.349609375, + "learning_rate": 4.888466004923412e-07, + "logits/chosen": -1.4570837020874023, + "logits/rejected": -1.154343605041504, + "logps/chosen": -241.1246795654297, + "logps/rejected": -217.9856719970703, + "loss": 0.6717, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01621456816792488, + "rewards/margins": 0.039181966334581375, + "rewards/margins_max": 0.056494325399398804, + "rewards/margins_min": 0.021869609132409096, + "rewards/margins_std": 0.024483371526002884, + "rewards/rejected": -0.022967400029301643, + "step": 490 + }, + { + "epoch": 0.19, + "grad_norm": 0.322265625, + "learning_rate": 4.878459888517532e-07, + "logits/chosen": -1.3344757556915283, + "logits/rejected": -1.1008737087249756, + "logps/chosen": -204.79550170898438, + "logps/rejected": -226.4160614013672, + "loss": 0.6695, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.01746196672320366, + "rewards/margins": 0.0481671541929245, + "rewards/margins_max": 0.06977338343858719, + "rewards/margins_min": 0.02656092867255211, + "rewards/margins_std": 0.03055582009255886, + "rewards/rejected": -0.03070518746972084, + "step": 500 + }, + { + "epoch": 0.19, + "grad_norm": 0.296875, + "learning_rate": 4.86803514648472e-07, + "logits/chosen": -1.5273072719573975, + "logits/rejected": -1.1597423553466797, + "logps/chosen": -220.5069580078125, + "logps/rejected": -226.74734497070312, + "loss": 0.6702, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02024765871465206, + "rewards/margins": 0.042053740471601486, + "rewards/margins_max": 0.062348585575819016, + "rewards/margins_min": 0.02175888977944851, + "rewards/margins_std": 0.02870125137269497, + "rewards/rejected": -0.021806079894304276, + "step": 510 + }, + { + "epoch": 0.2, + "grad_norm": 0.326171875, + "learning_rate": 4.85719361365271e-07, + "logits/chosen": -1.3967102766036987, + "logits/rejected": -1.3165340423583984, + "logps/chosen": -196.89349365234375, + "logps/rejected": -268.110595703125, + "loss": 0.6671, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015946678817272186, + "rewards/margins": 0.05510186403989792, + "rewards/margins_max": 0.07911844551563263, + "rewards/margins_min": 0.03108527697622776, + "rewards/margins_std": 0.03396458178758621, + "rewards/rejected": -0.03915518522262573, + "step": 520 + }, + { + "epoch": 0.2, + "grad_norm": 0.306640625, + "learning_rate": 4.845937198207342e-07, + "logits/chosen": -1.3531572818756104, + "logits/rejected": -1.0980560779571533, + "logps/chosen": -193.45533752441406, + "logps/rejected": -219.8807830810547, + "loss": 0.6693, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020253274589776993, + "rewards/margins": 0.052356742322444916, + "rewards/margins_max": 0.07621929049491882, + "rewards/margins_min": 0.02849418856203556, + "rewards/margins_std": 0.03374674171209335, + "rewards/rejected": -0.03210346773266792, + "step": 530 + }, + { + "epoch": 0.21, + "grad_norm": 0.28515625, + "learning_rate": 4.834267881356707e-07, + "logits/chosen": -1.4308512210845947, + "logits/rejected": -1.106432557106018, + "logps/chosen": -192.52944946289062, + "logps/rejected": -235.0094757080078, + "loss": 0.6655, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0184812992811203, + "rewards/margins": 0.05686504766345024, + "rewards/margins_max": 0.08549021184444427, + "rewards/margins_min": 0.028239887207746506, + "rewards/margins_std": 0.04048209637403488, + "rewards/rejected": -0.03838375210762024, + "step": 540 + }, + { + "epoch": 0.21, + "grad_norm": 0.318359375, + "learning_rate": 4.822187716982439e-07, + "logits/chosen": -1.4813454151153564, + "logits/rejected": -1.1552975177764893, + "logps/chosen": -209.415283203125, + "logps/rejected": -203.3141632080078, + "loss": 0.6691, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015650704503059387, + "rewards/margins": 0.052040062844753265, + "rewards/margins_max": 0.07295672595500946, + "rewards/margins_min": 0.031123405322432518, + "rewards/margins_std": 0.0295806173235178, + "rewards/rejected": -0.03638935834169388, + "step": 550 + }, + { + "epoch": 0.21, + "grad_norm": 0.271484375, + "learning_rate": 4.809698831278217e-07, + "logits/chosen": -1.3834329843521118, + "logits/rejected": -1.0720294713974, + "logps/chosen": -212.3925018310547, + "logps/rejected": -222.80795288085938, + "loss": 0.6658, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02054109051823616, + "rewards/margins": 0.05672860145568848, + "rewards/margins_max": 0.07977689057588577, + "rewards/margins_min": 0.033680304884910583, + "rewards/margins_std": 0.032595209777355194, + "rewards/rejected": -0.03618750721216202, + "step": 560 + }, + { + "epoch": 0.22, + "grad_norm": 0.294921875, + "learning_rate": 4.796803422375544e-07, + "logits/chosen": -1.4147670269012451, + "logits/rejected": -1.0929642915725708, + "logps/chosen": -210.1922607421875, + "logps/rejected": -212.7117462158203, + "loss": 0.6672, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.018465718254446983, + "rewards/margins": 0.04878971725702286, + "rewards/margins_max": 0.06830336898565292, + "rewards/margins_min": 0.029276061803102493, + "rewards/margins_std": 0.027596473693847656, + "rewards/rejected": -0.030323997139930725, + "step": 570 + }, + { + "epoch": 0.22, + "grad_norm": 0.26953125, + "learning_rate": 4.783503759956858e-07, + "logits/chosen": -1.4458619356155396, + "logits/rejected": -1.0822944641113281, + "logps/chosen": -219.31643676757812, + "logps/rejected": -230.5920867919922, + "loss": 0.6654, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01647743210196495, + "rewards/margins": 0.058020107448101044, + "rewards/margins_max": 0.08135350048542023, + "rewards/margins_min": 0.03468669205904007, + "rewards/margins_std": 0.03299842029809952, + "rewards/rejected": -0.041542667895555496, + "step": 580 + }, + { + "epoch": 0.22, + "grad_norm": 0.330078125, + "learning_rate": 4.769802184856049e-07, + "logits/chosen": -1.3940141201019287, + "logits/rejected": -1.0562171936035156, + "logps/chosen": -200.2158660888672, + "logps/rejected": -233.1265106201172, + "loss": 0.6672, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.020147614181041718, + "rewards/margins": 0.05223570391535759, + "rewards/margins_max": 0.07756931334733963, + "rewards/margins_min": 0.026902100071310997, + "rewards/margins_std": 0.03582713380455971, + "rewards/rejected": -0.03208809345960617, + "step": 590 + }, + { + "epoch": 0.23, + "grad_norm": 0.3046875, + "learning_rate": 4.7557011086464625e-07, + "logits/chosen": -1.4117199182510376, + "logits/rejected": -1.1517552137374878, + "logps/chosen": -195.2522735595703, + "logps/rejected": -215.0863800048828, + "loss": 0.6662, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.022823641076683998, + "rewards/margins": 0.05536242574453354, + "rewards/margins_max": 0.07616210728883743, + "rewards/margins_min": 0.03456275910139084, + "rewards/margins_std": 0.029415175318717957, + "rewards/rejected": -0.03253878653049469, + "step": 600 + }, + { + "epoch": 0.23, + "grad_norm": 0.322265625, + "learning_rate": 4.74120301321644e-07, + "logits/chosen": -1.4161319732666016, + "logits/rejected": -1.0721873044967651, + "logps/chosen": -213.9852752685547, + "logps/rejected": -222.2463836669922, + "loss": 0.6634, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.020179515704512596, + "rewards/margins": 0.06596876680850983, + "rewards/margins_max": 0.08490391820669174, + "rewards/margins_min": 0.047033630311489105, + "rewards/margins_std": 0.0267783310264349, + "rewards/rejected": -0.04578925296664238, + "step": 610 + }, + { + "epoch": 0.24, + "grad_norm": 0.279296875, + "learning_rate": 4.7263104503324927e-07, + "logits/chosen": -1.4241613149642944, + "logits/rejected": -1.0511661767959595, + "logps/chosen": -229.7281036376953, + "logps/rejected": -221.2598419189453, + "loss": 0.6662, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022109119221568108, + "rewards/margins": 0.05698208883404732, + "rewards/margins_max": 0.08112742006778717, + "rewards/margins_min": 0.03283676132559776, + "rewards/margins_std": 0.03414664790034294, + "rewards/rejected": -0.03487296774983406, + "step": 620 + }, + { + "epoch": 0.24, + "grad_norm": 0.26953125, + "learning_rate": 4.711026041190167e-07, + "logits/chosen": -1.4864065647125244, + "logits/rejected": -1.1408494710922241, + "logps/chosen": -190.9454345703125, + "logps/rejected": -185.9245147705078, + "loss": 0.6632, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024640116840600967, + "rewards/margins": 0.0617794506251812, + "rewards/margins_max": 0.08624964207410812, + "rewards/margins_min": 0.03730924800038338, + "rewards/margins_std": 0.03460608795285225, + "rewards/rejected": -0.03713933378458023, + "step": 630 + }, + { + "epoch": 0.24, + "grad_norm": 0.318359375, + "learning_rate": 4.6953524759527053e-07, + "logits/chosen": -1.3665571212768555, + "logits/rejected": -1.0415217876434326, + "logps/chosen": -206.030029296875, + "logps/rejected": -218.22738647460938, + "loss": 0.6626, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02321634814143181, + "rewards/margins": 0.06289292871952057, + "rewards/margins_max": 0.08443962037563324, + "rewards/margins_min": 0.041346240788698196, + "rewards/margins_std": 0.030471617355942726, + "rewards/rejected": -0.03967657685279846, + "step": 640 + }, + { + "epoch": 0.25, + "grad_norm": 0.35546875, + "learning_rate": 4.6792925132775486e-07, + "logits/chosen": -1.3912900686264038, + "logits/rejected": -1.2024117708206177, + "logps/chosen": -189.44032287597656, + "logps/rejected": -203.79727172851562, + "loss": 0.6643, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.0144128929823637, + "rewards/margins": 0.05512215569615364, + "rewards/margins_max": 0.08231507241725922, + "rewards/margins_min": 0.02792922779917717, + "rewards/margins_std": 0.03845660015940666, + "rewards/rejected": -0.04070926457643509, + "step": 650 + }, + { + "epoch": 0.25, + "grad_norm": 0.3515625, + "learning_rate": 4.6628489798308004e-07, + "logits/chosen": -1.3954074382781982, + "logits/rejected": -1.1058709621429443, + "logps/chosen": -217.58493041992188, + "logps/rejected": -220.1082000732422, + "loss": 0.6622, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.015280501917004585, + "rewards/margins": 0.058963824063539505, + "rewards/margins_max": 0.0854538083076477, + "rewards/margins_min": 0.032473836094141006, + "rewards/margins_std": 0.037462495267391205, + "rewards/rejected": -0.04368331655859947, + "step": 660 + }, + { + "epoch": 0.25, + "grad_norm": 0.3046875, + "learning_rate": 4.64602476978971e-07, + "logits/chosen": -1.3883717060089111, + "logits/rejected": -1.0562824010849, + "logps/chosen": -209.17697143554688, + "logps/rejected": -229.61672973632812, + "loss": 0.6604, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018770989030599594, + "rewards/margins": 0.0635194405913353, + "rewards/margins_max": 0.09025418758392334, + "rewards/margins_min": 0.03678469732403755, + "rewards/margins_std": 0.03780863806605339, + "rewards/rejected": -0.044748447835445404, + "step": 670 + }, + { + "epoch": 0.26, + "grad_norm": 0.306640625, + "learning_rate": 4.6288228443332776e-07, + "logits/chosen": -1.4112727642059326, + "logits/rejected": -1.1717571020126343, + "logps/chosen": -183.66964721679688, + "logps/rejected": -186.0308380126953, + "loss": 0.6619, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01850176230072975, + "rewards/margins": 0.05745549127459526, + "rewards/margins_max": 0.08195947110652924, + "rewards/margins_min": 0.03295152261853218, + "rewards/margins_std": 0.034653857350349426, + "rewards/rejected": -0.038953740149736404, + "step": 680 + }, + { + "epoch": 0.26, + "grad_norm": 0.306640625, + "learning_rate": 4.6112462311210685e-07, + "logits/chosen": -1.3851698637008667, + "logits/rejected": -0.9565486907958984, + "logps/chosen": -208.18746948242188, + "logps/rejected": -205.7491455078125, + "loss": 0.6615, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.021183792501688004, + "rewards/margins": 0.06673257052898407, + "rewards/margins_max": 0.09535098820924759, + "rewards/margins_min": 0.038114141672849655, + "rewards/margins_std": 0.04047255963087082, + "rewards/rejected": -0.04554877430200577, + "step": 690 + }, + { + "epoch": 0.27, + "grad_norm": 0.2890625, + "learning_rate": 4.593298023760319e-07, + "logits/chosen": -1.4321620464324951, + "logits/rejected": -1.0346721410751343, + "logps/chosen": -235.98095703125, + "logps/rejected": -261.7032165527344, + "loss": 0.6602, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.020307691767811775, + "rewards/margins": 0.06724603474140167, + "rewards/margins_max": 0.09406231343746185, + "rewards/margins_min": 0.040429744869470596, + "rewards/margins_std": 0.03792395070195198, + "rewards/rejected": -0.04693833738565445, + "step": 700 + }, + { + "epoch": 0.27, + "grad_norm": 0.279296875, + "learning_rate": 4.5749813812614447e-07, + "logits/chosen": -1.5499064922332764, + "logits/rejected": -1.1074297428131104, + "logps/chosen": -237.53274536132812, + "logps/rejected": -226.1111297607422, + "loss": 0.6626, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020518099889159203, + "rewards/margins": 0.05978738144040108, + "rewards/margins_max": 0.08547325432300568, + "rewards/margins_min": 0.034101493656635284, + "rewards/margins_std": 0.036325328052043915, + "rewards/rejected": -0.03926927596330643, + "step": 710 + }, + { + "epoch": 0.27, + "grad_norm": 0.302734375, + "learning_rate": 4.5562995274820283e-07, + "logits/chosen": -1.4489339590072632, + "logits/rejected": -1.154266595840454, + "logps/chosen": -205.8553466796875, + "logps/rejected": -199.05389404296875, + "loss": 0.6577, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.019976060837507248, + "rewards/margins": 0.07855883985757828, + "rewards/margins_max": 0.11712227761745453, + "rewards/margins_min": 0.039995402097702026, + "rewards/margins_std": 0.05453693866729736, + "rewards/rejected": -0.05858277156949043, + "step": 720 + }, + { + "epoch": 0.28, + "grad_norm": 0.31640625, + "learning_rate": 4.5372557505594024e-07, + "logits/chosen": -1.4426929950714111, + "logits/rejected": -1.1371030807495117, + "logps/chosen": -228.83718872070312, + "logps/rejected": -260.3121032714844, + "loss": 0.6573, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024684404954314232, + "rewards/margins": 0.07492948323488235, + "rewards/margins_max": 0.1032097116112709, + "rewards/margins_min": 0.04664924368262291, + "rewards/margins_std": 0.039994291961193085, + "rewards/rejected": -0.050245076417922974, + "step": 730 + }, + { + "epoch": 0.28, + "grad_norm": 0.263671875, + "learning_rate": 4.517853402331909e-07, + "logits/chosen": -1.5174484252929688, + "logits/rejected": -1.0977634191513062, + "logps/chosen": -199.60836791992188, + "logps/rejected": -209.7433319091797, + "loss": 0.6595, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.026398539543151855, + "rewards/margins": 0.07308533042669296, + "rewards/margins_max": 0.10156605392694473, + "rewards/margins_min": 0.04460462927818298, + "rewards/margins_std": 0.04027780145406723, + "rewards/rejected": -0.046686798334121704, + "step": 740 + }, + { + "epoch": 0.28, + "grad_norm": 0.2890625, + "learning_rate": 4.4980958977489594e-07, + "logits/chosen": -1.416208267211914, + "logits/rejected": -1.0870482921600342, + "logps/chosen": -216.3725128173828, + "logps/rejected": -215.8811492919922, + "loss": 0.652, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.020642448216676712, + "rewards/margins": 0.07765364646911621, + "rewards/margins_max": 0.11166071891784668, + "rewards/margins_min": 0.043646566569805145, + "rewards/margins_std": 0.0480932779610157, + "rewards/rejected": -0.0570111945271492, + "step": 750 + }, + { + "epoch": 0.29, + "grad_norm": 0.2734375, + "learning_rate": 4.477986714269971e-07, + "logits/chosen": -1.4371469020843506, + "logits/rejected": -1.1657259464263916, + "logps/chosen": -183.5517120361328, + "logps/rejected": -202.76528930664062, + "loss": 0.6589, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022578328847885132, + "rewards/margins": 0.06701181083917618, + "rewards/margins_max": 0.09141481667757034, + "rewards/margins_min": 0.04260881245136261, + "rewards/margins_std": 0.03451105207204819, + "rewards/rejected": -0.044433485716581345, + "step": 760 + }, + { + "epoch": 0.29, + "grad_norm": 0.26953125, + "learning_rate": 4.457529391252317e-07, + "logits/chosen": -1.4765751361846924, + "logits/rejected": -1.1041843891143799, + "logps/chosen": -217.7615966796875, + "logps/rejected": -216.8796844482422, + "loss": 0.6584, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021331623196601868, + "rewards/margins": 0.0649794489145279, + "rewards/margins_max": 0.09212762117385864, + "rewards/margins_min": 0.03783128783106804, + "rewards/margins_std": 0.0383933000266552, + "rewards/rejected": -0.043647829443216324, + "step": 770 + }, + { + "epoch": 0.3, + "grad_norm": 0.259765625, + "learning_rate": 4.43672752932837e-07, + "logits/chosen": -1.432063102722168, + "logits/rejected": -1.1789019107818604, + "logps/chosen": -194.9161834716797, + "logps/rejected": -193.73593139648438, + "loss": 0.6625, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02206316404044628, + "rewards/margins": 0.06179576367139816, + "rewards/margins_max": 0.0931532233953476, + "rewards/margins_min": 0.030438298359513283, + "rewards/margins_std": 0.0443461537361145, + "rewards/rejected": -0.03973260149359703, + "step": 780 + }, + { + "epoch": 0.3, + "grad_norm": 0.296875, + "learning_rate": 4.415584789771769e-07, + "logits/chosen": -1.3746683597564697, + "logits/rejected": -1.1170393228530884, + "logps/chosen": -192.1763153076172, + "logps/rejected": -205.6260223388672, + "loss": 0.661, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.025766903534531593, + "rewards/margins": 0.0719086080789566, + "rewards/margins_max": 0.09772966802120209, + "rewards/margins_min": 0.04608756676316261, + "rewards/margins_std": 0.0365164689719677, + "rewards/rejected": -0.04614170640707016, + "step": 790 + }, + { + "epoch": 0.3, + "grad_norm": 0.279296875, + "learning_rate": 4.394104893853007e-07, + "logits/chosen": -1.4416863918304443, + "logits/rejected": -1.0908607244491577, + "logps/chosen": -219.8855743408203, + "logps/rejected": -217.4703826904297, + "loss": 0.6541, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02542579174041748, + "rewards/margins": 0.08650486171245575, + "rewards/margins_max": 0.1185513511300087, + "rewards/margins_min": 0.0544583797454834, + "rewards/margins_std": 0.04532057046890259, + "rewards/rejected": -0.06107907369732857, + "step": 800 + }, + { + "epoch": 0.31, + "grad_norm": 0.3203125, + "learning_rate": 4.3722916221844613e-07, + "logits/chosen": -1.3401496410369873, + "logits/rejected": -0.9828430414199829, + "logps/chosen": -213.7498779296875, + "logps/rejected": -208.93325805664062, + "loss": 0.6557, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.028565192595124245, + "rewards/margins": 0.08439178764820099, + "rewards/margins_max": 0.11834411323070526, + "rewards/margins_min": 0.05043945461511612, + "rewards/margins_std": 0.04801584780216217, + "rewards/rejected": -0.055826593190431595, + "step": 810 + }, + { + "epoch": 0.31, + "grad_norm": 0.349609375, + "learning_rate": 4.350148814054982e-07, + "logits/chosen": -1.3076943159103394, + "logits/rejected": -1.0068638324737549, + "logps/chosen": -224.6240997314453, + "logps/rejected": -238.9586181640625, + "loss": 0.6556, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.01850820705294609, + "rewards/margins": 0.0732334703207016, + "rewards/margins_max": 0.11072257906198502, + "rewards/margins_min": 0.03574436157941818, + "rewards/margins_std": 0.05301760509610176, + "rewards/rejected": -0.05472525954246521, + "step": 820 + }, + { + "epoch": 0.32, + "grad_norm": 0.2890625, + "learning_rate": 4.327680366754146e-07, + "logits/chosen": -1.356018304824829, + "logits/rejected": -1.0702496767044067, + "logps/chosen": -211.2010040283203, + "logps/rejected": -209.5356903076172, + "loss": 0.6569, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02133285626769066, + "rewards/margins": 0.06990201771259308, + "rewards/margins_max": 0.10147367417812347, + "rewards/margins_min": 0.03833036497235298, + "rewards/margins_std": 0.04464906454086304, + "rewards/rejected": -0.04856916517019272, + "step": 830 + }, + { + "epoch": 0.32, + "grad_norm": 0.318359375, + "learning_rate": 4.3048902348863106e-07, + "logits/chosen": -1.469026803970337, + "logits/rejected": -1.1089591979980469, + "logps/chosen": -224.88095092773438, + "logps/rejected": -225.6546173095703, + "loss": 0.6554, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.025625556707382202, + "rewards/margins": 0.0845932811498642, + "rewards/margins_max": 0.11639096587896347, + "rewards/margins_min": 0.05279559642076492, + "rewards/margins_std": 0.04496871680021286, + "rewards/rejected": -0.05896772816777229, + "step": 840 + }, + { + "epoch": 0.32, + "grad_norm": 0.326171875, + "learning_rate": 4.2817824296745736e-07, + "logits/chosen": -1.410362958908081, + "logits/rejected": -1.0984877347946167, + "logps/chosen": -205.9351348876953, + "logps/rejected": -216.6947021484375, + "loss": 0.658, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022886861115694046, + "rewards/margins": 0.0620783269405365, + "rewards/margins_max": 0.09115530550479889, + "rewards/margins_min": 0.03300134092569351, + "rewards/margins_std": 0.04112106189131737, + "rewards/rejected": -0.039191462099552155, + "step": 850 + }, + { + "epoch": 0.33, + "grad_norm": 0.33203125, + "learning_rate": 4.258361018254769e-07, + "logits/chosen": -1.465014934539795, + "logits/rejected": -1.1755095720291138, + "logps/chosen": -192.93734741210938, + "logps/rejected": -219.0324249267578, + "loss": 0.6532, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.025163287296891212, + "rewards/margins": 0.0787445530295372, + "rewards/margins_max": 0.11643560230731964, + "rewards/margins_min": 0.041053496301174164, + "rewards/margins_std": 0.05330319330096245, + "rewards/rejected": -0.05358126014471054, + "step": 860 + }, + { + "epoch": 0.33, + "grad_norm": 0.28515625, + "learning_rate": 4.234630122959625e-07, + "logits/chosen": -1.5218905210494995, + "logits/rejected": -1.1900856494903564, + "logps/chosen": -195.29864501953125, + "logps/rejected": -254.084716796875, + "loss": 0.6557, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021066388115286827, + "rewards/margins": 0.0712776631116867, + "rewards/margins_max": 0.09616719186306, + "rewards/margins_min": 0.046388134360313416, + "rewards/margins_std": 0.0351991131901741, + "rewards/rejected": -0.05021127313375473, + "step": 870 + }, + { + "epoch": 0.33, + "grad_norm": 0.703125, + "learning_rate": 4.2105939205932005e-07, + "logits/chosen": -1.3866618871688843, + "logits/rejected": -1.1061947345733643, + "logps/chosen": -197.81729125976562, + "logps/rejected": -253.219970703125, + "loss": 0.6515, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029097964987158775, + "rewards/margins": 0.08818992227315903, + "rewards/margins_max": 0.12071744352579117, + "rewards/margins_min": 0.055662404745817184, + "rewards/margins_std": 0.04600085690617561, + "rewards/rejected": -0.0590919628739357, + "step": 880 + }, + { + "epoch": 0.34, + "grad_norm": 0.34765625, + "learning_rate": 4.1862566416957444e-07, + "logits/chosen": -1.5151255130767822, + "logits/rejected": -1.148018479347229, + "logps/chosen": -202.05813598632812, + "logps/rejected": -222.6027069091797, + "loss": 0.6562, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.019906148314476013, + "rewards/margins": 0.07728614658117294, + "rewards/margins_max": 0.10957305133342743, + "rewards/margins_min": 0.044999223202466965, + "rewards/margins_std": 0.0456605963408947, + "rewards/rejected": -0.05737999826669693, + "step": 890 + }, + { + "epoch": 0.34, + "grad_norm": 0.306640625, + "learning_rate": 4.161622569799085e-07, + "logits/chosen": -1.3949564695358276, + "logits/rejected": -1.0818382501602173, + "logps/chosen": -195.73452758789062, + "logps/rejected": -196.80213928222656, + "loss": 0.6541, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021286411210894585, + "rewards/margins": 0.07778953015804291, + "rewards/margins_max": 0.10992787778377533, + "rewards/margins_min": 0.04565117880702019, + "rewards/margins_std": 0.04545048624277115, + "rewards/rejected": -0.056503117084503174, + "step": 900 + }, + { + "epoch": 0.35, + "grad_norm": 0.267578125, + "learning_rate": 4.136696040672702e-07, + "logits/chosen": -1.3432233333587646, + "logits/rejected": -0.9210837483406067, + "logps/chosen": -227.01687622070312, + "logps/rejected": -276.3375549316406, + "loss": 0.6503, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02953980304300785, + "rewards/margins": 0.09690040349960327, + "rewards/margins_max": 0.13372206687927246, + "rewards/margins_min": 0.06007874757051468, + "rewards/margins_std": 0.052073679864406586, + "rewards/rejected": -0.06736060231924057, + "step": 910 + }, + { + "epoch": 0.35, + "grad_norm": 0.3125, + "learning_rate": 4.1114814415605975e-07, + "logits/chosen": -1.39051353931427, + "logits/rejected": -1.1195346117019653, + "logps/chosen": -172.82054138183594, + "logps/rejected": -214.9634552001953, + "loss": 0.6491, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.025750596076250076, + "rewards/margins": 0.08932848274707794, + "rewards/margins_max": 0.12272658199071884, + "rewards/margins_min": 0.05593038722872734, + "rewards/margins_std": 0.04723203927278519, + "rewards/rejected": -0.06357789039611816, + "step": 920 + }, + { + "epoch": 0.35, + "grad_norm": 0.33984375, + "learning_rate": 4.0859832104091136e-07, + "logits/chosen": -1.4840190410614014, + "logits/rejected": -1.109261155128479, + "logps/chosen": -211.09658813476562, + "logps/rejected": -212.0129852294922, + "loss": 0.6542, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0225338414311409, + "rewards/margins": 0.0804629772901535, + "rewards/margins_max": 0.1128251925110817, + "rewards/margins_min": 0.048100754618644714, + "rewards/margins_std": 0.04576708376407623, + "rewards/rejected": -0.057929135859012604, + "step": 930 + }, + { + "epoch": 0.36, + "grad_norm": 0.283203125, + "learning_rate": 4.060205835085821e-07, + "logits/chosen": -1.4287742376327515, + "logits/rejected": -1.0760209560394287, + "logps/chosen": -216.3826904296875, + "logps/rejected": -228.55966186523438, + "loss": 0.6575, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02253951132297516, + "rewards/margins": 0.0758446678519249, + "rewards/margins_max": 0.104688860476017, + "rewards/margins_min": 0.0470004640519619, + "rewards/margins_std": 0.04079186171293259, + "rewards/rejected": -0.05330515652894974, + "step": 940 + }, + { + "epoch": 0.36, + "grad_norm": 0.35546875, + "learning_rate": 4.034153852589623e-07, + "logits/chosen": -1.490159273147583, + "logits/rejected": -1.1104148626327515, + "logps/chosen": -192.82733154296875, + "logps/rejected": -206.4297637939453, + "loss": 0.6549, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02006385289132595, + "rewards/margins": 0.07609236240386963, + "rewards/margins_max": 0.112637460231781, + "rewards/margins_min": 0.03954725340008736, + "rewards/margins_std": 0.05168258026242256, + "rewards/rejected": -0.056028496474027634, + "step": 950 + }, + { + "epoch": 0.36, + "grad_norm": 0.33984375, + "learning_rate": 4.0078318482522114e-07, + "logits/chosen": -1.417443871498108, + "logits/rejected": -1.0727176666259766, + "logps/chosen": -222.9761962890625, + "logps/rejected": -221.27786254882812, + "loss": 0.6543, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.017086107283830643, + "rewards/margins": 0.07355757057666779, + "rewards/margins_max": 0.10742592811584473, + "rewards/margins_min": 0.03968920186161995, + "rewards/margins_std": 0.0478971004486084, + "rewards/rejected": -0.056471455842256546, + "step": 960 + }, + { + "epoch": 0.37, + "grad_norm": 0.291015625, + "learning_rate": 3.9812444549310166e-07, + "logits/chosen": -1.4559601545333862, + "logits/rejected": -1.1964634656906128, + "logps/chosen": -189.91392517089844, + "logps/rejected": -199.15951538085938, + "loss": 0.6502, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.025768781080842018, + "rewards/margins": 0.08322307467460632, + "rewards/margins_max": 0.11746567487716675, + "rewards/margins_min": 0.048980481922626495, + "rewards/margins_std": 0.04842633754014969, + "rewards/rejected": -0.05745428800582886, + "step": 970 + }, + { + "epoch": 0.37, + "grad_norm": 0.318359375, + "learning_rate": 3.9543963521937915e-07, + "logits/chosen": -1.3898035287857056, + "logits/rejected": -1.138649821281433, + "logps/chosen": -204.3951873779297, + "logps/rejected": -227.2592010498047, + "loss": 0.6544, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.015050689689815044, + "rewards/margins": 0.072553351521492, + "rewards/margins_max": 0.10220275074243546, + "rewards/margins_min": 0.042903970927000046, + "rewards/margins_std": 0.04193056747317314, + "rewards/rejected": -0.05750266835093498, + "step": 980 + }, + { + "epoch": 0.38, + "grad_norm": 0.3046875, + "learning_rate": 3.927292265494978e-07, + "logits/chosen": -1.3583369255065918, + "logits/rejected": -1.1555811166763306, + "logps/chosen": -182.11109924316406, + "logps/rejected": -184.9093017578125, + "loss": 0.6548, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020562218502163887, + "rewards/margins": 0.06886212527751923, + "rewards/margins_max": 0.10518188774585724, + "rewards/margins_min": 0.032542359083890915, + "rewards/margins_std": 0.051363904029130936, + "rewards/rejected": -0.04829990863800049, + "step": 990 + }, + { + "epoch": 0.38, + "grad_norm": 0.310546875, + "learning_rate": 3.8999369653439883e-07, + "logits/chosen": -1.4692192077636719, + "logits/rejected": -1.1662304401397705, + "logps/chosen": -218.901123046875, + "logps/rejected": -259.073486328125, + "loss": 0.6544, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021590515971183777, + "rewards/margins": 0.07978875935077667, + "rewards/margins_max": 0.11681132018566132, + "rewards/margins_min": 0.04276617616415024, + "rewards/margins_std": 0.05235783010721207, + "rewards/rejected": -0.0581982359290123, + "step": 1000 + }, + { + "epoch": 0.38, + "grad_norm": 0.431640625, + "learning_rate": 3.872335266465565e-07, + "logits/chosen": -1.449806571006775, + "logits/rejected": -1.1126211881637573, + "logps/chosen": -208.30508422851562, + "logps/rejected": -209.28854370117188, + "loss": 0.6526, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01985124871134758, + "rewards/margins": 0.09691493213176727, + "rewards/margins_max": 0.13921746611595154, + "rewards/margins_min": 0.05461239814758301, + "rewards/margins_std": 0.059824805706739426, + "rewards/rejected": -0.0770636796951294, + "step": 1010 + }, + { + "epoch": 0.39, + "grad_norm": 0.2578125, + "learning_rate": 3.8444920269523564e-07, + "logits/chosen": -1.4464867115020752, + "logits/rejected": -1.1247550249099731, + "logps/chosen": -208.340087890625, + "logps/rejected": -231.50064086914062, + "loss": 0.6555, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02373802661895752, + "rewards/margins": 0.07935117930173874, + "rewards/margins_max": 0.11073676496744156, + "rewards/margins_min": 0.04796561598777771, + "rewards/margins_std": 0.04438590258359909, + "rewards/rejected": -0.055613160133361816, + "step": 1020 + }, + { + "epoch": 0.39, + "grad_norm": 0.3046875, + "learning_rate": 3.8164121474098557e-07, + "logits/chosen": -1.4539260864257812, + "logits/rejected": -1.0766693353652954, + "logps/chosen": -208.6253662109375, + "logps/rejected": -219.2858428955078, + "loss": 0.6552, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.015472279861569405, + "rewards/margins": 0.07762763649225235, + "rewards/margins_max": 0.10684994608163834, + "rewards/margins_min": 0.04840531200170517, + "rewards/margins_std": 0.04132659360766411, + "rewards/rejected": -0.0621553435921669, + "step": 1030 + }, + { + "epoch": 0.4, + "grad_norm": 0.2470703125, + "learning_rate": 3.7881005700938627e-07, + "logits/chosen": -1.4579023122787476, + "logits/rejected": -1.067096471786499, + "logps/chosen": -195.89137268066406, + "logps/rejected": -208.55618286132812, + "loss": 0.6504, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01986720971763134, + "rewards/margins": 0.0889149010181427, + "rewards/margins_max": 0.12377619743347168, + "rewards/margins_min": 0.05405362695455551, + "rewards/margins_std": 0.04930129647254944, + "rewards/rejected": -0.0690477043390274, + "step": 1040 + }, + { + "epoch": 0.4, + "grad_norm": 0.326171875, + "learning_rate": 3.759562278040611e-07, + "logits/chosen": -1.3401424884796143, + "logits/rejected": -1.1433542966842651, + "logps/chosen": -187.8769989013672, + "logps/rejected": -207.67489624023438, + "loss": 0.6523, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.019486157223582268, + "rewards/margins": 0.08506642282009125, + "rewards/margins_max": 0.1269386112689972, + "rewards/margins_min": 0.043194226920604706, + "rewards/margins_std": 0.059216223657131195, + "rewards/rejected": -0.06558026373386383, + "step": 1050 + }, + { + "epoch": 0.4, + "grad_norm": 0.353515625, + "learning_rate": 3.7308022941897176e-07, + "logits/chosen": -1.3987281322479248, + "logits/rejected": -1.0789750814437866, + "logps/chosen": -222.21798706054688, + "logps/rejected": -221.2227783203125, + "loss": 0.6494, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.021621376276016235, + "rewards/margins": 0.09140492975711823, + "rewards/margins_max": 0.1279296576976776, + "rewards/margins_min": 0.05488022044301033, + "rewards/margins_std": 0.05165375396609306, + "rewards/rejected": -0.06978355348110199, + "step": 1060 + }, + { + "epoch": 0.41, + "grad_norm": 0.29296875, + "learning_rate": 3.7018256805001115e-07, + "logits/chosen": -1.4328137636184692, + "logits/rejected": -1.0926902294158936, + "logps/chosen": -208.5886688232422, + "logps/rejected": -234.0699462890625, + "loss": 0.6478, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.025926152244210243, + "rewards/margins": 0.09499989449977875, + "rewards/margins_max": 0.1423562914133072, + "rewards/margins_min": 0.047643501311540604, + "rewards/margins_std": 0.06697206199169159, + "rewards/rejected": -0.06907374411821365, + "step": 1070 + }, + { + "epoch": 0.41, + "grad_norm": 0.310546875, + "learning_rate": 3.6726375370590924e-07, + "logits/chosen": -1.4664019346237183, + "logits/rejected": -1.0370583534240723, + "logps/chosen": -245.40286254882812, + "logps/rejected": -201.60385131835938, + "loss": 0.6554, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.016693558543920517, + "rewards/margins": 0.07288383692502975, + "rewards/margins_max": 0.10198800265789032, + "rewards/margins_min": 0.043779678642749786, + "rewards/margins_std": 0.041159503161907196, + "rewards/rejected": -0.056190282106399536, + "step": 1080 + }, + { + "epoch": 0.41, + "grad_norm": 0.353515625, + "learning_rate": 3.6432430011846825e-07, + "logits/chosen": -1.4667326211929321, + "logits/rejected": -1.1126149892807007, + "logps/chosen": -199.15652465820312, + "logps/rejected": -224.69058227539062, + "loss": 0.6495, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022613363340497017, + "rewards/margins": 0.09035179764032364, + "rewards/margins_max": 0.1315051019191742, + "rewards/margins_min": 0.04919849708676338, + "rewards/margins_std": 0.05819956213235855, + "rewards/rejected": -0.06773844361305237, + "step": 1090 + }, + { + "epoch": 0.42, + "grad_norm": 0.3359375, + "learning_rate": 3.613647246521419e-07, + "logits/chosen": -1.5111665725708008, + "logits/rejected": -1.047363519668579, + "logps/chosen": -213.94393920898438, + "logps/rejected": -214.4349365234375, + "loss": 0.6534, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.020266389474272728, + "rewards/margins": 0.08451583981513977, + "rewards/margins_max": 0.12752871215343475, + "rewards/margins_min": 0.04150295630097389, + "rewards/margins_std": 0.06082940101623535, + "rewards/rejected": -0.0642494484782219, + "step": 1100 + }, + { + "epoch": 0.42, + "grad_norm": 0.294921875, + "learning_rate": 3.583855482129755e-07, + "logits/chosen": -1.4186838865280151, + "logits/rejected": -1.1392043828964233, + "logps/chosen": -214.18783569335938, + "logps/rejected": -266.89801025390625, + "loss": 0.6486, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.024385053664445877, + "rewards/margins": 0.09580737352371216, + "rewards/margins_max": 0.1446331888437271, + "rewards/margins_min": 0.0469815619289875, + "rewards/margins_std": 0.0690501257777214, + "rewards/rejected": -0.07142232358455658, + "step": 1110 + }, + { + "epoch": 0.43, + "grad_norm": 0.328125, + "learning_rate": 3.5538729515692354e-07, + "logits/chosen": -1.4348571300506592, + "logits/rejected": -1.005236268043518, + "logps/chosen": -222.6161346435547, + "logps/rejected": -215.0897674560547, + "loss": 0.6544, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02178863435983658, + "rewards/margins": 0.0858430564403534, + "rewards/margins_max": 0.12585435807704926, + "rewards/margins_min": 0.04583176225423813, + "rewards/margins_std": 0.056584518402814865, + "rewards/rejected": -0.06405442208051682, + "step": 1120 + }, + { + "epoch": 0.43, + "grad_norm": 0.310546875, + "learning_rate": 3.523704931975588e-07, + "logits/chosen": -1.4166271686553955, + "logits/rejected": -1.1025335788726807, + "logps/chosen": -214.4542999267578, + "logps/rejected": -231.10617065429688, + "loss": 0.6537, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02049335464835167, + "rewards/margins": 0.07652363926172256, + "rewards/margins_max": 0.11191650480031967, + "rewards/margins_min": 0.041130781173706055, + "rewards/margins_std": 0.05005306005477905, + "rewards/rejected": -0.056030284613370895, + "step": 1130 + }, + { + "epoch": 0.43, + "grad_norm": 0.328125, + "learning_rate": 3.4933567331319086e-07, + "logits/chosen": -1.4324204921722412, + "logits/rejected": -1.0857694149017334, + "logps/chosen": -220.71957397460938, + "logps/rejected": -210.15182495117188, + "loss": 0.6499, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021782949566841125, + "rewards/margins": 0.09863193333148956, + "rewards/margins_max": 0.14462308585643768, + "rewards/margins_min": 0.052640777081251144, + "rewards/margins_std": 0.06504130363464355, + "rewards/rejected": -0.07684897631406784, + "step": 1140 + }, + { + "epoch": 0.44, + "grad_norm": 0.390625, + "learning_rate": 3.46283369653411e-07, + "logits/chosen": -1.4875332117080688, + "logits/rejected": -1.1682199239730835, + "logps/chosen": -225.19692993164062, + "logps/rejected": -249.2867889404297, + "loss": 0.6514, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020085861906409264, + "rewards/margins": 0.08808033168315887, + "rewards/margins_max": 0.12566521763801575, + "rewards/margins_min": 0.05049543455243111, + "rewards/margins_std": 0.05315307527780533, + "rewards/rejected": -0.06799447536468506, + "step": 1150 + }, + { + "epoch": 0.44, + "grad_norm": 0.63671875, + "learning_rate": 3.4321411944507714e-07, + "logits/chosen": -1.4519484043121338, + "logits/rejected": -1.1154358386993408, + "logps/chosen": -211.3180389404297, + "logps/rejected": -290.20086669921875, + "loss": 0.6523, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02073494717478752, + "rewards/margins": 0.08044596761465073, + "rewards/margins_max": 0.11118495464324951, + "rewards/margins_min": 0.04970698431134224, + "rewards/margins_std": 0.04347149282693863, + "rewards/rejected": -0.059711016714572906, + "step": 1160 + }, + { + "epoch": 0.44, + "grad_norm": 0.3203125, + "learning_rate": 3.40128462897759e-07, + "logits/chosen": -1.4702914953231812, + "logits/rejected": -1.1601308584213257, + "logps/chosen": -199.1900177001953, + "logps/rejected": -213.91455078125, + "loss": 0.6496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.014372904784977436, + "rewards/margins": 0.0744035467505455, + "rewards/margins_max": 0.10700450092554092, + "rewards/margins_min": 0.04180259257555008, + "rewards/margins_std": 0.04610472172498703, + "rewards/rejected": -0.06003064662218094, + "step": 1170 + }, + { + "epoch": 0.45, + "grad_norm": 0.373046875, + "learning_rate": 3.3702694310865693e-07, + "logits/chosen": -1.334912896156311, + "logits/rejected": -1.1364924907684326, + "logps/chosen": -185.46609497070312, + "logps/rejected": -222.8370819091797, + "loss": 0.6493, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.029090750962495804, + "rewards/margins": 0.09566773474216461, + "rewards/margins_max": 0.1352817267179489, + "rewards/margins_min": 0.05605371668934822, + "rewards/margins_std": 0.05602266266942024, + "rewards/rejected": -0.06657697260379791, + "step": 1180 + }, + { + "epoch": 0.45, + "grad_norm": 0.275390625, + "learning_rate": 3.339101059670131e-07, + "logits/chosen": -1.3308570384979248, + "logits/rejected": -1.011419653892517, + "logps/chosen": -220.4107666015625, + "logps/rejected": -240.52627563476562, + "loss": 0.6521, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.01713314652442932, + "rewards/margins": 0.08281738311052322, + "rewards/margins_max": 0.12792006134986877, + "rewards/margins_min": 0.037714701145887375, + "rewards/margins_std": 0.06378481537103653, + "rewards/rejected": -0.0656842291355133, + "step": 1190 + }, + { + "epoch": 0.46, + "grad_norm": 0.26953125, + "learning_rate": 3.3077850005803125e-07, + "logits/chosen": -1.377722144126892, + "logits/rejected": -1.073115587234497, + "logps/chosen": -217.9221649169922, + "logps/rejected": -207.5659637451172, + "loss": 0.6558, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.02081725373864174, + "rewards/margins": 0.06937672197818756, + "rewards/margins_max": 0.10580576956272125, + "rewards/margins_min": 0.03294768184423447, + "rewards/margins_std": 0.05151844024658203, + "rewards/rejected": -0.048559464514255524, + "step": 1200 + }, + { + "epoch": 0.46, + "grad_norm": 0.35546875, + "learning_rate": 3.276326765663218e-07, + "logits/chosen": -1.3020093441009521, + "logits/rejected": -1.0328706502914429, + "logps/chosen": -237.2034149169922, + "logps/rejected": -226.58364868164062, + "loss": 0.6534, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01786690205335617, + "rewards/margins": 0.07763433456420898, + "rewards/margins_max": 0.10978218168020248, + "rewards/margins_min": 0.04548647999763489, + "rewards/margins_std": 0.045463927090168, + "rewards/rejected": -0.05976742506027222, + "step": 1210 + }, + { + "epoch": 0.46, + "grad_norm": 0.376953125, + "learning_rate": 3.244731891788893e-07, + "logits/chosen": -1.4796888828277588, + "logits/rejected": -1.1671749353408813, + "logps/chosen": -199.0811767578125, + "logps/rejected": -237.5238494873047, + "loss": 0.6469, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.032245419919490814, + "rewards/margins": 0.09224997460842133, + "rewards/margins_max": 0.12662890553474426, + "rewards/margins_min": 0.05787103250622749, + "rewards/margins_std": 0.04861915856599808, + "rewards/rejected": -0.06000455096364021, + "step": 1220 + }, + { + "epoch": 0.47, + "grad_norm": 0.359375, + "learning_rate": 3.2130059398768005e-07, + "logits/chosen": -1.3609169721603394, + "logits/rejected": -1.0250881910324097, + "logps/chosen": -213.1770782470703, + "logps/rejected": -201.0860595703125, + "loss": 0.652, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.014574935659766197, + "rewards/margins": 0.08546517789363861, + "rewards/margins_max": 0.12767310440540314, + "rewards/margins_min": 0.04325725510716438, + "rewards/margins_std": 0.059691011905670166, + "rewards/rejected": -0.07089023292064667, + "step": 1230 + }, + { + "epoch": 0.47, + "grad_norm": 0.251953125, + "learning_rate": 3.1811544939170573e-07, + "logits/chosen": -1.4496055841445923, + "logits/rejected": -1.250016212463379, + "logps/chosen": -197.13966369628906, + "logps/rejected": -228.1790313720703, + "loss": 0.6529, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.022412937134504318, + "rewards/margins": 0.0773552656173706, + "rewards/margins_max": 0.11442377418279648, + "rewards/margins_min": 0.04028675705194473, + "rewards/margins_std": 0.05242278426885605, + "rewards/rejected": -0.054942332208156586, + "step": 1240 + }, + { + "epoch": 0.47, + "grad_norm": 0.33984375, + "learning_rate": 3.1491831599876105e-07, + "logits/chosen": -1.4599530696868896, + "logits/rejected": -1.1350712776184082, + "logps/chosen": -198.54531860351562, + "logps/rejected": -211.37203979492188, + "loss": 0.6482, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024880778044462204, + "rewards/margins": 0.10913392156362534, + "rewards/margins_max": 0.1501014530658722, + "rewards/margins_min": 0.06816640496253967, + "rewards/margins_std": 0.05793682858347893, + "rewards/rejected": -0.08425314724445343, + "step": 1250 + }, + { + "epoch": 0.48, + "grad_norm": 0.33203125, + "learning_rate": 3.117097565267534e-07, + "logits/chosen": -1.4878017902374268, + "logits/rejected": -1.150596022605896, + "logps/chosen": -236.7520294189453, + "logps/rejected": -269.3097839355469, + "loss": 0.6456, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023586656898260117, + "rewards/margins": 0.09756810963153839, + "rewards/margins_max": 0.14421576261520386, + "rewards/margins_min": 0.05092043802142143, + "rewards/margins_std": 0.06596976518630981, + "rewards/rejected": -0.07398144900798798, + "step": 1260 + }, + { + "epoch": 0.48, + "grad_norm": 0.296875, + "learning_rate": 3.0849033570466013e-07, + "logits/chosen": -1.3854446411132812, + "logits/rejected": -1.1156353950500488, + "logps/chosen": -211.7381134033203, + "logps/rejected": -230.3738250732422, + "loss": 0.647, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.024476120248436928, + "rewards/margins": 0.09338697046041489, + "rewards/margins_max": 0.13921865820884705, + "rewards/margins_min": 0.04755526781082153, + "rewards/margins_std": 0.06481581181287766, + "rewards/rejected": -0.06891084462404251, + "step": 1270 + }, + { + "epoch": 0.49, + "grad_norm": 0.27734375, + "learning_rate": 3.0526062017313247e-07, + "logits/chosen": -1.379686951637268, + "logits/rejected": -1.134813904762268, + "logps/chosen": -190.8585968017578, + "logps/rejected": -205.110595703125, + "loss": 0.6482, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018528077751398087, + "rewards/margins": 0.08439335227012634, + "rewards/margins_max": 0.11601463705301285, + "rewards/margins_min": 0.05277208238840103, + "rewards/margins_std": 0.044719234108924866, + "rewards/rejected": -0.06586527824401855, + "step": 1280 + }, + { + "epoch": 0.49, + "grad_norm": 0.3203125, + "learning_rate": 3.020211783847625e-07, + "logits/chosen": -1.5027602910995483, + "logits/rejected": -1.1443543434143066, + "logps/chosen": -200.97781372070312, + "logps/rejected": -213.7756805419922, + "loss": 0.6525, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02603713609278202, + "rewards/margins": 0.08934494107961655, + "rewards/margins_max": 0.1271466463804245, + "rewards/margins_min": 0.05154324695467949, + "rewards/margins_std": 0.05345967411994934, + "rewards/rejected": -0.06330780684947968, + "step": 1290 + }, + { + "epoch": 0.49, + "grad_norm": 0.3046875, + "learning_rate": 2.987725805040321e-07, + "logits/chosen": -1.5109453201293945, + "logits/rejected": -1.1658785343170166, + "logps/chosen": -194.1292266845703, + "logps/rejected": -226.944091796875, + "loss": 0.65, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.020652558654546738, + "rewards/margins": 0.08578969538211823, + "rewards/margins_max": 0.12059824168682098, + "rewards/margins_min": 0.05098114535212517, + "rewards/margins_std": 0.049226727336645126, + "rewards/rejected": -0.06513713300228119, + "step": 1300 + }, + { + "epoch": 0.5, + "grad_norm": 0.291015625, + "learning_rate": 2.955153983069593e-07, + "logits/chosen": -1.3911088705062866, + "logits/rejected": -1.0021635293960571, + "logps/chosen": -216.1819305419922, + "logps/rejected": -217.72048950195312, + "loss": 0.6533, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.009969010017812252, + "rewards/margins": 0.0843484178185463, + "rewards/margins_max": 0.11825823783874512, + "rewards/margins_min": 0.050438590347766876, + "rewards/margins_std": 0.04795572906732559, + "rewards/rejected": -0.07437939941883087, + "step": 1310 + }, + { + "epoch": 0.5, + "grad_norm": 0.341796875, + "learning_rate": 2.922502050804623e-07, + "logits/chosen": -1.3908171653747559, + "logits/rejected": -1.1572504043579102, + "logps/chosen": -209.4795379638672, + "logps/rejected": -240.86074829101562, + "loss": 0.6489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02364761009812355, + "rewards/margins": 0.10014495998620987, + "rewards/margins_max": 0.13577914237976074, + "rewards/margins_min": 0.0645107850432396, + "rewards/margins_std": 0.05039433762431145, + "rewards/rejected": -0.07649735361337662, + "step": 1320 + }, + { + "epoch": 0.51, + "grad_norm": 0.330078125, + "learning_rate": 2.889775755214565e-07, + "logits/chosen": -1.4878746271133423, + "logits/rejected": -1.0873339176177979, + "logps/chosen": -208.18862915039062, + "logps/rejected": -243.1056671142578, + "loss": 0.6486, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02549285627901554, + "rewards/margins": 0.08646980673074722, + "rewards/margins_max": 0.12378053367137909, + "rewards/margins_min": 0.04915907606482506, + "rewards/margins_std": 0.05276532843708992, + "rewards/rejected": -0.060976944863796234, + "step": 1330 + }, + { + "epoch": 0.51, + "grad_norm": 0.279296875, + "learning_rate": 2.8569808563570406e-07, + "logits/chosen": -1.4435174465179443, + "logits/rejected": -1.0699201822280884, + "logps/chosen": -195.5081024169922, + "logps/rejected": -198.16644287109375, + "loss": 0.6521, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02266775444149971, + "rewards/margins": 0.07937842607498169, + "rewards/margins_max": 0.11254648864269257, + "rewards/margins_min": 0.046210356056690216, + "rewards/margins_std": 0.0469067320227623, + "rewards/rejected": -0.056710679084062576, + "step": 1340 + }, + { + "epoch": 0.51, + "grad_norm": 0.2734375, + "learning_rate": 2.8241231263643284e-07, + "logits/chosen": -1.456235408782959, + "logits/rejected": -1.1784732341766357, + "logps/chosen": -213.12203979492188, + "logps/rejected": -240.123046875, + "loss": 0.6476, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01615014672279358, + "rewards/margins": 0.09304296225309372, + "rewards/margins_max": 0.13617947697639465, + "rewards/margins_min": 0.04990645498037338, + "rewards/margins_std": 0.06100423261523247, + "rewards/rejected": -0.07689281553030014, + "step": 1350 + }, + { + "epoch": 0.52, + "grad_norm": 0.291015625, + "learning_rate": 2.791208348427426e-07, + "logits/chosen": -1.4464397430419922, + "logits/rejected": -1.1199182271957397, + "logps/chosen": -214.69924926757812, + "logps/rejected": -198.94326782226562, + "loss": 0.6532, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01893942430615425, + "rewards/margins": 0.0802186131477356, + "rewards/margins_max": 0.11644063889980316, + "rewards/margins_min": 0.04399657994508743, + "rewards/margins_std": 0.05122567340731621, + "rewards/rejected": -0.061279188841581345, + "step": 1360 + }, + { + "epoch": 0.52, + "grad_norm": 0.265625, + "learning_rate": 2.758242315778172e-07, + "logits/chosen": -1.3660023212432861, + "logits/rejected": -0.9832341074943542, + "logps/chosen": -235.3618927001953, + "logps/rejected": -201.94151306152344, + "loss": 0.6526, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01969451829791069, + "rewards/margins": 0.08969944715499878, + "rewards/margins_max": 0.12913790345191956, + "rewards/margins_min": 0.050260983407497406, + "rewards/margins_std": 0.05577441304922104, + "rewards/rejected": -0.07000492513179779, + "step": 1370 + }, + { + "epoch": 0.52, + "grad_norm": 0.302734375, + "learning_rate": 2.725230830669591e-07, + "logits/chosen": -1.3614161014556885, + "logits/rejected": -1.0590187311172485, + "logps/chosen": -206.86636352539062, + "logps/rejected": -194.6129150390625, + "loss": 0.6539, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.01725422963500023, + "rewards/margins": 0.07537270337343216, + "rewards/margins_max": 0.11255017668008804, + "rewards/margins_min": 0.03819523751735687, + "rewards/margins_std": 0.052576880902051926, + "rewards/rejected": -0.05811848118901253, + "step": 1380 + }, + { + "epoch": 0.53, + "grad_norm": 0.3828125, + "learning_rate": 2.6921797033546604e-07, + "logits/chosen": -1.4232187271118164, + "logits/rejected": -1.0422379970550537, + "logps/chosen": -225.4199676513672, + "logps/rejected": -247.0299072265625, + "loss": 0.6472, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.023638783022761345, + "rewards/margins": 0.10305992513895035, + "rewards/margins_max": 0.15439125895500183, + "rewards/margins_min": 0.05172859877347946, + "rewards/margins_std": 0.07259346544742584, + "rewards/rejected": -0.07942114025354385, + "step": 1390 + }, + { + "epoch": 0.53, + "grad_norm": 0.330078125, + "learning_rate": 2.6590947510636656e-07, + "logits/chosen": -1.4895226955413818, + "logits/rejected": -1.1499069929122925, + "logps/chosen": -224.0960693359375, + "logps/rejected": -234.5597686767578, + "loss": 0.6522, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01984899304807186, + "rewards/margins": 0.0820828229188919, + "rewards/margins_max": 0.12106503546237946, + "rewards/margins_min": 0.04310062155127525, + "rewards/margins_std": 0.05512915924191475, + "rewards/rejected": -0.062233828008174896, + "step": 1400 + }, + { + "epoch": 0.54, + "grad_norm": 0.322265625, + "learning_rate": 2.625981796980323e-07, + "logits/chosen": -1.4586188793182373, + "logits/rejected": -1.1192744970321655, + "logps/chosen": -245.0766143798828, + "logps/rejected": -215.29385375976562, + "loss": 0.6537, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01640249788761139, + "rewards/margins": 0.08490737527608871, + "rewards/margins_max": 0.12561504542827606, + "rewards/margins_min": 0.04419969767332077, + "rewards/margins_std": 0.05756935477256775, + "rewards/rejected": -0.06850487738847733, + "step": 1410 + }, + { + "epoch": 0.54, + "grad_norm": 0.2734375, + "learning_rate": 2.5928466692168616e-07, + "logits/chosen": -1.4357595443725586, + "logits/rejected": -1.1410107612609863, + "logps/chosen": -186.00540161132812, + "logps/rejected": -211.6009063720703, + "loss": 0.6491, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022374480962753296, + "rewards/margins": 0.09253247827291489, + "rewards/margins_max": 0.13468560576438904, + "rewards/margins_min": 0.05037938430905342, + "rewards/margins_std": 0.059613488614559174, + "rewards/rejected": -0.07015800476074219, + "step": 1420 + }, + { + "epoch": 0.54, + "grad_norm": 0.318359375, + "learning_rate": 2.559695199788234e-07, + "logits/chosen": -1.3736810684204102, + "logits/rejected": -1.1410489082336426, + "logps/chosen": -206.69204711914062, + "logps/rejected": -219.3141326904297, + "loss": 0.6496, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.019356805831193924, + "rewards/margins": 0.0815727710723877, + "rewards/margins_max": 0.11693547666072845, + "rewards/margins_min": 0.046210046857595444, + "rewards/margins_std": 0.05001043155789375, + "rewards/rejected": -0.06221596151590347, + "step": 1430 + }, + { + "epoch": 0.55, + "grad_norm": 0.33984375, + "learning_rate": 2.526533223585641e-07, + "logits/chosen": -1.556718111038208, + "logits/rejected": -1.207423448562622, + "logps/chosen": -178.8986358642578, + "logps/rejected": -196.97682189941406, + "loss": 0.6522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01927010528743267, + "rewards/margins": 0.08486685901880264, + "rewards/margins_max": 0.12347618490457535, + "rewards/margins_min": 0.046257536858320236, + "rewards/margins_std": 0.054601818323135376, + "rewards/rejected": -0.06559675186872482, + "step": 1440 + }, + { + "epoch": 0.55, + "grad_norm": 0.318359375, + "learning_rate": 2.4933665773495464e-07, + "logits/chosen": -1.4177541732788086, + "logits/rejected": -1.0632787942886353, + "logps/chosen": -210.5023956298828, + "logps/rejected": -220.0447998046875, + "loss": 0.6456, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022884273901581764, + "rewards/margins": 0.08784890919923782, + "rewards/margins_max": 0.12712661921977997, + "rewards/margins_min": 0.04857120290398598, + "rewards/margins_std": 0.055547066032886505, + "rewards/rejected": -0.06496462970972061, + "step": 1450 + }, + { + "epoch": 0.55, + "grad_norm": 0.330078125, + "learning_rate": 2.460201098642378e-07, + "logits/chosen": -1.4159841537475586, + "logits/rejected": -1.0748342275619507, + "logps/chosen": -199.53170776367188, + "logps/rejected": -206.43692016601562, + "loss": 0.6423, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.020755691453814507, + "rewards/margins": 0.10842639207839966, + "rewards/margins_max": 0.15878863632678986, + "rewards/margins_min": 0.05806415155529976, + "rewards/margins_std": 0.07122296094894409, + "rewards/rejected": -0.08767069876194, + "step": 1460 + }, + { + "epoch": 0.56, + "grad_norm": 0.3203125, + "learning_rate": 2.4270426248210635e-07, + "logits/chosen": -1.4016398191452026, + "logits/rejected": -1.0987274646759033, + "logps/chosen": -203.04690551757812, + "logps/rejected": -224.54849243164062, + "loss": 0.6527, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.018762022256851196, + "rewards/margins": 0.08783960342407227, + "rewards/margins_max": 0.1270819753408432, + "rewards/margins_min": 0.04859720915555954, + "rewards/margins_std": 0.05549710988998413, + "rewards/rejected": -0.06907757371664047, + "step": 1470 + }, + { + "epoch": 0.56, + "grad_norm": 0.306640625, + "learning_rate": 2.3938969920096296e-07, + "logits/chosen": -1.4343984127044678, + "logits/rejected": -1.121087670326233, + "logps/chosen": -222.9105987548828, + "logps/rejected": -221.96896362304688, + "loss": 0.6503, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.021892910823225975, + "rewards/margins": 0.08273215591907501, + "rewards/margins_max": 0.12083101272583008, + "rewards/margins_min": 0.04463329166173935, + "rewards/margins_std": 0.05387992784380913, + "rewards/rejected": -0.060839246958494186, + "step": 1480 + }, + { + "epoch": 0.57, + "grad_norm": 0.32421875, + "learning_rate": 2.3607700340719872e-07, + "logits/chosen": -1.4712364673614502, + "logits/rejected": -1.1130168437957764, + "logps/chosen": -204.46397399902344, + "logps/rejected": -216.9080047607422, + "loss": 0.6507, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02178640104830265, + "rewards/margins": 0.08826582878828049, + "rewards/margins_max": 0.1252831369638443, + "rewards/margins_min": 0.05124853178858757, + "rewards/margins_std": 0.052350372076034546, + "rewards/rejected": -0.06647942960262299, + "step": 1490 + }, + { + "epoch": 0.57, + "grad_norm": 0.328125, + "learning_rate": 2.3276675815851439e-07, + "logits/chosen": -1.3515491485595703, + "logits/rejected": -1.1509690284729004, + "logps/chosen": -184.31765747070312, + "logps/rejected": -215.8578643798828, + "loss": 0.6516, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.024958888068795204, + "rewards/margins": 0.09194238483905792, + "rewards/margins_max": 0.13820824027061462, + "rewards/margins_min": 0.04567654803395271, + "rewards/margins_std": 0.06542977690696716, + "rewards/rejected": -0.06698349863290787, + "step": 1500 + }, + { + "epoch": 0.57, + "grad_norm": 0.3203125, + "learning_rate": 2.2945954608129725e-07, + "logits/chosen": -1.4680145978927612, + "logits/rejected": -1.1459739208221436, + "logps/chosen": -220.7042236328125, + "logps/rejected": -237.5146942138672, + "loss": 0.6523, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.018372472375631332, + "rewards/margins": 0.08562228828668594, + "rewards/margins_max": 0.12245050817728043, + "rewards/margins_min": 0.048794087022542953, + "rewards/margins_std": 0.052082955837249756, + "rewards/rejected": -0.0672498270869255, + "step": 1510 + }, + { + "epoch": 0.58, + "grad_norm": 0.259765625, + "learning_rate": 2.261559492680755e-07, + "logits/chosen": -1.5185743570327759, + "logits/rejected": -1.0897482633590698, + "logps/chosen": -221.81869506835938, + "logps/rejected": -223.1243896484375, + "loss": 0.6469, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024064363911747932, + "rewards/margins": 0.10357926785945892, + "rewards/margins_max": 0.14326122403144836, + "rewards/margins_min": 0.06389732658863068, + "rewards/margins_std": 0.05611874908208847, + "rewards/rejected": -0.07951490581035614, + "step": 1520 + }, + { + "epoch": 0.58, + "grad_norm": 0.400390625, + "learning_rate": 2.2285654917506511e-07, + "logits/chosen": -1.4201505184173584, + "logits/rejected": -1.1784555912017822, + "logps/chosen": -197.08529663085938, + "logps/rejected": -233.5238037109375, + "loss": 0.6519, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.014250082895159721, + "rewards/margins": 0.0851346030831337, + "rewards/margins_max": 0.12699416279792786, + "rewards/margins_min": 0.04327503591775894, + "rewards/margins_std": 0.05919836089015007, + "rewards/rejected": -0.07088451087474823, + "step": 1530 + }, + { + "epoch": 0.59, + "grad_norm": 0.314453125, + "learning_rate": 2.1956192651983025e-07, + "logits/chosen": -1.4901044368743896, + "logits/rejected": -1.2225561141967773, + "logps/chosen": -205.6460418701172, + "logps/rejected": -211.71597290039062, + "loss": 0.6483, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.014587330631911755, + "rewards/margins": 0.08248572051525116, + "rewards/margins_max": 0.11513223499059677, + "rewards/margins_min": 0.049839213490486145, + "rewards/margins_std": 0.04616914689540863, + "rewards/rejected": -0.06789840012788773, + "step": 1540 + }, + { + "epoch": 0.59, + "grad_norm": 0.30859375, + "learning_rate": 2.1627266117907206e-07, + "logits/chosen": -1.4423274993896484, + "logits/rejected": -1.1351933479309082, + "logps/chosen": -192.0325927734375, + "logps/rejected": -214.21127319335938, + "loss": 0.6511, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.019394617527723312, + "rewards/margins": 0.08463616669178009, + "rewards/margins_max": 0.1280382126569748, + "rewards/margins_min": 0.04123411327600479, + "rewards/margins_std": 0.06137976795434952, + "rewards/rejected": -0.06524154543876648, + "step": 1550 + }, + { + "epoch": 0.59, + "grad_norm": 0.328125, + "learning_rate": 2.1298933208656715e-07, + "logits/chosen": -1.4492504596710205, + "logits/rejected": -1.0848209857940674, + "logps/chosen": -213.02944946289062, + "logps/rejected": -244.51010131835938, + "loss": 0.6513, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01053069718182087, + "rewards/margins": 0.08705185353755951, + "rewards/margins_max": 0.12572325766086578, + "rewards/margins_min": 0.04838045313954353, + "rewards/margins_std": 0.05468962341547012, + "rewards/rejected": -0.07652115821838379, + "step": 1560 + }, + { + "epoch": 0.6, + "grad_norm": 0.390625, + "learning_rate": 2.0971251713127064e-07, + "logits/chosen": -1.4406160116195679, + "logits/rejected": -1.0886666774749756, + "logps/chosen": -214.84988403320312, + "logps/rejected": -218.6672821044922, + "loss": 0.6469, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.023885857313871384, + "rewards/margins": 0.09796186536550522, + "rewards/margins_max": 0.13213284313678741, + "rewards/margins_min": 0.06379088014364243, + "rewards/margins_std": 0.048325065523386, + "rewards/rejected": -0.07407601177692413, + "step": 1570 + }, + { + "epoch": 0.6, + "grad_norm": 0.333984375, + "learning_rate": 2.0644279305560378e-07, + "logits/chosen": -1.3928980827331543, + "logits/rejected": -1.1537230014801025, + "logps/chosen": -209.4993438720703, + "logps/rejected": -224.5998077392578, + "loss": 0.6523, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.024306269362568855, + "rewards/margins": 0.07895907014608383, + "rewards/margins_max": 0.1160157099366188, + "rewards/margins_min": 0.04190244525671005, + "rewards/margins_std": 0.05240599066019058, + "rewards/rejected": -0.054652802646160126, + "step": 1580 + }, + { + "epoch": 0.6, + "grad_norm": 0.29296875, + "learning_rate": 2.0318073535394322e-07, + "logits/chosen": -1.330426812171936, + "logits/rejected": -1.1868915557861328, + "logps/chosen": -209.96444702148438, + "logps/rejected": -228.32839965820312, + "loss": 0.6491, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.021598026156425476, + "rewards/margins": 0.09060736745595932, + "rewards/margins_max": 0.1294335275888443, + "rewards/margins_min": 0.051781199872493744, + "rewards/margins_std": 0.054908476769924164, + "rewards/rejected": -0.06900934129953384, + "step": 1590 + }, + { + "epoch": 0.61, + "grad_norm": 0.33984375, + "learning_rate": 1.9992691817133024e-07, + "logits/chosen": -1.3735512495040894, + "logits/rejected": -1.0827502012252808, + "logps/chosen": -204.10800170898438, + "logps/rejected": -213.89306640625, + "loss": 0.6488, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.021180730313062668, + "rewards/margins": 0.08791421353816986, + "rewards/margins_max": 0.13012385368347168, + "rewards/margins_min": 0.04570458456873894, + "rewards/margins_std": 0.059693437069654465, + "rewards/rejected": -0.06673348695039749, + "step": 1600 + }, + { + "epoch": 0.61, + "grad_norm": 0.296875, + "learning_rate": 1.9668191420241654e-07, + "logits/chosen": -1.411036491394043, + "logits/rejected": -1.0531718730926514, + "logps/chosen": -196.48928833007812, + "logps/rejected": -218.87850952148438, + "loss": 0.6484, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02457505650818348, + "rewards/margins": 0.09790615737438202, + "rewards/margins_max": 0.13226325809955597, + "rewards/margins_min": 0.06354905664920807, + "rewards/margins_std": 0.04858827963471413, + "rewards/rejected": -0.07333110272884369, + "step": 1610 + }, + { + "epoch": 0.62, + "grad_norm": 0.3046875, + "learning_rate": 1.9344629459066676e-07, + "logits/chosen": -1.3850080966949463, + "logits/rejected": -1.1197946071624756, + "logps/chosen": -199.50949096679688, + "logps/rejected": -246.0616455078125, + "loss": 0.6489, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02376171015202999, + "rewards/margins": 0.08920769393444061, + "rewards/margins_max": 0.12311413139104843, + "rewards/margins_min": 0.055301256477832794, + "rewards/margins_std": 0.04795095697045326, + "rewards/rejected": -0.06544599682092667, + "step": 1620 + }, + { + "epoch": 0.62, + "grad_norm": 0.357421875, + "learning_rate": 1.902206288278326e-07, + "logits/chosen": -1.5406492948532104, + "logits/rejected": -1.1824986934661865, + "logps/chosen": -202.54025268554688, + "logps/rejected": -201.29934692382812, + "loss": 0.6529, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.01288673747330904, + "rewards/margins": 0.07875024527311325, + "rewards/margins_max": 0.1176532506942749, + "rewards/margins_min": 0.039847247302532196, + "rewards/margins_std": 0.055017150938510895, + "rewards/rejected": -0.06586351245641708, + "step": 1630 + }, + { + "epoch": 0.62, + "grad_norm": 0.279296875, + "learning_rate": 1.8700548465371873e-07, + "logits/chosen": -1.4827055931091309, + "logits/rejected": -1.1093571186065674, + "logps/chosen": -223.68710327148438, + "logps/rejected": -239.01846313476562, + "loss": 0.6496, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01819652132689953, + "rewards/margins": 0.08910879492759705, + "rewards/margins_max": 0.12292404472827911, + "rewards/margins_min": 0.05529356002807617, + "rewards/margins_std": 0.04782196134328842, + "rewards/rejected": -0.07091227173805237, + "step": 1640 + }, + { + "epoch": 0.63, + "grad_norm": 0.28515625, + "learning_rate": 1.8380142795625613e-07, + "logits/chosen": -1.3712780475616455, + "logits/rejected": -1.0752792358398438, + "logps/chosen": -202.42001342773438, + "logps/rejected": -208.89230346679688, + "loss": 0.6503, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02562437392771244, + "rewards/margins": 0.08713234215974808, + "rewards/margins_max": 0.1250787079334259, + "rewards/margins_min": 0.04918598383665085, + "rewards/margins_std": 0.053664255887269974, + "rewards/rejected": -0.06150797754526138, + "step": 1650 + }, + { + "epoch": 0.63, + "grad_norm": 0.28125, + "learning_rate": 1.8060902267190248e-07, + "logits/chosen": -1.3778380155563354, + "logits/rejected": -1.0215446949005127, + "logps/chosen": -221.0638885498047, + "logps/rejected": -214.5088348388672, + "loss": 0.6518, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02021961286664009, + "rewards/margins": 0.08468443900346756, + "rewards/margins_max": 0.11928977817296982, + "rewards/margins_min": 0.050079114735126495, + "rewards/margins_std": 0.04893932491540909, + "rewards/rejected": -0.06446482241153717, + "step": 1660 + }, + { + "epoch": 0.63, + "grad_norm": 0.326171875, + "learning_rate": 1.7742883068638445e-07, + "logits/chosen": -1.3894107341766357, + "logits/rejected": -1.1386574506759644, + "logps/chosen": -192.35226440429688, + "logps/rejected": -209.70242309570312, + "loss": 0.6478, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.02140471711754799, + "rewards/margins": 0.09159474074840546, + "rewards/margins_max": 0.12955918908119202, + "rewards/margins_min": 0.05363030359148979, + "rewards/margins_std": 0.05368983745574951, + "rewards/rejected": -0.07019002735614777, + "step": 1670 + }, + { + "epoch": 0.64, + "grad_norm": 0.294921875, + "learning_rate": 1.742614117358029e-07, + "logits/chosen": -1.3801229000091553, + "logits/rejected": -1.077549934387207, + "logps/chosen": -200.5829620361328, + "logps/rejected": -208.20291137695312, + "loss": 0.6522, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.018527880311012268, + "rewards/margins": 0.09335903823375702, + "rewards/margins_max": 0.13055115938186646, + "rewards/margins_min": 0.05616689473390579, + "rewards/margins_std": 0.05259762331843376, + "rewards/rejected": -0.07483114302158356, + "step": 1680 + }, + { + "epoch": 0.64, + "grad_norm": 0.30078125, + "learning_rate": 1.7110732330811488e-07, + "logits/chosen": -1.3090708255767822, + "logits/rejected": -1.0196329355239868, + "logps/chosen": -216.028076171875, + "logps/rejected": -259.5859680175781, + "loss": 0.6511, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.019178880378603935, + "rewards/margins": 0.08465974032878876, + "rewards/margins_max": 0.11861772835254669, + "rewards/margins_min": 0.05070176720619202, + "rewards/margins_std": 0.04802383482456207, + "rewards/rejected": -0.06548087298870087, + "step": 1690 + }, + { + "epoch": 0.65, + "grad_norm": 0.345703125, + "learning_rate": 1.6796712054501167e-07, + "logits/chosen": -1.3992605209350586, + "logits/rejected": -1.054147720336914, + "logps/chosen": -210.8378448486328, + "logps/rejected": -222.1731719970703, + "loss": 0.652, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021352563053369522, + "rewards/margins": 0.08200518041849136, + "rewards/margins_max": 0.12033899873495102, + "rewards/margins_min": 0.04367135837674141, + "rewards/margins_std": 0.054212212562561035, + "rewards/rejected": -0.06065262109041214, + "step": 1700 + }, + { + "epoch": 0.65, + "grad_norm": 0.39453125, + "learning_rate": 1.6484135614421036e-07, + "logits/chosen": -1.3646801710128784, + "logits/rejected": -1.1284492015838623, + "logps/chosen": -202.224609375, + "logps/rejected": -231.50820922851562, + "loss": 0.6486, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.020144078880548477, + "rewards/margins": 0.09865118563175201, + "rewards/margins_max": 0.12845909595489502, + "rewards/margins_min": 0.0688432827591896, + "rewards/margins_std": 0.042154744267463684, + "rewards/rejected": -0.07850711047649384, + "step": 1710 + }, + { + "epoch": 0.65, + "grad_norm": 0.2294921875, + "learning_rate": 1.617305802621748e-07, + "logits/chosen": -1.4775947332382202, + "logits/rejected": -1.182840347290039, + "logps/chosen": -226.96572875976562, + "logps/rejected": -248.0793914794922, + "loss": 0.6549, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022877058014273643, + "rewards/margins": 0.08480893075466156, + "rewards/margins_max": 0.12495218217372894, + "rewards/margins_min": 0.04466567188501358, + "rewards/margins_std": 0.05677112936973572, + "rewards/rejected": -0.06193187087774277, + "step": 1720 + }, + { + "epoch": 0.66, + "grad_norm": 0.3359375, + "learning_rate": 1.586353404172846e-07, + "logits/chosen": -1.5373847484588623, + "logits/rejected": -1.1060173511505127, + "logps/chosen": -220.9475860595703, + "logps/rejected": -232.4903106689453, + "loss": 0.6531, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.018491679802536964, + "rewards/margins": 0.09102120250463486, + "rewards/margins_max": 0.1361791342496872, + "rewards/margins_min": 0.04586326703429222, + "rewards/margins_std": 0.06386296451091766, + "rewards/rejected": -0.07252952456474304, + "step": 1730 + }, + { + "epoch": 0.66, + "grad_norm": 0.27734375, + "learning_rate": 1.5555618139346762e-07, + "logits/chosen": -1.4408385753631592, + "logits/rejected": -1.170309066772461, + "logps/chosen": -208.6848602294922, + "logps/rejected": -236.5205841064453, + "loss": 0.6454, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0164918415248394, + "rewards/margins": 0.08608251065015793, + "rewards/margins_max": 0.11366148293018341, + "rewards/margins_min": 0.058503538370132446, + "rewards/margins_std": 0.03900256007909775, + "rewards/rejected": -0.06959067285060883, + "step": 1740 + }, + { + "epoch": 0.66, + "grad_norm": 0.30859375, + "learning_rate": 1.5249364514431467e-07, + "logits/chosen": -1.430936574935913, + "logits/rejected": -1.2513033151626587, + "logps/chosen": -195.7074432373047, + "logps/rejected": -222.7970428466797, + "loss": 0.6525, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014837482944130898, + "rewards/margins": 0.08443491905927658, + "rewards/margins_max": 0.11733835935592651, + "rewards/margins_min": 0.05153145641088486, + "rewards/margins_std": 0.046532515436410904, + "rewards/rejected": -0.06959743797779083, + "step": 1750 + }, + { + "epoch": 0.67, + "grad_norm": 0.3671875, + "learning_rate": 1.4944827069769122e-07, + "logits/chosen": -1.3783493041992188, + "logits/rejected": -1.0512521266937256, + "logps/chosen": -195.98391723632812, + "logps/rejected": -220.15029907226562, + "loss": 0.6491, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023444708436727524, + "rewards/margins": 0.09378419816493988, + "rewards/margins_max": 0.13183923065662384, + "rewards/margins_min": 0.055729180574417114, + "rewards/margins_std": 0.05381792038679123, + "rewards/rejected": -0.07033950090408325, + "step": 1760 + }, + { + "epoch": 0.67, + "grad_norm": 0.31640625, + "learning_rate": 1.4642059406086543e-07, + "logits/chosen": -1.4898409843444824, + "logits/rejected": -1.1888186931610107, + "logps/chosen": -205.3476104736328, + "logps/rejected": -194.2181396484375, + "loss": 0.6518, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.015554072335362434, + "rewards/margins": 0.08319921046495438, + "rewards/margins_max": 0.12011650949716568, + "rewards/margins_min": 0.046281903982162476, + "rewards/margins_std": 0.052208948880434036, + "rewards/rejected": -0.0676451325416565, + "step": 1770 + }, + { + "epoch": 0.68, + "grad_norm": 0.34765625, + "learning_rate": 1.4341114812616648e-07, + "logits/chosen": -1.44392991065979, + "logits/rejected": -1.0090781450271606, + "logps/chosen": -229.25161743164062, + "logps/rejected": -213.78305053710938, + "loss": 0.6496, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.018382510170340538, + "rewards/margins": 0.08579520881175995, + "rewards/margins_max": 0.1280362457036972, + "rewards/margins_min": 0.043554168194532394, + "rewards/margins_std": 0.05973784998059273, + "rewards/rejected": -0.06741269677877426, + "step": 1780 + }, + { + "epoch": 0.68, + "grad_norm": 0.298828125, + "learning_rate": 1.404204625771926e-07, + "logits/chosen": -1.4878530502319336, + "logits/rejected": -1.113875389099121, + "logps/chosen": -214.8065185546875, + "logps/rejected": -246.6589813232422, + "loss": 0.6471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02722371183335781, + "rewards/margins": 0.09628230333328247, + "rewards/margins_max": 0.13469961285591125, + "rewards/margins_min": 0.05786500126123428, + "rewards/margins_std": 0.05433027073740959, + "rewards/rejected": -0.06905858218669891, + "step": 1790 + }, + { + "epoch": 0.68, + "grad_norm": 0.3359375, + "learning_rate": 1.3744906379558164e-07, + "logits/chosen": -1.4668910503387451, + "logits/rejected": -1.199731469154358, + "logps/chosen": -196.6200714111328, + "logps/rejected": -210.1691131591797, + "loss": 0.647, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.023409178480505943, + "rewards/margins": 0.09194588661193848, + "rewards/margins_max": 0.13833829760551453, + "rewards/margins_min": 0.04555344209074974, + "rewards/margins_std": 0.06560881435871124, + "rewards/rejected": -0.06853669881820679, + "step": 1800 + }, + { + "epoch": 0.69, + "grad_norm": 0.314453125, + "learning_rate": 1.3449747476836602e-07, + "logits/chosen": -1.5464714765548706, + "logits/rejected": -1.2193983793258667, + "logps/chosen": -208.71432495117188, + "logps/rejected": -219.8221435546875, + "loss": 0.6489, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.023995716124773026, + "rewards/margins": 0.09224376827478409, + "rewards/margins_max": 0.1384916752576828, + "rewards/margins_min": 0.045995842665433884, + "rewards/margins_std": 0.06540443003177643, + "rewards/rejected": -0.06824804842472076, + "step": 1810 + }, + { + "epoch": 0.69, + "grad_norm": 0.29296875, + "learning_rate": 1.315662149959218e-07, + "logits/chosen": -1.4520776271820068, + "logits/rejected": -1.116135835647583, + "logps/chosen": -209.5501251220703, + "logps/rejected": -214.80197143554688, + "loss": 0.658, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.017536520957946777, + "rewards/margins": 0.06895823031663895, + "rewards/margins_max": 0.09921301901340485, + "rewards/margins_min": 0.038703449070453644, + "rewards/margins_std": 0.042786724865436554, + "rewards/rejected": -0.05142170935869217, + "step": 1820 + }, + { + "epoch": 0.7, + "grad_norm": 0.337890625, + "learning_rate": 1.286558004005338e-07, + "logits/chosen": -1.4225823879241943, + "logits/rejected": -1.0862594842910767, + "logps/chosen": -193.0080108642578, + "logps/rejected": -234.2497100830078, + "loss": 0.6502, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02093338966369629, + "rewards/margins": 0.08371131867170334, + "rewards/margins_max": 0.11859454959630966, + "rewards/margins_min": 0.04882808402180672, + "rewards/margins_std": 0.04933235049247742, + "rewards/rejected": -0.06277792900800705, + "step": 1830 + }, + { + "epoch": 0.7, + "grad_norm": 0.3046875, + "learning_rate": 1.2576674323558928e-07, + "logits/chosen": -1.4744040966033936, + "logits/rejected": -1.200323462486267, + "logps/chosen": -208.7011260986328, + "logps/rejected": -238.9582061767578, + "loss": 0.6498, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022860785946249962, + "rewards/margins": 0.09148404747247696, + "rewards/margins_max": 0.1325657218694687, + "rewards/margins_min": 0.05040237307548523, + "rewards/margins_std": 0.05809825658798218, + "rewards/rejected": -0.06862326711416245, + "step": 1840 + }, + { + "epoch": 0.7, + "grad_norm": 0.3125, + "learning_rate": 1.228995519954183e-07, + "logits/chosen": -1.4823616743087769, + "logits/rejected": -1.0789930820465088, + "logps/chosen": -224.8362274169922, + "logps/rejected": -233.29721069335938, + "loss": 0.6511, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02483968995511532, + "rewards/margins": 0.08517090976238251, + "rewards/margins_max": 0.11456477642059326, + "rewards/margins_min": 0.05577705428004265, + "rewards/margins_std": 0.041569195687770844, + "rewards/rejected": -0.06033121794462204, + "step": 1850 + }, + { + "epoch": 0.71, + "grad_norm": 0.26171875, + "learning_rate": 1.2005473132579407e-07, + "logits/chosen": -1.466604471206665, + "logits/rejected": -1.0293363332748413, + "logps/chosen": -217.72970581054688, + "logps/rejected": -234.6483154296875, + "loss": 0.652, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.021058417856693268, + "rewards/margins": 0.08528687059879303, + "rewards/margins_max": 0.11772221326828003, + "rewards/margins_min": 0.05285150930285454, + "rewards/margins_std": 0.04587051644921303, + "rewards/rejected": -0.06422845274209976, + "step": 1860 + }, + { + "epoch": 0.71, + "grad_norm": 0.306640625, + "learning_rate": 1.1723278193511322e-07, + "logits/chosen": -1.4631662368774414, + "logits/rejected": -1.0792559385299683, + "logps/chosen": -249.8877716064453, + "logps/rejected": -277.57452392578125, + "loss": 0.6463, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.016841616481542587, + "rewards/margins": 0.09164032340049744, + "rewards/margins_max": 0.1224864274263382, + "rewards/margins_min": 0.060794223099946976, + "rewards/margins_std": 0.04362296685576439, + "rewards/rejected": -0.07479871064424515, + "step": 1870 + }, + { + "epoch": 0.71, + "grad_norm": 0.279296875, + "learning_rate": 1.1443420050626623e-07, + "logits/chosen": -1.4762096405029297, + "logits/rejected": -1.2501494884490967, + "logps/chosen": -199.69052124023438, + "logps/rejected": -204.5517120361328, + "loss": 0.6513, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01751863956451416, + "rewards/margins": 0.0781446248292923, + "rewards/margins_max": 0.11569873243570328, + "rewards/margins_min": 0.04059051349759102, + "rewards/margins_std": 0.05310952663421631, + "rewards/rejected": -0.06062598153948784, + "step": 1880 + }, + { + "epoch": 0.72, + "grad_norm": 0.353515625, + "learning_rate": 1.1165947960921868e-07, + "logits/chosen": -1.4795281887054443, + "logits/rejected": -1.089855670928955, + "logps/chosen": -247.3242645263672, + "logps/rejected": -246.82656860351562, + "loss": 0.6495, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.011459515430033207, + "rewards/margins": 0.0838027149438858, + "rewards/margins_max": 0.11774241924285889, + "rewards/margins_min": 0.04986302927136421, + "rewards/margins_std": 0.04799797758460045, + "rewards/rejected": -0.07234319299459457, + "step": 1890 + }, + { + "epoch": 0.72, + "grad_norm": 0.34765625, + "learning_rate": 1.0890910761431491e-07, + "logits/chosen": -1.3522553443908691, + "logits/rejected": -1.0552892684936523, + "logps/chosen": -207.72488403320312, + "logps/rejected": -233.9858856201172, + "loss": 0.65, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020823100581765175, + "rewards/margins": 0.08793775737285614, + "rewards/margins_max": 0.12979640066623688, + "rewards/margins_min": 0.04607909545302391, + "rewards/margins_std": 0.059197068214416504, + "rewards/rejected": -0.06711465120315552, + "step": 1900 + }, + { + "epoch": 0.73, + "grad_norm": 0.2431640625, + "learning_rate": 1.0618356860632208e-07, + "logits/chosen": -1.486316442489624, + "logits/rejected": -1.1503719091415405, + "logps/chosen": -191.83619689941406, + "logps/rejected": -218.7319793701172, + "loss": 0.6508, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.029194846749305725, + "rewards/margins": 0.0852990448474884, + "rewards/margins_max": 0.11882505565881729, + "rewards/margins_min": 0.05177304893732071, + "rewards/margins_std": 0.047412920743227005, + "rewards/rejected": -0.05610420182347298, + "step": 1910 + }, + { + "epoch": 0.73, + "grad_norm": 0.3046875, + "learning_rate": 1.0348334229922676e-07, + "logits/chosen": -1.3859325647354126, + "logits/rejected": -1.1395026445388794, + "logps/chosen": -197.09481811523438, + "logps/rejected": -218.48251342773438, + "loss": 0.6476, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0274784155189991, + "rewards/margins": 0.1017388105392456, + "rewards/margins_max": 0.14634063839912415, + "rewards/margins_min": 0.05713699012994766, + "rewards/margins_std": 0.06307649612426758, + "rewards/rejected": -0.07426039129495621, + "step": 1920 + }, + { + "epoch": 0.73, + "grad_norm": 0.265625, + "learning_rate": 1.0080890395180328e-07, + "logits/chosen": -1.4809856414794922, + "logits/rejected": -1.1530823707580566, + "logps/chosen": -207.95291137695312, + "logps/rejected": -221.2301788330078, + "loss": 0.6498, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.022588089108467102, + "rewards/margins": 0.09043975919485092, + "rewards/margins_max": 0.1265183687210083, + "rewards/margins_min": 0.05436115339398384, + "rewards/margins_std": 0.05102284997701645, + "rewards/rejected": -0.06785167008638382, + "step": 1930 + }, + { + "epoch": 0.74, + "grad_norm": 0.294921875, + "learning_rate": 9.816072428396374e-08, + "logits/chosen": -1.5341602563858032, + "logits/rejected": -1.2367621660232544, + "logps/chosen": -221.6543426513672, + "logps/rejected": -219.3003387451172, + "loss": 0.6499, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017101837322115898, + "rewards/margins": 0.0833502784371376, + "rewards/margins_max": 0.11911274492740631, + "rewards/margins_min": 0.0475878044962883, + "rewards/margins_std": 0.050575774163007736, + "rewards/rejected": -0.06624843925237656, + "step": 1940 + }, + { + "epoch": 0.74, + "grad_norm": 0.30078125, + "learning_rate": 9.553926939390847e-08, + "logits/chosen": -1.348481297492981, + "logits/rejected": -1.092132329940796, + "logps/chosen": -180.2677001953125, + "logps/rejected": -206.4167938232422, + "loss": 0.6517, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.020975422114133835, + "rewards/margins": 0.08831629902124405, + "rewards/margins_max": 0.12501828372478485, + "rewards/margins_min": 0.05161432549357414, + "rewards/margins_std": 0.051904432475566864, + "rewards/rejected": -0.06734088063240051, + "step": 1950 + }, + { + "epoch": 0.74, + "grad_norm": 0.3046875, + "learning_rate": 9.29450006760894e-08, + "logits/chosen": -1.4164844751358032, + "logits/rejected": -1.0626654624938965, + "logps/chosen": -217.0990753173828, + "logps/rejected": -231.00747680664062, + "loss": 0.6462, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.025280630216002464, + "rewards/margins": 0.08944814652204514, + "rewards/margins_max": 0.12857092916965485, + "rewards/margins_min": 0.05032537132501602, + "rewards/margins_std": 0.05532795935869217, + "rewards/rejected": -0.06416751444339752, + "step": 1960 + }, + { + "epoch": 0.75, + "grad_norm": 0.31640625, + "learning_rate": 9.03783747400017e-08, + "logits/chosen": -1.5357427597045898, + "logits/rejected": -1.2725077867507935, + "logps/chosen": -204.5410919189453, + "logps/rejected": -243.62899780273438, + "loss": 0.6459, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02568015828728676, + "rewards/margins": 0.10661699622869492, + "rewards/margins_max": 0.14371469616889954, + "rewards/margins_min": 0.0695192739367485, + "rewards/margins_std": 0.05246409773826599, + "rewards/rejected": -0.08093683421611786, + "step": 1970 + }, + { + "epoch": 0.75, + "grad_norm": 0.26171875, + "learning_rate": 8.783984332981648e-08, + "logits/chosen": -1.5041474103927612, + "logits/rejected": -1.130979299545288, + "logps/chosen": -194.86923217773438, + "logps/rejected": -220.365478515625, + "loss": 0.6489, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.024836743250489235, + "rewards/margins": 0.09214451909065247, + "rewards/margins_max": 0.12188988924026489, + "rewards/margins_min": 0.062399156391620636, + "rewards/margins_std": 0.0420663021504879, + "rewards/rejected": -0.06730777770280838, + "step": 1980 + }, + { + "epoch": 0.76, + "grad_norm": 0.3046875, + "learning_rate": 8.532985324487171e-08, + "logits/chosen": -1.4796479940414429, + "logits/rejected": -1.1508190631866455, + "logps/chosen": -184.7924346923828, + "logps/rejected": -212.42373657226562, + "loss": 0.6484, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01795900985598564, + "rewards/margins": 0.09584374725818634, + "rewards/margins_max": 0.13568690419197083, + "rewards/margins_min": 0.05600060150027275, + "rewards/margins_std": 0.05634673312306404, + "rewards/rejected": -0.0778847485780716, + "step": 1990 + }, + { + "epoch": 0.76, + "grad_norm": 0.29296875, + "learning_rate": 8.284884626103164e-08, + "logits/chosen": -1.4941637516021729, + "logits/rejected": -1.2435134649276733, + "logps/chosen": -188.8251190185547, + "logps/rejected": -210.8880157470703, + "loss": 0.6503, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02011449821293354, + "rewards/margins": 0.08227307349443436, + "rewards/margins_max": 0.11268408596515656, + "rewards/margins_min": 0.05186206102371216, + "rewards/margins_std": 0.04300766438245773, + "rewards/rejected": -0.06215857341885567, + "step": 2000 + }, + { + "epoch": 0.76, + "grad_norm": 0.2890625, + "learning_rate": 8.039725905293138e-08, + "logits/chosen": -1.3546192646026611, + "logits/rejected": -1.0556727647781372, + "logps/chosen": -189.37301635742188, + "logps/rejected": -232.6674041748047, + "loss": 0.6513, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02390899695456028, + "rewards/margins": 0.0961272269487381, + "rewards/margins_max": 0.14148414134979248, + "rewards/margins_min": 0.05077032372355461, + "rewards/margins_std": 0.06414436548948288, + "rewards/rejected": -0.07221823185682297, + "step": 2010 + }, + { + "epoch": 0.77, + "grad_norm": 0.30078125, + "learning_rate": 7.797552311711905e-08, + "logits/chosen": -1.5595532655715942, + "logits/rejected": -1.205398678779602, + "logps/chosen": -210.8866729736328, + "logps/rejected": -214.4613800048828, + "loss": 0.652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.01619151420891285, + "rewards/margins": 0.07569371163845062, + "rewards/margins_max": 0.11168579757213593, + "rewards/margins_min": 0.039701610803604126, + "rewards/margins_std": 0.05090050771832466, + "rewards/rejected": -0.059502195566892624, + "step": 2020 + }, + { + "epoch": 0.77, + "grad_norm": 0.359375, + "learning_rate": 7.558406469610981e-08, + "logits/chosen": -1.479150414466858, + "logits/rejected": -1.1994903087615967, + "logps/chosen": -203.69873046875, + "logps/rejected": -213.0478973388672, + "loss": 0.6519, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.01320022065192461, + "rewards/margins": 0.08063283562660217, + "rewards/margins_max": 0.12000874429941177, + "rewards/margins_min": 0.04125692695379257, + "rewards/margins_std": 0.055685948580503464, + "rewards/rejected": -0.06743261963129044, + "step": 2030 + }, + { + "epoch": 0.78, + "grad_norm": 0.328125, + "learning_rate": 7.322330470336313e-08, + "logits/chosen": -1.5190201997756958, + "logits/rejected": -1.1927928924560547, + "logps/chosen": -202.38818359375, + "logps/rejected": -222.53903198242188, + "loss": 0.6523, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.016741709783673286, + "rewards/margins": 0.08067712187767029, + "rewards/margins_max": 0.11362887918949127, + "rewards/margins_min": 0.047725364565849304, + "rewards/margins_std": 0.0466008223593235, + "rewards/rejected": -0.06393541395664215, + "step": 2040 + }, + { + "epoch": 0.78, + "grad_norm": 0.32421875, + "learning_rate": 7.08936586492003e-08, + "logits/chosen": -1.3734339475631714, + "logits/rejected": -1.0133839845657349, + "logps/chosen": -206.0966339111328, + "logps/rejected": -218.1828155517578, + "loss": 0.6502, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.014113351702690125, + "rewards/margins": 0.08260734379291534, + "rewards/margins_max": 0.12146137654781342, + "rewards/margins_min": 0.043753307312726974, + "rewards/margins_std": 0.054947901517152786, + "rewards/rejected": -0.06849398463964462, + "step": 2050 + }, + { + "epoch": 0.78, + "grad_norm": 0.345703125, + "learning_rate": 6.859553656767112e-08, + "logits/chosen": -1.3706634044647217, + "logits/rejected": -1.0720088481903076, + "logps/chosen": -220.724853515625, + "logps/rejected": -234.53756713867188, + "loss": 0.6471, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.017995189875364304, + "rewards/margins": 0.0858609676361084, + "rewards/margins_max": 0.13481369614601135, + "rewards/margins_min": 0.03690824285149574, + "rewards/margins_std": 0.0692296102643013, + "rewards/rejected": -0.0678657740354538, + "step": 2060 + }, + { + "epoch": 0.79, + "grad_norm": 0.3125, + "learning_rate": 6.63293429443845e-08, + "logits/chosen": -1.484061360359192, + "logits/rejected": -1.1830289363861084, + "logps/chosen": -224.13644409179688, + "logps/rejected": -251.21792602539062, + "loss": 0.6527, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.020492028445005417, + "rewards/margins": 0.0933627113699913, + "rewards/margins_max": 0.1307709515094757, + "rewards/margins_min": 0.055954478681087494, + "rewards/margins_std": 0.052903227508068085, + "rewards/rejected": -0.07287068665027618, + "step": 2070 + }, + { + "epoch": 0.79, + "grad_norm": 0.375, + "learning_rate": 6.409547664531733e-08, + "logits/chosen": -1.392897367477417, + "logits/rejected": -1.0433756113052368, + "logps/chosen": -233.62039184570312, + "logps/rejected": -221.968994140625, + "loss": 0.6485, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.017711572349071503, + "rewards/margins": 0.09063192456960678, + "rewards/margins_max": 0.1303481161594391, + "rewards/margins_min": 0.05091572925448418, + "rewards/margins_std": 0.05616719275712967, + "rewards/rejected": -0.07292035222053528, + "step": 2080 + }, + { + "epoch": 0.79, + "grad_norm": 0.287109375, + "learning_rate": 6.189433084661031e-08, + "logits/chosen": -1.441162347793579, + "logits/rejected": -1.0751087665557861, + "logps/chosen": -211.349365234375, + "logps/rejected": -226.0006103515625, + "loss": 0.6491, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.017282210290431976, + "rewards/margins": 0.0929594486951828, + "rewards/margins_max": 0.13247843086719513, + "rewards/margins_min": 0.05344045162200928, + "rewards/margins_std": 0.05588828772306442, + "rewards/rejected": -0.07567723095417023, + "step": 2090 + }, + { + "epoch": 0.8, + "grad_norm": 0.302734375, + "learning_rate": 5.972629296536655e-08, + "logits/chosen": -1.398723840713501, + "logits/rejected": -1.0770161151885986, + "logps/chosen": -208.3170928955078, + "logps/rejected": -220.9575653076172, + "loss": 0.6522, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021888596937060356, + "rewards/margins": 0.08464206010103226, + "rewards/margins_max": 0.12029530853033066, + "rewards/margins_min": 0.04898880049586296, + "rewards/margins_std": 0.05042130872607231, + "rewards/rejected": -0.06275346130132675, + "step": 2100 + }, + { + "epoch": 0.8, + "grad_norm": 0.314453125, + "learning_rate": 5.7591744591463375e-08, + "logits/chosen": -1.4472309350967407, + "logits/rejected": -1.021103858947754, + "logps/chosen": -226.20626831054688, + "logps/rejected": -214.0725860595703, + "loss": 0.6477, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017262738198041916, + "rewards/margins": 0.09329278767108917, + "rewards/margins_max": 0.13951919972896576, + "rewards/margins_min": 0.04706636443734169, + "rewards/margins_std": 0.06537402421236038, + "rewards/rejected": -0.07603004574775696, + "step": 2110 + }, + { + "epoch": 0.81, + "grad_norm": 0.314453125, + "learning_rate": 5.5491061420390174e-08, + "logits/chosen": -1.461857557296753, + "logits/rejected": -1.0689175128936768, + "logps/chosen": -231.24649047851562, + "logps/rejected": -246.7604217529297, + "loss": 0.6547, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.019667720422148705, + "rewards/margins": 0.08514972031116486, + "rewards/margins_max": 0.12298393249511719, + "rewards/margins_min": 0.04731552302837372, + "rewards/margins_std": 0.05350564047694206, + "rewards/rejected": -0.0654820054769516, + "step": 2120 + }, + { + "epoch": 0.81, + "grad_norm": 0.271484375, + "learning_rate": 5.342461318712252e-08, + "logits/chosen": -1.4466235637664795, + "logits/rejected": -1.14009690284729, + "logps/chosen": -176.4794158935547, + "logps/rejected": -235.32144165039062, + "loss": 0.6488, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.018947120755910873, + "rewards/margins": 0.09472303092479706, + "rewards/margins_max": 0.13757546246051788, + "rewards/margins_min": 0.05187060683965683, + "rewards/margins_std": 0.060602474957704544, + "rewards/rejected": -0.07577590644359589, + "step": 2130 + }, + { + "epoch": 0.81, + "grad_norm": 0.365234375, + "learning_rate": 5.1392763601047244e-08, + "logits/chosen": -1.4163846969604492, + "logits/rejected": -1.0834262371063232, + "logps/chosen": -187.30274963378906, + "logps/rejected": -220.01986694335938, + "loss": 0.645, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01945066824555397, + "rewards/margins": 0.09664531797170639, + "rewards/margins_max": 0.13591603934764862, + "rewards/margins_min": 0.057374607771635056, + "rewards/margins_std": 0.05553717538714409, + "rewards/rejected": -0.07719465345144272, + "step": 2140 + }, + { + "epoch": 0.82, + "grad_norm": 0.326171875, + "learning_rate": 4.939587028194625e-08, + "logits/chosen": -1.4588258266448975, + "logits/rejected": -1.075309157371521, + "logps/chosen": -231.6609344482422, + "logps/rejected": -194.46878051757812, + "loss": 0.6523, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.013223061338067055, + "rewards/margins": 0.08043862879276276, + "rewards/margins_max": 0.11344832181930542, + "rewards/margins_min": 0.0474289245903492, + "rewards/margins_std": 0.04668276757001877, + "rewards/rejected": -0.06721556931734085, + "step": 2150 + }, + { + "epoch": 0.82, + "grad_norm": 0.345703125, + "learning_rate": 4.743428469705335e-08, + "logits/chosen": -1.4122099876403809, + "logits/rejected": -1.1665161848068237, + "logps/chosen": -210.12646484375, + "logps/rejected": -255.7862091064453, + "loss": 0.6507, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.015928715467453003, + "rewards/margins": 0.08897783607244492, + "rewards/margins_max": 0.12421097606420517, + "rewards/margins_min": 0.053744666278362274, + "rewards/margins_std": 0.0498272180557251, + "rewards/rejected": -0.07304911315441132, + "step": 2160 + }, + { + "epoch": 0.82, + "grad_norm": 0.322265625, + "learning_rate": 4.550835209919326e-08, + "logits/chosen": -1.3729497194290161, + "logits/rejected": -1.0899299383163452, + "logps/chosen": -202.81869506835938, + "logps/rejected": -222.53366088867188, + "loss": 0.6513, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021017426624894142, + "rewards/margins": 0.0997917577624321, + "rewards/margins_max": 0.1454765796661377, + "rewards/margins_min": 0.0541069433093071, + "rewards/margins_std": 0.06460809707641602, + "rewards/rejected": -0.0787743479013443, + "step": 2170 + }, + { + "epoch": 0.83, + "grad_norm": 0.33984375, + "learning_rate": 4.361841146601516e-08, + "logits/chosen": -1.3598549365997314, + "logits/rejected": -1.1428276300430298, + "logps/chosen": -225.26675415039062, + "logps/rejected": -262.83294677734375, + "loss": 0.647, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02677268348634243, + "rewards/margins": 0.10582437366247177, + "rewards/margins_max": 0.157110795378685, + "rewards/margins_min": 0.05453797057271004, + "rewards/margins_std": 0.07252994179725647, + "rewards/rejected": -0.07905169576406479, + "step": 2180 + }, + { + "epoch": 0.83, + "grad_norm": 0.30078125, + "learning_rate": 4.1764795440329516e-08, + "logits/chosen": -1.4309017658233643, + "logits/rejected": -1.1507847309112549, + "logps/chosen": -207.42568969726562, + "logps/rejected": -230.8451385498047, + "loss": 0.6523, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01876028999686241, + "rewards/margins": 0.08968784660100937, + "rewards/margins_max": 0.12752839922904968, + "rewards/margins_min": 0.05184728652238846, + "rewards/margins_std": 0.05351463705301285, + "rewards/rejected": -0.07092756032943726, + "step": 2190 + }, + { + "epoch": 0.84, + "grad_norm": 0.376953125, + "learning_rate": 3.994783027156143e-08, + "logits/chosen": -1.5498156547546387, + "logits/rejected": -1.1738865375518799, + "logps/chosen": -235.3194122314453, + "logps/rejected": -264.9289855957031, + "loss": 0.652, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.020533021539449692, + "rewards/margins": 0.09463083744049072, + "rewards/margins_max": 0.1284172236919403, + "rewards/margins_min": 0.06084446236491203, + "rewards/margins_std": 0.04778115078806877, + "rewards/rejected": -0.07409781217575073, + "step": 2200 + }, + { + "epoch": 0.84, + "grad_norm": 0.3203125, + "learning_rate": 3.81678357583278e-08, + "logits/chosen": -1.387663722038269, + "logits/rejected": -1.0362415313720703, + "logps/chosen": -201.65501403808594, + "logps/rejected": -223.9352569580078, + "loss": 0.6507, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.019265640527009964, + "rewards/margins": 0.08608859777450562, + "rewards/margins_max": 0.11621971428394318, + "rewards/margins_min": 0.055957477539777756, + "rewards/margins_std": 0.04261183738708496, + "rewards/rejected": -0.06682296097278595, + "step": 2210 + }, + { + "epoch": 0.84, + "grad_norm": 0.30859375, + "learning_rate": 3.6425125192150854e-08, + "logits/chosen": -1.3626813888549805, + "logits/rejected": -1.0685607194900513, + "logps/chosen": -202.44815063476562, + "logps/rejected": -236.275146484375, + "loss": 0.6504, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022014940157532692, + "rewards/margins": 0.10008475929498672, + "rewards/margins_max": 0.1432458460330963, + "rewards/margins_min": 0.05692365765571594, + "rewards/margins_std": 0.061039019376039505, + "rewards/rejected": -0.07806982100009918, + "step": 2220 + }, + { + "epoch": 0.85, + "grad_norm": 0.26953125, + "learning_rate": 3.4720005302316555e-08, + "logits/chosen": -1.4205926656723022, + "logits/rejected": -1.059009313583374, + "logps/chosen": -204.2139434814453, + "logps/rejected": -223.8666534423828, + "loss": 0.6446, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.025852352380752563, + "rewards/margins": 0.10249295085668564, + "rewards/margins_max": 0.1471790224313736, + "rewards/margins_min": 0.057806871831417084, + "rewards/margins_std": 0.06319564580917358, + "rewards/rejected": -0.07664059847593307, + "step": 2230 + }, + { + "epoch": 0.85, + "grad_norm": 0.294921875, + "learning_rate": 3.305277620188826e-08, + "logits/chosen": -1.4504072666168213, + "logits/rejected": -1.0379372835159302, + "logps/chosen": -191.2012939453125, + "logps/rejected": -193.94325256347656, + "loss": 0.6515, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.022001946344971657, + "rewards/margins": 0.08406315743923187, + "rewards/margins_max": 0.12489868700504303, + "rewards/margins_min": 0.04322761669754982, + "rewards/margins_std": 0.057750165462493896, + "rewards/rejected": -0.06206120178103447, + "step": 2240 + }, + { + "epoch": 0.85, + "grad_norm": 0.330078125, + "learning_rate": 3.142373133488416e-08, + "logits/chosen": -1.4936898946762085, + "logits/rejected": -1.0924030542373657, + "logps/chosen": -209.015380859375, + "logps/rejected": -224.1560821533203, + "loss": 0.6471, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02342359907925129, + "rewards/margins": 0.0905032753944397, + "rewards/margins_max": 0.12563428282737732, + "rewards/margins_min": 0.05537227541208267, + "rewards/margins_std": 0.04968274012207985, + "rewards/rejected": -0.06707967817783356, + "step": 2250 + }, + { + "epoch": 0.86, + "grad_norm": 0.302734375, + "learning_rate": 2.9833157424629965e-08, + "logits/chosen": -1.486553430557251, + "logits/rejected": -1.2344576120376587, + "logps/chosen": -190.970703125, + "logps/rejected": -235.8113555908203, + "loss": 0.6472, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.025419479236006737, + "rewards/margins": 0.10403518378734589, + "rewards/margins_max": 0.1522940993309021, + "rewards/margins_min": 0.05577626824378967, + "rewards/margins_std": 0.06824841350317001, + "rewards/rejected": -0.078615702688694, + "step": 2260 + }, + { + "epoch": 0.86, + "grad_norm": 0.27734375, + "learning_rate": 2.8281334423292752e-08, + "logits/chosen": -1.405575156211853, + "logits/rejected": -1.1565072536468506, + "logps/chosen": -184.67291259765625, + "logps/rejected": -215.6078338623047, + "loss": 0.6489, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.022038763388991356, + "rewards/margins": 0.08945713192224503, + "rewards/margins_max": 0.12467725574970245, + "rewards/margins_min": 0.05423703044652939, + "rewards/margins_std": 0.049808748066425323, + "rewards/rejected": -0.06741837412118912, + "step": 2270 + }, + { + "epoch": 0.87, + "grad_norm": 0.240234375, + "learning_rate": 2.6768535462607905e-08, + "logits/chosen": -1.405057668685913, + "logits/rejected": -1.0886411666870117, + "logps/chosen": -207.4287109375, + "logps/rejected": -224.9873809814453, + "loss": 0.6502, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.021886788308620453, + "rewards/margins": 0.09229324758052826, + "rewards/margins_max": 0.13078710436820984, + "rewards/margins_min": 0.053799428045749664, + "rewards/margins_std": 0.054438501596450806, + "rewards/rejected": -0.0704064816236496, + "step": 2280 + }, + { + "epoch": 0.87, + "grad_norm": 0.3046875, + "learning_rate": 2.529502680580578e-08, + "logits/chosen": -1.4395860433578491, + "logits/rejected": -1.1944835186004639, + "logps/chosen": -197.48228454589844, + "logps/rejected": -224.9473419189453, + "loss": 0.6498, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.017150208353996277, + "rewards/margins": 0.0919913500547409, + "rewards/margins_max": 0.13415281474590302, + "rewards/margins_min": 0.049829889088869095, + "rewards/margins_std": 0.05962531641125679, + "rewards/rejected": -0.07484114170074463, + "step": 2290 + }, + { + "epoch": 0.87, + "grad_norm": 0.296875, + "learning_rate": 2.386106780074784e-08, + "logits/chosen": -1.432604432106018, + "logits/rejected": -1.134526014328003, + "logps/chosen": -207.81283569335938, + "logps/rejected": -234.94784545898438, + "loss": 0.6503, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.022798217833042145, + "rewards/margins": 0.09147022664546967, + "rewards/margins_max": 0.13085611164569855, + "rewards/margins_min": 0.05208434537053108, + "rewards/margins_std": 0.055700045078992844, + "rewards/rejected": -0.06867200881242752, + "step": 2300 + }, + { + "epoch": 0.88, + "grad_norm": 0.318359375, + "learning_rate": 2.2466910834278957e-08, + "logits/chosen": -1.4143720865249634, + "logits/rejected": -1.069481611251831, + "logps/chosen": -207.4478759765625, + "logps/rejected": -218.31716918945312, + "loss": 0.6489, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.021387558430433273, + "rewards/margins": 0.09235554933547974, + "rewards/margins_max": 0.13592186570167542, + "rewards/margins_min": 0.048789240419864655, + "rewards/margins_std": 0.0616120770573616, + "rewards/rejected": -0.07096799463033676, + "step": 2310 + }, + { + "epoch": 0.88, + "grad_norm": 0.337890625, + "learning_rate": 2.1112801287806375e-08, + "logits/chosen": -1.3332234621047974, + "logits/rejected": -1.115999698638916, + "logps/chosen": -195.05917358398438, + "logps/rejected": -202.2626495361328, + "loss": 0.6477, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.01643414981663227, + "rewards/margins": 0.0895700454711914, + "rewards/margins_max": 0.12620273232460022, + "rewards/margins_min": 0.0529373399913311, + "rewards/margins_std": 0.051806457340717316, + "rewards/rejected": -0.07313589006662369, + "step": 2320 + }, + { + "epoch": 0.89, + "grad_norm": 0.302734375, + "learning_rate": 1.9798977494110274e-08, + "logits/chosen": -1.473491907119751, + "logits/rejected": -1.1137970685958862, + "logps/chosen": -213.4473114013672, + "logps/rejected": -226.2127685546875, + "loss": 0.65, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.024023082107305527, + "rewards/margins": 0.09007260203361511, + "rewards/margins_max": 0.12320313602685928, + "rewards/margins_min": 0.056942082941532135, + "rewards/margins_std": 0.04685363173484802, + "rewards/rejected": -0.06604952365159988, + "step": 2330 + }, + { + "epoch": 0.89, + "grad_norm": 0.33203125, + "learning_rate": 1.852567069539568e-08, + "logits/chosen": -1.4035086631774902, + "logits/rejected": -1.0367395877838135, + "logps/chosen": -222.76388549804688, + "logps/rejected": -218.81900024414062, + "loss": 0.6472, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01737220212817192, + "rewards/margins": 0.09154538065195084, + "rewards/margins_max": 0.12961694598197937, + "rewards/margins_min": 0.053473830223083496, + "rewards/margins_std": 0.05384131520986557, + "rewards/rejected": -0.07417318224906921, + "step": 2340 + }, + { + "epoch": 0.89, + "grad_norm": 0.345703125, + "learning_rate": 1.729310500259229e-08, + "logits/chosen": -1.438696265220642, + "logits/rejected": -1.103390097618103, + "logps/chosen": -204.58889770507812, + "logps/rejected": -197.52224731445312, + "loss": 0.6518, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.016056453809142113, + "rewards/margins": 0.08784165978431702, + "rewards/margins_max": 0.13728336989879608, + "rewards/margins_min": 0.03839995712041855, + "rewards/margins_std": 0.06992112845182419, + "rewards/rejected": -0.07178520411252975, + "step": 2350 + }, + { + "epoch": 0.9, + "grad_norm": 0.298828125, + "learning_rate": 1.610149735590949e-08, + "logits/chosen": -1.49490225315094, + "logits/rejected": -1.0492806434631348, + "logps/chosen": -233.8212432861328, + "logps/rejected": -238.4543914794922, + "loss": 0.6482, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.01798924431204796, + "rewards/margins": 0.08889796584844589, + "rewards/margins_max": 0.12274952232837677, + "rewards/margins_min": 0.05504639074206352, + "rewards/margins_std": 0.04787334054708481, + "rewards/rejected": -0.07090871036052704, + "step": 2360 + }, + { + "epoch": 0.9, + "grad_norm": 0.376953125, + "learning_rate": 1.4951057486652845e-08, + "logits/chosen": -1.4752815961837769, + "logits/rejected": -1.0967390537261963, + "logps/chosen": -215.08462524414062, + "logps/rejected": -239.4650115966797, + "loss": 0.6474, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.016311800107359886, + "rewards/margins": 0.09782110154628754, + "rewards/margins_max": 0.14107748866081238, + "rewards/margins_min": 0.0545647032558918, + "rewards/margins_std": 0.06117378547787666, + "rewards/rejected": -0.0815092995762825, + "step": 2370 + }, + { + "epoch": 0.9, + "grad_norm": 0.27734375, + "learning_rate": 1.384198788031063e-08, + "logits/chosen": -1.40728759765625, + "logits/rejected": -1.0882267951965332, + "logps/chosen": -215.80819702148438, + "logps/rejected": -219.1876678466797, + "loss": 0.655, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.013580176047980785, + "rewards/margins": 0.08551829308271408, + "rewards/margins_max": 0.13581404089927673, + "rewards/margins_min": 0.035222552716732025, + "rewards/margins_std": 0.07112891972064972, + "rewards/rejected": -0.07193811982870102, + "step": 2380 + }, + { + "epoch": 0.91, + "grad_norm": 0.318359375, + "learning_rate": 1.2774483740914416e-08, + "logits/chosen": -1.4530203342437744, + "logits/rejected": -1.204655408859253, + "logps/chosen": -201.9489288330078, + "logps/rejected": -252.07650756835938, + "loss": 0.6512, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02443147823214531, + "rewards/margins": 0.08860547840595245, + "rewards/margins_max": 0.12207148969173431, + "rewards/margins_min": 0.05513947084546089, + "rewards/margins_std": 0.04732808098196983, + "rewards/rejected": -0.06417400389909744, + "step": 2390 + }, + { + "epoch": 0.91, + "grad_norm": 0.271484375, + "learning_rate": 1.1748732956682023e-08, + "logits/chosen": -1.3249019384384155, + "logits/rejected": -1.0664135217666626, + "logps/chosen": -185.0057373046875, + "logps/rejected": -204.3482208251953, + "loss": 0.6547, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.018103353679180145, + "rewards/margins": 0.08620314300060272, + "rewards/margins_max": 0.1258658468723297, + "rewards/margins_min": 0.04654044285416603, + "rewards/margins_std": 0.056091535836458206, + "rewards/rejected": -0.06809979677200317, + "step": 2400 + }, + { + "epoch": 0.92, + "grad_norm": 0.33203125, + "learning_rate": 1.0764916066947794e-08, + "logits/chosen": -1.5028488636016846, + "logits/rejected": -1.139801025390625, + "logps/chosen": -216.3588104248047, + "logps/rejected": -209.8426055908203, + "loss": 0.6507, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012566794641315937, + "rewards/margins": 0.08007965236902237, + "rewards/margins_max": 0.11527623981237411, + "rewards/margins_min": 0.04488305002450943, + "rewards/margins_std": 0.04977550357580185, + "rewards/rejected": -0.06751285493373871, + "step": 2410 + }, + { + "epoch": 0.92, + "grad_norm": 0.388671875, + "learning_rate": 9.823206230386515e-09, + "logits/chosen": -1.4211446046829224, + "logits/rejected": -1.1179053783416748, + "logps/chosen": -196.47525024414062, + "logps/rejected": -217.45126342773438, + "loss": 0.6468, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.019422296434640884, + "rewards/margins": 0.10285399109125137, + "rewards/margins_max": 0.14698545634746552, + "rewards/margins_min": 0.058722518384456635, + "rewards/margins_std": 0.062411319464445114, + "rewards/rejected": -0.0834316834807396, + "step": 2420 + }, + { + "epoch": 0.92, + "grad_norm": 0.412109375, + "learning_rate": 8.923769194536218e-09, + "logits/chosen": -1.3940680027008057, + "logits/rejected": -1.283018708229065, + "logps/chosen": -179.62240600585938, + "logps/rejected": -204.8430938720703, + "loss": 0.652, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.018096450716257095, + "rewards/margins": 0.07983745634555817, + "rewards/margins_max": 0.11743442714214325, + "rewards/margins_min": 0.04224049299955368, + "rewards/margins_std": 0.053170137107372284, + "rewards/rejected": -0.06174100562930107, + "step": 2430 + }, + { + "epoch": 0.93, + "grad_norm": 0.3203125, + "learning_rate": 8.066763266625282e-09, + "logits/chosen": -1.453731894493103, + "logits/rejected": -1.2243844270706177, + "logps/chosen": -181.93214416503906, + "logps/rejected": -204.25790405273438, + "loss": 0.6461, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019669560715556145, + "rewards/margins": 0.09502781927585602, + "rewards/margins_max": 0.13680796325206757, + "rewards/margins_min": 0.05324765294790268, + "rewards/margins_std": 0.059086065739393234, + "rewards/rejected": -0.07535825669765472, + "step": 2440 + }, + { + "epoch": 0.93, + "grad_norm": 0.330078125, + "learning_rate": 7.252339285709619e-09, + "logits/chosen": -1.5105646848678589, + "logits/rejected": -1.1885838508605957, + "logps/chosen": -191.6083984375, + "logps/rejected": -217.36001586914062, + "loss": 0.6523, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.025863632559776306, + "rewards/margins": 0.08741247653961182, + "rewards/margins_max": 0.12167079746723175, + "rewards/margins_min": 0.05315415933728218, + "rewards/margins_std": 0.048448581248521805, + "rewards/rejected": -0.06154884770512581, + "step": 2450 + }, + { + "epoch": 0.93, + "grad_norm": 0.296875, + "learning_rate": 6.480640596123549e-09, + "logits/chosen": -1.3854317665100098, + "logits/rejected": -1.246504306793213, + "logps/chosen": -206.4389190673828, + "logps/rejected": -241.0319061279297, + "loss": 0.6471, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.02236495353281498, + "rewards/margins": 0.09376935660839081, + "rewards/margins_max": 0.1314789205789566, + "rewards/margins_min": 0.056059788912534714, + "rewards/margins_std": 0.053329385817050934, + "rewards/rejected": -0.07140441238880157, + "step": 2460 + }, + { + "epoch": 0.94, + "grad_norm": 0.3046875, + "learning_rate": 5.751803022250479e-09, + "logits/chosen": -1.5128895044326782, + "logits/rejected": -1.1328929662704468, + "logps/chosen": -252.16366577148438, + "logps/rejected": -239.92733764648438, + "loss": 0.6499, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.014960886910557747, + "rewards/margins": 0.07866047322750092, + "rewards/margins_max": 0.11670245975255966, + "rewards/margins_min": 0.04061848670244217, + "rewards/margins_std": 0.053799498826265335, + "rewards/rejected": -0.06369959563016891, + "step": 2470 + }, + { + "epoch": 0.94, + "grad_norm": 0.3125, + "learning_rate": 5.065954844616721e-09, + "logits/chosen": -1.308065414428711, + "logits/rejected": -1.095858097076416, + "logps/chosen": -185.49024963378906, + "logps/rejected": -205.8270263671875, + "loss": 0.6528, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02121925540268421, + "rewards/margins": 0.09081675857305527, + "rewards/margins_max": 0.12591932713985443, + "rewards/margins_min": 0.0557141974568367, + "rewards/margins_std": 0.04964253306388855, + "rewards/rejected": -0.0695975124835968, + "step": 2480 + }, + { + "epoch": 0.95, + "grad_norm": 0.34375, + "learning_rate": 4.4232167773132215e-09, + "logits/chosen": -1.3658572435379028, + "logits/rejected": -1.1204966306686401, + "logps/chosen": -187.7608642578125, + "logps/rejected": -222.4073486328125, + "loss": 0.6518, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.014262977056205273, + "rewards/margins": 0.08244883269071579, + "rewards/margins_max": 0.11362477391958237, + "rewards/margins_min": 0.051272887736558914, + "rewards/margins_std": 0.04408944398164749, + "rewards/rejected": -0.06818585842847824, + "step": 2490 + }, + { + "epoch": 0.95, + "grad_norm": 0.3125, + "learning_rate": 3.823701946749053e-09, + "logits/chosen": -1.4690792560577393, + "logits/rejected": -1.1181033849716187, + "logps/chosen": -185.13882446289062, + "logps/rejected": -213.7282257080078, + "loss": 0.649, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02593539096415043, + "rewards/margins": 0.08635548502206802, + "rewards/margins_max": 0.12079595029354095, + "rewards/margins_min": 0.0519150085747242, + "rewards/margins_std": 0.04870619252324104, + "rewards/rejected": -0.06042008846998215, + "step": 2500 + }, + { + "epoch": 0.95, + "grad_norm": 0.318359375, + "learning_rate": 3.267515871740484e-09, + "logits/chosen": -1.426261067390442, + "logits/rejected": -1.221677541732788, + "logps/chosen": -185.28851318359375, + "logps/rejected": -215.24221801757812, + "loss": 0.6442, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.018279213458299637, + "rewards/margins": 0.10030057281255722, + "rewards/margins_max": 0.13936129212379456, + "rewards/margins_min": 0.06123984977602959, + "rewards/margins_std": 0.055240195244550705, + "rewards/rejected": -0.08202135562896729, + "step": 2510 + }, + { + "epoch": 0.96, + "grad_norm": 0.30078125, + "learning_rate": 2.754756444938666e-09, + "logits/chosen": -1.5783092975616455, + "logits/rejected": -1.1907222270965576, + "logps/chosen": -210.3064727783203, + "logps/rejected": -241.8881378173828, + "loss": 0.6526, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.016610082238912582, + "rewards/margins": 0.08524598181247711, + "rewards/margins_max": 0.12586773931980133, + "rewards/margins_min": 0.04462422430515289, + "rewards/margins_std": 0.05744783952832222, + "rewards/rejected": -0.06863589584827423, + "step": 2520 + }, + { + "epoch": 0.96, + "grad_norm": 0.28515625, + "learning_rate": 2.285513915600168e-09, + "logits/chosen": -1.398267388343811, + "logits/rejected": -1.142956018447876, + "logps/chosen": -180.8777618408203, + "logps/rejected": -212.1314239501953, + "loss": 0.652, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.019063105806708336, + "rewards/margins": 0.08323358744382858, + "rewards/margins_max": 0.11851847171783447, + "rewards/margins_min": 0.04794871434569359, + "rewards/margins_std": 0.0499003566801548, + "rewards/rejected": -0.0641704872250557, + "step": 2530 + }, + { + "epoch": 0.97, + "grad_norm": 0.322265625, + "learning_rate": 1.859870873702124e-09, + "logits/chosen": -1.4865785837173462, + "logits/rejected": -1.0758774280548096, + "logps/chosen": -229.53744506835938, + "logps/rejected": -225.7979736328125, + "loss": 0.6474, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.027804816141724586, + "rewards/margins": 0.10115760564804077, + "rewards/margins_max": 0.1517588049173355, + "rewards/margins_min": 0.05055641010403633, + "rewards/margins_std": 0.07156090438365936, + "rewards/rejected": -0.07335279136896133, + "step": 2540 + }, + { + "epoch": 0.97, + "grad_norm": 0.326171875, + "learning_rate": 1.4779022354061698e-09, + "logits/chosen": -1.4936548471450806, + "logits/rejected": -1.1473426818847656, + "logps/chosen": -198.7931671142578, + "logps/rejected": -211.90811157226562, + "loss": 0.6559, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.017037371173501015, + "rewards/margins": 0.07735893130302429, + "rewards/margins_max": 0.11480559408664703, + "rewards/margins_min": 0.039912257343530655, + "rewards/margins_std": 0.05295759439468384, + "rewards/rejected": -0.06032155826687813, + "step": 2550 + }, + { + "epoch": 0.97, + "grad_norm": 0.330078125, + "learning_rate": 1.1396752298723499e-09, + "logits/chosen": -1.4866702556610107, + "logits/rejected": -1.0954639911651611, + "logps/chosen": -204.0499725341797, + "logps/rejected": -212.27188110351562, + "loss": 0.6457, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.017862681299448013, + "rewards/margins": 0.09754244983196259, + "rewards/margins_max": 0.1323915421962738, + "rewards/margins_min": 0.06269336491823196, + "rewards/margins_std": 0.04928405210375786, + "rewards/rejected": -0.07967977225780487, + "step": 2560 + }, + { + "epoch": 0.98, + "grad_norm": 0.271484375, + "learning_rate": 8.452493874266108e-10, + "logits/chosen": -1.322525978088379, + "logits/rejected": -1.0981214046478271, + "logps/chosen": -210.36325073242188, + "logps/rejected": -231.64987182617188, + "loss": 0.6511, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.017954757437109947, + "rewards/margins": 0.08609770238399506, + "rewards/margins_max": 0.12132489681243896, + "rewards/margins_min": 0.050870515406131744, + "rewards/margins_std": 0.04981876164674759, + "rewards/rejected": -0.06814294308423996, + "step": 2570 + }, + { + "epoch": 0.98, + "grad_norm": 0.287109375, + "learning_rate": 5.946765290827383e-10, + "logits/chosen": -1.3004374504089355, + "logits/rejected": -1.0841004848480225, + "logps/chosen": -201.1930389404297, + "logps/rejected": -225.4813690185547, + "loss": 0.6504, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.020434178411960602, + "rewards/margins": 0.08689124882221222, + "rewards/margins_max": 0.1344389021396637, + "rewards/margins_min": 0.039343591779470444, + "rewards/margins_std": 0.06724254041910172, + "rewards/rejected": -0.06645707041025162, + "step": 2580 + }, + { + "epoch": 0.98, + "grad_norm": 0.3125, + "learning_rate": 3.880007574218469e-10, + "logits/chosen": -1.4542973041534424, + "logits/rejected": -1.1403675079345703, + "logps/chosen": -185.40380859375, + "logps/rejected": -219.89501953125, + "loss": 0.6496, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.028258394449949265, + "rewards/margins": 0.09239298850297928, + "rewards/margins_max": 0.1326712816953659, + "rewards/margins_min": 0.05211470276117325, + "rewards/margins_std": 0.05696210265159607, + "rewards/rejected": -0.06413459777832031, + "step": 2590 + }, + { + "epoch": 0.99, + "grad_norm": 0.34375, + "learning_rate": 2.2525844882964606e-10, + "logits/chosen": -1.496821403503418, + "logits/rejected": -1.1975185871124268, + "logps/chosen": -213.3788604736328, + "logps/rejected": -249.6721649169922, + "loss": 0.6505, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.022496605291962624, + "rewards/margins": 0.08701016753911972, + "rewards/margins_max": 0.12534096837043762, + "rewards/margins_min": 0.04867938160896301, + "rewards/margins_std": 0.05420792102813721, + "rewards/rejected": -0.06451357156038284, + "step": 2600 + }, + { + "epoch": 0.99, + "grad_norm": 0.29296875, + "learning_rate": 1.0647824709419939e-10, + "logits/chosen": -1.4642287492752075, + "logits/rejected": -1.195237398147583, + "logps/chosen": -173.61244201660156, + "logps/rejected": -194.322509765625, + "loss": 0.649, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.026876743882894516, + "rewards/margins": 0.10324034839868546, + "rewards/margins_max": 0.14352624118328094, + "rewards/margins_min": 0.06295443326234818, + "rewards/margins_std": 0.05697287991642952, + "rewards/rejected": -0.07636359333992004, + "step": 2610 + }, + { + "epoch": 1.0, + "grad_norm": 0.34765625, + "learning_rate": 3.168105836440227e-11, + "logits/chosen": -1.430641770362854, + "logits/rejected": -1.058257818222046, + "logps/chosen": -219.7120819091797, + "logps/rejected": -215.89663696289062, + "loss": 0.6453, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01688862219452858, + "rewards/margins": 0.095039002597332, + "rewards/margins_max": 0.13273173570632935, + "rewards/margins_min": 0.05734627693891525, + "rewards/margins_std": 0.053305573761463165, + "rewards/rejected": -0.07815037667751312, + "step": 2620 + }, + { + "epoch": 1.0, + "grad_norm": 0.3203125, + "learning_rate": 8.800474701475824e-13, + "logits/chosen": -1.5000232458114624, + "logits/rejected": -1.2522644996643066, + "logps/chosen": -189.9681854248047, + "logps/rejected": -215.1830291748047, + "loss": 0.6522, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019782623276114464, + "rewards/margins": 0.07797619700431824, + "rewards/margins_max": 0.10702647268772125, + "rewards/margins_min": 0.04892592877149582, + "rewards/margins_std": 0.041083287447690964, + "rewards/rejected": -0.058193571865558624, + "step": 2630 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -1.013566493988037, + "eval_logits/rejected": -0.8866661190986633, + "eval_logps/chosen": -326.8977355957031, + "eval_logps/rejected": -315.0963439941406, + "eval_loss": 0.6891594529151917, + "eval_rewards/accuracies": 0.5820000171661377, + "eval_rewards/chosen": 0.004229320678859949, + "eval_rewards/margins": 0.008481495082378387, + "eval_rewards/margins_max": 0.0640675351023674, + "eval_rewards/margins_min": -0.04490014538168907, + "eval_rewards/margins_std": 0.03645985573530197, + "eval_rewards/rejected": -0.004252173937857151, + "eval_runtime": 4330.3916, + "eval_samples_per_second": 2.771, + "eval_steps_per_second": 0.173, + "step": 2632 + }, + { + "epoch": 1.0, + "step": 2632, + "total_flos": 0.0, + "train_loss": 0.658518510975374, + "train_runtime": 42431.3226, + "train_samples_per_second": 0.992, + "train_steps_per_second": 0.062 + } + ], + "logging_steps": 10, + "max_steps": 2632, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}