Quill-v0.9 / trainer_state.json
sam-paech's picture
Upload folder using huggingface_hub
03390e4 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 400,
"global_step": 564,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.008865248226950355,
"grad_norm": 149.09850698280093,
"learning_rate": 4.385964912280701e-09,
"logps/chosen": -2.8927114009857178,
"logps/rejected": -0.7169164419174194,
"loss": 25.3611,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.927114486694336,
"rewards/margins": -21.757949829101562,
"rewards/rejected": -7.169164180755615,
"step": 5
},
{
"epoch": 0.01773049645390071,
"grad_norm": 133.9237477557748,
"learning_rate": 8.771929824561403e-09,
"logps/chosen": -2.677772045135498,
"logps/rejected": -0.7764161825180054,
"loss": 24.583,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.777721405029297,
"rewards/margins": -19.013559341430664,
"rewards/rejected": -7.764161109924316,
"step": 10
},
{
"epoch": 0.026595744680851064,
"grad_norm": 152.65228499963413,
"learning_rate": 1.3157894736842104e-08,
"logps/chosen": -2.491425037384033,
"logps/rejected": -0.7799981832504272,
"loss": 24.3721,
"rewards/accuracies": 0.0,
"rewards/chosen": -24.914249420166016,
"rewards/margins": -17.114269256591797,
"rewards/rejected": -7.79998254776001,
"step": 15
},
{
"epoch": 0.03546099290780142,
"grad_norm": 167.82984889261684,
"learning_rate": 1.7543859649122805e-08,
"logps/chosen": -2.818477153778076,
"logps/rejected": -0.8185766935348511,
"loss": 24.5782,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.184768676757812,
"rewards/margins": -19.999004364013672,
"rewards/rejected": -8.185766220092773,
"step": 20
},
{
"epoch": 0.044326241134751775,
"grad_norm": 158.80212467372465,
"learning_rate": 2.1929824561403507e-08,
"logps/chosen": -3.06237530708313,
"logps/rejected": -0.685140073299408,
"loss": 24.7471,
"rewards/accuracies": 0.0,
"rewards/chosen": -30.62375259399414,
"rewards/margins": -23.77235221862793,
"rewards/rejected": -6.851400852203369,
"step": 25
},
{
"epoch": 0.05319148936170213,
"grad_norm": 146.70835684303188,
"learning_rate": 2.6315789473684208e-08,
"logps/chosen": -2.6460320949554443,
"logps/rejected": -0.7569402456283569,
"loss": 24.7665,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.4603214263916,
"rewards/margins": -18.890918731689453,
"rewards/rejected": -7.569401741027832,
"step": 30
},
{
"epoch": 0.06205673758865248,
"grad_norm": 155.08422103006285,
"learning_rate": 3.070175438596491e-08,
"logps/chosen": -2.725968837738037,
"logps/rejected": -0.7673903703689575,
"loss": 24.0323,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.259693145751953,
"rewards/margins": -19.58578872680664,
"rewards/rejected": -7.673903465270996,
"step": 35
},
{
"epoch": 0.07092198581560284,
"grad_norm": 144.68445966753478,
"learning_rate": 3.508771929824561e-08,
"logps/chosen": -2.8730432987213135,
"logps/rejected": -0.7648515105247498,
"loss": 24.9421,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.73043441772461,
"rewards/margins": -21.081920623779297,
"rewards/rejected": -7.648515224456787,
"step": 40
},
{
"epoch": 0.0797872340425532,
"grad_norm": 136.66852055782576,
"learning_rate": 3.947368421052631e-08,
"logps/chosen": -2.648432970046997,
"logps/rejected": -0.7622749209403992,
"loss": 24.9349,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.484333038330078,
"rewards/margins": -18.861581802368164,
"rewards/rejected": -7.622749328613281,
"step": 45
},
{
"epoch": 0.08865248226950355,
"grad_norm": 137.8970167243902,
"learning_rate": 4.385964912280701e-08,
"logps/chosen": -2.7805895805358887,
"logps/rejected": -0.7085495591163635,
"loss": 23.7959,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.805896759033203,
"rewards/margins": -20.72039794921875,
"rewards/rejected": -7.085495948791504,
"step": 50
},
{
"epoch": 0.0975177304964539,
"grad_norm": 144.53363403919823,
"learning_rate": 4.8245614035087715e-08,
"logps/chosen": -2.7633585929870605,
"logps/rejected": -0.766043484210968,
"loss": 24.5894,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.63358497619629,
"rewards/margins": -19.973148345947266,
"rewards/rejected": -7.660434722900391,
"step": 55
},
{
"epoch": 0.10638297872340426,
"grad_norm": 186.10146168517474,
"learning_rate": 4.999568059583401e-08,
"logps/chosen": -2.605541706085205,
"logps/rejected": -0.7882484197616577,
"loss": 24.1209,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.0554141998291,
"rewards/margins": -18.172931671142578,
"rewards/rejected": -7.88248348236084,
"step": 60
},
{
"epoch": 0.11524822695035461,
"grad_norm": 144.73338067190508,
"learning_rate": 4.9969289642076847e-08,
"logps/chosen": -2.691685199737549,
"logps/rejected": -0.7605332136154175,
"loss": 24.5817,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.916851043701172,
"rewards/margins": -19.311519622802734,
"rewards/rejected": -7.6053314208984375,
"step": 65
},
{
"epoch": 0.12411347517730496,
"grad_norm": 142.10076756137335,
"learning_rate": 4.991893270335525e-08,
"logps/chosen": -2.887664318084717,
"logps/rejected": -0.7798042297363281,
"loss": 25.4368,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.876644134521484,
"rewards/margins": -21.078603744506836,
"rewards/rejected": -7.798041343688965,
"step": 70
},
{
"epoch": 0.13297872340425532,
"grad_norm": 136.39554296889824,
"learning_rate": 4.9844658113188256e-08,
"logps/chosen": -2.554642915725708,
"logps/rejected": -0.741515576839447,
"loss": 25.032,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.546428680419922,
"rewards/margins": -18.131275177001953,
"rewards/rejected": -7.41515588760376,
"step": 75
},
{
"epoch": 0.14184397163120568,
"grad_norm": 150.8035431184681,
"learning_rate": 4.974653716169812e-08,
"logps/chosen": -2.592639923095703,
"logps/rejected": -0.7398035526275635,
"loss": 24.5209,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.9263973236084,
"rewards/margins": -18.528364181518555,
"rewards/rejected": -7.398035526275635,
"step": 80
},
{
"epoch": 0.15070921985815602,
"grad_norm": 131.31880148234296,
"learning_rate": 4.962466402718474e-08,
"logps/chosen": -2.76173734664917,
"logps/rejected": -0.7157766819000244,
"loss": 24.4644,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.61737060546875,
"rewards/margins": -20.459604263305664,
"rewards/rejected": -7.157766819000244,
"step": 85
},
{
"epoch": 0.1595744680851064,
"grad_norm": 134.79926804392827,
"learning_rate": 4.9479155685731585e-08,
"logps/chosen": -2.651050329208374,
"logps/rejected": -0.8522893190383911,
"loss": 23.8276,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.510501861572266,
"rewards/margins": -17.98760986328125,
"rewards/rejected": -8.522892951965332,
"step": 90
},
{
"epoch": 0.16843971631205673,
"grad_norm": 148.45315521916737,
"learning_rate": 4.93101517989299e-08,
"logps/chosen": -2.806105136871338,
"logps/rejected": -0.8227446675300598,
"loss": 24.8131,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.061050415039062,
"rewards/margins": -19.833606719970703,
"rewards/rejected": -8.227446556091309,
"step": 95
},
{
"epoch": 0.1773049645390071,
"grad_norm": 143.9707085484871,
"learning_rate": 4.91178145798289e-08,
"logps/chosen": -2.8190388679504395,
"logps/rejected": -0.7673857808113098,
"loss": 24.4349,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.19038963317871,
"rewards/margins": -20.516530990600586,
"rewards/rejected": -7.673857688903809,
"step": 100
},
{
"epoch": 0.18617021276595744,
"grad_norm": 142.14896402628358,
"learning_rate": 4.890232863724074e-08,
"logps/chosen": -2.5820021629333496,
"logps/rejected": -0.7970396876335144,
"loss": 24.1908,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.820018768310547,
"rewards/margins": -17.849624633789062,
"rewards/rejected": -7.970396518707275,
"step": 105
},
{
"epoch": 0.1950354609929078,
"grad_norm": 146.1009589129858,
"learning_rate": 4.8663900798549555e-08,
"logps/chosen": -2.6352057456970215,
"logps/rejected": -0.7787631154060364,
"loss": 23.47,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.3520565032959,
"rewards/margins": -18.564428329467773,
"rewards/rejected": -7.787631034851074,
"step": 110
},
{
"epoch": 0.20390070921985815,
"grad_norm": 140.00337741613868,
"learning_rate": 4.84027599111947e-08,
"logps/chosen": -2.8055331707000732,
"logps/rejected": -0.7519856095314026,
"loss": 24.6991,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.055328369140625,
"rewards/margins": -20.535472869873047,
"rewards/rejected": -7.5198564529418945,
"step": 115
},
{
"epoch": 0.2127659574468085,
"grad_norm": 141.35660015466556,
"learning_rate": 4.8119156623018765e-08,
"logps/chosen": -2.79301118850708,
"logps/rejected": -0.7545806765556335,
"loss": 24.7028,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.93011474609375,
"rewards/margins": -20.384307861328125,
"rewards/rejected": -7.545806884765625,
"step": 120
},
{
"epoch": 0.22163120567375885,
"grad_norm": 142.6726786477248,
"learning_rate": 4.781336314169116e-08,
"logps/chosen": -2.820788860321045,
"logps/rejected": -0.8155437707901001,
"loss": 24.0466,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.207889556884766,
"rewards/margins": -20.05245018005371,
"rewards/rejected": -8.155437469482422,
"step": 125
},
{
"epoch": 0.23049645390070922,
"grad_norm": 132.8640819326495,
"learning_rate": 4.748567297343817e-08,
"logps/chosen": -2.764223575592041,
"logps/rejected": -0.7645096182823181,
"loss": 24.7092,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.642236709594727,
"rewards/margins": -19.997140884399414,
"rewards/rejected": -7.6450958251953125,
"step": 130
},
{
"epoch": 0.2393617021276596,
"grad_norm": 139.96100373862689,
"learning_rate": 4.713640064133024e-08,
"logps/chosen": -2.7403454780578613,
"logps/rejected": -0.745998740196228,
"loss": 23.5556,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.403453826904297,
"rewards/margins": -19.943464279174805,
"rewards/rejected": -7.459986686706543,
"step": 135
},
{
"epoch": 0.24822695035460993,
"grad_norm": 148.02227270943942,
"learning_rate": 4.676588138339698e-08,
"logps/chosen": -2.7284622192382812,
"logps/rejected": -0.7950869798660278,
"loss": 23.8238,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.284622192382812,
"rewards/margins": -19.333749771118164,
"rewards/rejected": -7.950869560241699,
"step": 140
},
{
"epoch": 0.2570921985815603,
"grad_norm": 176.18112226614372,
"learning_rate": 4.6374470830859435e-08,
"logps/chosen": -2.7229888439178467,
"logps/rejected": -0.8719980120658875,
"loss": 24.1807,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.229888916015625,
"rewards/margins": -18.50990867614746,
"rewards/rejected": -8.719980239868164,
"step": 145
},
{
"epoch": 0.26595744680851063,
"grad_norm": 135.96118644362534,
"learning_rate": 4.596254466678876e-08,
"logps/chosen": -2.56044340133667,
"logps/rejected": -0.8695181012153625,
"loss": 22.9327,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.60443687438965,
"rewards/margins": -16.90925407409668,
"rewards/rejected": -8.695180892944336,
"step": 150
},
{
"epoch": 0.274822695035461,
"grad_norm": 132.24528952538998,
"learning_rate": 4.5530498265518635e-08,
"logps/chosen": -2.813053607940674,
"logps/rejected": -0.808445155620575,
"loss": 23.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.130535125732422,
"rewards/margins": -20.04608726501465,
"rewards/rejected": -8.084451675415039,
"step": 155
},
{
"epoch": 0.28368794326241137,
"grad_norm": 127.79059567307556,
"learning_rate": 4.507874631315768e-08,
"logps/chosen": -2.66123628616333,
"logps/rejected": -0.8127639889717102,
"loss": 23.47,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.61236000061035,
"rewards/margins": -18.48472023010254,
"rewards/rejected": -8.127640724182129,
"step": 160
},
{
"epoch": 0.2925531914893617,
"grad_norm": 129.2315474298329,
"learning_rate": 4.460772240956608e-08,
"logps/chosen": -2.7900562286376953,
"logps/rejected": -0.849897027015686,
"loss": 23.8895,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.900564193725586,
"rewards/margins": -19.401594161987305,
"rewards/rejected": -8.498970985412598,
"step": 165
},
{
"epoch": 0.30141843971631205,
"grad_norm": 130.10241912996838,
"learning_rate": 4.411787865217846e-08,
"logps/chosen": -2.605811595916748,
"logps/rejected": -0.7789794206619263,
"loss": 24.0528,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.0581111907959,
"rewards/margins": -18.268321990966797,
"rewards/rejected": -7.789793491363525,
"step": 170
},
{
"epoch": 0.3102836879432624,
"grad_norm": 125.41956154105759,
"learning_rate": 4.36096852020724e-08,
"logps/chosen": -2.852963924407959,
"logps/rejected": -0.8932555913925171,
"loss": 24.7971,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.529638290405273,
"rewards/margins": -19.59708023071289,
"rewards/rejected": -8.932558059692383,
"step": 175
},
{
"epoch": 0.3191489361702128,
"grad_norm": 129.52798142464744,
"learning_rate": 4.308362983269915e-08,
"logps/chosen": -2.6496150493621826,
"logps/rejected": -0.8426092267036438,
"loss": 24.0708,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.496150970458984,
"rewards/margins": -18.070056915283203,
"rewards/rejected": -8.426092147827148,
"step": 180
},
{
"epoch": 0.3280141843971631,
"grad_norm": 124.39272381055714,
"learning_rate": 4.2540217461709714e-08,
"logps/chosen": -2.640799045562744,
"logps/rejected": -0.7842020988464355,
"loss": 23.3526,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.407989501953125,
"rewards/margins": -18.565967559814453,
"rewards/rejected": -7.8420209884643555,
"step": 185
},
{
"epoch": 0.33687943262411346,
"grad_norm": 135.5931742125443,
"learning_rate": 4.1979969666325505e-08,
"logps/chosen": -2.4907784461975098,
"logps/rejected": -0.8824084997177124,
"loss": 23.1102,
"rewards/accuracies": 0.10000000149011612,
"rewards/chosen": -24.907785415649414,
"rewards/margins": -16.083698272705078,
"rewards/rejected": -8.824087142944336,
"step": 190
},
{
"epoch": 0.34574468085106386,
"grad_norm": 124.78318512461612,
"learning_rate": 4.140342418271896e-08,
"logps/chosen": -2.566643476486206,
"logps/rejected": -0.8403929471969604,
"loss": 23.9738,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.666433334350586,
"rewards/margins": -17.262508392333984,
"rewards/rejected": -8.403928756713867,
"step": 195
},
{
"epoch": 0.3546099290780142,
"grad_norm": 124.54166567693945,
"learning_rate": 4.0811134389884425e-08,
"logps/chosen": -2.662815570831299,
"logps/rejected": -0.8375980257987976,
"loss": 23.2471,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.628154754638672,
"rewards/margins": -18.25217628479004,
"rewards/rejected": -8.37597942352295,
"step": 200
},
{
"epoch": 0.36347517730496454,
"grad_norm": 129.10521131399426,
"learning_rate": 4.020366877849477e-08,
"logps/chosen": -2.899678945541382,
"logps/rejected": -0.9157842397689819,
"loss": 22.6395,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.99679183959961,
"rewards/margins": -19.838947296142578,
"rewards/rejected": -9.157841682434082,
"step": 205
},
{
"epoch": 0.3723404255319149,
"grad_norm": 118.9218343714434,
"learning_rate": 3.958161040525354e-08,
"logps/chosen": -2.7788329124450684,
"logps/rejected": -0.7856232523918152,
"loss": 24.5067,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.788330078125,
"rewards/margins": -19.93209457397461,
"rewards/rejected": -7.856232643127441,
"step": 210
},
{
"epoch": 0.38120567375886527,
"grad_norm": 124.20603604955339,
"learning_rate": 3.894555633326642e-08,
"logps/chosen": -2.766160011291504,
"logps/rejected": -0.8535798192024231,
"loss": 23.9262,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.66160011291504,
"rewards/margins": -19.125802993774414,
"rewards/rejected": -8.535799026489258,
"step": 215
},
{
"epoch": 0.3900709219858156,
"grad_norm": 122.4047572823285,
"learning_rate": 3.829611705896899e-08,
"logps/chosen": -2.758810520172119,
"logps/rejected": -0.8758818507194519,
"loss": 23.0933,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.58810806274414,
"rewards/margins": -18.829286575317383,
"rewards/rejected": -8.758818626403809,
"step": 220
},
{
"epoch": 0.39893617021276595,
"grad_norm": 136.57138875427492,
"learning_rate": 3.763391592616104e-08,
"logps/chosen": -2.629612445831299,
"logps/rejected": -0.8354926109313965,
"loss": 23.2906,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.296123504638672,
"rewards/margins": -17.941198348999023,
"rewards/rejected": -8.354926109313965,
"step": 225
},
{
"epoch": 0.4078014184397163,
"grad_norm": 125.42475135454727,
"learning_rate": 3.695958852770963e-08,
"logps/chosen": -2.629051685333252,
"logps/rejected": -0.8992561101913452,
"loss": 23.2326,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.290517807006836,
"rewards/margins": -17.297956466674805,
"rewards/rejected": -8.992559432983398,
"step": 230
},
{
"epoch": 0.4166666666666667,
"grad_norm": 130.81865526034508,
"learning_rate": 3.627378209549536e-08,
"logps/chosen": -2.713019847869873,
"logps/rejected": -0.9180544018745422,
"loss": 23.2666,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.130197525024414,
"rewards/margins": -17.94965171813965,
"rewards/rejected": -9.180543899536133,
"step": 235
},
{
"epoch": 0.425531914893617,
"grad_norm": 154.01811109390695,
"learning_rate": 3.557715487918728e-08,
"logps/chosen": -2.845736265182495,
"logps/rejected": -0.8981745839118958,
"loss": 22.8914,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.45736312866211,
"rewards/margins": -19.475618362426758,
"rewards/rejected": -8.981744766235352,
"step": 240
},
{
"epoch": 0.43439716312056736,
"grad_norm": 124.72008912272919,
"learning_rate": 3.487037551444267e-08,
"logps/chosen": -2.6402182579040527,
"logps/rejected": -0.9148176908493042,
"loss": 24.0945,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.40218162536621,
"rewards/margins": -17.254005432128906,
"rewards/rejected": -9.148177146911621,
"step": 245
},
{
"epoch": 0.4432624113475177,
"grad_norm": 126.67782694015744,
"learning_rate": 3.4154122381138226e-08,
"logps/chosen": -2.8172085285186768,
"logps/rejected": -0.9476315379142761,
"loss": 22.8683,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.17208480834961,
"rewards/margins": -18.695770263671875,
"rewards/rejected": -9.476313591003418,
"step": 250
},
{
"epoch": 0.4521276595744681,
"grad_norm": 128.76198121819777,
"learning_rate": 3.3429082952248535e-08,
"logps/chosen": -2.6602797508239746,
"logps/rejected": -0.9177526235580444,
"loss": 22.7192,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.602794647216797,
"rewards/margins": -17.42526626586914,
"rewards/rejected": -9.177526473999023,
"step": 255
},
{
"epoch": 0.46099290780141844,
"grad_norm": 120.09861391040339,
"learning_rate": 3.2695953133996826e-08,
"logps/chosen": -2.6814820766448975,
"logps/rejected": -0.8409668803215027,
"loss": 22.8322,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.8148193359375,
"rewards/margins": -18.405153274536133,
"rewards/rejected": -8.409668922424316,
"step": 260
},
{
"epoch": 0.4698581560283688,
"grad_norm": 149.0293076595361,
"learning_rate": 3.195543659791132e-08,
"logps/chosen": -2.6782455444335938,
"logps/rejected": -0.988556981086731,
"loss": 22.749,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.782455444335938,
"rewards/margins": -16.896886825561523,
"rewards/rejected": -9.88556957244873,
"step": 265
},
{
"epoch": 0.4787234042553192,
"grad_norm": 120.49824463691422,
"learning_rate": 3.120824410542833e-08,
"logps/chosen": -2.619314193725586,
"logps/rejected": -0.9258754849433899,
"loss": 22.1466,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.19314193725586,
"rewards/margins": -16.93438720703125,
"rewards/rejected": -9.25875473022461,
"step": 270
},
{
"epoch": 0.4875886524822695,
"grad_norm": 126.28759504359856,
"learning_rate": 3.045509282569031e-08,
"logps/chosen": -2.569866895675659,
"logps/rejected": -0.843720555305481,
"loss": 22.4486,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.69866943359375,
"rewards/margins": -17.261463165283203,
"rewards/rejected": -8.43720531463623,
"step": 275
},
{
"epoch": 0.49645390070921985,
"grad_norm": 133.61205279748194,
"learning_rate": 2.969670564719369e-08,
"logps/chosen": -2.705709457397461,
"logps/rejected": -0.8904563784599304,
"loss": 22.7856,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.057092666625977,
"rewards/margins": -18.152530670166016,
"rewards/rejected": -8.904562950134277,
"step": 280
},
{
"epoch": 0.5053191489361702,
"grad_norm": 125.31007604414539,
"learning_rate": 2.893381048394715e-08,
"logps/chosen": -2.5439467430114746,
"logps/rejected": -0.9845563173294067,
"loss": 22.3419,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.43946647644043,
"rewards/margins": -15.593902587890625,
"rewards/rejected": -9.845562934875488,
"step": 285
},
{
"epoch": 0.5141843971631206,
"grad_norm": 133.3039752253482,
"learning_rate": 2.8167139576806304e-08,
"logps/chosen": -2.7466816902160645,
"logps/rejected": -0.9437241554260254,
"loss": 22.5713,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.466812133789062,
"rewards/margins": -18.029569625854492,
"rewards/rejected": -9.43724250793457,
"step": 290
},
{
"epoch": 0.5230496453900709,
"grad_norm": 125.8510714937569,
"learning_rate": 2.739742879065544e-08,
"logps/chosen": -2.6576900482177734,
"logps/rejected": -0.9526101350784302,
"loss": 22.5247,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.576900482177734,
"rewards/margins": -17.050800323486328,
"rewards/rejected": -9.526101112365723,
"step": 295
},
{
"epoch": 0.5319148936170213,
"grad_norm": 132.63761511287686,
"learning_rate": 2.662541690811082e-08,
"logps/chosen": -2.3779425621032715,
"logps/rejected": -0.8925439715385437,
"loss": 21.2831,
"rewards/accuracies": 0.0,
"rewards/chosen": -23.779422760009766,
"rewards/margins": -14.853982925415039,
"rewards/rejected": -8.925439834594727,
"step": 300
},
{
"epoch": 0.5407801418439716,
"grad_norm": 136.13215779336474,
"learning_rate": 2.585184492042347e-08,
"logps/chosen": -2.6994504928588867,
"logps/rejected": -0.9614452123641968,
"loss": 21.5851,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.9945068359375,
"rewards/margins": -17.38005256652832,
"rewards/rejected": -9.614453315734863,
"step": 305
},
{
"epoch": 0.549645390070922,
"grad_norm": 132.02028150495246,
"learning_rate": 2.5077455316262147e-08,
"logps/chosen": -2.6731841564178467,
"logps/rejected": -0.9815571904182434,
"loss": 21.7416,
"rewards/accuracies": 0.10000000149011612,
"rewards/chosen": -26.73184585571289,
"rewards/margins": -16.916271209716797,
"rewards/rejected": -9.815571784973145,
"step": 310
},
{
"epoch": 0.5585106382978723,
"grad_norm": 130.67893366658126,
"learning_rate": 2.4302991369058958e-08,
"logps/chosen": -2.51406192779541,
"logps/rejected": -1.0564053058624268,
"loss": 20.9766,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.1406192779541,
"rewards/margins": -14.576566696166992,
"rewards/rejected": -10.564051628112793,
"step": 315
},
{
"epoch": 0.5673758865248227,
"grad_norm": 131.46127648897104,
"learning_rate": 2.3529196423601876e-08,
"logps/chosen": -2.443092107772827,
"logps/rejected": -0.9888316988945007,
"loss": 22.5219,
"rewards/accuracies": 0.0,
"rewards/chosen": -24.43092155456543,
"rewards/margins": -14.54260540008545,
"rewards/rejected": -9.888317108154297,
"step": 320
},
{
"epoch": 0.5762411347517731,
"grad_norm": 134.82856786462523,
"learning_rate": 2.27568131825586e-08,
"logps/chosen": -2.7351233959198,
"logps/rejected": -1.0152182579040527,
"loss": 21.4016,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.35123634338379,
"rewards/margins": -17.199050903320312,
"rewards/rejected": -10.152182579040527,
"step": 325
},
{
"epoch": 0.5851063829787234,
"grad_norm": 138.32340221747404,
"learning_rate": 2.1986582993616925e-08,
"logps/chosen": -2.5989935398101807,
"logps/rejected": -0.9776498079299927,
"loss": 21.6639,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.989938735961914,
"rewards/margins": -16.213438034057617,
"rewards/rejected": -9.776496887207031,
"step": 330
},
{
"epoch": 0.5939716312056738,
"grad_norm": 144.27555433213394,
"learning_rate": 2.121924513792548e-08,
"logps/chosen": -2.7326598167419434,
"logps/rejected": -0.993288516998291,
"loss": 22.0246,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.326595306396484,
"rewards/margins": -17.39371109008789,
"rewards/rejected": -9.93288516998291,
"step": 335
},
{
"epoch": 0.6028368794326241,
"grad_norm": 131.66189209302496,
"learning_rate": 2.0455536120518093e-08,
"logps/chosen": -2.7750537395477295,
"logps/rejected": -1.0254470109939575,
"loss": 21.3832,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.750534057617188,
"rewards/margins": -17.496063232421875,
"rewards/rejected": -10.254469871520996,
"step": 340
},
{
"epoch": 0.6117021276595744,
"grad_norm": 133.29472374802276,
"learning_rate": 1.969618896340261e-08,
"logps/chosen": -2.791393756866455,
"logps/rejected": -0.9777949452400208,
"loss": 21.8087,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.9139404296875,
"rewards/margins": -18.1359920501709,
"rewards/rejected": -9.777949333190918,
"step": 345
},
{
"epoch": 0.6205673758865248,
"grad_norm": 132.7071056456499,
"learning_rate": 1.8941932501992915e-08,
"logps/chosen": -2.528127670288086,
"logps/rejected": -0.9332917928695679,
"loss": 22.4318,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.281272888183594,
"rewards/margins": -15.948354721069336,
"rewards/rejected": -9.332918167114258,
"step": 350
},
{
"epoch": 0.6294326241134752,
"grad_norm": 133.23550694363342,
"learning_rate": 1.8193490685559176e-08,
"logps/chosen": -2.6291937828063965,
"logps/rejected": -1.0154783725738525,
"loss": 21.633,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.291940689086914,
"rewards/margins": -16.137157440185547,
"rewards/rejected": -10.154783248901367,
"step": 355
},
{
"epoch": 0.6382978723404256,
"grad_norm": 131.9862073113527,
"learning_rate": 1.745158188236805e-08,
"logps/chosen": -2.7716784477233887,
"logps/rejected": -0.9892207980155945,
"loss": 21.8071,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.716785430908203,
"rewards/margins": -17.824573516845703,
"rewards/rejected": -9.89220905303955,
"step": 360
},
{
"epoch": 0.6471631205673759,
"grad_norm": 133.59469928327707,
"learning_rate": 1.6716918190179507e-08,
"logps/chosen": -2.824233055114746,
"logps/rejected": -1.0325210094451904,
"loss": 21.8658,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.24233055114746,
"rewards/margins": -17.91712188720703,
"rewards/rejected": -10.325210571289062,
"step": 365
},
{
"epoch": 0.6560283687943262,
"grad_norm": 137.91947812898098,
"learning_rate": 1.599020475276227e-08,
"logps/chosen": -2.6540284156799316,
"logps/rejected": -1.0216987133026123,
"loss": 21.8092,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.540287017822266,
"rewards/margins": -16.323299407958984,
"rewards/rejected": -10.216985702514648,
"step": 370
},
{
"epoch": 0.6648936170212766,
"grad_norm": 139.4931571924634,
"learning_rate": 1.527213908308386e-08,
"logps/chosen": -2.6086392402648926,
"logps/rejected": -1.0575668811798096,
"loss": 21.1347,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.086395263671875,
"rewards/margins": -15.510726928710938,
"rewards/rejected": -10.575668334960938,
"step": 375
},
{
"epoch": 0.6737588652482269,
"grad_norm": 140.7825368139536,
"learning_rate": 1.45634103938247e-08,
"logps/chosen": -2.7265937328338623,
"logps/rejected": -1.1351561546325684,
"loss": 21.6724,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.265939712524414,
"rewards/margins": -15.914377212524414,
"rewards/rejected": -11.3515625,
"step": 380
},
{
"epoch": 0.6826241134751773,
"grad_norm": 155.36198841615501,
"learning_rate": 1.3864698935859152e-08,
"logps/chosen": -2.785165309906006,
"logps/rejected": -1.016316533088684,
"loss": 21.7173,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.851654052734375,
"rewards/margins": -17.688488006591797,
"rewards/rejected": -10.163165092468262,
"step": 385
},
{
"epoch": 0.6914893617021277,
"grad_norm": 142.46678106209737,
"learning_rate": 1.3176675345338084e-08,
"logps/chosen": -2.720693349838257,
"logps/rejected": -1.0303252935409546,
"loss": 21.5356,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.206933975219727,
"rewards/margins": -16.903682708740234,
"rewards/rejected": -10.303252220153809,
"step": 390
},
{
"epoch": 0.700354609929078,
"grad_norm": 145.09087804013313,
"learning_rate": 1.2500000000000004e-08,
"logps/chosen": -2.8458621501922607,
"logps/rejected": -1.0235410928726196,
"loss": 21.0196,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.4586181640625,
"rewards/margins": -18.22321128845215,
"rewards/rejected": -10.2354097366333,
"step": 395
},
{
"epoch": 0.7092198581560284,
"grad_norm": 141.31236991069352,
"learning_rate": 1.1835322385328258e-08,
"logps/chosen": -2.525390625,
"logps/rejected": -1.0862586498260498,
"loss": 21.4429,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.25390625,
"rewards/margins": -14.391319274902344,
"rewards/rejected": -10.862588882446289,
"step": 400
},
{
"epoch": 0.7092198581560284,
"eval_logps/chosen": -2.753817319869995,
"eval_logps/rejected": -1.0216267108917236,
"eval_loss": 21.640687942504883,
"eval_rewards/accuracies": 0.0,
"eval_rewards/chosen": -27.538171768188477,
"eval_rewards/margins": -17.3219051361084,
"eval_rewards/rejected": -10.216267585754395,
"eval_runtime": 4.3362,
"eval_samples_per_second": 2.767,
"eval_steps_per_second": 0.692,
"step": 400
},
{
"epoch": 0.7180851063829787,
"grad_norm": 138.30907528011366,
"learning_rate": 1.1183280471162916e-08,
"logps/chosen": -2.6900100708007812,
"logps/rejected": -1.054898977279663,
"loss": 21.6015,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.900100708007812,
"rewards/margins": -16.351110458374023,
"rewards/rejected": -10.548989295959473,
"step": 405
},
{
"epoch": 0.7269503546099291,
"grad_norm": 140.60691958793103,
"learning_rate": 1.0544500099365513e-08,
"logps/chosen": -2.5441603660583496,
"logps/rejected": -1.0485824346542358,
"loss": 20.2114,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.441608428955078,
"rewards/margins": -14.955782890319824,
"rewards/rejected": -10.485824584960938,
"step": 410
},
{
"epoch": 0.7358156028368794,
"grad_norm": 158.54062894661507,
"learning_rate": 9.91959438312451e-09,
"logps/chosen": -2.586798667907715,
"logps/rejected": -1.0556939840316772,
"loss": 20.7519,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.867984771728516,
"rewards/margins": -15.31104564666748,
"rewards/rejected": -10.556941032409668,
"step": 415
},
{
"epoch": 0.7446808510638298,
"grad_norm": 145.7088174939863,
"learning_rate": 9.309163118477952e-09,
"logps/chosen": -2.7213327884674072,
"logps/rejected": -1.10366690158844,
"loss": 20.9045,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.213327407836914,
"rewards/margins": -16.176658630371094,
"rewards/rejected": -11.036666870117188,
"step": 420
},
{
"epoch": 0.7535460992907801,
"grad_norm": 140.87803118012943,
"learning_rate": 8.713792208618095e-09,
"logps/chosen": -2.7333548069000244,
"logps/rejected": -1.0871623754501343,
"loss": 21.3148,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.333545684814453,
"rewards/margins": -16.461925506591797,
"rewards/rejected": -10.871623039245605,
"step": 425
},
{
"epoch": 0.7624113475177305,
"grad_norm": 148.2931552966102,
"learning_rate": 8.134053101530814e-09,
"logps/chosen": -2.6974215507507324,
"logps/rejected": -1.0488805770874023,
"loss": 22.011,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.974212646484375,
"rewards/margins": -16.485408782958984,
"rewards/rejected": -10.488804817199707,
"step": 430
},
{
"epoch": 0.7712765957446809,
"grad_norm": 156.37941932984688,
"learning_rate": 7.570502241509161e-09,
"logps/chosen": -2.6490612030029297,
"logps/rejected": -1.0499160289764404,
"loss": 20.7465,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.490612030029297,
"rewards/margins": -15.991450309753418,
"rewards/rejected": -10.499159812927246,
"step": 435
},
{
"epoch": 0.7801418439716312,
"grad_norm": 142.1179108847554,
"learning_rate": 7.023680535067997e-09,
"logps/chosen": -2.540945053100586,
"logps/rejected": -1.1362855434417725,
"loss": 20.9313,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.40945053100586,
"rewards/margins": -14.046595573425293,
"rewards/rejected": -11.362855911254883,
"step": 440
},
{
"epoch": 0.7890070921985816,
"grad_norm": 142.7591601522917,
"learning_rate": 6.4941128317718e-09,
"logps/chosen": -2.71937894821167,
"logps/rejected": -1.0216131210327148,
"loss": 21.3747,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.19378662109375,
"rewards/margins": -16.977657318115234,
"rewards/rejected": -10.216130256652832,
"step": 445
},
{
"epoch": 0.7978723404255319,
"grad_norm": 146.19832437425603,
"learning_rate": 5.9823074204745e-09,
"logps/chosen": -2.6064822673797607,
"logps/rejected": -1.1812522411346436,
"loss": 20.7936,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.0648250579834,
"rewards/margins": -14.252301216125488,
"rewards/rejected": -11.812520980834961,
"step": 450
},
{
"epoch": 0.8067375886524822,
"grad_norm": 138.88047068502686,
"learning_rate": 5.4887555414543344e-09,
"logps/chosen": -2.6501286029815674,
"logps/rejected": -1.0125081539154053,
"loss": 20.8237,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.50128746032715,
"rewards/margins": -16.376205444335938,
"rewards/rejected": -10.125081062316895,
"step": 455
},
{
"epoch": 0.8156028368794326,
"grad_norm": 136.88698229561803,
"learning_rate": 5.0139309149124755e-09,
"logps/chosen": -2.5912132263183594,
"logps/rejected": -1.1049864292144775,
"loss": 20.3607,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.91213035583496,
"rewards/margins": -14.862268447875977,
"rewards/rejected": -11.049863815307617,
"step": 460
},
{
"epoch": 0.824468085106383,
"grad_norm": 150.56544212976812,
"learning_rate": 4.558289286287545e-09,
"logps/chosen": -2.563302516937256,
"logps/rejected": -1.1222257614135742,
"loss": 20.5554,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.63302993774414,
"rewards/margins": -14.41076946258545,
"rewards/rejected": -11.222257614135742,
"step": 465
},
{
"epoch": 0.8333333333333334,
"grad_norm": 151.83025187850424,
"learning_rate": 4.122267988822792e-09,
"logps/chosen": -2.8155665397644043,
"logps/rejected": -1.1182525157928467,
"loss": 20.6342,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.15566635131836,
"rewards/margins": -16.973140716552734,
"rewards/rejected": -11.182525634765625,
"step": 470
},
{
"epoch": 0.8421985815602837,
"grad_norm": 143.90652908649776,
"learning_rate": 3.7062855238055775e-09,
"logps/chosen": -2.8534111976623535,
"logps/rejected": -1.079416275024414,
"loss": 21.6896,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.534109115600586,
"rewards/margins": -17.739948272705078,
"rewards/rejected": -10.794163703918457,
"step": 475
},
{
"epoch": 0.851063829787234,
"grad_norm": 146.70532693647684,
"learning_rate": 3.310741158882052e-09,
"logps/chosen": -2.5945401191711426,
"logps/rejected": -1.1171306371688843,
"loss": 20.8278,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.945398330688477,
"rewards/margins": -14.774093627929688,
"rewards/rejected": -11.171305656433105,
"step": 480
},
{
"epoch": 0.8599290780141844,
"grad_norm": 152.1225967764815,
"learning_rate": 2.9360145448327933e-09,
"logps/chosen": -2.7132070064544678,
"logps/rejected": -1.10798978805542,
"loss": 20.2016,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.132068634033203,
"rewards/margins": -16.052169799804688,
"rewards/rejected": -11.079896926879883,
"step": 485
},
{
"epoch": 0.8687943262411347,
"grad_norm": 148.4298648799797,
"learning_rate": 2.5824653511768905e-09,
"logps/chosen": -2.7444825172424316,
"logps/rejected": -1.0877773761749268,
"loss": 20.6239,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.44482421875,
"rewards/margins": -16.56705093383789,
"rewards/rejected": -10.877774238586426,
"step": 490
},
{
"epoch": 0.8776595744680851,
"grad_norm": 149.3789958144051,
"learning_rate": 2.250432920954584e-09,
"logps/chosen": -2.625457286834717,
"logps/rejected": -1.1413992643356323,
"loss": 20.8323,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.25457191467285,
"rewards/margins": -14.840580940246582,
"rewards/rejected": -11.413991928100586,
"step": 495
},
{
"epoch": 0.8865248226950354,
"grad_norm": 149.24437602693683,
"learning_rate": 1.9402359450194834e-09,
"logps/chosen": -2.633744716644287,
"logps/rejected": -1.0395913124084473,
"loss": 21.1488,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.337448120117188,
"rewards/margins": -15.941534042358398,
"rewards/rejected": -10.395914077758789,
"step": 500
},
{
"epoch": 0.8953900709219859,
"grad_norm": 148.69800996723336,
"learning_rate": 1.6521721561532642e-09,
"logps/chosen": -2.753237724304199,
"logps/rejected": -1.091736078262329,
"loss": 21.1955,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.53237533569336,
"rewards/margins": -16.61501693725586,
"rewards/rejected": -10.91736125946045,
"step": 505
},
{
"epoch": 0.9042553191489362,
"grad_norm": 140.23345832085906,
"learning_rate": 1.3865180432961975e-09,
"logps/chosen": -2.7156546115875244,
"logps/rejected": -1.033905029296875,
"loss": 20.9517,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.156545639038086,
"rewards/margins": -16.8174991607666,
"rewards/rejected": -10.33905029296875,
"step": 510
},
{
"epoch": 0.9131205673758865,
"grad_norm": 135.7495898891431,
"learning_rate": 1.1435285861680105e-09,
"logps/chosen": -2.6717796325683594,
"logps/rejected": -1.1057870388031006,
"loss": 20.2493,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.71779441833496,
"rewards/margins": -15.65992546081543,
"rewards/rejected": -11.057870864868164,
"step": 515
},
{
"epoch": 0.9219858156028369,
"grad_norm": 146.54140042392083,
"learning_rate": 9.234370105336037e-10,
"logps/chosen": -2.7197585105895996,
"logps/rejected": -1.0512771606445312,
"loss": 20.9221,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.197586059570312,
"rewards/margins": -16.684812545776367,
"rewards/rejected": -10.512772560119629,
"step": 520
},
{
"epoch": 0.9308510638297872,
"grad_norm": 154.92783543731412,
"learning_rate": 7.264545643486997e-10,
"logps/chosen": -2.7849745750427246,
"logps/rejected": -1.0905792713165283,
"loss": 21.5713,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.849742889404297,
"rewards/margins": -16.943950653076172,
"rewards/rejected": -10.905792236328125,
"step": 525
},
{
"epoch": 0.9397163120567376,
"grad_norm": 151.56459387858007,
"learning_rate": 5.527703150001173e-10,
"logps/chosen": -2.8318445682525635,
"logps/rejected": -1.042283535003662,
"loss": 21.4995,
"rewards/accuracies": 0.0,
"rewards/chosen": -28.31844711303711,
"rewards/margins": -17.895610809326172,
"rewards/rejected": -10.422834396362305,
"step": 530
},
{
"epoch": 0.9485815602836879,
"grad_norm": 163.48745110663447,
"learning_rate": 4.0255096783542995e-10,
"logps/chosen": -2.5114052295684814,
"logps/rejected": -1.1133381128311157,
"loss": 20.1608,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.114055633544922,
"rewards/margins": -13.980671882629395,
"rewards/rejected": -11.133380889892578,
"step": 535
},
{
"epoch": 0.9574468085106383,
"grad_norm": 149.97953788101626,
"learning_rate": 2.759407061560942e-10,
"logps/chosen": -2.641373872756958,
"logps/rejected": -1.0878345966339111,
"loss": 20.6395,
"rewards/accuracies": 0.0,
"rewards/chosen": -26.41373634338379,
"rewards/margins": -15.535390853881836,
"rewards/rejected": -10.878347396850586,
"step": 540
},
{
"epoch": 0.9663120567375887,
"grad_norm": 146.82377405274374,
"learning_rate": 1.7306105282764162e-10,
"logps/chosen": -2.7007031440734863,
"logps/rejected": -1.1214299201965332,
"loss": 21.5189,
"rewards/accuracies": 0.0,
"rewards/chosen": -27.007030487060547,
"rewards/margins": -15.792730331420898,
"rewards/rejected": -11.214300155639648,
"step": 545
},
{
"epoch": 0.975177304964539,
"grad_norm": 146.05557779309504,
"learning_rate": 9.401075363981436e-11,
"logps/chosen": -2.5641696453094482,
"logps/rejected": -1.0827744007110596,
"loss": 20.7057,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.641698837280273,
"rewards/margins": -14.813952445983887,
"rewards/rejected": -10.827742576599121,
"step": 550
},
{
"epoch": 0.9840425531914894,
"grad_norm": 141.1483903438878,
"learning_rate": 3.886568252850497e-11,
"logps/chosen": -2.5521302223205566,
"logps/rejected": -1.1713207960128784,
"loss": 20.7053,
"rewards/accuracies": 0.0,
"rewards/chosen": -25.521303176879883,
"rewards/margins": -13.808095932006836,
"rewards/rejected": -11.713208198547363,
"step": 555
},
{
"epoch": 0.9929078014184397,
"grad_norm": 150.04371038548183,
"learning_rate": 7.678768750579711e-12,
"logps/chosen": -2.5215792655944824,
"logps/rejected": -1.0503337383270264,
"loss": 20.6892,
"rewards/accuracies": 0.10000000149011612,
"rewards/chosen": -25.215795516967773,
"rewards/margins": -14.712457656860352,
"rewards/rejected": -10.503335952758789,
"step": 560
},
{
"epoch": 1.0,
"step": 564,
"total_flos": 0.0,
"train_loss": 22.58866818745931,
"train_runtime": 4347.6828,
"train_samples_per_second": 1.297,
"train_steps_per_second": 0.13
}
],
"logging_steps": 5,
"max_steps": 564,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}