inter-play-sim-assistant-dpo-test / trainer_state.json
jeromeramos's picture
Model save
1e53920 verified
raw
history blame contribute delete
147 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9997327870312639,
"eval_steps": 500,
"global_step": 1403,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0007125679166295537,
"grad_norm": 35.624961853027344,
"learning_rate": 0.0,
"logits/chosen": -3.107421875,
"logits/rejected": -3.0234375,
"logps/chosen": -106.375,
"logps/rejected": -64.125,
"loss": 0.6914,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0035628395831477687,
"grad_norm": 26.24993324279785,
"learning_rate": 2.8368794326241133e-08,
"logits/chosen": -3.1044921875,
"logits/rejected": -3.08642578125,
"logps/chosen": -95.46875,
"logps/rejected": -64.515625,
"loss": 0.6931,
"rewards/accuracies": 0.1875,
"rewards/chosen": -0.0012693405151367188,
"rewards/margins": -0.0015630722045898438,
"rewards/rejected": 0.0002932548522949219,
"step": 5
},
{
"epoch": 0.0071256791662955375,
"grad_norm": 24.48309326171875,
"learning_rate": 6.382978723404254e-08,
"logits/chosen": -3.10546875,
"logits/rejected": -3.072265625,
"logps/chosen": -88.23750305175781,
"logps/rejected": -55.287498474121094,
"loss": 0.6921,
"rewards/accuracies": 0.3375000059604645,
"rewards/chosen": 0.00015716553025413305,
"rewards/margins": 0.002190017607063055,
"rewards/rejected": -0.0020355223678052425,
"step": 10
},
{
"epoch": 0.010688518749443307,
"grad_norm": 54.63581848144531,
"learning_rate": 9.929078014184397e-08,
"logits/chosen": -3.080078125,
"logits/rejected": -3.0648436546325684,
"logps/chosen": -98.9437484741211,
"logps/rejected": -59.84375,
"loss": 0.6889,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.0001586914004292339,
"rewards/margins": 0.00680465716868639,
"rewards/rejected": -0.006960677914321423,
"step": 15
},
{
"epoch": 0.014251358332591075,
"grad_norm": 50.01255416870117,
"learning_rate": 1.3475177304964538e-07,
"logits/chosen": -3.099609375,
"logits/rejected": -3.0687499046325684,
"logps/chosen": -101.0562515258789,
"logps/rejected": -56.58124923706055,
"loss": 0.685,
"rewards/accuracies": 0.46875,
"rewards/chosen": 0.006333542056381702,
"rewards/margins": 0.01415863074362278,
"rewards/rejected": -0.00781860388815403,
"step": 20
},
{
"epoch": 0.017814197915738843,
"grad_norm": 142.6243896484375,
"learning_rate": 1.702127659574468e-07,
"logits/chosen": -3.076171875,
"logits/rejected": -3.0640625953674316,
"logps/chosen": -111.58125305175781,
"logps/rejected": -76.98750305175781,
"loss": 0.6806,
"rewards/accuracies": 0.4625000059604645,
"rewards/chosen": 0.01683807373046875,
"rewards/margins": 0.02606506273150444,
"rewards/rejected": -0.009222030639648438,
"step": 25
},
{
"epoch": 0.021377037498886614,
"grad_norm": 16.588232040405273,
"learning_rate": 2.0567375886524822e-07,
"logits/chosen": -3.083203077316284,
"logits/rejected": -3.0679688453674316,
"logps/chosen": -106.0250015258789,
"logps/rejected": -67.875,
"loss": 0.6634,
"rewards/accuracies": 0.518750011920929,
"rewards/chosen": 0.05385131761431694,
"rewards/margins": 0.06231040880084038,
"rewards/rejected": -0.008445357903838158,
"step": 30
},
{
"epoch": 0.024939877082034382,
"grad_norm": 23.556861877441406,
"learning_rate": 2.411347517730496e-07,
"logits/chosen": -3.0796875953674316,
"logits/rejected": -3.08203125,
"logps/chosen": -88.76249694824219,
"logps/rejected": -53.837501525878906,
"loss": 0.6479,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.09301147609949112,
"rewards/margins": 0.10019302368164062,
"rewards/rejected": -0.00728950509801507,
"step": 35
},
{
"epoch": 0.02850271666518215,
"grad_norm": 29.775815963745117,
"learning_rate": 2.7659574468085106e-07,
"logits/chosen": -3.0914063453674316,
"logits/rejected": -3.057421922683716,
"logps/chosen": -102.15625,
"logps/rejected": -71.5250015258789,
"loss": 0.6043,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": 0.202159121632576,
"rewards/margins": 0.22055740654468536,
"rewards/rejected": -0.01839141920208931,
"step": 40
},
{
"epoch": 0.03206555624832992,
"grad_norm": 25.135190963745117,
"learning_rate": 3.1205673758865245e-07,
"logits/chosen": -3.071093797683716,
"logits/rejected": -3.0601563453674316,
"logps/chosen": -87.21875,
"logps/rejected": -47.756248474121094,
"loss": 0.6027,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": 0.25634080171585083,
"rewards/margins": 0.26132506132125854,
"rewards/rejected": -0.005035400390625,
"step": 45
},
{
"epoch": 0.035628395831477686,
"grad_norm": 13.338802337646484,
"learning_rate": 3.475177304964539e-07,
"logits/chosen": -3.0531249046325684,
"logits/rejected": -3.065234422683716,
"logps/chosen": -107.1875,
"logps/rejected": -79.0374984741211,
"loss": 0.5993,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": 0.207794189453125,
"rewards/margins": 0.29575881361961365,
"rewards/rejected": -0.08815918117761612,
"step": 50
},
{
"epoch": 0.03919123541462546,
"grad_norm": 12.088021278381348,
"learning_rate": 3.829787234042553e-07,
"logits/chosen": -3.0746092796325684,
"logits/rejected": -3.073437452316284,
"logps/chosen": -91.6187515258789,
"logps/rejected": -63.34375,
"loss": 0.5622,
"rewards/accuracies": 0.6875,
"rewards/chosen": 0.36282652616500854,
"rewards/margins": 0.417471319437027,
"rewards/rejected": -0.054642487317323685,
"step": 55
},
{
"epoch": 0.04275407499777323,
"grad_norm": 13.686232566833496,
"learning_rate": 4.184397163120567e-07,
"logits/chosen": -3.083984375,
"logits/rejected": -3.079296827316284,
"logps/chosen": -101.8062515258789,
"logps/rejected": -72.42500305175781,
"loss": 0.5372,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": 0.42637938261032104,
"rewards/margins": 0.6403244137763977,
"rewards/rejected": -0.2142478972673416,
"step": 60
},
{
"epoch": 0.04631691458092099,
"grad_norm": 10.232641220092773,
"learning_rate": 4.5390070921985813e-07,
"logits/chosen": -3.065624952316284,
"logits/rejected": -3.0667967796325684,
"logps/chosen": -86.7750015258789,
"logps/rejected": -56.353126525878906,
"loss": 0.5248,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.6796913146972656,
"rewards/margins": 0.741424560546875,
"rewards/rejected": -0.061974335461854935,
"step": 65
},
{
"epoch": 0.049879754164068764,
"grad_norm": 17.83266830444336,
"learning_rate": 4.893617021276595e-07,
"logits/chosen": -3.049999952316284,
"logits/rejected": -3.05078125,
"logps/chosen": -101.2874984741211,
"logps/rejected": -77.9749984741211,
"loss": 0.5423,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": 0.3456260561943054,
"rewards/margins": 0.6742362976074219,
"rewards/rejected": -0.32913780212402344,
"step": 70
},
{
"epoch": 0.05344259374721653,
"grad_norm": 23.34439468383789,
"learning_rate": 5.248226950354609e-07,
"logits/chosen": -3.060546875,
"logits/rejected": -3.0562500953674316,
"logps/chosen": -86.29374694824219,
"logps/rejected": -62.92499923706055,
"loss": 0.5162,
"rewards/accuracies": 0.7875000238418579,
"rewards/chosen": 0.6199722290039062,
"rewards/margins": 0.7961105108261108,
"rewards/rejected": -0.176055908203125,
"step": 75
},
{
"epoch": 0.0570054333303643,
"grad_norm": 13.863036155700684,
"learning_rate": 5.602836879432624e-07,
"logits/chosen": -3.065624952316284,
"logits/rejected": -3.0374999046325684,
"logps/chosen": -101.5625,
"logps/rejected": -79.7750015258789,
"loss": 0.5003,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.655413806438446,
"rewards/margins": 0.987408459186554,
"rewards/rejected": -0.33138352632522583,
"step": 80
},
{
"epoch": 0.06056827291351207,
"grad_norm": 14.414058685302734,
"learning_rate": 5.957446808510638e-07,
"logits/chosen": -3.0542969703674316,
"logits/rejected": -3.07421875,
"logps/chosen": -93.625,
"logps/rejected": -69.44999694824219,
"loss": 0.5144,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.5085555911064148,
"rewards/margins": 0.8773147463798523,
"rewards/rejected": -0.3694648742675781,
"step": 85
},
{
"epoch": 0.06413111249665984,
"grad_norm": 12.498698234558105,
"learning_rate": 6.312056737588652e-07,
"logits/chosen": -3.056640625,
"logits/rejected": -3.055859327316284,
"logps/chosen": -115.61250305175781,
"logps/rejected": -93.48750305175781,
"loss": 0.519,
"rewards/accuracies": 0.768750011920929,
"rewards/chosen": 0.4992126524448395,
"rewards/margins": 0.902966320514679,
"rewards/rejected": -0.4039718508720398,
"step": 90
},
{
"epoch": 0.0676939520798076,
"grad_norm": 11.825833320617676,
"learning_rate": 6.666666666666666e-07,
"logits/chosen": -3.048046827316284,
"logits/rejected": -3.072265625,
"logps/chosen": -88.01249694824219,
"logps/rejected": -62.79375076293945,
"loss": 0.4863,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 0.911865234375,
"rewards/margins": 1.028173804283142,
"rewards/rejected": -0.11660919338464737,
"step": 95
},
{
"epoch": 0.07125679166295537,
"grad_norm": 17.15355682373047,
"learning_rate": 7.021276595744681e-07,
"logits/chosen": -3.020703077316284,
"logits/rejected": -3.037890672683716,
"logps/chosen": -92.09375,
"logps/rejected": -78.25,
"loss": 0.4907,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 0.714630126953125,
"rewards/margins": 0.987384021282196,
"rewards/rejected": -0.2730239927768707,
"step": 100
},
{
"epoch": 0.07481963124610315,
"grad_norm": 14.750085830688477,
"learning_rate": 7.375886524822694e-07,
"logits/chosen": -3.033203125,
"logits/rejected": -3.0386719703674316,
"logps/chosen": -98.8062515258789,
"logps/rejected": -73.3062515258789,
"loss": 0.4708,
"rewards/accuracies": 0.862500011920929,
"rewards/chosen": 0.8927696347236633,
"rewards/margins": 1.1517212390899658,
"rewards/rejected": -0.25947266817092896,
"step": 105
},
{
"epoch": 0.07838247082925091,
"grad_norm": 9.73326301574707,
"learning_rate": 7.730496453900709e-07,
"logits/chosen": -3.0648436546325684,
"logits/rejected": -3.0335936546325684,
"logps/chosen": -92.2874984741211,
"logps/rejected": -64.23124694824219,
"loss": 0.4229,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 1.2247527837753296,
"rewards/margins": 1.42816162109375,
"rewards/rejected": -0.20256957411766052,
"step": 110
},
{
"epoch": 0.08194531041239868,
"grad_norm": 8.861647605895996,
"learning_rate": 8.085106382978723e-07,
"logits/chosen": -3.0152344703674316,
"logits/rejected": -3.016796827316284,
"logps/chosen": -84.01875305175781,
"logps/rejected": -62.01250076293945,
"loss": 0.4404,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 1.268524169921875,
"rewards/margins": 1.392974853515625,
"rewards/rejected": -0.12417755275964737,
"step": 115
},
{
"epoch": 0.08550814999554646,
"grad_norm": 11.753287315368652,
"learning_rate": 8.439716312056737e-07,
"logits/chosen": -3.021484375,
"logits/rejected": -3.0257811546325684,
"logps/chosen": -70.6187515258789,
"logps/rejected": -54.650001525878906,
"loss": 0.457,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 1.0282318592071533,
"rewards/margins": 1.1394774913787842,
"rewards/rejected": -0.11103515326976776,
"step": 120
},
{
"epoch": 0.08907098957869422,
"grad_norm": 16.672988891601562,
"learning_rate": 8.794326241134752e-07,
"logits/chosen": -2.983593702316284,
"logits/rejected": -3.01171875,
"logps/chosen": -84.30000305175781,
"logps/rejected": -61.98125076293945,
"loss": 0.41,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 1.290094017982483,
"rewards/margins": 1.4578125476837158,
"rewards/rejected": -0.16876526176929474,
"step": 125
},
{
"epoch": 0.09263382916184199,
"grad_norm": 10.004197120666504,
"learning_rate": 9.148936170212766e-07,
"logits/chosen": -2.984375,
"logits/rejected": -3.0121092796325684,
"logps/chosen": -82.8375015258789,
"logps/rejected": -61.41875076293945,
"loss": 0.4163,
"rewards/accuracies": 0.84375,
"rewards/chosen": 1.3681640625,
"rewards/margins": 1.491455078125,
"rewards/rejected": -0.12366028130054474,
"step": 130
},
{
"epoch": 0.09619666874498976,
"grad_norm": 12.494372367858887,
"learning_rate": 9.50354609929078e-07,
"logits/chosen": -2.975781202316284,
"logits/rejected": -2.9828124046325684,
"logps/chosen": -90.48750305175781,
"logps/rejected": -74.5562515258789,
"loss": 0.3907,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 1.3906981945037842,
"rewards/margins": 1.6629638671875,
"rewards/rejected": -0.2710815370082855,
"step": 135
},
{
"epoch": 0.09975950832813753,
"grad_norm": 13.188983917236328,
"learning_rate": 9.858156028368794e-07,
"logits/chosen": -2.9781250953674316,
"logits/rejected": -2.955859422683716,
"logps/chosen": -76.2437515258789,
"logps/rejected": -61.16875076293945,
"loss": 0.3928,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.412744164466858,
"rewards/margins": 1.584985375404358,
"rewards/rejected": -0.17142944037914276,
"step": 140
},
{
"epoch": 0.10332234791128529,
"grad_norm": 12.645596504211426,
"learning_rate": 9.999860568295915e-07,
"logits/chosen": -2.948046922683716,
"logits/rejected": -2.9703125953674316,
"logps/chosen": -78.86250305175781,
"logps/rejected": -66.6875,
"loss": 0.4173,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.195831298828125,
"rewards/margins": 1.5179870128631592,
"rewards/rejected": -0.32008057832717896,
"step": 145
},
{
"epoch": 0.10688518749443306,
"grad_norm": 11.21827220916748,
"learning_rate": 9.999008513821418e-07,
"logits/chosen": -2.9410157203674316,
"logits/rejected": -2.9488282203674316,
"logps/chosen": -77.67500305175781,
"logps/rejected": -57.64374923706055,
"loss": 0.3616,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.391027808189392,
"rewards/margins": 1.7245604991912842,
"rewards/rejected": -0.3327087461948395,
"step": 150
},
{
"epoch": 0.11044802707758083,
"grad_norm": 19.32581901550293,
"learning_rate": 9.997381998772935e-07,
"logits/chosen": -2.928515672683716,
"logits/rejected": -2.944531202316284,
"logps/chosen": -94.01249694824219,
"logps/rejected": -76.98750305175781,
"loss": 0.3437,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.4373290538787842,
"rewards/margins": 1.874505639076233,
"rewards/rejected": -0.4372314512729645,
"step": 155
},
{
"epoch": 0.1140108666607286,
"grad_norm": 9.437000274658203,
"learning_rate": 9.99498127513479e-07,
"logits/chosen": -2.9027342796325684,
"logits/rejected": -2.920703172683716,
"logps/chosen": -74.16874694824219,
"logps/rejected": -60.243751525878906,
"loss": 0.3419,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 1.5391356945037842,
"rewards/margins": 2.053997755050659,
"rewards/rejected": -0.51385498046875,
"step": 160
},
{
"epoch": 0.11757370624387636,
"grad_norm": 7.837158679962158,
"learning_rate": 9.991806714833894e-07,
"logits/chosen": -2.9039063453674316,
"logits/rejected": -2.9156250953674316,
"logps/chosen": -87.0875015258789,
"logps/rejected": -70.16874694824219,
"loss": 0.3555,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.413110375404358,
"rewards/margins": 1.951440453529358,
"rewards/rejected": -0.537158191204071,
"step": 165
},
{
"epoch": 0.12113654582702414,
"grad_norm": 11.675161361694336,
"learning_rate": 9.987858809682132e-07,
"logits/chosen": -2.8902344703674316,
"logits/rejected": -2.910937547683716,
"logps/chosen": -80.60624694824219,
"logps/rejected": -64.75,
"loss": 0.3047,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.405981421470642,
"rewards/margins": 2.0376954078674316,
"rewards/rejected": -0.629956066608429,
"step": 170
},
{
"epoch": 0.1246993854101719,
"grad_norm": 10.536681175231934,
"learning_rate": 9.983138171300162e-07,
"logits/chosen": -2.8675780296325684,
"logits/rejected": -2.88671875,
"logps/chosen": -80.84375,
"logps/rejected": -67.55000305175781,
"loss": 0.3357,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.3307921886444092,
"rewards/margins": 1.8141601085662842,
"rewards/rejected": -0.4823974668979645,
"step": 175
},
{
"epoch": 0.12826222499331968,
"grad_norm": 18.031932830810547,
"learning_rate": 9.977645531022672e-07,
"logits/chosen": -2.8734374046325684,
"logits/rejected": -2.896484375,
"logps/chosen": -76.4375,
"logps/rejected": -72.40625,
"loss": 0.3215,
"rewards/accuracies": 0.8687499761581421,
"rewards/chosen": 1.909423828125,
"rewards/margins": 2.198779344558716,
"rewards/rejected": -0.28810423612594604,
"step": 180
},
{
"epoch": 0.13182506457646745,
"grad_norm": 11.270240783691406,
"learning_rate": 9.971381739785065e-07,
"logits/chosen": -2.859375,
"logits/rejected": -2.883593797683716,
"logps/chosen": -90.1500015258789,
"logps/rejected": -74.66874694824219,
"loss": 0.3281,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 1.711999535560608,
"rewards/margins": 2.1112303733825684,
"rewards/rejected": -0.39727783203125,
"step": 185
},
{
"epoch": 0.1353879041596152,
"grad_norm": 7.635077476501465,
"learning_rate": 9.964347767991644e-07,
"logits/chosen": -2.8558592796325684,
"logits/rejected": -2.862499952316284,
"logps/chosen": -95.88749694824219,
"logps/rejected": -84.53125,
"loss": 0.2675,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.230224609375,
"rewards/margins": 2.361132860183716,
"rewards/rejected": -1.1302001476287842,
"step": 190
},
{
"epoch": 0.13895074374276298,
"grad_norm": 9.399760246276855,
"learning_rate": 9.956544705365262e-07,
"logits/chosen": -2.8539061546325684,
"logits/rejected": -2.860546827316284,
"logps/chosen": -85.8375015258789,
"logps/rejected": -74.95625305175781,
"loss": 0.2563,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.636621117591858,
"rewards/margins": 2.4004883766174316,
"rewards/rejected": -0.7652435302734375,
"step": 195
},
{
"epoch": 0.14251358332591074,
"grad_norm": 20.745431900024414,
"learning_rate": 9.947973760778508e-07,
"logits/chosen": -2.830859422683716,
"logits/rejected": -2.856250047683716,
"logps/chosen": -73.9000015258789,
"logps/rejected": -63.41875076293945,
"loss": 0.2483,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.80950927734375,
"rewards/margins": 2.4383788108825684,
"rewards/rejected": -0.6286865472793579,
"step": 200
},
{
"epoch": 0.1460764229090585,
"grad_norm": 17.357345581054688,
"learning_rate": 9.938636262066423e-07,
"logits/chosen": -2.821093797683716,
"logits/rejected": -2.8414063453674316,
"logps/chosen": -87.64375305175781,
"logps/rejected": -77.25,
"loss": 0.2311,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.82574462890625,
"rewards/margins": 2.6576170921325684,
"rewards/rejected": -0.8322509527206421,
"step": 205
},
{
"epoch": 0.1496392624922063,
"grad_norm": 12.413117408752441,
"learning_rate": 9.928533655820778e-07,
"logits/chosen": -2.8140625953674316,
"logits/rejected": -2.82421875,
"logps/chosen": -86.48124694824219,
"logps/rejected": -79.54374694824219,
"loss": 0.3201,
"rewards/accuracies": 0.875,
"rewards/chosen": 1.5425536632537842,
"rewards/margins": 2.359692335128784,
"rewards/rejected": -0.8163818120956421,
"step": 210
},
{
"epoch": 0.15320210207535406,
"grad_norm": 17.220603942871094,
"learning_rate": 9.917667507165988e-07,
"logits/chosen": -2.8363280296325684,
"logits/rejected": -2.8285155296325684,
"logps/chosen": -77.04374694824219,
"logps/rejected": -70.4375,
"loss": 0.2564,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.592675805091858,
"rewards/margins": 2.5828003883361816,
"rewards/rejected": -0.9895995855331421,
"step": 215
},
{
"epoch": 0.15676494165850183,
"grad_norm": 8.446002960205078,
"learning_rate": 9.90603949951661e-07,
"logits/chosen": -2.8246092796325684,
"logits/rejected": -2.837109327316284,
"logps/chosen": -91.78125,
"logps/rejected": -82.59375,
"loss": 0.2734,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.702734351158142,
"rewards/margins": 2.5569825172424316,
"rewards/rejected": -0.854077160358429,
"step": 220
},
{
"epoch": 0.1603277812416496,
"grad_norm": 7.998837471008301,
"learning_rate": 9.89365143431656e-07,
"logits/chosen": -2.815624952316284,
"logits/rejected": -2.842578172683716,
"logps/chosen": -77.2125015258789,
"logps/rejected": -77.8125,
"loss": 0.1894,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 2.030078172683716,
"rewards/margins": 2.8941407203674316,
"rewards/rejected": -0.8644775152206421,
"step": 225
},
{
"epoch": 0.16389062082479736,
"grad_norm": 9.701436996459961,
"learning_rate": 9.880505230760025e-07,
"logits/chosen": -2.787890672683716,
"logits/rejected": -2.826171875,
"logps/chosen": -73.625,
"logps/rejected": -74.35624694824219,
"loss": 0.2395,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 2.212085008621216,
"rewards/margins": 2.756884813308716,
"rewards/rejected": -0.543957531452179,
"step": 230
},
{
"epoch": 0.16745346040794512,
"grad_norm": 8.417997360229492,
"learning_rate": 9.866602925494141e-07,
"logits/chosen": -2.7718749046325684,
"logits/rejected": -2.817187547683716,
"logps/chosen": -90.19999694824219,
"logps/rejected": -81.1500015258789,
"loss": 0.2562,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 1.513342261314392,
"rewards/margins": 2.689257860183716,
"rewards/rejected": -1.1742675304412842,
"step": 235
},
{
"epoch": 0.1710162999910929,
"grad_norm": 7.670560359954834,
"learning_rate": 9.851946672303459e-07,
"logits/chosen": -2.793750047683716,
"logits/rejected": -2.788281202316284,
"logps/chosen": -96.0374984741211,
"logps/rejected": -86.01249694824219,
"loss": 0.2326,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.3016846179962158,
"rewards/margins": 3.0846190452575684,
"rewards/rejected": -1.782128930091858,
"step": 240
},
{
"epoch": 0.17457913957424068,
"grad_norm": 12.10120964050293,
"learning_rate": 9.836538741776283e-07,
"logits/chosen": -2.791015625,
"logits/rejected": -2.802734375,
"logps/chosen": -89.48124694824219,
"logps/rejected": -85.63749694824219,
"loss": 0.2696,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.33062744140625,
"rewards/margins": 2.7452392578125,
"rewards/rejected": -1.411718726158142,
"step": 245
},
{
"epoch": 0.17814197915738844,
"grad_norm": 9.357841491699219,
"learning_rate": 9.8203815209529e-07,
"logits/chosen": -2.78125,
"logits/rejected": -2.8042969703674316,
"logps/chosen": -73.15625,
"logps/rejected": -73.1312484741211,
"loss": 0.1795,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 2.027270555496216,
"rewards/margins": 2.994921922683716,
"rewards/rejected": -0.967480480670929,
"step": 250
},
{
"epoch": 0.1817048187405362,
"grad_norm": 8.387929916381836,
"learning_rate": 9.80347751295577e-07,
"logits/chosen": -2.7914061546325684,
"logits/rejected": -2.8046875,
"logps/chosen": -96.3187484741211,
"logps/rejected": -98.40625,
"loss": 0.1988,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 2.1563477516174316,
"rewards/margins": 3.224609375,
"rewards/rejected": -1.0703246593475342,
"step": 255
},
{
"epoch": 0.18526765832368397,
"grad_norm": 7.9508137702941895,
"learning_rate": 9.78582933660175e-07,
"logits/chosen": -2.7789063453674316,
"logits/rejected": -2.7972655296325684,
"logps/chosen": -85.13749694824219,
"logps/rejected": -84.64375305175781,
"loss": 0.2766,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 2.044140577316284,
"rewards/margins": 2.802441358566284,
"rewards/rejected": -0.75970458984375,
"step": 260
},
{
"epoch": 0.18883049790683173,
"grad_norm": 6.185698509216309,
"learning_rate": 9.767439725996362e-07,
"logits/chosen": -2.753124952316284,
"logits/rejected": -2.770312547683716,
"logps/chosen": -89.39375305175781,
"logps/rejected": -89.0562515258789,
"loss": 0.2549,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.800146460533142,
"rewards/margins": 2.896484375,
"rewards/rejected": -1.096582055091858,
"step": 265
},
{
"epoch": 0.19239333748997953,
"grad_norm": 9.08353328704834,
"learning_rate": 9.748311530110229e-07,
"logits/chosen": -2.748828172683716,
"logits/rejected": -2.7718749046325684,
"logps/chosen": -100.5625,
"logps/rejected": -97.46875,
"loss": 0.2605,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.925512671470642,
"rewards/margins": 3.23291015625,
"rewards/rejected": -1.30859375,
"step": 270
},
{
"epoch": 0.1959561770731273,
"grad_norm": 4.137091159820557,
"learning_rate": 9.728447712337691e-07,
"logits/chosen": -2.744921922683716,
"logits/rejected": -2.759765625,
"logps/chosen": -87.5999984741211,
"logps/rejected": -91.3062515258789,
"loss": 0.2278,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 2.05633544921875,
"rewards/margins": 3.37890625,
"rewards/rejected": -1.320715308189392,
"step": 275
},
{
"epoch": 0.19951901665627506,
"grad_norm": 12.414250373840332,
"learning_rate": 9.707851350037725e-07,
"logits/chosen": -2.729296922683716,
"logits/rejected": -2.7542967796325684,
"logps/chosen": -77.55000305175781,
"logps/rejected": -77.8125,
"loss": 0.1781,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 2.095141649246216,
"rewards/margins": 3.4671874046325684,
"rewards/rejected": -1.3744628429412842,
"step": 280
},
{
"epoch": 0.20308185623942282,
"grad_norm": 7.794823169708252,
"learning_rate": 9.686525634057183e-07,
"logits/chosen": -2.733203172683716,
"logits/rejected": -2.7464842796325684,
"logps/chosen": -99.1500015258789,
"logps/rejected": -100.1500015258789,
"loss": 0.231,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 2.1944947242736816,
"rewards/margins": 3.321044921875,
"rewards/rejected": -1.124609351158142,
"step": 285
},
{
"epoch": 0.20664469582257058,
"grad_norm": 8.649138450622559,
"learning_rate": 9.664473868236452e-07,
"logits/chosen": -2.755859375,
"logits/rejected": -2.76953125,
"logps/chosen": -80.39375305175781,
"logps/rejected": -76.4312515258789,
"loss": 0.1813,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.514404296875,
"rewards/margins": 3.5414061546325684,
"rewards/rejected": -1.02783203125,
"step": 290
},
{
"epoch": 0.21020753540571835,
"grad_norm": 12.671677589416504,
"learning_rate": 9.641699468897624e-07,
"logits/chosen": -2.7093749046325684,
"logits/rejected": -2.739453077316284,
"logps/chosen": -60.45000076293945,
"logps/rejected": -56.525001525878906,
"loss": 0.2041,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 2.3620848655700684,
"rewards/margins": 3.497265577316284,
"rewards/rejected": -1.1365234851837158,
"step": 295
},
{
"epoch": 0.2137703749888661,
"grad_norm": 4.332060813903809,
"learning_rate": 9.618205964315222e-07,
"logits/chosen": -2.727734327316284,
"logits/rejected": -2.757031202316284,
"logps/chosen": -98.5374984741211,
"logps/rejected": -100.4937515258789,
"loss": 0.2254,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.587915062904358,
"rewards/margins": 3.080639600753784,
"rewards/rejected": -1.492285132408142,
"step": 300
},
{
"epoch": 0.2173332145720139,
"grad_norm": 15.351773262023926,
"learning_rate": 9.593996994169595e-07,
"logits/chosen": -2.7203125953674316,
"logits/rejected": -2.7249999046325684,
"logps/chosen": -75.66874694824219,
"logps/rejected": -77.15625,
"loss": 0.2185,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.589599609375,
"rewards/margins": 3.6607422828674316,
"rewards/rejected": -1.0699951648712158,
"step": 305
},
{
"epoch": 0.22089605415516167,
"grad_norm": 26.818561553955078,
"learning_rate": 9.569076308983043e-07,
"logits/chosen": -2.696484327316284,
"logits/rejected": -2.7085938453674316,
"logps/chosen": -75.07499694824219,
"logps/rejected": -86.9749984741211,
"loss": 0.2727,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 2.1810059547424316,
"rewards/margins": 3.570849657058716,
"rewards/rejected": -1.389892578125,
"step": 310
},
{
"epoch": 0.22445889373830943,
"grad_norm": 8.661190032958984,
"learning_rate": 9.54344776953878e-07,
"logits/chosen": -2.6617188453674316,
"logits/rejected": -2.6832032203674316,
"logps/chosen": -79.5875015258789,
"logps/rejected": -76.4124984741211,
"loss": 0.214,
"rewards/accuracies": 0.893750011920929,
"rewards/chosen": 1.545873999595642,
"rewards/margins": 3.3358397483825684,
"rewards/rejected": -1.7917969226837158,
"step": 315
},
{
"epoch": 0.2280217333214572,
"grad_norm": 10.709281921386719,
"learning_rate": 9.517115346282807e-07,
"logits/chosen": -2.677734375,
"logits/rejected": -2.713671922683716,
"logps/chosen": -81.0687484741211,
"logps/rejected": -86.5,
"loss": 0.3241,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 2.3992676734924316,
"rewards/margins": 3.3475098609924316,
"rewards/rejected": -0.9508301019668579,
"step": 320
},
{
"epoch": 0.23158457290460496,
"grad_norm": 8.042264938354492,
"learning_rate": 9.490083118708802e-07,
"logits/chosen": -2.666015625,
"logits/rejected": -2.6875,
"logps/chosen": -82.4937515258789,
"logps/rejected": -84.82499694824219,
"loss": 0.2036,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 2.216723680496216,
"rewards/margins": 3.622363328933716,
"rewards/rejected": -1.403662085533142,
"step": 325
},
{
"epoch": 0.23514741248775273,
"grad_norm": 5.525402545928955,
"learning_rate": 9.462355274726115e-07,
"logits/chosen": -2.670703172683716,
"logits/rejected": -2.70703125,
"logps/chosen": -77.34375,
"logps/rejected": -76.76875305175781,
"loss": 0.1855,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 2.365652561187744,
"rewards/margins": 3.461718797683716,
"rewards/rejected": -1.0956542491912842,
"step": 330
},
{
"epoch": 0.23871025207090052,
"grad_norm": 10.05048942565918,
"learning_rate": 9.433936110010956e-07,
"logits/chosen": -2.667187452316284,
"logits/rejected": -2.6871094703674316,
"logps/chosen": -78.17500305175781,
"logps/rejected": -76.9937515258789,
"loss": 0.1874,
"rewards/accuracies": 0.9375,
"rewards/chosen": 2.0904572010040283,
"rewards/margins": 3.5014405250549316,
"rewards/rejected": -1.4128906726837158,
"step": 335
},
{
"epoch": 0.24227309165404828,
"grad_norm": 9.706530570983887,
"learning_rate": 9.404830027340911e-07,
"logits/chosen": -2.6640625,
"logits/rejected": -2.694531202316284,
"logps/chosen": -69.0562515258789,
"logps/rejected": -75.125,
"loss": 0.2029,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.834741234779358,
"rewards/margins": 3.663867235183716,
"rewards/rejected": -1.8297851085662842,
"step": 340
},
{
"epoch": 0.24583593123719605,
"grad_norm": 6.624788284301758,
"learning_rate": 9.375041535912838e-07,
"logits/chosen": -2.639453172683716,
"logits/rejected": -2.6953125,
"logps/chosen": -92.89375305175781,
"logps/rejected": -91.64375305175781,
"loss": 0.1853,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.5345947742462158,
"rewards/margins": 3.5376954078674316,
"rewards/rejected": -2.003124952316284,
"step": 345
},
{
"epoch": 0.2493987708203438,
"grad_norm": 41.15028762817383,
"learning_rate": 9.344575250644295e-07,
"logits/chosen": -2.6402344703674316,
"logits/rejected": -2.6488280296325684,
"logps/chosen": -79.6812515258789,
"logps/rejected": -83.7249984741211,
"loss": 0.2387,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 2.3847413063049316,
"rewards/margins": 4.047461032867432,
"rewards/rejected": -1.664794921875,
"step": 350
},
{
"epoch": 0.2529616104034916,
"grad_norm": 10.601265907287598,
"learning_rate": 9.313435891458587e-07,
"logits/chosen": -2.651562452316284,
"logits/rejected": -2.67578125,
"logps/chosen": -81.4000015258789,
"logps/rejected": -91.5875015258789,
"loss": 0.1739,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.215380907058716,
"rewards/margins": 3.997180223464966,
"rewards/rejected": -1.7802734375,
"step": 355
},
{
"epoch": 0.25652444998663937,
"grad_norm": 22.471799850463867,
"learning_rate": 9.281628282553535e-07,
"logits/chosen": -2.627734422683716,
"logits/rejected": -2.673828125,
"logps/chosen": -83.95625305175781,
"logps/rejected": -93.0,
"loss": 0.2203,
"rewards/accuracies": 0.90625,
"rewards/chosen": 2.1271729469299316,
"rewards/margins": 3.6353516578674316,
"rewards/rejected": -1.506982445716858,
"step": 360
},
{
"epoch": 0.2600872895697871,
"grad_norm": 203.5772705078125,
"learning_rate": 9.249157351654104e-07,
"logits/chosen": -2.643359422683716,
"logits/rejected": -2.676562547683716,
"logps/chosen": -89.5687484741211,
"logps/rejected": -84.54374694824219,
"loss": 0.212,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.6217529773712158,
"rewards/margins": 3.0572266578674316,
"rewards/rejected": -1.4359023571014404,
"step": 365
},
{
"epoch": 0.2636501291529349,
"grad_norm": 9.114107131958008,
"learning_rate": 9.216028129248985e-07,
"logits/chosen": -2.63671875,
"logits/rejected": -2.673828125,
"logps/chosen": -92.1500015258789,
"logps/rejected": -92.40625,
"loss": 0.1802,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.858483910560608,
"rewards/margins": 3.7662110328674316,
"rewards/rejected": -1.9098632335662842,
"step": 370
},
{
"epoch": 0.26721296873608263,
"grad_norm": 8.871646881103516,
"learning_rate": 9.182245747811248e-07,
"logits/chosen": -2.6390624046325684,
"logits/rejected": -2.655078172683716,
"logps/chosen": -91.5,
"logps/rejected": -87.88749694824219,
"loss": 0.2021,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 2.110546827316284,
"rewards/margins": 3.6029295921325684,
"rewards/rejected": -1.4890625476837158,
"step": 375
},
{
"epoch": 0.2707758083192304,
"grad_norm": 25.285552978515625,
"learning_rate": 9.147815441003221e-07,
"logits/chosen": -2.653125047683716,
"logits/rejected": -2.666796922683716,
"logps/chosen": -91.6875,
"logps/rejected": -100.0687484741211,
"loss": 0.2094,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 2.166583299636841,
"rewards/margins": 3.796191453933716,
"rewards/rejected": -1.6293213367462158,
"step": 380
},
{
"epoch": 0.2743386479023782,
"grad_norm": 11.9544038772583,
"learning_rate": 9.112742542865664e-07,
"logits/chosen": -2.623046875,
"logits/rejected": -2.647656202316284,
"logps/chosen": -69.10624694824219,
"logps/rejected": -73.8125,
"loss": 0.1568,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 2.197265625,
"rewards/margins": 4.13671875,
"rewards/rejected": -1.93994140625,
"step": 385
},
{
"epoch": 0.27790148748552596,
"grad_norm": 7.545533657073975,
"learning_rate": 9.077032486991407e-07,
"logits/chosen": -2.6390624046325684,
"logits/rejected": -2.6527342796325684,
"logps/chosen": -76.2125015258789,
"logps/rejected": -78.25,
"loss": 0.164,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.2819581031799316,
"rewards/margins": 4.1171875,
"rewards/rejected": -1.8315918445587158,
"step": 390
},
{
"epoch": 0.28146432706867375,
"grad_norm": 17.746479034423828,
"learning_rate": 9.040690805683566e-07,
"logits/chosen": -2.6285157203674316,
"logits/rejected": -2.654296875,
"logps/chosen": -91.58125305175781,
"logps/rejected": -96.57499694824219,
"loss": 0.1974,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 2.3270201683044434,
"rewards/margins": 3.623730421066284,
"rewards/rejected": -1.2992675304412842,
"step": 395
},
{
"epoch": 0.2850271666518215,
"grad_norm": 6.393121719360352,
"learning_rate": 9.003723129098458e-07,
"logits/chosen": -2.5835938453674316,
"logits/rejected": -2.6171875,
"logps/chosen": -67.17500305175781,
"logps/rejected": -64.5374984741211,
"loss": 0.1381,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.3965821266174316,
"rewards/margins": 3.896484375,
"rewards/rejected": -1.499121069908142,
"step": 400
},
{
"epoch": 0.2885900062349693,
"grad_norm": 9.168136596679688,
"learning_rate": 8.966135184373361e-07,
"logits/chosen": -2.59375,
"logits/rejected": -2.611328125,
"logps/chosen": -91.40625,
"logps/rejected": -89.26249694824219,
"loss": 0.1728,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 2.112548828125,
"rewards/margins": 3.8931641578674316,
"rewards/rejected": -1.783105492591858,
"step": 405
},
{
"epoch": 0.292152845818117,
"grad_norm": 7.881724834442139,
"learning_rate": 8.927932794739257e-07,
"logits/chosen": -2.578906297683716,
"logits/rejected": -2.610156297683716,
"logps/chosen": -74.96875,
"logps/rejected": -79.33125305175781,
"loss": 0.1594,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.987982153892517,
"rewards/margins": 3.7542967796325684,
"rewards/rejected": -1.769140601158142,
"step": 410
},
{
"epoch": 0.2957156854012648,
"grad_norm": 17.415807723999023,
"learning_rate": 8.889121878618675e-07,
"logits/chosen": -2.5550780296325684,
"logits/rejected": -2.594921827316284,
"logps/chosen": -76.9124984741211,
"logps/rejected": -78.9375,
"loss": 0.1577,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.731329321861267,
"rewards/margins": 3.586132764816284,
"rewards/rejected": -1.855126976966858,
"step": 415
},
{
"epoch": 0.2992785249844126,
"grad_norm": 19.104272842407227,
"learning_rate": 8.849708448708789e-07,
"logits/chosen": -2.5941405296325684,
"logits/rejected": -2.607421875,
"logps/chosen": -85.20625305175781,
"logps/rejected": -90.34375,
"loss": 0.1708,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.9668457508087158,
"rewards/margins": 4.154101371765137,
"rewards/rejected": -2.1869139671325684,
"step": 420
},
{
"epoch": 0.30284136456756033,
"grad_norm": 9.433489799499512,
"learning_rate": 8.809698611049922e-07,
"logits/chosen": -2.5746092796325684,
"logits/rejected": -2.6011719703674316,
"logps/chosen": -89.78125,
"logps/rejected": -102.30000305175781,
"loss": 0.1482,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.6952025890350342,
"rewards/margins": 4.217577934265137,
"rewards/rejected": -2.5201172828674316,
"step": 425
},
{
"epoch": 0.3064042041507081,
"grad_norm": 9.763529777526855,
"learning_rate": 8.769098564079573e-07,
"logits/chosen": -2.582812547683716,
"logits/rejected": -2.6058592796325684,
"logps/chosen": -77.29374694824219,
"logps/rejected": -87.48750305175781,
"loss": 0.1348,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 2.206249952316284,
"rewards/margins": 4.6728515625,
"rewards/rejected": -2.46875,
"step": 430
},
{
"epoch": 0.30996704373385586,
"grad_norm": 110.477294921875,
"learning_rate": 8.727914597672146e-07,
"logits/chosen": -2.569140672683716,
"logits/rejected": -2.6070313453674316,
"logps/chosen": -98.33125305175781,
"logps/rejected": -109.48124694824219,
"loss": 0.1594,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.8028564453125,
"rewards/margins": 3.9544920921325684,
"rewards/rejected": -2.153027296066284,
"step": 435
},
{
"epoch": 0.31352988331700365,
"grad_norm": 14.672298431396484,
"learning_rate": 8.686153092164492e-07,
"logits/chosen": -2.5316405296325684,
"logits/rejected": -2.575000047683716,
"logps/chosen": -76.9124984741211,
"logps/rejected": -81.6937484741211,
"loss": 0.1387,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.836523413658142,
"rewards/margins": 3.8916015625,
"rewards/rejected": -2.058398485183716,
"step": 440
},
{
"epoch": 0.31709272290015145,
"grad_norm": 8.469518661499023,
"learning_rate": 8.643820517367467e-07,
"logits/chosen": -2.522656202316284,
"logits/rejected": -2.548828125,
"logps/chosen": -94.51875305175781,
"logps/rejected": -93.4312515258789,
"loss": 0.2304,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.2852294445037842,
"rewards/margins": 3.875537157058716,
"rewards/rejected": -2.592578172683716,
"step": 445
},
{
"epoch": 0.3206555624832992,
"grad_norm": 17.38793182373047,
"learning_rate": 8.600923431563589e-07,
"logits/chosen": -2.5218749046325684,
"logits/rejected": -2.551953077316284,
"logps/chosen": -97.5625,
"logps/rejected": -101.07499694824219,
"loss": 0.2786,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.6149657964706421,
"rewards/margins": 4.080859184265137,
"rewards/rejected": -3.467578172683716,
"step": 450
},
{
"epoch": 0.324218402066447,
"grad_norm": 10.183024406433105,
"learning_rate": 8.557468480491035e-07,
"logits/chosen": -2.5523438453674316,
"logits/rejected": -2.5445313453674316,
"logps/chosen": -107.8812484741211,
"logps/rejected": -117.9437484741211,
"loss": 0.2774,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.05524902418255806,
"rewards/margins": 4.509814262390137,
"rewards/rejected": -4.458398342132568,
"step": 455
},
{
"epoch": 0.3277812416495947,
"grad_norm": 5.875920295715332,
"learning_rate": 8.513462396314041e-07,
"logits/chosen": -2.5562500953674316,
"logits/rejected": -2.567578077316284,
"logps/chosen": -102.76875305175781,
"logps/rejected": -110.07499694824219,
"loss": 0.3396,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.13802489638328552,
"rewards/margins": 4.290234565734863,
"rewards/rejected": -4.151757717132568,
"step": 460
},
{
"epoch": 0.3313440812327425,
"grad_norm": 13.03176212310791,
"learning_rate": 8.46891199657995e-07,
"logits/chosen": -2.516796827316284,
"logits/rejected": -2.5433592796325684,
"logps/chosen": -80.4000015258789,
"logps/rejected": -85.76249694824219,
"loss": 0.176,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.7038635015487671,
"rewards/margins": 3.87890625,
"rewards/rejected": -3.173828125,
"step": 465
},
{
"epoch": 0.33490692081589024,
"grad_norm": 5.999340057373047,
"learning_rate": 8.423824183163015e-07,
"logits/chosen": -2.5425782203674316,
"logits/rejected": -2.55859375,
"logps/chosen": -86.125,
"logps/rejected": -90.82499694824219,
"loss": 0.1629,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.2559814453125,
"rewards/margins": 4.011328220367432,
"rewards/rejected": -2.756640672683716,
"step": 470
},
{
"epoch": 0.33846976039903803,
"grad_norm": 10.494653701782227,
"learning_rate": 8.37820594119514e-07,
"logits/chosen": -2.5570311546325684,
"logits/rejected": -2.5503907203674316,
"logps/chosen": -94.70625305175781,
"logps/rejected": -101.88749694824219,
"loss": 0.3166,
"rewards/accuracies": 0.8812500238418579,
"rewards/chosen": 0.986376941204071,
"rewards/margins": 3.810375928878784,
"rewards/rejected": -2.823193311691284,
"step": 475
},
{
"epoch": 0.3420325999821858,
"grad_norm": 6.390571594238281,
"learning_rate": 8.332064337983725e-07,
"logits/chosen": -2.508593797683716,
"logits/rejected": -2.536328077316284,
"logps/chosen": -82.83125305175781,
"logps/rejected": -84.9124984741211,
"loss": 0.1201,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.835424780845642,
"rewards/margins": 4.11328125,
"rewards/rejected": -2.278515577316284,
"step": 480
},
{
"epoch": 0.34559543956533356,
"grad_norm": 62.394775390625,
"learning_rate": 8.285406521916776e-07,
"logits/chosen": -2.54296875,
"logits/rejected": -2.5542969703674316,
"logps/chosen": -86.91874694824219,
"logps/rejected": -94.3499984741211,
"loss": 0.1883,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.5449950695037842,
"rewards/margins": 4.182275295257568,
"rewards/rejected": -2.636767625808716,
"step": 485
},
{
"epoch": 0.34915827914848135,
"grad_norm": 9.866166114807129,
"learning_rate": 8.23823972135546e-07,
"logits/chosen": -2.473437547683716,
"logits/rejected": -2.501171827316284,
"logps/chosen": -71.8499984741211,
"logps/rejected": -76.29374694824219,
"loss": 0.1806,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.8723846673965454,
"rewards/margins": 3.8558592796325684,
"rewards/rejected": -1.983862280845642,
"step": 490
},
{
"epoch": 0.3527211187316291,
"grad_norm": 8.677438735961914,
"learning_rate": 8.190571243514265e-07,
"logits/chosen": -2.542187452316284,
"logits/rejected": -2.580859422683716,
"logps/chosen": -94.5062484741211,
"logps/rejected": -103.3375015258789,
"loss": 0.1849,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.524304211139679,
"rewards/margins": 3.6502928733825684,
"rewards/rejected": -3.1285157203674316,
"step": 495
},
{
"epoch": 0.3562839583147769,
"grad_norm": 7.261411666870117,
"learning_rate": 8.142408473328944e-07,
"logits/chosen": -2.5062499046325684,
"logits/rejected": -2.521484375,
"logps/chosen": -70.9312515258789,
"logps/rejected": -89.0250015258789,
"loss": 0.1543,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.715905785560608,
"rewards/margins": 4.405468940734863,
"rewards/rejected": -2.6905274391174316,
"step": 500
},
{
"epoch": 0.3598467978979246,
"grad_norm": 10.193512916564941,
"learning_rate": 8.093758872312423e-07,
"logits/chosen": -2.5394530296325684,
"logits/rejected": -2.5746092796325684,
"logps/chosen": -95.79374694824219,
"logps/rejected": -104.95625305175781,
"loss": 0.1999,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.8937820196151733,
"rewards/margins": 4.2265625,
"rewards/rejected": -3.332812547683716,
"step": 505
},
{
"epoch": 0.3634096374810724,
"grad_norm": 9.057995796203613,
"learning_rate": 8.044629977398845e-07,
"logits/chosen": -2.521484375,
"logits/rejected": -2.5492186546325684,
"logps/chosen": -84.61250305175781,
"logps/rejected": -100.2125015258789,
"loss": 0.226,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.5130493640899658,
"rewards/margins": 4.856640815734863,
"rewards/rejected": -3.340136766433716,
"step": 510
},
{
"epoch": 0.3669724770642202,
"grad_norm": 6.788172245025635,
"learning_rate": 7.995029399775912e-07,
"logits/chosen": -2.4839844703674316,
"logits/rejected": -2.5132813453674316,
"logps/chosen": -75.07499694824219,
"logps/rejected": -85.2437515258789,
"loss": 0.1204,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 1.325585961341858,
"rewards/margins": 4.249218940734863,
"rewards/rejected": -2.926562547683716,
"step": 515
},
{
"epoch": 0.37053531664736794,
"grad_norm": 64.83377838134766,
"learning_rate": 7.944964823705759e-07,
"logits/chosen": -2.4761719703674316,
"logits/rejected": -2.510546922683716,
"logps/chosen": -85.46875,
"logps/rejected": -98.6312484741211,
"loss": 0.14,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.808642566204071,
"rewards/margins": 3.9560546875,
"rewards/rejected": -3.145312547683716,
"step": 520
},
{
"epoch": 0.37409815623051573,
"grad_norm": 10.828370094299316,
"learning_rate": 7.894444005334471e-07,
"logits/chosen": -2.483593702316284,
"logits/rejected": -2.5093750953674316,
"logps/chosen": -82.57499694824219,
"logps/rejected": -83.0999984741211,
"loss": 0.3036,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.2034575939178467,
"rewards/margins": 3.845410108566284,
"rewards/rejected": -2.64306640625,
"step": 525
},
{
"epoch": 0.37766099581366347,
"grad_norm": 6.788658618927002,
"learning_rate": 7.843474771490485e-07,
"logits/chosen": -2.498046875,
"logits/rejected": -2.516796827316284,
"logps/chosen": -83.8125,
"logps/rejected": -92.9375,
"loss": 0.1235,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.125561475753784,
"rewards/margins": 4.599413871765137,
"rewards/rejected": -2.474902391433716,
"step": 530
},
{
"epoch": 0.38122383539681126,
"grad_norm": 11.926194190979004,
"learning_rate": 7.792065018472035e-07,
"logits/chosen": -2.485156297683716,
"logits/rejected": -2.4925780296325684,
"logps/chosen": -75.98124694824219,
"logps/rejected": -84.58125305175781,
"loss": 0.2244,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.962408423423767,
"rewards/margins": 4.793554782867432,
"rewards/rejected": -2.8294920921325684,
"step": 535
},
{
"epoch": 0.38478667497995905,
"grad_norm": 6.597282886505127,
"learning_rate": 7.740222710823836e-07,
"logits/chosen": -2.505859375,
"logits/rejected": -2.51953125,
"logps/chosen": -87.7874984741211,
"logps/rejected": -94.8187484741211,
"loss": 0.1883,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.200537085533142,
"rewards/margins": 4.257177829742432,
"rewards/rejected": -3.0601563453674316,
"step": 540
},
{
"epoch": 0.3883495145631068,
"grad_norm": 7.958057880401611,
"learning_rate": 7.687955880103189e-07,
"logits/chosen": -2.490234375,
"logits/rejected": -2.503124952316284,
"logps/chosen": -90.0062484741211,
"logps/rejected": -96.61250305175781,
"loss": 0.1619,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.556249976158142,
"rewards/margins": 4.306445121765137,
"rewards/rejected": -2.7484374046325684,
"step": 545
},
{
"epoch": 0.3919123541462546,
"grad_norm": 5.72054386138916,
"learning_rate": 7.635272623635716e-07,
"logits/chosen": -2.524609327316284,
"logits/rejected": -2.544140577316284,
"logps/chosen": -84.875,
"logps/rejected": -97.2874984741211,
"loss": 0.172,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.576513648033142,
"rewards/margins": 4.614648342132568,
"rewards/rejected": -3.0403809547424316,
"step": 550
},
{
"epoch": 0.3954751937294023,
"grad_norm": 10.896967887878418,
"learning_rate": 7.582181103260896e-07,
"logits/chosen": -2.51171875,
"logits/rejected": -2.5289063453674316,
"logps/chosen": -97.76875305175781,
"logps/rejected": -115.2562484741211,
"loss": 0.154,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.833831787109375,
"rewards/margins": 4.320898532867432,
"rewards/rejected": -3.486132860183716,
"step": 555
},
{
"epoch": 0.3990380333125501,
"grad_norm": 4.492983341217041,
"learning_rate": 7.528689544067612e-07,
"logits/chosen": -2.516796827316284,
"logits/rejected": -2.533203125,
"logps/chosen": -95.07499694824219,
"logps/rejected": -107.0999984741211,
"loss": 0.2136,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.541247546672821,
"rewards/margins": 3.962109327316284,
"rewards/rejected": -3.4200196266174316,
"step": 560
},
{
"epoch": 0.40260087289569785,
"grad_norm": 7.211833477020264,
"learning_rate": 7.474806233119889e-07,
"logits/chosen": -2.5054688453674316,
"logits/rejected": -2.557812452316284,
"logps/chosen": -97.6624984741211,
"logps/rejected": -107.6812515258789,
"loss": 0.1502,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.6728760004043579,
"rewards/margins": 3.773632764816284,
"rewards/rejected": -3.101757764816284,
"step": 565
},
{
"epoch": 0.40616371247884564,
"grad_norm": 10.611228942871094,
"learning_rate": 7.420539518173053e-07,
"logits/chosen": -2.501171827316284,
"logits/rejected": -2.5230469703674316,
"logps/chosen": -84.89375305175781,
"logps/rejected": -96.04374694824219,
"loss": 0.2756,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.957080066204071,
"rewards/margins": 4.474218845367432,
"rewards/rejected": -3.5152344703674316,
"step": 570
},
{
"epoch": 0.40972655206199343,
"grad_norm": 8.464762687683105,
"learning_rate": 7.365897806380457e-07,
"logits/chosen": -2.4691405296325684,
"logits/rejected": -2.490234375,
"logps/chosen": -74.5875015258789,
"logps/rejected": -92.9375,
"loss": 0.1274,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.570068359375,
"rewards/margins": 4.603906154632568,
"rewards/rejected": -3.0337891578674316,
"step": 575
},
{
"epoch": 0.41328939164514117,
"grad_norm": 16.32123565673828,
"learning_rate": 7.310889562991036e-07,
"logits/chosen": -2.458203077316284,
"logits/rejected": -2.479687452316284,
"logps/chosen": -94.0999984741211,
"logps/rejected": -104.98124694824219,
"loss": 0.1985,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.3551514148712158,
"rewards/margins": 4.083203315734863,
"rewards/rejected": -2.726855516433716,
"step": 580
},
{
"epoch": 0.41685223122828896,
"grad_norm": 5.781502723693848,
"learning_rate": 7.255523310037832e-07,
"logits/chosen": -2.442187547683716,
"logits/rejected": -2.4574217796325684,
"logps/chosen": -79.4625015258789,
"logps/rejected": -91.8687515258789,
"loss": 0.1093,
"rewards/accuracies": 0.96875,
"rewards/chosen": 0.938916027545929,
"rewards/margins": 4.7197265625,
"rewards/rejected": -3.7837891578674316,
"step": 585
},
{
"epoch": 0.4204150708114367,
"grad_norm": 6.917849540710449,
"learning_rate": 7.199807625017749e-07,
"logits/chosen": -2.450390577316284,
"logits/rejected": -2.4691405296325684,
"logps/chosen": -93.23750305175781,
"logps/rejected": -97.26249694824219,
"loss": 0.1563,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2737548351287842,
"rewards/margins": 4.183203220367432,
"rewards/rejected": -2.9102845191955566,
"step": 590
},
{
"epoch": 0.4239779103945845,
"grad_norm": 14.325077056884766,
"learning_rate": 7.143751139562694e-07,
"logits/chosen": -2.4664063453674316,
"logits/rejected": -2.4683594703674316,
"logps/chosen": -100.33125305175781,
"logps/rejected": -115.8812484741211,
"loss": 0.2218,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.18304443359375,
"rewards/margins": 3.9588866233825684,
"rewards/rejected": -3.7740235328674316,
"step": 595
},
{
"epoch": 0.4275407499777322,
"grad_norm": 8.037822723388672,
"learning_rate": 7.08736253810235e-07,
"logits/chosen": -2.401171922683716,
"logits/rejected": -2.4214844703674316,
"logps/chosen": -77.76249694824219,
"logps/rejected": -88.64375305175781,
"loss": 0.119,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.3977539539337158,
"rewards/margins": 4.58984375,
"rewards/rejected": -3.192578077316284,
"step": 600
},
{
"epoch": 0.43110358956088,
"grad_norm": 5.8383636474609375,
"learning_rate": 7.030650556518742e-07,
"logits/chosen": -2.444531202316284,
"logits/rejected": -2.473828077316284,
"logps/chosen": -93.4437484741211,
"logps/rejected": -104.63749694824219,
"loss": 0.1362,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.25640869140625,
"rewards/margins": 4.8349609375,
"rewards/rejected": -3.578125,
"step": 605
},
{
"epoch": 0.4346664291440278,
"grad_norm": 4.72075891494751,
"learning_rate": 6.973623980792874e-07,
"logits/chosen": -2.4136719703674316,
"logits/rejected": -2.423828125,
"logps/chosen": -90.25,
"logps/rejected": -103.0562515258789,
"loss": 0.2161,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 1.163671851158142,
"rewards/margins": 4.592236518859863,
"rewards/rejected": -3.4292969703674316,
"step": 610
},
{
"epoch": 0.43822926872717555,
"grad_norm": 7.089286804199219,
"learning_rate": 6.916291645643557e-07,
"logits/chosen": -2.4195313453674316,
"logits/rejected": -2.457812547683716,
"logps/chosen": -89.0999984741211,
"logps/rejected": -115.44999694824219,
"loss": 0.1713,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.977294921875,
"rewards/margins": 4.731640815734863,
"rewards/rejected": -3.75390625,
"step": 615
},
{
"epoch": 0.44179210831032334,
"grad_norm": 10.222452163696289,
"learning_rate": 6.858662433158724e-07,
"logits/chosen": -2.411328077316284,
"logits/rejected": -2.451953172683716,
"logps/chosen": -105.6187515258789,
"logps/rejected": -113.46875,
"loss": 0.1649,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.13162842392921448,
"rewards/margins": 4.344336032867432,
"rewards/rejected": -4.2119140625,
"step": 620
},
{
"epoch": 0.4453549478934711,
"grad_norm": 4.796070575714111,
"learning_rate": 6.800745271419382e-07,
"logits/chosen": -2.382031202316284,
"logits/rejected": -2.408984422683716,
"logps/chosen": -75.1312484741211,
"logps/rejected": -81.4375,
"loss": 0.1686,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.6754150390625,
"rewards/margins": 4.466406345367432,
"rewards/rejected": -2.793652296066284,
"step": 625
},
{
"epoch": 0.44891778747661887,
"grad_norm": 12.59176254272461,
"learning_rate": 6.742549133116458e-07,
"logits/chosen": -2.393359422683716,
"logits/rejected": -2.428906202316284,
"logps/chosen": -79.39375305175781,
"logps/rejected": -99.51875305175781,
"loss": 0.2592,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.839855968952179,
"rewards/margins": 4.597460746765137,
"rewards/rejected": -3.752734422683716,
"step": 630
},
{
"epoch": 0.45248062705976666,
"grad_norm": 11.1537504196167,
"learning_rate": 6.684083034160716e-07,
"logits/chosen": -2.4027342796325684,
"logits/rejected": -2.3941407203674316,
"logps/chosen": -88.94999694824219,
"logps/rejected": -94.60624694824219,
"loss": 0.1405,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.701879858970642,
"rewards/margins": 5.148046970367432,
"rewards/rejected": -3.4483399391174316,
"step": 635
},
{
"epoch": 0.4560434666429144,
"grad_norm": 6.074214458465576,
"learning_rate": 6.62535603228599e-07,
"logits/chosen": -2.3843750953674316,
"logits/rejected": -2.404296875,
"logps/chosen": -79.69999694824219,
"logps/rejected": -93.61250305175781,
"loss": 0.1523,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.164160132408142,
"rewards/margins": 4.486718654632568,
"rewards/rejected": -3.321582078933716,
"step": 640
},
{
"epoch": 0.4596063062260622,
"grad_norm": 8.071775436401367,
"learning_rate": 6.566377225645938e-07,
"logits/chosen": -2.4292969703674316,
"logits/rejected": -2.4781250953674316,
"logps/chosen": -103.96875,
"logps/rejected": -115.1937484741211,
"loss": 0.1832,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.35205078125,
"rewards/margins": 4.561913967132568,
"rewards/rejected": -3.2134766578674316,
"step": 645
},
{
"epoch": 0.4631691458092099,
"grad_norm": 25.342777252197266,
"learning_rate": 6.507155751404518e-07,
"logits/chosen": -2.3851561546325684,
"logits/rejected": -2.4195313453674316,
"logps/chosen": -92.91874694824219,
"logps/rejected": -109.58125305175781,
"loss": 0.1515,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.623998999595642,
"rewards/margins": 5.013281345367432,
"rewards/rejected": -3.393749952316284,
"step": 650
},
{
"epoch": 0.4667319853923577,
"grad_norm": 12.96956729888916,
"learning_rate": 6.447700784320449e-07,
"logits/chosen": -2.3804688453674316,
"logits/rejected": -2.405468702316284,
"logps/chosen": -76.1875,
"logps/rejected": -93.0999984741211,
"loss": 0.1773,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.213952660560608,
"rewards/margins": 3.945117235183716,
"rewards/rejected": -2.731640577316284,
"step": 655
},
{
"epoch": 0.47029482497550545,
"grad_norm": 9.139556884765625,
"learning_rate": 6.38802153532582e-07,
"logits/chosen": -2.3753905296325684,
"logits/rejected": -2.393359422683716,
"logps/chosen": -87.54374694824219,
"logps/rejected": -91.7750015258789,
"loss": 0.1862,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.614367663860321,
"rewards/margins": 4.119140625,
"rewards/rejected": -3.505664110183716,
"step": 660
},
{
"epoch": 0.47385766455865325,
"grad_norm": 7.9081339836120605,
"learning_rate": 6.328127250099111e-07,
"logits/chosen": -2.4085936546325684,
"logits/rejected": -2.419921875,
"logps/chosen": -92.58125305175781,
"logps/rejected": -103.55000305175781,
"loss": 0.3,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.018774390220642,
"rewards/margins": 4.601855278015137,
"rewards/rejected": -3.581372022628784,
"step": 665
},
{
"epoch": 0.47742050414180104,
"grad_norm": 8.504165649414062,
"learning_rate": 6.268027207632821e-07,
"logits/chosen": -2.376171827316284,
"logits/rejected": -2.381640672683716,
"logps/chosen": -81.94999694824219,
"logps/rejected": -97.55000305175781,
"loss": 0.1193,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.4761962890625,
"rewards/margins": 4.615234375,
"rewards/rejected": -3.1357421875,
"step": 670
},
{
"epoch": 0.4809833437249488,
"grad_norm": 8.712873458862305,
"learning_rate": 6.207730718795948e-07,
"logits/chosen": -2.342968702316284,
"logits/rejected": -2.3753905296325684,
"logps/chosen": -79.13749694824219,
"logps/rejected": -94.98750305175781,
"loss": 0.1471,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.8577514886856079,
"rewards/margins": 3.96484375,
"rewards/rejected": -3.1075196266174316,
"step": 675
},
{
"epoch": 0.48454618330809657,
"grad_norm": 15.8992338180542,
"learning_rate": 6.147247124891518e-07,
"logits/chosen": -2.3609375953674316,
"logits/rejected": -2.3746094703674316,
"logps/chosen": -82.59375,
"logps/rejected": -91.4124984741211,
"loss": 0.1209,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.4687988758087158,
"rewards/margins": 4.640625,
"rewards/rejected": -3.1724610328674316,
"step": 680
},
{
"epoch": 0.4881090228912443,
"grad_norm": 7.288065433502197,
"learning_rate": 6.086585796209404e-07,
"logits/chosen": -2.3714842796325684,
"logits/rejected": -2.3773436546325684,
"logps/chosen": -77.9124984741211,
"logps/rejected": -95.8125,
"loss": 0.1402,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.0320556163787842,
"rewards/margins": 4.360156059265137,
"rewards/rejected": -3.329296827316284,
"step": 685
},
{
"epoch": 0.4916718624743921,
"grad_norm": 75.57015991210938,
"learning_rate": 6.025756130574652e-07,
"logits/chosen": -2.380859375,
"logits/rejected": -2.3902344703674316,
"logps/chosen": -91.91874694824219,
"logps/rejected": -106.76875305175781,
"loss": 0.1206,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.6959717273712158,
"rewards/margins": 4.78515625,
"rewards/rejected": -3.089062452316284,
"step": 690
},
{
"epoch": 0.49523470205753983,
"grad_norm": 9.346633911132812,
"learning_rate": 5.96476755189155e-07,
"logits/chosen": -2.3636717796325684,
"logits/rejected": -2.3515625,
"logps/chosen": -86.60624694824219,
"logps/rejected": -92.7249984741211,
"loss": 0.2749,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.4318358898162842,
"rewards/margins": 4.559765815734863,
"rewards/rejected": -3.130078077316284,
"step": 695
},
{
"epoch": 0.4987975416406876,
"grad_norm": 5.443614482879639,
"learning_rate": 5.903629508683649e-07,
"logits/chosen": -2.348437547683716,
"logits/rejected": -2.364453077316284,
"logps/chosen": -75.1624984741211,
"logps/rejected": -92.7874984741211,
"loss": 0.1063,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.4652831554412842,
"rewards/margins": 4.849023342132568,
"rewards/rejected": -3.383984327316284,
"step": 700
},
{
"epoch": 0.5023603812238354,
"grad_norm": 7.599747657775879,
"learning_rate": 5.842351472629959e-07,
"logits/chosen": -2.34765625,
"logits/rejected": -2.385546922683716,
"logps/chosen": -88.2437515258789,
"logps/rejected": -101.01249694824219,
"loss": 0.139,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.15228271484375,
"rewards/margins": 4.364062309265137,
"rewards/rejected": -3.2127928733825684,
"step": 705
},
{
"epoch": 0.5059232208069832,
"grad_norm": 7.442669868469238,
"learning_rate": 5.780942937097584e-07,
"logits/chosen": -2.3828125,
"logits/rejected": -2.4105467796325684,
"logps/chosen": -82.2562484741211,
"logps/rejected": -106.9437484741211,
"loss": 0.1851,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.423883080482483,
"rewards/margins": 4.710058689117432,
"rewards/rejected": -3.2855467796325684,
"step": 710
},
{
"epoch": 0.5094860603901309,
"grad_norm": 4.387864589691162,
"learning_rate": 5.719413415670976e-07,
"logits/chosen": -2.3765625953674316,
"logits/rejected": -2.382031202316284,
"logps/chosen": -75.7750015258789,
"logps/rejected": -90.01875305175781,
"loss": 0.0862,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 1.4503662586212158,
"rewards/margins": 5.015820503234863,
"rewards/rejected": -3.560351610183716,
"step": 715
},
{
"epoch": 0.5130488999732787,
"grad_norm": 6.781715393066406,
"learning_rate": 5.657772440678069e-07,
"logits/chosen": -2.362499952316284,
"logits/rejected": -2.3804688453674316,
"logps/chosen": -92.4749984741211,
"logps/rejected": -106.51875305175781,
"loss": 0.1194,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.975146472454071,
"rewards/margins": 4.7578125,
"rewards/rejected": -3.7816405296325684,
"step": 720
},
{
"epoch": 0.5166117395564265,
"grad_norm": 9.871294975280762,
"learning_rate": 5.596029561713493e-07,
"logits/chosen": -2.3695311546325684,
"logits/rejected": -2.384765625,
"logps/chosen": -99.88749694824219,
"logps/rejected": -107.4312515258789,
"loss": 0.1378,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.3436005115509033,
"rewards/margins": 4.878515720367432,
"rewards/rejected": -3.534374952316284,
"step": 725
},
{
"epoch": 0.5201745791395742,
"grad_norm": 7.945125579833984,
"learning_rate": 5.534194344159136e-07,
"logits/chosen": -2.4078125953674316,
"logits/rejected": -2.4156250953674316,
"logps/chosen": -108.8187484741211,
"logps/rejected": -127.9375,
"loss": 0.1956,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.9227539300918579,
"rewards/margins": 5.07421875,
"rewards/rejected": -4.1484375,
"step": 730
},
{
"epoch": 0.5237374187227221,
"grad_norm": 7.44891881942749,
"learning_rate": 5.472276367702236e-07,
"logits/chosen": -2.3570313453674316,
"logits/rejected": -2.366015672683716,
"logps/chosen": -94.91874694824219,
"logps/rejected": -106.01249694824219,
"loss": 0.1449,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.844006359577179,
"rewards/margins": 4.509179592132568,
"rewards/rejected": -3.6656250953674316,
"step": 735
},
{
"epoch": 0.5273002583058698,
"grad_norm": 18.9903621673584,
"learning_rate": 5.410285224851281e-07,
"logits/chosen": -2.328906297683716,
"logits/rejected": -2.3597655296325684,
"logps/chosen": -83.8187484741211,
"logps/rejected": -95.0062484741211,
"loss": 0.1806,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.957568347454071,
"rewards/margins": 4.262499809265137,
"rewards/rejected": -3.30810546875,
"step": 740
},
{
"epoch": 0.5308630978890175,
"grad_norm": 9.177763938903809,
"learning_rate": 5.348230519449901e-07,
"logits/chosen": -2.382031202316284,
"logits/rejected": -2.3726563453674316,
"logps/chosen": -81.125,
"logps/rejected": -100.1187515258789,
"loss": 0.1606,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 1.4339721202850342,
"rewards/margins": 4.755859375,
"rewards/rejected": -3.322265625,
"step": 745
},
{
"epoch": 0.5344259374721653,
"grad_norm": 9.381056785583496,
"learning_rate": 5.286121865189017e-07,
"logits/chosen": -2.362499952316284,
"logits/rejected": -2.3539061546325684,
"logps/chosen": -89.9749984741211,
"logps/rejected": -101.79374694824219,
"loss": 0.1385,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.702539086341858,
"rewards/margins": 4.629687309265137,
"rewards/rejected": -2.930468797683716,
"step": 750
},
{
"epoch": 0.5379887770553131,
"grad_norm": 22.414461135864258,
"learning_rate": 5.223968884117458e-07,
"logits/chosen": -2.3519530296325684,
"logits/rejected": -2.3726563453674316,
"logps/chosen": -98.01875305175781,
"logps/rejected": -104.3375015258789,
"loss": 0.1909,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.934985339641571,
"rewards/margins": 4.401562690734863,
"rewards/rejected": -3.46875,
"step": 755
},
{
"epoch": 0.5415516166384609,
"grad_norm": 14.190799713134766,
"learning_rate": 5.161781205151293e-07,
"logits/chosen": -2.3734374046325684,
"logits/rejected": -2.405468702316284,
"logps/chosen": -101.76249694824219,
"logps/rejected": -122.0250015258789,
"loss": 0.1682,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.6546630859375,
"rewards/margins": 4.376562595367432,
"rewards/rejected": -3.7222657203674316,
"step": 760
},
{
"epoch": 0.5451144562216086,
"grad_norm": 6.251968860626221,
"learning_rate": 5.099568462582087e-07,
"logits/chosen": -2.319140672683716,
"logits/rejected": -2.3343749046325684,
"logps/chosen": -73.5999984741211,
"logps/rejected": -96.88749694824219,
"loss": 0.106,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.074597120285034,
"rewards/margins": 5.300000190734863,
"rewards/rejected": -3.227099657058716,
"step": 765
},
{
"epoch": 0.5486772958047564,
"grad_norm": 9.493823051452637,
"learning_rate": 5.037340294584323e-07,
"logits/chosen": -2.348437547683716,
"logits/rejected": -2.3765625953674316,
"logps/chosen": -95.94999694824219,
"logps/rejected": -111.57499694824219,
"loss": 0.1611,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.665844738483429,
"rewards/margins": 4.508008003234863,
"rewards/rejected": -3.8408203125,
"step": 770
},
{
"epoch": 0.5522401353879042,
"grad_norm": 8.721504211425781,
"learning_rate": 4.975106341722242e-07,
"logits/chosen": -2.349609375,
"logits/rejected": -2.3675780296325684,
"logps/chosen": -81.91874694824219,
"logps/rejected": -91.33125305175781,
"loss": 0.2756,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.837109386920929,
"rewards/margins": 4.758008003234863,
"rewards/rejected": -3.91796875,
"step": 775
},
{
"epoch": 0.5558029749710519,
"grad_norm": 7.29752779006958,
"learning_rate": 4.912876245456287e-07,
"logits/chosen": -2.3472657203674316,
"logits/rejected": -2.35546875,
"logps/chosen": -81.8499984741211,
"logps/rejected": -105.7249984741211,
"loss": 0.1115,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.476263403892517,
"rewards/margins": 5.165234565734863,
"rewards/rejected": -3.6888670921325684,
"step": 780
},
{
"epoch": 0.5593658145541996,
"grad_norm": 15.453743934631348,
"learning_rate": 4.850659646649433e-07,
"logits/chosen": -2.367968797683716,
"logits/rejected": -2.3695311546325684,
"logps/chosen": -90.0,
"logps/rejected": -111.75,
"loss": 0.1826,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.060827612876892,
"rewards/margins": 4.824999809265137,
"rewards/rejected": -3.762500047683716,
"step": 785
},
{
"epoch": 0.5629286541373475,
"grad_norm": 7.576887130737305,
"learning_rate": 4.788466184073585e-07,
"logits/chosen": -2.3140625953674316,
"logits/rejected": -2.346484422683716,
"logps/chosen": -82.10624694824219,
"logps/rejected": -102.0250015258789,
"loss": 0.2602,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2728393077850342,
"rewards/margins": 4.8544921875,
"rewards/rejected": -3.58203125,
"step": 790
},
{
"epoch": 0.5664914937204952,
"grad_norm": 14.275249481201172,
"learning_rate": 4.7263054929163175e-07,
"logits/chosen": -2.322265625,
"logits/rejected": -2.338671922683716,
"logps/chosen": -88.67500305175781,
"logps/rejected": -101.13749694824219,
"loss": 0.1402,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.216101050376892,
"rewards/margins": 4.634179592132568,
"rewards/rejected": -3.4203124046325684,
"step": 795
},
{
"epoch": 0.570054333303643,
"grad_norm": 5.2038044929504395,
"learning_rate": 4.664187203288167e-07,
"logits/chosen": -2.330078125,
"logits/rejected": -2.3597655296325684,
"logps/chosen": -90.9000015258789,
"logps/rejected": -111.6500015258789,
"loss": 0.1215,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 2.1827635765075684,
"rewards/margins": 5.364453315734863,
"rewards/rejected": -3.179394483566284,
"step": 800
},
{
"epoch": 0.5736171728867908,
"grad_norm": 12.148797035217285,
"learning_rate": 4.6021209387307025e-07,
"logits/chosen": -2.343945264816284,
"logits/rejected": -2.346484422683716,
"logps/chosen": -113.58125305175781,
"logps/rejected": -122.98750305175781,
"loss": 0.215,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.3671875,
"rewards/margins": 4.382177829742432,
"rewards/rejected": -4.015234470367432,
"step": 805
},
{
"epoch": 0.5771800124699386,
"grad_norm": 13.842655181884766,
"learning_rate": 4.540116314725622e-07,
"logits/chosen": -2.333203077316284,
"logits/rejected": -2.3726563453674316,
"logps/chosen": -101.0374984741211,
"logps/rejected": -114.25,
"loss": 0.2076,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.048675537109375,
"rewards/margins": 4.542578220367432,
"rewards/rejected": -3.491406202316284,
"step": 810
},
{
"epoch": 0.5807428520530863,
"grad_norm": 8.721251487731934,
"learning_rate": 4.478182937205096e-07,
"logits/chosen": -2.307421922683716,
"logits/rejected": -2.313281297683716,
"logps/chosen": -83.90625,
"logps/rejected": -94.54374694824219,
"loss": 0.3232,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 1.7117431163787842,
"rewards/margins": 4.647753715515137,
"rewards/rejected": -2.9359374046325684,
"step": 815
},
{
"epoch": 0.584305691636234,
"grad_norm": 6.957128047943115,
"learning_rate": 4.4163304010635873e-07,
"logits/chosen": -2.3324217796325684,
"logits/rejected": -2.37109375,
"logps/chosen": -92.45625305175781,
"logps/rejected": -104.15625,
"loss": 0.2184,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.1681396961212158,
"rewards/margins": 4.538378715515137,
"rewards/rejected": -3.3729491233825684,
"step": 820
},
{
"epoch": 0.5878685312193819,
"grad_norm": 6.509469032287598,
"learning_rate": 4.3545682886713785e-07,
"logits/chosen": -2.346874952316284,
"logits/rejected": -2.367968797683716,
"logps/chosen": -97.5875015258789,
"logps/rejected": -116.90625,
"loss": 0.1391,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.130731225013733,
"rewards/margins": 5.1904296875,
"rewards/rejected": -4.060937404632568,
"step": 825
},
{
"epoch": 0.5914313708025296,
"grad_norm": 8.003087997436523,
"learning_rate": 4.2929061683900547e-07,
"logits/chosen": -2.3363280296325684,
"logits/rejected": -2.3394532203674316,
"logps/chosen": -93.26249694824219,
"logps/rejected": -101.44999694824219,
"loss": 0.1551,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.0355713367462158,
"rewards/margins": 4.733593940734863,
"rewards/rejected": -3.700488328933716,
"step": 830
},
{
"epoch": 0.5949942103856773,
"grad_norm": 9.408036231994629,
"learning_rate": 4.2313535930901357e-07,
"logits/chosen": -2.382031202316284,
"logits/rejected": -2.3828125,
"logps/chosen": -89.88749694824219,
"logps/rejected": -120.4375,
"loss": 0.1501,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.1078369617462158,
"rewards/margins": 5.233984470367432,
"rewards/rejected": -4.1220703125,
"step": 835
},
{
"epoch": 0.5985570499688252,
"grad_norm": 8.164095878601074,
"learning_rate": 4.1699200986711235e-07,
"logits/chosen": -2.3257813453674316,
"logits/rejected": -2.3433594703674316,
"logps/chosen": -97.4937515258789,
"logps/rejected": -113.9000015258789,
"loss": 0.1906,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.912426769733429,
"rewards/margins": 4.489160060882568,
"rewards/rejected": -3.57568359375,
"step": 840
},
{
"epoch": 0.6021198895519729,
"grad_norm": 17.470115661621094,
"learning_rate": 4.108615202584175e-07,
"logits/chosen": -2.346874952316284,
"logits/rejected": -2.357421875,
"logps/chosen": -97.58125305175781,
"logps/rejected": -116.94999694824219,
"loss": 0.1324,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.9498535394668579,
"rewards/margins": 4.808984279632568,
"rewards/rejected": -3.857714891433716,
"step": 845
},
{
"epoch": 0.6056827291351207,
"grad_norm": 3.3351809978485107,
"learning_rate": 4.047448402357622e-07,
"logits/chosen": -2.279296875,
"logits/rejected": -2.319140672683716,
"logps/chosen": -70.6937484741211,
"logps/rejected": -86.0250015258789,
"loss": 0.2401,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.5939209461212158,
"rewards/margins": 4.990332126617432,
"rewards/rejected": -3.397656202316284,
"step": 850
},
{
"epoch": 0.6092455687182685,
"grad_norm": 9.039243698120117,
"learning_rate": 3.9864291741255997e-07,
"logits/chosen": -2.325000047683716,
"logits/rejected": -2.33984375,
"logps/chosen": -96.15625,
"logps/rejected": -119.4437484741211,
"loss": 0.0906,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.2700684070587158,
"rewards/margins": 5.166796684265137,
"rewards/rejected": -3.899609327316284,
"step": 855
},
{
"epoch": 0.6128084083014163,
"grad_norm": 7.753138065338135,
"learning_rate": 3.9255669711599703e-07,
"logits/chosen": -2.283984422683716,
"logits/rejected": -2.3335938453674316,
"logps/chosen": -80.9375,
"logps/rejected": -90.55000305175781,
"loss": 0.2316,
"rewards/accuracies": 0.9125000238418579,
"rewards/chosen": 0.857067883014679,
"rewards/margins": 4.355615139007568,
"rewards/rejected": -3.503222703933716,
"step": 860
},
{
"epoch": 0.616371247884564,
"grad_norm": 12.482671737670898,
"learning_rate": 3.8648712224057975e-07,
"logits/chosen": -2.353515625,
"logits/rejected": -2.335156202316284,
"logps/chosen": -90.9937515258789,
"logps/rejected": -116.0062484741211,
"loss": 0.1113,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.6822509765625,
"rewards/margins": 5.601953029632568,
"rewards/rejected": -3.9214844703674316,
"step": 865
},
{
"epoch": 0.6199340874677117,
"grad_norm": 14.556914329528809,
"learning_rate": 3.804351331020583e-07,
"logits/chosen": -2.313671827316284,
"logits/rejected": -2.323046922683716,
"logps/chosen": -80.7874984741211,
"logps/rejected": -97.20625305175781,
"loss": 0.1081,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.885522484779358,
"rewards/margins": 4.939453125,
"rewards/rejected": -3.056835889816284,
"step": 870
},
{
"epoch": 0.6234969270508596,
"grad_norm": 7.730465412139893,
"learning_rate": 3.744016672917509e-07,
"logits/chosen": -2.325390577316284,
"logits/rejected": -2.3453125953674316,
"logps/chosen": -88.4375,
"logps/rejected": -104.19999694824219,
"loss": 0.219,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.844067394733429,
"rewards/margins": 4.600878715515137,
"rewards/rejected": -3.7572264671325684,
"step": 875
},
{
"epoch": 0.6270597666340073,
"grad_norm": 11.791308403015137,
"learning_rate": 3.6838765953128914e-07,
"logits/chosen": -2.345703125,
"logits/rejected": -2.3687500953674316,
"logps/chosen": -86.75,
"logps/rejected": -112.86250305175781,
"loss": 0.1796,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.0369873046875,
"rewards/margins": 5.278515815734863,
"rewards/rejected": -4.239062309265137,
"step": 880
},
{
"epoch": 0.630622606217155,
"grad_norm": 14.919589042663574,
"learning_rate": 3.623940415278086e-07,
"logits/chosen": -2.2718749046325684,
"logits/rejected": -2.275390625,
"logps/chosen": -81.6343765258789,
"logps/rejected": -96.8375015258789,
"loss": 0.1462,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.508947730064392,
"rewards/margins": 5.040625095367432,
"rewards/rejected": -3.529296875,
"step": 885
},
{
"epoch": 0.6341854458003029,
"grad_norm": 18.789249420166016,
"learning_rate": 3.564217418296055e-07,
"logits/chosen": -2.305468797683716,
"logits/rejected": -2.328125,
"logps/chosen": -95.85624694824219,
"logps/rejected": -111.40625,
"loss": 0.1828,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.99578857421875,
"rewards/margins": 4.781542778015137,
"rewards/rejected": -3.78662109375,
"step": 890
},
{
"epoch": 0.6377482853834506,
"grad_norm": 17.591827392578125,
"learning_rate": 3.5047168568228394e-07,
"logits/chosen": -2.323437452316284,
"logits/rejected": -2.331249952316284,
"logps/chosen": -89.94999694824219,
"logps/rejected": -107.0687484741211,
"loss": 0.2022,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.7130126953125,
"rewards/margins": 4.793847560882568,
"rewards/rejected": -3.081982374191284,
"step": 895
},
{
"epoch": 0.6413111249665984,
"grad_norm": 17.939722061157227,
"learning_rate": 3.445447948854141e-07,
"logits/chosen": -2.3042969703674316,
"logits/rejected": -2.328906297683716,
"logps/chosen": -101.6187515258789,
"logps/rejected": -114.15625,
"loss": 0.1749,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.1605224609375,
"rewards/margins": 4.574999809265137,
"rewards/rejected": -3.413281202316284,
"step": 900
},
{
"epoch": 0.6448739645497461,
"grad_norm": 27.744525909423828,
"learning_rate": 3.386419876497244e-07,
"logits/chosen": -2.3695311546325684,
"logits/rejected": -2.375,
"logps/chosen": -110.9312515258789,
"logps/rejected": -131.7375030517578,
"loss": 0.1293,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.83831787109375,
"rewards/margins": 4.964453220367432,
"rewards/rejected": -4.127734184265137,
"step": 905
},
{
"epoch": 0.648436804132894,
"grad_norm": 11.689591407775879,
"learning_rate": 3.327641784548494e-07,
"logits/chosen": -2.330859422683716,
"logits/rejected": -2.360546827316284,
"logps/chosen": -97.2874984741211,
"logps/rejected": -111.78125,
"loss": 0.1031,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.701641857624054,
"rewards/margins": 4.783984184265137,
"rewards/rejected": -4.083203315734863,
"step": 910
},
{
"epoch": 0.6519996437160417,
"grad_norm": 4.99397087097168,
"learning_rate": 3.2691227790765674e-07,
"logits/chosen": -2.323046922683716,
"logits/rejected": -2.352734327316284,
"logps/chosen": -81.75,
"logps/rejected": -96.2874984741211,
"loss": 0.0998,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.350073218345642,
"rewards/margins": 5.151757717132568,
"rewards/rejected": -3.7984375953674316,
"step": 915
},
{
"epoch": 0.6555624832991894,
"grad_norm": 12.093426704406738,
"learning_rate": 3.210871926011724e-07,
"logits/chosen": -2.319531202316284,
"logits/rejected": -2.3267579078674316,
"logps/chosen": -84.58125305175781,
"logps/rejected": -104.80000305175781,
"loss": 0.1376,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.0358489751815796,
"rewards/margins": 4.966894626617432,
"rewards/rejected": -3.9325194358825684,
"step": 920
},
{
"epoch": 0.6591253228823373,
"grad_norm": 7.205654621124268,
"learning_rate": 3.1528982497412983e-07,
"logits/chosen": -2.3238282203674316,
"logits/rejected": -2.3511719703674316,
"logps/chosen": -103.3125,
"logps/rejected": -119.3125,
"loss": 0.1892,
"rewards/accuracies": 0.918749988079071,
"rewards/chosen": 0.536755383014679,
"rewards/margins": 4.951171875,
"rewards/rejected": -4.412890434265137,
"step": 925
},
{
"epoch": 0.662688162465485,
"grad_norm": 14.197951316833496,
"learning_rate": 3.095210731711603e-07,
"logits/chosen": -2.317187547683716,
"logits/rejected": -2.340625047683716,
"logps/chosen": -87.2874984741211,
"logps/rejected": -97.8375015258789,
"loss": 0.1569,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.5652587413787842,
"rewards/margins": 5.275976657867432,
"rewards/rejected": -3.709277391433716,
"step": 930
},
{
"epoch": 0.6662510020486327,
"grad_norm": 4.761596202850342,
"learning_rate": 3.0378183090365086e-07,
"logits/chosen": -2.3031249046325684,
"logits/rejected": -2.323437452316284,
"logps/chosen": -81.5999984741211,
"logps/rejected": -97.79374694824219,
"loss": 0.1236,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.888842761516571,
"rewards/margins": 4.931250095367432,
"rewards/rejected": -4.043749809265137,
"step": 935
},
{
"epoch": 0.6698138416317805,
"grad_norm": 12.251235008239746,
"learning_rate": 2.9807298731128774e-07,
"logits/chosen": -2.28515625,
"logits/rejected": -2.3050780296325684,
"logps/chosen": -90.19999694824219,
"logps/rejected": -107.4625015258789,
"loss": 0.1699,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.154022216796875,
"rewards/margins": 4.8662109375,
"rewards/rejected": -3.709179639816284,
"step": 940
},
{
"epoch": 0.6733766812149283,
"grad_norm": 9.046751022338867,
"learning_rate": 2.92395426824308e-07,
"logits/chosen": -2.3316407203674316,
"logits/rejected": -2.342968702316284,
"logps/chosen": -86.04374694824219,
"logps/rejected": -103.79374694824219,
"loss": 0.1121,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.208886742591858,
"rewards/margins": 5.004492282867432,
"rewards/rejected": -3.7982420921325684,
"step": 945
},
{
"epoch": 0.6769395207980761,
"grad_norm": 6.2202277183532715,
"learning_rate": 2.867500290264814e-07,
"logits/chosen": -2.318359375,
"logits/rejected": -2.328125,
"logps/chosen": -92.0999984741211,
"logps/rejected": -108.3499984741211,
"loss": 0.1039,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.389562964439392,
"rewards/margins": 5.326171875,
"rewards/rejected": -3.936328172683716,
"step": 950
},
{
"epoch": 0.6805023603812238,
"grad_norm": 9.26759147644043,
"learning_rate": 2.8113766851884257e-07,
"logits/chosen": -2.31640625,
"logits/rejected": -2.325390577316284,
"logps/chosen": -88.7437515258789,
"logps/rejected": -104.32499694824219,
"loss": 0.0947,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 1.828588843345642,
"rewards/margins": 5.205273628234863,
"rewards/rejected": -3.3765625953674316,
"step": 955
},
{
"epoch": 0.6840651999643717,
"grad_norm": 7.322399616241455,
"learning_rate": 2.75559214784196e-07,
"logits/chosen": -2.315624952316284,
"logits/rejected": -2.329296827316284,
"logps/chosen": -88.3687515258789,
"logps/rejected": -102.66874694824219,
"loss": 0.1189,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.0693359375,
"rewards/margins": 5.214062690734863,
"rewards/rejected": -4.143164157867432,
"step": 960
},
{
"epoch": 0.6876280395475194,
"grad_norm": 7.353623390197754,
"learning_rate": 2.700155320524119e-07,
"logits/chosen": -2.282421827316284,
"logits/rejected": -2.315234422683716,
"logps/chosen": -76.9937515258789,
"logps/rejected": -90.01249694824219,
"loss": 0.1001,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.221459984779358,
"rewards/margins": 4.799218654632568,
"rewards/rejected": -3.5804686546325684,
"step": 965
},
{
"epoch": 0.6911908791306671,
"grad_norm": 16.381595611572266,
"learning_rate": 2.6450747916653853e-07,
"logits/chosen": -2.3167967796325684,
"logits/rejected": -2.3304686546325684,
"logps/chosen": -91.53125,
"logps/rejected": -113.7874984741211,
"loss": 0.1369,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.2250487804412842,
"rewards/margins": 5.068749904632568,
"rewards/rejected": -3.841992139816284,
"step": 970
},
{
"epoch": 0.6947537187138149,
"grad_norm": 6.328347206115723,
"learning_rate": 2.5903590944974787e-07,
"logits/chosen": -2.3199219703674316,
"logits/rejected": -2.327343702316284,
"logps/chosen": -98.11250305175781,
"logps/rejected": -126.39375305175781,
"loss": 0.1543,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.3526367247104645,
"rewards/margins": 5.135546684265137,
"rewards/rejected": -4.785742282867432,
"step": 975
},
{
"epoch": 0.6983165582969627,
"grad_norm": 4.730679988861084,
"learning_rate": 2.5360167057313507e-07,
"logits/chosen": -2.331249952316284,
"logits/rejected": -2.350390672683716,
"logps/chosen": -101.5625,
"logps/rejected": -118.92500305175781,
"loss": 0.1445,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.254968285560608,
"rewards/margins": 5.006249904632568,
"rewards/rejected": -3.749218702316284,
"step": 980
},
{
"epoch": 0.7018793978801104,
"grad_norm": 27.285436630249023,
"learning_rate": 2.4820560442439597e-07,
"logits/chosen": -2.301953077316284,
"logits/rejected": -2.31640625,
"logps/chosen": -76.82499694824219,
"logps/rejected": -97.3125,
"loss": 0.1167,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.947167992591858,
"rewards/margins": 5.776171684265137,
"rewards/rejected": -3.8340821266174316,
"step": 985
},
{
"epoch": 0.7054422374632582,
"grad_norm": 9.996295928955078,
"learning_rate": 2.428485469773997e-07,
"logits/chosen": -2.302929639816284,
"logits/rejected": -2.3179688453674316,
"logps/chosen": -95.64375305175781,
"logps/rejected": -113.75,
"loss": 0.1502,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.806933581829071,
"rewards/margins": 4.828125,
"rewards/rejected": -4.019690036773682,
"step": 990
},
{
"epoch": 0.709005077046406,
"grad_norm": 10.70380973815918,
"learning_rate": 2.3753132816267573e-07,
"logits/chosen": -2.3402342796325684,
"logits/rejected": -2.335156202316284,
"logps/chosen": -100.13749694824219,
"logps/rejected": -118.3375015258789,
"loss": 0.131,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.5045897960662842,
"rewards/margins": 5.556250095367432,
"rewards/rejected": -4.048828125,
"step": 995
},
{
"epoch": 0.7125679166295538,
"grad_norm": 18.55549430847168,
"learning_rate": 2.322547717388406e-07,
"logits/chosen": -2.2847657203674316,
"logits/rejected": -2.299609422683716,
"logps/chosen": -86.5625,
"logps/rejected": -107.65625,
"loss": 0.1475,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.8027588129043579,
"rewards/margins": 4.818457126617432,
"rewards/rejected": -4.014843940734863,
"step": 1000
},
{
"epoch": 0.7161307562127015,
"grad_norm": 8.651657104492188,
"learning_rate": 2.2701969516497738e-07,
"logits/chosen": -2.267578125,
"logits/rejected": -2.2945313453674316,
"logps/chosen": -81.4625015258789,
"logps/rejected": -97.1624984741211,
"loss": 0.1165,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.997119128704071,
"rewards/margins": 5.078125,
"rewards/rejected": -4.081250190734863,
"step": 1005
},
{
"epoch": 0.7196935957958492,
"grad_norm": 8.88605785369873,
"learning_rate": 2.2182690947399303e-07,
"logits/chosen": -2.3101563453674316,
"logits/rejected": -2.299999952316284,
"logps/chosen": -94.23750305175781,
"logps/rejected": -113.11250305175781,
"loss": 0.1974,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.884765625,
"rewards/margins": 5.18798828125,
"rewards/rejected": -4.304101467132568,
"step": 1010
},
{
"epoch": 0.7232564353789971,
"grad_norm": 355.3295593261719,
"learning_rate": 2.1667721914697173e-07,
"logits/chosen": -2.3101563453674316,
"logits/rejected": -2.3218750953674316,
"logps/chosen": -84.8187484741211,
"logps/rejected": -97.64375305175781,
"loss": 0.1277,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.81243896484375,
"rewards/margins": 5.278515815734863,
"rewards/rejected": -3.4693360328674316,
"step": 1015
},
{
"epoch": 0.7268192749621448,
"grad_norm": 13.823278427124023,
"learning_rate": 2.11571421988541e-07,
"logits/chosen": -2.305468797683716,
"logits/rejected": -2.315624952316284,
"logps/chosen": -89.63749694824219,
"logps/rejected": -110.94999694824219,
"loss": 0.1137,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.272363305091858,
"rewards/margins": 5.442187309265137,
"rewards/rejected": -4.17041015625,
"step": 1020
},
{
"epoch": 0.7303821145452926,
"grad_norm": 8.384671211242676,
"learning_rate": 2.065103090032743e-07,
"logits/chosen": -2.3109374046325684,
"logits/rejected": -2.3167967796325684,
"logps/chosen": -91.98750305175781,
"logps/rejected": -102.36250305175781,
"loss": 0.1786,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.9146575927734375,
"rewards/margins": 4.707129001617432,
"rewards/rejected": -3.791015625,
"step": 1025
},
{
"epoch": 0.7339449541284404,
"grad_norm": 17.72806167602539,
"learning_rate": 2.014946642731468e-07,
"logits/chosen": -2.2828125953674316,
"logits/rejected": -2.270703077316284,
"logps/chosen": -74.35624694824219,
"logps/rejected": -95.42500305175781,
"loss": 0.1665,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.3673095703125,
"rewards/margins": 5.188672065734863,
"rewards/rejected": -3.819140672683716,
"step": 1030
},
{
"epoch": 0.7375077937115881,
"grad_norm": 14.17636775970459,
"learning_rate": 1.9652526483606196e-07,
"logits/chosen": -2.2515625953674316,
"logits/rejected": -2.276562452316284,
"logps/chosen": -74.0625,
"logps/rejected": -95.38749694824219,
"loss": 0.1077,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.250244140625,
"rewards/margins": 5.008203029632568,
"rewards/rejected": -3.7601561546325684,
"step": 1035
},
{
"epoch": 0.7410706332947359,
"grad_norm": 8.393796920776367,
"learning_rate": 1.9160288056547196e-07,
"logits/chosen": -2.256640672683716,
"logits/rejected": -2.3023438453674316,
"logps/chosen": -88.26875305175781,
"logps/rejected": -103.7249984741211,
"loss": 0.1336,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.1659667491912842,
"rewards/margins": 4.723242282867432,
"rewards/rejected": -3.5577149391174316,
"step": 1040
},
{
"epoch": 0.7446334728778837,
"grad_norm": 4.969725131988525,
"learning_rate": 1.867282740511056e-07,
"logits/chosen": -2.2894530296325684,
"logits/rejected": -2.305468797683716,
"logps/chosen": -89.5875015258789,
"logps/rejected": -112.2125015258789,
"loss": 0.1522,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.8788086175918579,
"rewards/margins": 5.113476753234863,
"rewards/rejected": -4.235547065734863,
"step": 1045
},
{
"epoch": 0.7481963124610315,
"grad_norm": 10.194605827331543,
"learning_rate": 1.819022004808261e-07,
"logits/chosen": -2.303906202316284,
"logits/rejected": -2.3179688453674316,
"logps/chosen": -95.14375305175781,
"logps/rejected": -117.89375305175781,
"loss": 0.1688,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.0526001453399658,
"rewards/margins": 4.858056545257568,
"rewards/rejected": -3.808300733566284,
"step": 1050
},
{
"epoch": 0.7517591520441792,
"grad_norm": 8.145842552185059,
"learning_rate": 1.7712540752363607e-07,
"logits/chosen": -2.301953077316284,
"logits/rejected": -2.302734375,
"logps/chosen": -81.76249694824219,
"logps/rejected": -106.76249694824219,
"loss": 0.2246,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.1788451671600342,
"rewards/margins": 5.237695217132568,
"rewards/rejected": -4.058789253234863,
"step": 1055
},
{
"epoch": 0.7553219916273269,
"grad_norm": 6.336703777313232,
"learning_rate": 1.7239863521384517e-07,
"logits/chosen": -2.332812547683716,
"logits/rejected": -2.321093797683716,
"logps/chosen": -91.76875305175781,
"logps/rejected": -111.5875015258789,
"loss": 0.1455,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.3840820789337158,
"rewards/margins": 4.992383003234863,
"rewards/rejected": -3.6078124046325684,
"step": 1060
},
{
"epoch": 0.7588848312104748,
"grad_norm": 8.184171676635742,
"learning_rate": 1.677226158364225e-07,
"logits/chosen": -2.2992186546325684,
"logits/rejected": -2.315234422683716,
"logps/chosen": -107.5374984741211,
"logps/rejected": -118.79374694824219,
"loss": 0.2229,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 1.201904296875,
"rewards/margins": 4.680468559265137,
"rewards/rejected": -3.4779295921325684,
"step": 1065
},
{
"epoch": 0.7624476707936225,
"grad_norm": 9.234628677368164,
"learning_rate": 1.6309807381354957e-07,
"logits/chosen": -2.291210889816284,
"logits/rejected": -2.314453125,
"logps/chosen": -90.58125305175781,
"logps/rejected": -106.86250305175781,
"loss": 0.1176,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.29840087890625,
"rewards/margins": 5.166796684265137,
"rewards/rejected": -3.872265577316284,
"step": 1070
},
{
"epoch": 0.7660105103767703,
"grad_norm": 5.269473075866699,
"learning_rate": 1.5852572559238941e-07,
"logits/chosen": -2.289843797683716,
"logits/rejected": -2.32421875,
"logps/chosen": -99.6312484741211,
"logps/rejected": -115.7125015258789,
"loss": 0.1284,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.0069701671600342,
"rewards/margins": 4.695214748382568,
"rewards/rejected": -3.688183546066284,
"step": 1075
},
{
"epoch": 0.7695733499599181,
"grad_norm": 9.90135669708252,
"learning_rate": 1.5400627953409394e-07,
"logits/chosen": -2.3121094703674316,
"logits/rejected": -2.309375047683716,
"logps/chosen": -90.0062484741211,
"logps/rejected": -111.0,
"loss": 0.1327,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.312280297279358,
"rewards/margins": 5.36328125,
"rewards/rejected": -4.051562309265137,
"step": 1080
},
{
"epoch": 0.7731361895430658,
"grad_norm": 11.21921443939209,
"learning_rate": 1.4954043580406155e-07,
"logits/chosen": -2.294921875,
"logits/rejected": -2.301562547683716,
"logps/chosen": -97.3499984741211,
"logps/rejected": -112.95625305175781,
"loss": 0.2028,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.258544921875,
"rewards/margins": 5.259570121765137,
"rewards/rejected": -4.002831935882568,
"step": 1085
},
{
"epoch": 0.7766990291262136,
"grad_norm": 17.780261993408203,
"learning_rate": 1.4512888626346598e-07,
"logits/chosen": -2.2953124046325684,
"logits/rejected": -2.334765672683716,
"logps/chosen": -91.4312515258789,
"logps/rejected": -105.42500305175781,
"loss": 0.1559,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.1874573230743408,
"rewards/margins": 4.756054878234863,
"rewards/rejected": -3.5699219703674316,
"step": 1090
},
{
"epoch": 0.7802618687093613,
"grad_norm": 7.047048091888428,
"learning_rate": 1.407723143620716e-07,
"logits/chosen": -2.3238282203674316,
"logits/rejected": -2.334765672683716,
"logps/chosen": -104.80000305175781,
"logps/rejected": -128.02499389648438,
"loss": 0.0925,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.6564819812774658,
"rewards/margins": 5.543359279632568,
"rewards/rejected": -3.887402296066284,
"step": 1095
},
{
"epoch": 0.7838247082925092,
"grad_norm": 9.77812671661377,
"learning_rate": 1.3647139503235045e-07,
"logits/chosen": -2.262500047683716,
"logits/rejected": -2.2998046875,
"logps/chosen": -97.88749694824219,
"logps/rejected": -114.2874984741211,
"loss": 0.1671,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.3259338438510895,
"rewards/margins": 4.857617378234863,
"rewards/rejected": -4.532422065734863,
"step": 1100
},
{
"epoch": 0.7873875478756569,
"grad_norm": 8.321101188659668,
"learning_rate": 1.3222679458492086e-07,
"logits/chosen": -2.2890625,
"logits/rejected": -2.3089842796325684,
"logps/chosen": -109.0250015258789,
"logps/rejected": -124.01249694824219,
"loss": 0.1252,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.8846069574356079,
"rewards/margins": 4.947070121765137,
"rewards/rejected": -4.061327934265137,
"step": 1105
},
{
"epoch": 0.7909503874588046,
"grad_norm": 73.0202865600586,
"learning_rate": 1.2803917060531993e-07,
"logits/chosen": -2.2777342796325684,
"logits/rejected": -2.309375047683716,
"logps/chosen": -99.2249984741211,
"logps/rejected": -110.9000015258789,
"loss": 0.1357,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.3677612245082855,
"rewards/margins": 4.772656440734863,
"rewards/rejected": -4.407422065734863,
"step": 1110
},
{
"epoch": 0.7945132270419525,
"grad_norm": 4.360289096832275,
"learning_rate": 1.2390917185212863e-07,
"logits/chosen": -2.262500047683716,
"logits/rejected": -2.278125047683716,
"logps/chosen": -92.60624694824219,
"logps/rejected": -107.5250015258789,
"loss": 0.1152,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.065679907798767,
"rewards/margins": 4.841601371765137,
"rewards/rejected": -3.7777342796325684,
"step": 1115
},
{
"epoch": 0.7980760666251002,
"grad_norm": 23.889127731323242,
"learning_rate": 1.1983743815646508e-07,
"logits/chosen": -2.251171827316284,
"logits/rejected": -2.2945313453674316,
"logps/chosen": -98.3187484741211,
"logps/rejected": -109.01249694824219,
"loss": 0.2434,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.17170409858226776,
"rewards/margins": 4.185156345367432,
"rewards/rejected": -4.01220703125,
"step": 1120
},
{
"epoch": 0.801638906208248,
"grad_norm": 3.363276481628418,
"learning_rate": 1.158246003228589e-07,
"logits/chosen": -2.2861328125,
"logits/rejected": -2.29296875,
"logps/chosen": -93.25,
"logps/rejected": -108.80000305175781,
"loss": 0.1153,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.910595715045929,
"rewards/margins": 5.132616996765137,
"rewards/rejected": -4.221289157867432,
"step": 1125
},
{
"epoch": 0.8052017457913957,
"grad_norm": 7.905906677246094,
"learning_rate": 1.1187128003152579e-07,
"logits/chosen": -2.283203125,
"logits/rejected": -2.288281202316284,
"logps/chosen": -85.39375305175781,
"logps/rejected": -107.3499984741211,
"loss": 0.1325,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2729613780975342,
"rewards/margins": 5.160742282867432,
"rewards/rejected": -3.8871092796325684,
"step": 1130
},
{
"epoch": 0.8087645853745435,
"grad_norm": 7.837643146514893,
"learning_rate": 1.0797808974205552e-07,
"logits/chosen": -2.289843797683716,
"logits/rejected": -2.287890672683716,
"logps/chosen": -82.48750305175781,
"logps/rejected": -99.38749694824219,
"loss": 0.1171,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.919165015220642,
"rewards/margins": 5.359765529632568,
"rewards/rejected": -3.4407715797424316,
"step": 1135
},
{
"epoch": 0.8123274249576913,
"grad_norm": 11.885445594787598,
"learning_rate": 1.0414563259852682e-07,
"logits/chosen": -2.298046827316284,
"logits/rejected": -2.29296875,
"logps/chosen": -97.96875,
"logps/rejected": -120.92500305175781,
"loss": 0.1138,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.620800793170929,
"rewards/margins": 5.161328315734863,
"rewards/rejected": -4.541406154632568,
"step": 1140
},
{
"epoch": 0.815890264540839,
"grad_norm": 8.711338996887207,
"learning_rate": 1.0037450233606782e-07,
"logits/chosen": -2.262500047683716,
"logits/rejected": -2.270703077316284,
"logps/chosen": -84.22187805175781,
"logps/rejected": -105.32499694824219,
"loss": 0.1101,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.4210205078125,
"rewards/margins": 5.448046684265137,
"rewards/rejected": -4.029687404632568,
"step": 1145
},
{
"epoch": 0.8194531041239869,
"grad_norm": 11.828937530517578,
"learning_rate": 9.666528318887196e-08,
"logits/chosen": -2.263867139816284,
"logits/rejected": -2.305468797683716,
"logps/chosen": -90.71875,
"logps/rejected": -104.2750015258789,
"loss": 0.151,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.256616234779358,
"rewards/margins": 4.792187690734863,
"rewards/rejected": -3.5337891578674316,
"step": 1150
},
{
"epoch": 0.8230159437071346,
"grad_norm": 6.786935329437256,
"learning_rate": 9.301854979968715e-08,
"logits/chosen": -2.2972655296325684,
"logits/rejected": -2.315624952316284,
"logps/chosen": -87.6500015258789,
"logps/rejected": -104.4000015258789,
"loss": 0.1213,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.2689208984375,
"rewards/margins": 4.808203220367432,
"rewards/rejected": -3.541015625,
"step": 1155
},
{
"epoch": 0.8265787832902823,
"grad_norm": 6.224339962005615,
"learning_rate": 8.943486713079068e-08,
"logits/chosen": -2.317187547683716,
"logits/rejected": -2.309765577316284,
"logps/chosen": -92.5562515258789,
"logps/rejected": -113.80000305175781,
"loss": 0.1581,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.9830077886581421,
"rewards/margins": 5.277148246765137,
"rewards/rejected": -4.294531345367432,
"step": 1160
},
{
"epoch": 0.8301416228734301,
"grad_norm": 12.381536483764648,
"learning_rate": 8.59147903764636e-08,
"logits/chosen": -2.2671875953674316,
"logits/rejected": -2.287890672683716,
"logps/chosen": -96.29374694824219,
"logps/rejected": -107.6500015258789,
"loss": 0.1256,
"rewards/accuracies": 0.9375,
"rewards/chosen": 1.2315673828125,
"rewards/margins": 4.880859375,
"rewards/rejected": -3.6522459983825684,
"step": 1165
},
{
"epoch": 0.8337044624565779,
"grad_norm": 6.709888458251953,
"learning_rate": 8.245886487697778e-08,
"logits/chosen": -2.3125,
"logits/rejected": -2.3101563453674316,
"logps/chosen": -92.6624984741211,
"logps/rejected": -111.0875015258789,
"loss": 0.147,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.50537109375,
"rewards/margins": 5.387890815734863,
"rewards/rejected": -3.8837890625,
"step": 1170
},
{
"epoch": 0.8372673020397257,
"grad_norm": 9.564090728759766,
"learning_rate": 7.906762603411132e-08,
"logits/chosen": -2.2457032203674316,
"logits/rejected": -2.254687547683716,
"logps/chosen": -71.33125305175781,
"logps/rejected": -92.80000305175781,
"loss": 0.1114,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.35357666015625,
"rewards/margins": 4.905859470367432,
"rewards/rejected": -3.548046827316284,
"step": 1175
},
{
"epoch": 0.8408301416228734,
"grad_norm": 6.03350305557251,
"learning_rate": 7.574159922820184e-08,
"logits/chosen": -2.30859375,
"logits/rejected": -2.325390577316284,
"logps/chosen": -93.82499694824219,
"logps/rejected": -116.7562484741211,
"loss": 0.1185,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.304834008216858,
"rewards/margins": 5.476758003234863,
"rewards/rejected": -4.173047065734863,
"step": 1180
},
{
"epoch": 0.8443929812060212,
"grad_norm": 7.182165145874023,
"learning_rate": 7.24812997367531e-08,
"logits/chosen": -2.262890577316284,
"logits/rejected": -2.2757811546325684,
"logps/chosen": -87.3812484741211,
"logps/rejected": -101.5250015258789,
"loss": 0.091,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.500463843345642,
"rewards/margins": 5.475976467132568,
"rewards/rejected": -3.97216796875,
"step": 1185
},
{
"epoch": 0.847955820789169,
"grad_norm": 9.104799270629883,
"learning_rate": 6.928723265460734e-08,
"logits/chosen": -2.255859375,
"logits/rejected": -2.260546922683716,
"logps/chosen": -88.0687484741211,
"logps/rejected": -105.2874984741211,
"loss": 0.1121,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.0933105945587158,
"rewards/margins": 5.320508003234863,
"rewards/rejected": -4.226758003234863,
"step": 1190
},
{
"epoch": 0.8515186603723167,
"grad_norm": 17.943103790283203,
"learning_rate": 6.615989281569373e-08,
"logits/chosen": -2.3070311546325684,
"logits/rejected": -2.317187547683716,
"logps/chosen": -100.07499694824219,
"logps/rejected": -117.2249984741211,
"loss": 0.1486,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.299291968345642,
"rewards/margins": 5.075781345367432,
"rewards/rejected": -3.7769532203674316,
"step": 1195
},
{
"epoch": 0.8550814999554645,
"grad_norm": 7.1349287033081055,
"learning_rate": 6.309976471636808e-08,
"logits/chosen": -2.3128905296325684,
"logits/rejected": -2.315624952316284,
"logps/chosen": -87.25,
"logps/rejected": -102.5,
"loss": 0.0928,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.4921143054962158,
"rewards/margins": 5.408398628234863,
"rewards/rejected": -3.91796875,
"step": 1200
},
{
"epoch": 0.8586443395386123,
"grad_norm": 10.989272117614746,
"learning_rate": 6.010732244035266e-08,
"logits/chosen": -2.272265672683716,
"logits/rejected": -2.297656297683716,
"logps/chosen": -77.6812515258789,
"logps/rejected": -101.48750305175781,
"loss": 0.1136,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.5633544921875,
"rewards/margins": 5.681640625,
"rewards/rejected": -4.119531154632568,
"step": 1205
},
{
"epoch": 0.86220717912176,
"grad_norm": 26.202138900756836,
"learning_rate": 5.7183029585289975e-08,
"logits/chosen": -2.2964844703674316,
"logits/rejected": -2.2992186546325684,
"logps/chosen": -95.7874984741211,
"logps/rejected": -115.6875,
"loss": 0.1312,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.4753844738006592,
"rewards/margins": 5.235547065734863,
"rewards/rejected": -3.76171875,
"step": 1210
},
{
"epoch": 0.8657700187049078,
"grad_norm": 10.135671615600586,
"learning_rate": 5.432733919092147e-08,
"logits/chosen": -2.2984375953674316,
"logits/rejected": -2.2890625,
"logps/chosen": -94.0625,
"logps/rejected": -120.42500305175781,
"loss": 0.1334,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.158136010169983,
"rewards/margins": 5.209374904632568,
"rewards/rejected": -4.052783012390137,
"step": 1215
},
{
"epoch": 0.8693328582880556,
"grad_norm": 14.827692031860352,
"learning_rate": 5.1540693668900346e-08,
"logits/chosen": -2.2582030296325684,
"logits/rejected": -2.268359422683716,
"logps/chosen": -92.1937484741211,
"logps/rejected": -112.4375,
"loss": 0.1352,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.57037353515625,
"rewards/margins": 5.102734565734863,
"rewards/rejected": -3.5328125953674316,
"step": 1220
},
{
"epoch": 0.8728956978712034,
"grad_norm": 2.7754666805267334,
"learning_rate": 4.882352473425255e-08,
"logits/chosen": -2.2503905296325684,
"logits/rejected": -2.2750000953674316,
"logps/chosen": -82.34375,
"logps/rejected": -100.625,
"loss": 0.1322,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.242883324623108,
"rewards/margins": 4.870898246765137,
"rewards/rejected": -3.628710985183716,
"step": 1225
},
{
"epoch": 0.8764585374543511,
"grad_norm": 8.408917427062988,
"learning_rate": 4.6176253338494344e-08,
"logits/chosen": -2.232421875,
"logits/rejected": -2.251171827316284,
"logps/chosen": -86.6875,
"logps/rejected": -100.7750015258789,
"loss": 0.1269,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.8370605707168579,
"rewards/margins": 4.563672065734863,
"rewards/rejected": -3.7256836891174316,
"step": 1230
},
{
"epoch": 0.8800213770374989,
"grad_norm": 4.768385887145996,
"learning_rate": 4.3599289604416614e-08,
"logits/chosen": -2.2777342796325684,
"logits/rejected": -2.2796874046325684,
"logps/chosen": -84.90625,
"logps/rejected": -101.0374984741211,
"loss": 0.1378,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.4487793445587158,
"rewards/margins": 5.4912109375,
"rewards/rejected": -4.040234565734863,
"step": 1235
},
{
"epoch": 0.8835842166206467,
"grad_norm": 9.690278053283691,
"learning_rate": 4.10930327625485e-08,
"logits/chosen": -2.298828125,
"logits/rejected": -2.307812452316284,
"logps/chosen": -94.4937515258789,
"logps/rejected": -111.41874694824219,
"loss": 0.1103,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 0.762249767780304,
"rewards/margins": 4.697265625,
"rewards/rejected": -3.93701171875,
"step": 1240
},
{
"epoch": 0.8871470562037944,
"grad_norm": 8.591952323913574,
"learning_rate": 3.865787108930646e-08,
"logits/chosen": -2.2587890625,
"logits/rejected": -2.2367186546325684,
"logps/chosen": -97.4625015258789,
"logps/rejected": -110.23750305175781,
"loss": 0.2029,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.166015625,
"rewards/margins": 5.281054496765137,
"rewards/rejected": -4.11328125,
"step": 1245
},
{
"epoch": 0.8907098957869422,
"grad_norm": 4.010148525238037,
"learning_rate": 3.629418184684185e-08,
"logits/chosen": -2.291015625,
"logits/rejected": -2.291796922683716,
"logps/chosen": -95.9625015258789,
"logps/rejected": -115.5250015258789,
"loss": 0.1164,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.5761597156524658,
"rewards/margins": 5.4794921875,
"rewards/rejected": -3.8990235328674316,
"step": 1250
},
{
"epoch": 0.89427273537009,
"grad_norm": 12.358043670654297,
"learning_rate": 3.400233122459473e-08,
"logits/chosen": -2.2300782203674316,
"logits/rejected": -2.2855467796325684,
"logps/chosen": -99.0687484741211,
"logps/rejected": -109.19999694824219,
"loss": 0.163,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.37403565645217896,
"rewards/margins": 4.477343559265137,
"rewards/rejected": -4.104687690734863,
"step": 1255
},
{
"epoch": 0.8978355749532377,
"grad_norm": 5.314749717712402,
"learning_rate": 3.1782674282562094e-08,
"logits/chosen": -2.2640624046325684,
"logits/rejected": -2.268749952316284,
"logps/chosen": -76.625,
"logps/rejected": -94.67500305175781,
"loss": 0.1039,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.71014404296875,
"rewards/margins": 5.317773342132568,
"rewards/rejected": -3.611035108566284,
"step": 1260
},
{
"epoch": 0.9013984145363855,
"grad_norm": 11.101000785827637,
"learning_rate": 2.9635554896291326e-08,
"logits/chosen": -2.26171875,
"logits/rejected": -2.278125047683716,
"logps/chosen": -98.7874984741211,
"logps/rejected": -113.3125,
"loss": 0.2612,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.4896484315395355,
"rewards/margins": 4.54150390625,
"rewards/rejected": -4.050000190734863,
"step": 1265
},
{
"epoch": 0.9049612541195333,
"grad_norm": 7.049961090087891,
"learning_rate": 2.7561305703606207e-08,
"logits/chosen": -2.285937547683716,
"logits/rejected": -2.295703172683716,
"logps/chosen": -97.1937484741211,
"logps/rejected": -112.5875015258789,
"loss": 0.1048,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.5007812976837158,
"rewards/margins": 5.321875095367432,
"rewards/rejected": -3.821484327316284,
"step": 1270
},
{
"epoch": 0.9085240937026811,
"grad_norm": 14.669742584228516,
"learning_rate": 2.5560248053073164e-08,
"logits/chosen": -2.2640624046325684,
"logits/rejected": -2.280078172683716,
"logps/chosen": -98.5,
"logps/rejected": -121.6875,
"loss": 0.1746,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.07766113430261612,
"rewards/margins": 4.587890625,
"rewards/rejected": -4.511034965515137,
"step": 1275
},
{
"epoch": 0.9120869332858288,
"grad_norm": 5.834444522857666,
"learning_rate": 2.3632691954217742e-08,
"logits/chosen": -2.253124952316284,
"logits/rejected": -2.270312547683716,
"logps/chosen": -87.1312484741211,
"logps/rejected": -107.2750015258789,
"loss": 0.1106,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 1.564550757408142,
"rewards/margins": 5.344336032867432,
"rewards/rejected": -3.7816405296325684,
"step": 1280
},
{
"epoch": 0.9156497728689765,
"grad_norm": 3.6613357067108154,
"learning_rate": 2.1778936029496376e-08,
"logits/chosen": -2.3011717796325684,
"logits/rejected": -2.299609422683716,
"logps/chosen": -99.10624694824219,
"logps/rejected": -116.0999984741211,
"loss": 0.119,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.7597290277481079,
"rewards/margins": 5.019140720367432,
"rewards/rejected": -4.260546684265137,
"step": 1285
},
{
"epoch": 0.9192126124521244,
"grad_norm": 10.60155963897705,
"learning_rate": 1.999926746803332e-08,
"logits/chosen": -2.265625,
"logits/rejected": -2.250195264816284,
"logps/chosen": -79.88749694824219,
"logps/rejected": -101.5,
"loss": 0.13,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 1.0681030750274658,
"rewards/margins": 5.093359470367432,
"rewards/rejected": -4.023046970367432,
"step": 1290
},
{
"epoch": 0.9227754520352721,
"grad_norm": 3.0230162143707275,
"learning_rate": 1.8293961981128592e-08,
"logits/chosen": -2.306640625,
"logits/rejected": -2.313671827316284,
"logps/chosen": -107.9000015258789,
"logps/rejected": -122.07499694824219,
"loss": 0.2062,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.44135743379592896,
"rewards/margins": 5.301171779632568,
"rewards/rejected": -4.86376953125,
"step": 1295
},
{
"epoch": 0.9263382916184199,
"grad_norm": 6.370534896850586,
"learning_rate": 1.6663283759543678e-08,
"logits/chosen": -2.247265577316284,
"logits/rejected": -2.274218797683716,
"logps/chosen": -96.35624694824219,
"logps/rejected": -115.8687515258789,
"loss": 0.1412,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 0.6146484613418579,
"rewards/margins": 4.799218654632568,
"rewards/rejected": -4.187402248382568,
"step": 1300
},
{
"epoch": 0.9299011312015677,
"grad_norm": 6.875704765319824,
"learning_rate": 1.510748543257262e-08,
"logits/chosen": -2.276171922683716,
"logits/rejected": -2.276171922683716,
"logps/chosen": -81.54374694824219,
"logps/rejected": -98.6875,
"loss": 0.0842,
"rewards/accuracies": 0.96875,
"rewards/chosen": 1.9011962413787842,
"rewards/margins": 5.594336032867432,
"rewards/rejected": -3.694140672683716,
"step": 1305
},
{
"epoch": 0.9334639707847154,
"grad_norm": 5.4383721351623535,
"learning_rate": 1.3626808028903757e-08,
"logits/chosen": -2.278125047683716,
"logits/rejected": -2.3148436546325684,
"logps/chosen": -86.5625,
"logps/rejected": -110.7750015258789,
"loss": 0.0813,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 1.6395142078399658,
"rewards/margins": 5.509130954742432,
"rewards/rejected": -3.87109375,
"step": 1310
},
{
"epoch": 0.9370268103678632,
"grad_norm": 4.379004955291748,
"learning_rate": 1.2221480939278938e-08,
"logits/chosen": -2.288281202316284,
"logits/rejected": -2.274218797683716,
"logps/chosen": -97.1312484741211,
"logps/rejected": -116.13749694824219,
"loss": 0.1139,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.6423828601837158,
"rewards/margins": 5.208300590515137,
"rewards/rejected": -3.566210985183716,
"step": 1315
},
{
"epoch": 0.9405896499510109,
"grad_norm": 9.59549617767334,
"learning_rate": 1.0891721880955996e-08,
"logits/chosen": -2.291015625,
"logits/rejected": -2.3125,
"logps/chosen": -92.05000305175781,
"logps/rejected": -101.92500305175781,
"loss": 0.0903,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": 1.4332764148712158,
"rewards/margins": 5.380273342132568,
"rewards/rejected": -3.946484327316284,
"step": 1320
},
{
"epoch": 0.9441524895341588,
"grad_norm": 10.30654239654541,
"learning_rate": 9.63773686397873e-09,
"logits/chosen": -2.2796874046325684,
"logits/rejected": -2.299609422683716,
"logps/chosen": -98.1500015258789,
"logps/rejected": -116.11250305175781,
"loss": 0.1958,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.719470202922821,
"rewards/margins": 4.872265815734863,
"rewards/rejected": -4.152734279632568,
"step": 1325
},
{
"epoch": 0.9477153291173065,
"grad_norm": 8.464811325073242,
"learning_rate": 8.459720159261718e-09,
"logits/chosen": -2.241992235183716,
"logits/rejected": -2.255859375,
"logps/chosen": -97.71875,
"logps/rejected": -104.10624694824219,
"loss": 0.1578,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.4008544981479645,
"rewards/margins": 4.505468845367432,
"rewards/rejected": -4.103125095367432,
"step": 1330
},
{
"epoch": 0.9512781687004542,
"grad_norm": 12.606581687927246,
"learning_rate": 7.35785426849328e-09,
"logits/chosen": -2.2738280296325684,
"logits/rejected": -2.298828125,
"logps/chosen": -78.5687484741211,
"logps/rejected": -97.07499694824219,
"loss": 0.1097,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.5250976085662842,
"rewards/margins": 5.138671875,
"rewards/rejected": -3.612109422683716,
"step": 1335
},
{
"epoch": 0.9548410082836021,
"grad_norm": 6.5607805252075195,
"learning_rate": 6.3323098958615314e-09,
"logits/chosen": -2.283984422683716,
"logits/rejected": -2.2874999046325684,
"logps/chosen": -87.58125305175781,
"logps/rejected": -105.1500015258789,
"loss": 0.1212,
"rewards/accuracies": 0.981249988079071,
"rewards/chosen": 1.039770483970642,
"rewards/margins": 5.138476371765137,
"rewards/rejected": -4.1015625,
"step": 1340
},
{
"epoch": 0.9584038478667498,
"grad_norm": 84.18962097167969,
"learning_rate": 5.38324592160877e-09,
"logits/chosen": -2.268359422683716,
"logits/rejected": -2.274609327316284,
"logps/chosen": -103.46875,
"logps/rejected": -119.51875305175781,
"loss": 0.2778,
"rewards/accuracies": 0.887499988079071,
"rewards/chosen": 0.715136706829071,
"rewards/margins": 4.048925876617432,
"rewards/rejected": -3.3335938453674316,
"step": 1345
},
{
"epoch": 0.9619666874498976,
"grad_norm": 9.378565788269043,
"learning_rate": 4.5108093774169356e-09,
"logits/chosen": -2.267578125,
"logits/rejected": -2.291210889816284,
"logps/chosen": -104.89375305175781,
"logps/rejected": -122.2750015258789,
"loss": 0.2457,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 0.806683361530304,
"rewards/margins": 5.057812690734863,
"rewards/rejected": -4.248437404632568,
"step": 1350
},
{
"epoch": 0.9655295270330453,
"grad_norm": 11.539667129516602,
"learning_rate": 3.7151354236293897e-09,
"logits/chosen": -2.283984422683716,
"logits/rejected": -2.319531202316284,
"logps/chosen": -103.3499984741211,
"logps/rejected": -116.25,
"loss": 0.1492,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.752197265625,
"rewards/margins": 4.77294921875,
"rewards/rejected": -4.019140720367432,
"step": 1355
},
{
"epoch": 0.9690923666161931,
"grad_norm": 29.576169967651367,
"learning_rate": 2.9963473283112216e-09,
"logits/chosen": -2.2437500953674316,
"logits/rejected": -2.2601561546325684,
"logps/chosen": -82.6187515258789,
"logps/rejected": -96.5875015258789,
"loss": 0.1919,
"rewards/accuracies": 0.90625,
"rewards/chosen": 1.0454590320587158,
"rewards/margins": 5.02978515625,
"rewards/rejected": -3.981738328933716,
"step": 1360
},
{
"epoch": 0.9726552061993409,
"grad_norm": 5.123119831085205,
"learning_rate": 2.3545564481523005e-09,
"logits/chosen": -2.278515577316284,
"logits/rejected": -2.282421827316284,
"logps/chosen": -84.13749694824219,
"logps/rejected": -93.4375,
"loss": 0.1302,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 1.513513207435608,
"rewards/margins": 5.091796875,
"rewards/rejected": -3.576367139816284,
"step": 1365
},
{
"epoch": 0.9762180457824886,
"grad_norm": 12.932695388793945,
"learning_rate": 1.7898622112156314e-09,
"logits/chosen": -2.2816405296325684,
"logits/rejected": -2.280468702316284,
"logps/chosen": -85.46875,
"logps/rejected": -110.5875015258789,
"loss": 0.1621,
"rewards/accuracies": 0.9312499761581421,
"rewards/chosen": 1.1466248035430908,
"rewards/margins": 5.542578220367432,
"rewards/rejected": -4.399218559265137,
"step": 1370
},
{
"epoch": 0.9797808853656365,
"grad_norm": 8.279297828674316,
"learning_rate": 1.3023521015336768e-09,
"logits/chosen": -2.303515672683716,
"logits/rejected": -2.294140577316284,
"logps/chosen": -109.96875,
"logps/rejected": -120.4000015258789,
"loss": 0.1537,
"rewards/accuracies": 0.9437500238418579,
"rewards/chosen": 0.3752075135707855,
"rewards/margins": 4.730664253234863,
"rewards/rejected": -4.355859279632568,
"step": 1375
},
{
"epoch": 0.9833437249487842,
"grad_norm": 14.932358741760254,
"learning_rate": 8.921016455548658e-10,
"logits/chosen": -2.234375,
"logits/rejected": -2.270312547683716,
"logps/chosen": -90.1500015258789,
"logps/rejected": -101.625,
"loss": 0.1677,
"rewards/accuracies": 0.9375,
"rewards/chosen": 0.3759964108467102,
"rewards/margins": 4.690625190734863,
"rewards/rejected": -4.31494140625,
"step": 1380
},
{
"epoch": 0.9869065645319319,
"grad_norm": 5.358438968658447,
"learning_rate": 5.591744004432853e-10,
"logits/chosen": -2.2476563453674316,
"logits/rejected": -2.2632813453674316,
"logps/chosen": -89.57499694824219,
"logps/rejected": -108.0374984741211,
"loss": 0.1115,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.094567894935608,
"rewards/margins": 4.923828125,
"rewards/rejected": -3.8304686546325684,
"step": 1385
},
{
"epoch": 0.9904694041150797,
"grad_norm": 12.948234558105469,
"learning_rate": 3.036219442317245e-10,
"logits/chosen": -2.258984327316284,
"logits/rejected": -2.28515625,
"logps/chosen": -81.38749694824219,
"logps/rejected": -102.125,
"loss": 0.1064,
"rewards/accuracies": 0.956250011920929,
"rewards/chosen": 1.3934326171875,
"rewards/margins": 5.1748046875,
"rewards/rejected": -3.782031297683716,
"step": 1390
},
{
"epoch": 0.9940322436982275,
"grad_norm": 7.298853874206543,
"learning_rate": 1.2548386783134413e-10,
"logits/chosen": -2.2681641578674316,
"logits/rejected": -2.2554688453674316,
"logps/chosen": -89.92500305175781,
"logps/rejected": -107.67500305175781,
"loss": 0.2167,
"rewards/accuracies": 0.9624999761581421,
"rewards/chosen": 0.764086902141571,
"rewards/margins": 5.166211128234863,
"rewards/rejected": -4.402539253234863,
"step": 1395
},
{
"epoch": 0.9975950832813753,
"grad_norm": 14.198148727416992,
"learning_rate": 2.4787768897971405e-11,
"logits/chosen": -2.305859327316284,
"logits/rejected": -2.309765577316284,
"logps/chosen": -111.25,
"logps/rejected": -129.9499969482422,
"loss": 0.1927,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": 0.5697265863418579,
"rewards/margins": 4.261132717132568,
"rewards/rejected": -3.6939454078674316,
"step": 1400
},
{
"epoch": 0.9997327870312639,
"step": 1403,
"total_flos": 0.0,
"train_loss": 0.2128710214231835,
"train_runtime": 9706.3948,
"train_samples_per_second": 4.626,
"train_steps_per_second": 0.145
}
],
"logging_steps": 5,
"max_steps": 1403,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}