diff --git "a/checkpoint-2709/trainer_state.json" "b/checkpoint-2709/trainer_state.json" new file mode 100644--- /dev/null +++ "b/checkpoint-2709/trainer_state.json" @@ -0,0 +1,40797 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 339, + "global_step": 2709, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0, + "eval_kl": 0.0, + "eval_logits/chosen": 373229834.73730683, + "eval_logits/rejected": 328986149.5224586, + "eval_logps/chosen": -383.5455298013245, + "eval_logps/rejected": -378.2536569148936, + "eval_loss": 0.5, + "eval_rewards/chosen": 0.0, + "eval_rewards/margins": 0.0, + "eval_rewards/rejected": 0.0, + "eval_runtime": 50.0707, + "eval_samples_per_second": 17.495, + "eval_steps_per_second": 4.374, + "step": 0 + }, + { + "epoch": 0.0003691569378432006, + "grad_norm": 15.1875, + "kl": 0.0, + "learning_rate": 0.0, + "logits/chosen": 474826556.95238096, + "logits/rejected": 312812706.90909094, + "logps/chosen": -392.2476748511905, + "logps/rejected": -328.4491077769886, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0007383138756864012, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 2.5000000000000004e-07, + "logits/chosen": 415271019.7894737, + "logits/rejected": 368185659.0769231, + "logps/chosen": -425.96885279605266, + "logps/rejected": -383.43453275240387, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.0011074708135296017, + "grad_norm": 19.875, + "kl": 0.23936080932617188, + "learning_rate": 5.000000000000001e-07, + "logits/chosen": 359570870.85714287, + "logits/rejected": 477672163.5555556, + "logps/chosen": -387.8623744419643, + "logps/rejected": -470.8349609375, + "loss": 0.5042, + "rewards/chosen": 0.01692763715982437, + "rewards/margins": -0.009892906579706404, + "rewards/rejected": 0.026820543739530776, + "step": 3 + }, + { + "epoch": 0.0014766277513728024, + "grad_norm": 15.9375, + "kl": 0.05631542205810547, + "learning_rate": 7.5e-07, + "logits/chosen": 412145902.93333334, + "logits/rejected": 412330074.35294116, + "logps/chosen": -393.81090494791664, + "logps/rejected": -316.34581801470586, + "loss": 0.4985, + "rewards/chosen": 0.012642414371172587, + "rewards/margins": 0.017811821780952752, + "rewards/rejected": -0.0051694074097801655, + "step": 4 + }, + { + "epoch": 0.001845784689216003, + "grad_norm": 17.75, + "kl": 0.0675201416015625, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 332565423.15789473, + "logits/rejected": 378657752.61538464, + "logps/chosen": -375.96104029605266, + "logps/rejected": -456.05799278846155, + "loss": 0.5024, + "rewards/chosen": -0.019428014755249023, + "rewards/margins": -0.014616626959580641, + "rewards/rejected": -0.004811387795668382, + "step": 5 + }, + { + "epoch": 0.0022149416270592034, + "grad_norm": 16.875, + "kl": 0.16606926918029785, + "learning_rate": 1.25e-06, + "logits/chosen": 416385890.46153843, + "logits/rejected": 441124244.2105263, + "logps/chosen": -540.5455228365385, + "logps/rejected": -336.21217105263156, + "loss": 0.5, + "rewards/chosen": 0.021362187770696785, + "rewards/margins": 0.008230634365487193, + "rewards/rejected": 0.013131553405209592, + "step": 6 + }, + { + "epoch": 0.0025840985649024043, + "grad_norm": 17.75, + "kl": 0.015097618103027344, + "learning_rate": 1.5e-06, + "logits/chosen": 301306709.3333333, + "logits/rejected": 529852536.4705882, + "logps/chosen": -303.4392903645833, + "logps/rejected": -411.36282169117646, + "loss": 0.5012, + "rewards/chosen": -0.008790486057599385, + "rewards/margins": -0.013039745010581671, + "rewards/rejected": 0.004249258952982286, + "step": 7 + }, + { + "epoch": 0.002953255502745605, + "grad_norm": 15.375, + "kl": 0.044513702392578125, + "learning_rate": 1.75e-06, + "logits/chosen": 345534683.4285714, + "logits/rejected": 363957760.0, + "logps/chosen": -316.6180478050595, + "logps/rejected": -376.21555397727275, + "loss": 0.5066, + "rewards/chosen": -0.018353399776277087, + "rewards/margins": -0.0474008618495165, + "rewards/rejected": 0.029047462073239414, + "step": 8 + }, + { + "epoch": 0.0033224124405888053, + "grad_norm": 17.125, + "kl": 0.10809516906738281, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": 383110144.0, + "logits/rejected": 458202258.28571427, + "logps/chosen": -362.90180121527777, + "logps/rejected": -372.32080078125, + "loss": 0.4943, + "rewards/chosen": 0.005824238889747196, + "rewards/margins": 0.0605001330139145, + "rewards/rejected": -0.054675894124167304, + "step": 9 + }, + { + "epoch": 0.003691569378432006, + "grad_norm": 17.125, + "kl": 0.07309794425964355, + "learning_rate": 2.25e-06, + "logits/chosen": 267813068.8, + "logits/rejected": 394245073.45454544, + "logps/chosen": -462.605908203125, + "logps/rejected": -351.65531782670456, + "loss": 0.494, + "rewards/chosen": -0.041304168105125424, + "rewards/margins": 0.010074618458747867, + "rewards/rejected": -0.05137878656387329, + "step": 10 + }, + { + "epoch": 0.004060726316275206, + "grad_norm": 15.125, + "kl": 0.07145237922668457, + "learning_rate": 2.5e-06, + "logits/chosen": 335684684.8, + "logits/rejected": 319132020.3636364, + "logps/chosen": -354.8703369140625, + "logps/rejected": -286.3788396661932, + "loss": 0.4898, + "rewards/chosen": 0.07037982940673829, + "rewards/margins": 0.09293101917613637, + "rewards/rejected": -0.022551189769398083, + "step": 11 + }, + { + "epoch": 0.004429883254118407, + "grad_norm": 19.125, + "kl": 0.11628150939941406, + "learning_rate": 2.7500000000000004e-06, + "logits/chosen": 303244018.5263158, + "logits/rejected": 442440585.84615386, + "logps/chosen": -383.3953279194079, + "logps/rejected": -494.53140024038464, + "loss": 0.4863, + "rewards/chosen": 0.016711906382912083, + "rewards/margins": 0.13954222395352506, + "rewards/rejected": -0.12283031757061298, + "step": 12 + }, + { + "epoch": 0.004799040191961607, + "grad_norm": 16.5, + "kl": 0.04451560974121094, + "learning_rate": 3e-06, + "logits/chosen": 306039552.0, + "logits/rejected": 397014624.0, + "logps/chosen": -335.736328125, + "logps/rejected": -340.09735107421875, + "loss": 0.4797, + "rewards/chosen": 0.05380403995513916, + "rewards/margins": 0.17203910648822784, + "rewards/rejected": -0.11823506653308868, + "step": 13 + }, + { + "epoch": 0.005168197129804809, + "grad_norm": 16.75, + "kl": 0.051006317138671875, + "learning_rate": 3.2500000000000002e-06, + "logits/chosen": 406864414.11764705, + "logits/rejected": 392859579.73333335, + "logps/chosen": -388.68118106617646, + "logps/rejected": -393.38317057291664, + "loss": 0.4696, + "rewards/chosen": 0.061786343069637525, + "rewards/margins": 0.26320304216123097, + "rewards/rejected": -0.20141669909159343, + "step": 14 + }, + { + "epoch": 0.005537354067648009, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 3.5e-06, + "logits/chosen": 274786389.3333333, + "logits/rejected": 362175616.0, + "logps/chosen": -252.7392578125, + "logps/rejected": -340.553515625, + "loss": 0.4583, + "rewards/chosen": 0.08242848515510559, + "rewards/margins": 0.3015967905521393, + "rewards/rejected": -0.2191683053970337, + "step": 15 + }, + { + "epoch": 0.00590651100549121, + "grad_norm": 15.1875, + "kl": 0.10133552551269531, + "learning_rate": 3.7500000000000005e-06, + "logits/chosen": 352869339.4285714, + "logits/rejected": 417453283.5555556, + "logps/chosen": -350.76436941964283, + "logps/rejected": -277.96161566840277, + "loss": 0.466, + "rewards/chosen": 0.039474163736615865, + "rewards/margins": 0.2625933166534182, + "rewards/rejected": -0.2231191529168023, + "step": 16 + }, + { + "epoch": 0.00627566794333441, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 455833284.9230769, + "logits/rejected": 421311595.7894737, + "logps/chosen": -433.39956430288464, + "logps/rejected": -390.2200349506579, + "loss": 0.4462, + "rewards/chosen": 0.029306501150131226, + "rewards/margins": 0.3800164414079566, + "rewards/rejected": -0.35070994025782537, + "step": 17 + }, + { + "epoch": 0.006644824881177611, + "grad_norm": 15.4375, + "kl": 0.0, + "learning_rate": 4.25e-06, + "logits/chosen": 349242908.4444444, + "logits/rejected": 507096502.85714287, + "logps/chosen": -384.63878038194446, + "logps/rejected": -431.13821847098217, + "loss": 0.4587, + "rewards/chosen": 0.0491667456097073, + "rewards/margins": 0.3833783675753881, + "rewards/rejected": -0.3342116219656808, + "step": 18 + }, + { + "epoch": 0.007013981819020811, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 4.5e-06, + "logits/chosen": 335161728.0, + "logits/rejected": 393190400.0, + "logps/chosen": -368.4449157714844, + "logps/rejected": -364.5025329589844, + "loss": 0.4615, + "rewards/chosen": 0.012668704614043236, + "rewards/margins": 0.3185649272054434, + "rewards/rejected": -0.30589622259140015, + "step": 19 + }, + { + "epoch": 0.007383138756864012, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 4.75e-06, + "logits/chosen": 285666797.71428573, + "logits/rejected": 308338062.2222222, + "logps/chosen": -354.10281808035717, + "logps/rejected": -392.2115071614583, + "loss": 0.436, + "rewards/chosen": -0.021644590156418935, + "rewards/margins": 0.4674336136806579, + "rewards/rejected": -0.48907820383707684, + "step": 20 + }, + { + "epoch": 0.007752295694707212, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 319922272.0, + "logits/rejected": 365256256.0, + "logps/chosen": -410.46197509765625, + "logps/rejected": -384.87884521484375, + "loss": 0.4225, + "rewards/chosen": 0.06947574764490128, + "rewards/margins": 0.6510091796517372, + "rewards/rejected": -0.5815334320068359, + "step": 21 + }, + { + "epoch": 0.008121452632550413, + "grad_norm": 14.8125, + "kl": 0.19563651084899902, + "learning_rate": 5.2500000000000006e-06, + "logits/chosen": 351332070.4, + "logits/rejected": 424257792.0, + "logps/chosen": -366.22119140625, + "logps/rejected": -374.7484944661458, + "loss": 0.4387, + "rewards/chosen": 0.10225703716278076, + "rewards/margins": 0.6457206249237061, + "rewards/rejected": -0.5434635877609253, + "step": 22 + }, + { + "epoch": 0.008490609570393614, + "grad_norm": 14.5625, + "kl": 0.20517826080322266, + "learning_rate": 5.500000000000001e-06, + "logits/chosen": 356672302.54545456, + "logits/rejected": 235229696.0, + "logps/chosen": -315.94777610085225, + "logps/rejected": -368.89090401785717, + "loss": 0.4, + "rewards/chosen": 0.08411303433504971, + "rewards/margins": 0.7500077243491169, + "rewards/rejected": -0.6658946900140672, + "step": 23 + }, + { + "epoch": 0.008859766508236814, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5.75e-06, + "logits/chosen": 303122097.2307692, + "logits/rejected": 360183107.3684211, + "logps/chosen": -400.69677734375, + "logps/rejected": -503.8579358552632, + "loss": 0.3487, + "rewards/chosen": 0.03575533399215111, + "rewards/margins": 1.2289766771831976, + "rewards/rejected": -1.1932213431910466, + "step": 24 + }, + { + "epoch": 0.009228923446080015, + "grad_norm": 13.25, + "kl": 0.018645763397216797, + "learning_rate": 6e-06, + "logits/chosen": 303889180.4444444, + "logits/rejected": 327348992.0, + "logps/chosen": -401.233154296875, + "logps/rejected": -385.915771484375, + "loss": 0.377, + "rewards/chosen": 0.14710291226704916, + "rewards/margins": 1.2394635450272333, + "rewards/rejected": -1.0923606327601842, + "step": 25 + }, + { + "epoch": 0.009598080383923215, + "grad_norm": 12.6875, + "kl": 0.0, + "learning_rate": 6.25e-06, + "logits/chosen": 419199036.2352941, + "logits/rejected": 241346781.86666667, + "logps/chosen": -343.08777573529414, + "logps/rejected": -365.7247721354167, + "loss": 0.3938, + "rewards/chosen": 0.022248520570642808, + "rewards/margins": 1.0710360826230516, + "rewards/rejected": -1.0487875620524088, + "step": 26 + }, + { + "epoch": 0.009967237321766416, + "grad_norm": 12.375, + "kl": 0.06976032257080078, + "learning_rate": 6.5000000000000004e-06, + "logits/chosen": 256960372.36363637, + "logits/rejected": 298067507.2, + "logps/chosen": -303.26611328125, + "logps/rejected": -342.1369140625, + "loss": 0.3967, + "rewards/chosen": 0.1635995561426336, + "rewards/margins": 1.3905141527002507, + "rewards/rejected": -1.2269145965576171, + "step": 27 + }, + { + "epoch": 0.010336394259609617, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 6.750000000000001e-06, + "logits/chosen": 262899510.85714287, + "logits/rejected": 244764188.44444445, + "logps/chosen": -309.42288643973217, + "logps/rejected": -339.638671875, + "loss": 0.3632, + "rewards/chosen": 0.015676796436309814, + "rewards/margins": 1.3344966901673212, + "rewards/rejected": -1.3188198937310114, + "step": 28 + }, + { + "epoch": 0.010705551197452817, + "grad_norm": 13.3125, + "kl": 0.0, + "learning_rate": 7e-06, + "logits/chosen": 329828181.3333333, + "logits/rejected": 328157334.5882353, + "logps/chosen": -416.4065755208333, + "logps/rejected": -373.37353515625, + "loss": 0.3309, + "rewards/chosen": 0.07431103388468424, + "rewards/margins": 1.6938583729313867, + "rewards/rejected": -1.6195473390467026, + "step": 29 + }, + { + "epoch": 0.011074708135296018, + "grad_norm": 11.1875, + "kl": 0.0, + "learning_rate": 7.25e-06, + "logits/chosen": 356692992.0, + "logits/rejected": 377564790.15384614, + "logps/chosen": -274.3595548930921, + "logps/rejected": -392.7765549879808, + "loss": 0.3602, + "rewards/chosen": 0.038800330538498726, + "rewards/margins": 1.82862122169873, + "rewards/rejected": -1.7898208911602314, + "step": 30 + }, + { + "epoch": 0.011443865073139218, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 7.500000000000001e-06, + "logits/chosen": 337196032.0, + "logits/rejected": 399349162.6666667, + "logps/chosen": -353.6226283482143, + "logps/rejected": -390.86463758680554, + "loss": 0.2984, + "rewards/chosen": 0.018596757735524858, + "rewards/margins": 2.068742224857921, + "rewards/rejected": -2.050145467122396, + "step": 31 + }, + { + "epoch": 0.01181302201098242, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 7.75e-06, + "logits/chosen": 349669558.85714287, + "logits/rejected": 421645368.8888889, + "logps/chosen": -363.6336146763393, + "logps/rejected": -435.9499782986111, + "loss": 0.2635, + "rewards/chosen": 0.247344970703125, + "rewards/margins": 2.600150638156467, + "rewards/rejected": -2.352805667453342, + "step": 32 + }, + { + "epoch": 0.012182178948825619, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 8.000000000000001e-06, + "logits/chosen": 310082699.6363636, + "logits/rejected": 305781735.61904764, + "logps/chosen": -372.0051935369318, + "logps/rejected": -339.1004929315476, + "loss": 0.3011, + "rewards/chosen": 0.009333941069516268, + "rewards/margins": 1.9164469709128014, + "rewards/rejected": -1.907113029843285, + "step": 33 + }, + { + "epoch": 0.01255133588666882, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 8.25e-06, + "logits/chosen": 333365248.0, + "logits/rejected": 343682706.28571427, + "logps/chosen": -319.61445756392044, + "logps/rejected": -433.8258463541667, + "loss": 0.2452, + "rewards/chosen": -0.05058004097505049, + "rewards/margins": 2.55988886887893, + "rewards/rejected": -2.6104689098539806, + "step": 34 + }, + { + "epoch": 0.01292049282451202, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 8.5e-06, + "logits/chosen": 221356700.44444445, + "logits/rejected": 256468150.85714287, + "logps/chosen": -390.35009765625, + "logps/rejected": -281.3192661830357, + "loss": 0.3375, + "rewards/chosen": 0.2238397863176134, + "rewards/margins": 2.301216424457611, + "rewards/rejected": -2.0773766381399974, + "step": 35 + }, + { + "epoch": 0.013289649762355221, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 8.750000000000001e-06, + "logits/chosen": 256884736.0, + "logits/rejected": 455072494.93333334, + "logps/chosen": -346.01648667279414, + "logps/rejected": -457.2962239583333, + "loss": 0.2965, + "rewards/chosen": 0.19446908726411707, + "rewards/margins": 2.708487336775836, + "rewards/rejected": -2.514018249511719, + "step": 36 + }, + { + "epoch": 0.013658806700198423, + "grad_norm": 11.875, + "kl": 0.0, + "learning_rate": 9e-06, + "logits/chosen": 314880056.8888889, + "logits/rejected": 391169426.28571427, + "logps/chosen": -402.8432345920139, + "logps/rejected": -330.20242745535717, + "loss": 0.4004, + "rewards/chosen": -0.21582206090291342, + "rewards/margins": 1.9510044597444083, + "rewards/rejected": -2.1668265206473216, + "step": 37 + }, + { + "epoch": 0.014027963638041622, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.250000000000001e-06, + "logits/chosen": 314442816.0, + "logits/rejected": 345654400.0, + "logps/chosen": -356.58160400390625, + "logps/rejected": -438.89447021484375, + "loss": 0.2868, + "rewards/chosen": 0.11346978694200516, + "rewards/margins": 3.5261012986302376, + "rewards/rejected": -3.4126315116882324, + "step": 38 + }, + { + "epoch": 0.014397120575884824, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.5e-06, + "logits/chosen": 342463561.14285713, + "logits/rejected": 275317134.2222222, + "logps/chosen": -391.74557059151783, + "logps/rejected": -383.7624782986111, + "loss": 0.2419, + "rewards/chosen": 0.3539630344935826, + "rewards/margins": 3.5844606066507003, + "rewards/rejected": -3.230497572157118, + "step": 39 + }, + { + "epoch": 0.014766277513728023, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.75e-06, + "logits/chosen": 279884378.35294116, + "logits/rejected": 261839342.93333334, + "logps/chosen": -341.8717256433824, + "logps/rejected": -395.26295572916666, + "loss": 0.2773, + "rewards/chosen": 0.2336867837344899, + "rewards/margins": 3.5508485981062345, + "rewards/rejected": -3.3171618143717447, + "step": 40 + }, + { + "epoch": 0.015135434451571225, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 1e-05, + "logits/chosen": 265730523.42857143, + "logits/rejected": 371406620.4444444, + "logps/chosen": -368.2596958705357, + "logps/rejected": -420.9895290798611, + "loss": 0.2876, + "rewards/chosen": 0.005165440695626395, + "rewards/margins": 3.6974935758681524, + "rewards/rejected": -3.692328135172526, + "step": 41 + }, + { + "epoch": 0.015504591389414424, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.999996536281763e-06, + "logits/chosen": 257816832.0, + "logits/rejected": 292840305.7777778, + "logps/chosen": -334.50352260044644, + "logps/rejected": -429.2851291232639, + "loss": 0.254, + "rewards/chosen": 0.09336684431348528, + "rewards/margins": 3.9117953077195184, + "rewards/rejected": -3.818428463406033, + "step": 42 + }, + { + "epoch": 0.015873748327257624, + "grad_norm": 12.3125, + "kl": 0.0, + "learning_rate": 9.999986145131847e-06, + "logits/chosen": 274582208.0, + "logits/rejected": 299395840.0, + "logps/chosen": -518.48046875, + "logps/rejected": -359.36669921875, + "loss": 0.2487, + "rewards/chosen": 0.37832848230997723, + "rewards/margins": 3.0091773668924966, + "rewards/rejected": -2.6308488845825195, + "step": 43 + }, + { + "epoch": 0.016242905265100825, + "grad_norm": 10.1875, + "kl": 0.0, + "learning_rate": 9.999968826564655e-06, + "logits/chosen": 273017417.14285713, + "logits/rejected": 293055744.0, + "logps/chosen": -401.48695591517856, + "logps/rejected": -420.4306640625, + "loss": 0.2732, + "rewards/chosen": 0.12526428699493408, + "rewards/margins": 3.4193922811084323, + "rewards/rejected": -3.294127994113498, + "step": 44 + }, + { + "epoch": 0.016612062202944027, + "grad_norm": 10.8125, + "kl": 0.0, + "learning_rate": 9.999944580604174e-06, + "logits/chosen": 286375731.2, + "logits/rejected": 252583333.6470588, + "logps/chosen": -376.8478190104167, + "logps/rejected": -447.5671817555147, + "loss": 0.2272, + "rewards/chosen": 0.5172962506612142, + "rewards/margins": 4.3331504915274826, + "rewards/rejected": -3.8158542408662686, + "step": 45 + }, + { + "epoch": 0.016981219140787228, + "grad_norm": 10.875, + "kl": 0.1224822998046875, + "learning_rate": 9.999913407284001e-06, + "logits/chosen": 266186865.7777778, + "logits/rejected": 425804690.28571427, + "logps/chosen": -340.48792860243054, + "logps/rejected": -356.6359165736607, + "loss": 0.2853, + "rewards/chosen": 0.4025382995605469, + "rewards/margins": 3.869560514177595, + "rewards/rejected": -3.467022214617048, + "step": 46 + }, + { + "epoch": 0.01735037607863043, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.999875306647327e-06, + "logits/chosen": 375453312.0, + "logits/rejected": 354228992.0, + "logps/chosen": -384.3309631347656, + "logps/rejected": -439.713623046875, + "loss": 0.2571, + "rewards/chosen": 0.5080668926239014, + "rewards/margins": 3.799616813659668, + "rewards/rejected": -3.2915499210357666, + "step": 47 + }, + { + "epoch": 0.017719533016473627, + "grad_norm": 10.5, + "kl": 0.0, + "learning_rate": 9.999830278746938e-06, + "logits/chosen": 363176960.0, + "logits/rejected": 330604303.0588235, + "logps/chosen": -391.06341145833335, + "logps/rejected": -361.4995978860294, + "loss": 0.2721, + "rewards/chosen": 0.16286640167236327, + "rewards/margins": 3.0725700490614947, + "rewards/rejected": -2.9097036473891316, + "step": 48 + }, + { + "epoch": 0.01808868995431683, + "grad_norm": 10.3125, + "kl": 0.04401874542236328, + "learning_rate": 9.99977832364522e-06, + "logits/chosen": 312114062.2222222, + "logits/rejected": 190066358.85714287, + "logps/chosen": -347.960205078125, + "logps/rejected": -327.246826171875, + "loss": 0.2944, + "rewards/chosen": 0.499189641740587, + "rewards/margins": 3.2440008360242087, + "rewards/rejected": -2.7448111942836215, + "step": 49 + }, + { + "epoch": 0.01845784689216003, + "grad_norm": 11.5625, + "kl": 0.07481837272644043, + "learning_rate": 9.999719441414155e-06, + "logits/chosen": 283986858.6666667, + "logits/rejected": 223094820.57142857, + "logps/chosen": -369.5777180989583, + "logps/rejected": -227.64693777901786, + "loss": 0.3206, + "rewards/chosen": 0.44436009724934894, + "rewards/margins": 2.2482952844528925, + "rewards/rejected": -1.8039351872035436, + "step": 50 + }, + { + "epoch": 0.01882700383000323, + "grad_norm": 10.75, + "kl": 0.0, + "learning_rate": 9.999653632135325e-06, + "logits/chosen": 306593536.0, + "logits/rejected": 496720603.4285714, + "logps/chosen": -424.532958984375, + "logps/rejected": -527.3479352678571, + "loss": 0.2666, + "rewards/chosen": 0.4731411404079861, + "rewards/margins": 4.860521467905196, + "rewards/rejected": -4.38738032749721, + "step": 51 + }, + { + "epoch": 0.01919616076784643, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.999580895899908e-06, + "logits/chosen": 378179345.06666666, + "logits/rejected": 302257483.2941176, + "logps/chosen": -342.4811197916667, + "logps/rejected": -352.52096737132354, + "loss": 0.2612, + "rewards/chosen": 0.24362347920735677, + "rewards/margins": 3.331483586629232, + "rewards/rejected": -3.087860107421875, + "step": 52 + }, + { + "epoch": 0.01956531770568963, + "grad_norm": 7.4375, + "kl": 0.9425196647644043, + "learning_rate": 9.999501232808678e-06, + "logits/chosen": 280927382.5882353, + "logits/rejected": 334792942.93333334, + "logps/chosen": -267.6831916360294, + "logps/rejected": -434.90234375, + "loss": 0.2206, + "rewards/chosen": 0.8271479326135972, + "rewards/margins": 4.814646915360993, + "rewards/rejected": -3.987498982747396, + "step": 53 + }, + { + "epoch": 0.019934474643532832, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.99941464297201e-06, + "logits/chosen": 349592177.7777778, + "logits/rejected": 313964032.0, + "logps/chosen": -349.2178005642361, + "logps/rejected": -373.2586146763393, + "loss": 0.2741, + "rewards/chosen": 0.4040897952185737, + "rewards/margins": 3.1830297235458613, + "rewards/rejected": -2.7789399283272878, + "step": 54 + }, + { + "epoch": 0.020303631581376033, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.99932112650987e-06, + "logits/chosen": 412536259.7647059, + "logits/rejected": 339180270.93333334, + "logps/chosen": -367.23790785845586, + "logps/rejected": -413.68297526041664, + "loss": 0.2338, + "rewards/chosen": 0.5423377541934743, + "rewards/margins": 4.057457179649203, + "rewards/rejected": -3.515119425455729, + "step": 55 + }, + { + "epoch": 0.020672788519219235, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.999220683551823e-06, + "logits/chosen": 306387904.0, + "logits/rejected": 326218419.2, + "logps/chosen": -361.5515543619792, + "logps/rejected": -384.8688232421875, + "loss": 0.2032, + "rewards/chosen": 0.794142484664917, + "rewards/margins": 4.118092679977417, + "rewards/rejected": -3.3239501953125, + "step": 56 + }, + { + "epoch": 0.021041945457062432, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.999113314237036e-06, + "logits/chosen": 405485440.0, + "logits/rejected": 309575360.0, + "logps/chosen": -385.48577880859375, + "logps/rejected": -404.5628967285156, + "loss": 0.2203, + "rewards/chosen": 0.6929687857627869, + "rewards/margins": 4.291114270687103, + "rewards/rejected": -3.5981454849243164, + "step": 57 + }, + { + "epoch": 0.021411102394905634, + "grad_norm": 11.4375, + "kl": 0.2973060607910156, + "learning_rate": 9.998999018714264e-06, + "logits/chosen": 353016320.0, + "logits/rejected": 316979309.71428573, + "logps/chosen": -402.54546440972223, + "logps/rejected": -506.54983956473217, + "loss": 0.2914, + "rewards/chosen": 0.39720482296413845, + "rewards/margins": 3.467344571673681, + "rewards/rejected": -3.0701397487095425, + "step": 58 + }, + { + "epoch": 0.021780259332748835, + "grad_norm": 9.5625, + "kl": 0.0, + "learning_rate": 9.998877797141864e-06, + "logits/chosen": 350721184.0, + "logits/rejected": 351332352.0, + "logps/chosen": -469.38946533203125, + "logps/rejected": -452.00396728515625, + "loss": 0.1915, + "rewards/chosen": 0.8586090207099915, + "rewards/margins": 4.556110441684723, + "rewards/rejected": -3.6975014209747314, + "step": 59 + }, + { + "epoch": 0.022149416270592037, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.998749649687784e-06, + "logits/chosen": 235645479.3846154, + "logits/rejected": 339345434.94736844, + "logps/chosen": -363.8684645432692, + "logps/rejected": -395.74195620888156, + "loss": 0.172, + "rewards/chosen": 1.104478542621319, + "rewards/margins": 4.663402765868646, + "rewards/rejected": -3.5589242232473275, + "step": 60 + }, + { + "epoch": 0.022518573208435234, + "grad_norm": 8.6875, + "kl": 0.2657625675201416, + "learning_rate": 9.998614576529575e-06, + "logits/chosen": 401137427.6923077, + "logits/rejected": 263157544.42105263, + "logps/chosen": -405.45981069711536, + "logps/rejected": -349.4937037417763, + "loss": 0.1913, + "rewards/chosen": 0.9418674615713266, + "rewards/margins": 4.046600376546142, + "rewards/rejected": -3.104732914974815, + "step": 61 + }, + { + "epoch": 0.022887730146278436, + "grad_norm": 10.3125, + "kl": 0.0, + "learning_rate": 9.998472577854377e-06, + "logits/chosen": 416104891.73333335, + "logits/rejected": 286862817.88235295, + "logps/chosen": -452.41868489583334, + "logps/rejected": -324.92029526654414, + "loss": 0.2396, + "rewards/chosen": 0.6576098759969076, + "rewards/margins": 3.231464883392932, + "rewards/rejected": -2.5738550073960247, + "step": 62 + }, + { + "epoch": 0.023256887084121637, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.998323653858927e-06, + "logits/chosen": 385074488.8888889, + "logits/rejected": 352292096.0, + "logps/chosen": -328.39276801215277, + "logps/rejected": -461.98793247767856, + "loss": 0.2525, + "rewards/chosen": 0.7723926968044705, + "rewards/margins": 3.8862469688294428, + "rewards/rejected": -3.1138542720249722, + "step": 63 + }, + { + "epoch": 0.02362604402196484, + "grad_norm": 9.9375, + "kl": 0.8653106689453125, + "learning_rate": 9.998167804749557e-06, + "logits/chosen": 261918691.55555555, + "logits/rejected": 405813760.0, + "logps/chosen": -401.7871365017361, + "logps/rejected": -508.4546595982143, + "loss": 0.233, + "rewards/chosen": 0.786613040500217, + "rewards/margins": 4.414777331882053, + "rewards/rejected": -3.628164291381836, + "step": 64 + }, + { + "epoch": 0.02399520095980804, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.998005030742195e-06, + "logits/chosen": 271594188.8, + "logits/rejected": 239106544.94117647, + "logps/chosen": -363.3728841145833, + "logps/rejected": -416.7379365808824, + "loss": 0.1832, + "rewards/chosen": 0.9753187815348308, + "rewards/margins": 4.4913420808081534, + "rewards/rejected": -3.5160232992733227, + "step": 65 + }, + { + "epoch": 0.024364357897651238, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 9.997835332062362e-06, + "logits/chosen": 314997555.2, + "logits/rejected": 302486377.4117647, + "logps/chosen": -341.03626302083336, + "logps/rejected": -345.77073759191177, + "loss": 0.2354, + "rewards/chosen": 0.592924690246582, + "rewards/margins": 3.457871167800006, + "rewards/rejected": -2.8649464775534237, + "step": 66 + }, + { + "epoch": 0.02473351483549444, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.997658708945173e-06, + "logits/chosen": 389068288.0, + "logits/rejected": 290456243.2, + "logps/chosen": -362.78125, + "logps/rejected": -403.643359375, + "loss": 0.1612, + "rewards/chosen": 0.81753937403361, + "rewards/margins": 4.370340506235759, + "rewards/rejected": -3.5528011322021484, + "step": 67 + }, + { + "epoch": 0.02510267177333764, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.997475161635339e-06, + "logits/chosen": 307853131.2941176, + "logits/rejected": 265687569.06666666, + "logps/chosen": -409.72472426470586, + "logps/rejected": -381.36243489583336, + "loss": 0.2226, + "rewards/chosen": 1.0089447919060202, + "rewards/margins": 3.9125069562126606, + "rewards/rejected": -2.9035621643066407, + "step": 68 + }, + { + "epoch": 0.025471828711180842, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.99728469038716e-06, + "logits/chosen": 339982414.7692308, + "logits/rejected": 375402280.42105263, + "logps/chosen": -354.3288762019231, + "logps/rejected": -434.92295435855266, + "loss": 0.198, + "rewards/chosen": 0.5803516461299016, + "rewards/margins": 4.516590386749762, + "rewards/rejected": -3.93623874061986, + "step": 69 + }, + { + "epoch": 0.02584098564902404, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.99708729546453e-06, + "logits/chosen": 321726227.6923077, + "logits/rejected": 299321882.94736844, + "logps/chosen": -366.22228064903845, + "logps/rejected": -353.8169716282895, + "loss": 0.1713, + "rewards/chosen": 0.8388754037710336, + "rewards/margins": 4.398169853426666, + "rewards/rejected": -3.559294449655633, + "step": 70 + }, + { + "epoch": 0.02621014258686724, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.996882977140942e-06, + "logits/chosen": 281173708.8, + "logits/rejected": 396852856.4705882, + "logps/chosen": -335.42942708333334, + "logps/rejected": -525.3773552389706, + "loss": 0.194, + "rewards/chosen": 0.5295801480611165, + "rewards/margins": 4.8037150644788555, + "rewards/rejected": -4.274134916417739, + "step": 71 + }, + { + "epoch": 0.026579299524710442, + "grad_norm": 9.625, + "kl": 0.5552797317504883, + "learning_rate": 9.996671735699473e-06, + "logits/chosen": 246517609.4117647, + "logits/rejected": 359049659.73333335, + "logps/chosen": -352.669921875, + "logps/rejected": -440.55677083333336, + "loss": 0.1917, + "rewards/chosen": 1.2500796598546646, + "rewards/margins": 4.943696399763519, + "rewards/rejected": -3.693616739908854, + "step": 72 + }, + { + "epoch": 0.026948456462553644, + "grad_norm": 9.125, + "kl": 0.23992586135864258, + "learning_rate": 9.996453571432797e-06, + "logits/chosen": 428351390.47619045, + "logits/rejected": 303390208.0, + "logps/chosen": -291.8870907738095, + "logps/rejected": -358.95607688210225, + "loss": 0.2745, + "rewards/chosen": 0.6823966162545341, + "rewards/margins": 3.970230783735003, + "rewards/rejected": -3.2878341674804688, + "step": 73 + }, + { + "epoch": 0.027317613400396845, + "grad_norm": 11.8125, + "kl": 0.0, + "learning_rate": 9.996228484643176e-06, + "logits/chosen": 373825938.28571427, + "logits/rejected": 319098652.4444444, + "logps/chosen": -486.30161830357144, + "logps/rejected": -433.66650390625, + "loss": 0.1994, + "rewards/chosen": 0.5486773082188198, + "rewards/margins": 4.294139513893733, + "rewards/rejected": -3.7454622056749134, + "step": 74 + }, + { + "epoch": 0.027686770338240043, + "grad_norm": 8.9375, + "kl": 0.008473873138427734, + "learning_rate": 9.995996475642466e-06, + "logits/chosen": 426807637.3333333, + "logits/rejected": 251093865.4117647, + "logps/chosen": -450.0958984375, + "logps/rejected": -463.7391142003676, + "loss": 0.1942, + "rewards/chosen": 0.9612097422281901, + "rewards/margins": 4.780588314579982, + "rewards/rejected": -3.819378572351792, + "step": 75 + }, + { + "epoch": 0.028055927276083244, + "grad_norm": 9.5625, + "kl": 0.42736339569091797, + "learning_rate": 9.995757544752114e-06, + "logits/chosen": 392942523.73333335, + "logits/rejected": 273063092.7058824, + "logps/chosen": -401.34641927083334, + "logps/rejected": -315.3045438878676, + "loss": 0.2218, + "rewards/chosen": 0.8567731857299805, + "rewards/margins": 3.8002265817978804, + "rewards/rejected": -2.9434533960678997, + "step": 76 + }, + { + "epoch": 0.028425084213926446, + "grad_norm": 10.875, + "kl": 0.0, + "learning_rate": 9.995511692303153e-06, + "logits/chosen": 371514240.0, + "logits/rejected": 297214848.0, + "logps/chosen": -402.6493225097656, + "logps/rejected": -417.052734375, + "loss": 0.225, + "rewards/chosen": 0.755180299282074, + "rewards/margins": 3.873466670513153, + "rewards/rejected": -3.118286371231079, + "step": 77 + }, + { + "epoch": 0.028794241151769647, + "grad_norm": 9.9375, + "kl": 0.6281604766845703, + "learning_rate": 9.995258918636209e-06, + "logits/chosen": 296659174.4, + "logits/rejected": 290886357.3333333, + "logps/chosen": -349.0162841796875, + "logps/rejected": -385.4923909505208, + "loss": 0.2234, + "rewards/chosen": 0.9800780296325684, + "rewards/margins": 5.1309639294942215, + "rewards/rejected": -4.150885899861653, + "step": 78 + }, + { + "epoch": 0.029163398089612845, + "grad_norm": 9.75, + "kl": 0.6763195991516113, + "learning_rate": 9.994999224101498e-06, + "logits/chosen": 294466446.2222222, + "logits/rejected": 278251392.0, + "logps/chosen": -358.1522623697917, + "logps/rejected": -416.99560546875, + "loss": 0.1996, + "rewards/chosen": 1.490166770087348, + "rewards/margins": 4.728133307562934, + "rewards/rejected": -3.237966537475586, + "step": 79 + }, + { + "epoch": 0.029532555027456046, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.994732609058824e-06, + "logits/chosen": 351943920.9411765, + "logits/rejected": 456485922.1333333, + "logps/chosen": -363.4836856617647, + "logps/rejected": -363.96067708333334, + "loss": 0.1892, + "rewards/chosen": 1.1173793568330652, + "rewards/margins": 4.358363955628638, + "rewards/rejected": -3.2409845987955728, + "step": 80 + }, + { + "epoch": 0.029901711965299248, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.994459073877577e-06, + "logits/chosen": 254556558.2222222, + "logits/rejected": 390862336.0, + "logps/chosen": -296.3024631076389, + "logps/rejected": -373.031982421875, + "loss": 0.2032, + "rewards/chosen": 1.0861679712931316, + "rewards/margins": 4.89876769837879, + "rewards/rejected": -3.8125997270856584, + "step": 81 + }, + { + "epoch": 0.03027086890314245, + "grad_norm": 7.8125, + "kl": 0.0, + "learning_rate": 9.994178618936736e-06, + "logits/chosen": 315314082.90909094, + "logits/rejected": 335286320.7619048, + "logps/chosen": -290.0667613636364, + "logps/rejected": -337.75899832589283, + "loss": 0.1773, + "rewards/chosen": 0.8498925295743075, + "rewards/margins": 3.8311772614846498, + "rewards/rejected": -2.981284731910342, + "step": 82 + }, + { + "epoch": 0.03064002584098565, + "grad_norm": 11.375, + "kl": 0.0, + "learning_rate": 9.99389124462487e-06, + "logits/chosen": 273487239.5294118, + "logits/rejected": 334498816.0, + "logps/chosen": -345.8655790441176, + "logps/rejected": -373.06236979166664, + "loss": 0.2703, + "rewards/chosen": 0.5492019653320312, + "rewards/margins": 3.7657808939615887, + "rewards/rejected": -3.2165789286295574, + "step": 83 + }, + { + "epoch": 0.03100918277882885, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.99359695134013e-06, + "logits/chosen": 348711148.3076923, + "logits/rejected": 379988183.57894737, + "logps/chosen": -430.2646484375, + "logps/rejected": -511.5122841282895, + "loss": 0.1893, + "rewards/chosen": 0.4756373625535231, + "rewards/margins": 4.780072289439831, + "rewards/rejected": -4.304434926886308, + "step": 84 + }, + { + "epoch": 0.03137833971667205, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.993295739490259e-06, + "logits/chosen": 228402832.0, + "logits/rejected": 356094720.0, + "logps/chosen": -301.5395202636719, + "logps/rejected": -507.272705078125, + "loss": 0.1584, + "rewards/chosen": 1.5112659931182861, + "rewards/margins": 5.482000350952148, + "rewards/rejected": -3.9707343578338623, + "step": 85 + }, + { + "epoch": 0.03174749665451525, + "grad_norm": 7.09375, + "kl": 1.1118309497833252, + "learning_rate": 9.992987609492578e-06, + "logits/chosen": 357287424.0, + "logits/rejected": 428834432.0, + "logps/chosen": -344.8846842447917, + "logps/rejected": -308.4556396484375, + "loss": 0.16, + "rewards/chosen": 1.0538597106933594, + "rewards/margins": 4.254974746704102, + "rewards/rejected": -3.201115036010742, + "step": 86 + }, + { + "epoch": 0.03211665359235845, + "grad_norm": 12.0625, + "kl": 0.0, + "learning_rate": 9.992672561774001e-06, + "logits/chosen": 241644008.72727272, + "logits/rejected": 428525926.4, + "logps/chosen": -393.9236949573864, + "logps/rejected": -389.99990234375, + "loss": 0.2493, + "rewards/chosen": 0.8203854994340376, + "rewards/margins": 4.843164201216265, + "rewards/rejected": -4.022778701782227, + "step": 87 + }, + { + "epoch": 0.03248581053020165, + "grad_norm": 8.1875, + "kl": 0.23165082931518555, + "learning_rate": 9.99235059677102e-06, + "logits/chosen": 312903232.0, + "logits/rejected": 514941664.0, + "logps/chosen": -402.7296142578125, + "logps/rejected": -621.26416015625, + "loss": 0.193, + "rewards/chosen": 0.7602296471595764, + "rewards/margins": 5.797093451023102, + "rewards/rejected": -5.036863803863525, + "step": 88 + }, + { + "epoch": 0.03285496746804485, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.992021714929714e-06, + "logits/chosen": 329516407.46666664, + "logits/rejected": 318402891.2941176, + "logps/chosen": -416.9033203125, + "logps/rejected": -428.8330078125, + "loss": 0.1791, + "rewards/chosen": 0.9379507064819336, + "rewards/margins": 4.424440395130831, + "rewards/rejected": -3.486489688648897, + "step": 89 + }, + { + "epoch": 0.03322412440588805, + "grad_norm": 10.375, + "kl": 0.0, + "learning_rate": 9.991685916705748e-06, + "logits/chosen": 319387596.8, + "logits/rejected": 349975786.6666667, + "logps/chosen": -444.41298828125, + "logps/rejected": -419.4962972005208, + "loss": 0.1795, + "rewards/chosen": 1.8672292709350586, + "rewards/margins": 4.959722963968913, + "rewards/rejected": -3.092493693033854, + "step": 90 + }, + { + "epoch": 0.033593281343731254, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.991343202564358e-06, + "logits/chosen": 218810525.53846154, + "logits/rejected": 302818250.1052632, + "logps/chosen": -303.1082106370192, + "logps/rejected": -352.9018297697368, + "loss": 0.1554, + "rewards/chosen": 1.2257785063523512, + "rewards/margins": 5.0314497966998015, + "rewards/rejected": -3.8056712903474508, + "step": 91 + }, + { + "epoch": 0.033962438281574456, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.99099357298038e-06, + "logits/chosen": 300118784.0, + "logits/rejected": 308193064.42105263, + "logps/chosen": -424.34314903846155, + "logps/rejected": -513.3343955592105, + "loss": 0.1774, + "rewards/chosen": 0.747009644141564, + "rewards/margins": 5.431760652827831, + "rewards/rejected": -4.684751008686266, + "step": 92 + }, + { + "epoch": 0.03433159521941766, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.990637028438213e-06, + "logits/chosen": 263378059.63636363, + "logits/rejected": 252253184.0, + "logps/chosen": -334.7020818536932, + "logps/rejected": -368.99823288690476, + "loss": 0.158, + "rewards/chosen": 0.9120744358409535, + "rewards/margins": 4.263903238040544, + "rewards/rejected": -3.3518288021995906, + "step": 93 + }, + { + "epoch": 0.03470075215726086, + "grad_norm": 9.5625, + "kl": 0.05284881591796875, + "learning_rate": 9.99027356943185e-06, + "logits/chosen": 314341708.8, + "logits/rejected": 290959082.6666667, + "logps/chosen": -420.551611328125, + "logps/rejected": -360.5419108072917, + "loss": 0.203, + "rewards/chosen": 1.119022274017334, + "rewards/margins": 4.673091793060303, + "rewards/rejected": -3.5540695190429688, + "step": 94 + }, + { + "epoch": 0.03506990909510405, + "grad_norm": 8.625, + "kl": 0.9799938201904297, + "learning_rate": 9.989903196464858e-06, + "logits/chosen": 370128640.0, + "logits/rejected": 297286542.2222222, + "logps/chosen": -331.2320033482143, + "logps/rejected": -327.39002821180554, + "loss": 0.221, + "rewards/chosen": 0.7470052582877023, + "rewards/margins": 4.073850215427459, + "rewards/rejected": -3.326844957139757, + "step": 95 + }, + { + "epoch": 0.035439066032947254, + "grad_norm": 7.625, + "kl": 0.18118619918823242, + "learning_rate": 9.989525910050382e-06, + "logits/chosen": 324916404.7058824, + "logits/rejected": 184376610.13333333, + "logps/chosen": -253.76953125, + "logps/rejected": -359.6565755208333, + "loss": 0.2325, + "rewards/chosen": 0.8021718193502987, + "rewards/margins": 4.722054275811887, + "rewards/rejected": -3.9198824564615884, + "step": 96 + }, + { + "epoch": 0.035808222970790456, + "grad_norm": 11.3125, + "kl": 0.4659719467163086, + "learning_rate": 9.989141710711149e-06, + "logits/chosen": 302910841.2631579, + "logits/rejected": 298687428.9230769, + "logps/chosen": -449.24342105263156, + "logps/rejected": -422.1397235576923, + "loss": 0.2068, + "rewards/chosen": 1.097245969270405, + "rewards/margins": 4.519127718350183, + "rewards/rejected": -3.4218817490797777, + "step": 97 + }, + { + "epoch": 0.03617737990863366, + "grad_norm": 9.375, + "kl": 0.0, + "learning_rate": 9.988750598979464e-06, + "logits/chosen": 374013138.8235294, + "logits/rejected": 244761531.73333332, + "logps/chosen": -401.3678768382353, + "logps/rejected": -422.00257161458336, + "loss": 0.1906, + "rewards/chosen": 1.0932338938993567, + "rewards/margins": 4.912030433205997, + "rewards/rejected": -3.8187965393066405, + "step": 98 + }, + { + "epoch": 0.03654653684647686, + "grad_norm": 9.1875, + "kl": 0.0, + "learning_rate": 9.988352575397204e-06, + "logits/chosen": 416420803.7647059, + "logits/rejected": 180892569.6, + "logps/chosen": -401.83220358455884, + "logps/rejected": -303.0835286458333, + "loss": 0.2368, + "rewards/chosen": 0.6204564150641946, + "rewards/margins": 4.126768213159898, + "rewards/rejected": -3.506311798095703, + "step": 99 + }, + { + "epoch": 0.03691569378432006, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.987947640515827e-06, + "logits/chosen": 252719675.07692307, + "logits/rejected": 221417364.21052632, + "logps/chosen": -392.6477614182692, + "logps/rejected": -318.1152086759868, + "loss": 0.1485, + "rewards/chosen": 1.6389163090632513, + "rewards/margins": 5.080467517559345, + "rewards/rejected": -3.4415512084960938, + "step": 100 + }, + { + "epoch": 0.03728485072216326, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.987535794896366e-06, + "logits/chosen": 351328796.4444444, + "logits/rejected": 366703104.0, + "logps/chosen": -293.8620876736111, + "logps/rejected": -449.96199898097825, + "loss": 0.1388, + "rewards/chosen": 1.0316431257459853, + "rewards/margins": 4.973506475992249, + "rewards/rejected": -3.9418633502462637, + "step": 101 + }, + { + "epoch": 0.03765400766000646, + "grad_norm": 9.125, + "kl": 0.0, + "learning_rate": 9.987117039109427e-06, + "logits/chosen": 269788618.1052632, + "logits/rejected": 304533326.7692308, + "logps/chosen": -339.4221833881579, + "logps/rejected": -438.7384690504808, + "loss": 0.199, + "rewards/chosen": 1.1157162314967106, + "rewards/margins": 5.7478394450446375, + "rewards/rejected": -4.632123213547927, + "step": 102 + }, + { + "epoch": 0.038023164597849664, + "grad_norm": 11.1875, + "kl": 0.3963775634765625, + "learning_rate": 9.986691373735191e-06, + "logits/chosen": 273418418.0869565, + "logits/rejected": 361095708.4444444, + "logps/chosen": -384.0732846467391, + "logps/rejected": -363.6285807291667, + "loss": 0.2456, + "rewards/chosen": 1.4204776598059612, + "rewards/margins": 5.50661267635327, + "rewards/rejected": -4.086135016547309, + "step": 103 + }, + { + "epoch": 0.03839232153569286, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.986258799363412e-06, + "logits/chosen": 331096576.0, + "logits/rejected": 235719338.66666666, + "logps/chosen": -397.95237821691177, + "logps/rejected": -310.84619140625, + "loss": 0.18, + "rewards/chosen": 1.1037871416877298, + "rewards/margins": 4.5804925656786155, + "rewards/rejected": -3.4767054239908854, + "step": 104 + }, + { + "epoch": 0.03876147847353606, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.985819316593416e-06, + "logits/chosen": 251620608.0, + "logits/rejected": 448176459.2941176, + "logps/chosen": -355.357421875, + "logps/rejected": -317.8449276194853, + "loss": 0.1669, + "rewards/chosen": 1.4944662729899088, + "rewards/margins": 4.6583932390400005, + "rewards/rejected": -3.163926966050092, + "step": 105 + }, + { + "epoch": 0.03913063541137926, + "grad_norm": 11.25, + "kl": 0.0, + "learning_rate": 9.9853729260341e-06, + "logits/chosen": 333496589.4736842, + "logits/rejected": 314872320.0, + "logps/chosen": -398.21361019736844, + "logps/rejected": -383.6670673076923, + "loss": 0.2159, + "rewards/chosen": 0.983243139166581, + "rewards/margins": 5.6393505281764975, + "rewards/rejected": -4.656107389009916, + "step": 106 + }, + { + "epoch": 0.03949979234922246, + "grad_norm": 9.8125, + "kl": 0.0, + "learning_rate": 9.984919628303934e-06, + "logits/chosen": 358484764.4444444, + "logits/rejected": 693173174.8571428, + "logps/chosen": -396.2157389322917, + "logps/rejected": -577.5203683035714, + "loss": 0.1924, + "rewards/chosen": 0.9471362431844076, + "rewards/margins": 5.573704628717332, + "rewards/rejected": -4.6265683855329245, + "step": 107 + }, + { + "epoch": 0.039868949287065664, + "grad_norm": 8.0, + "kl": 0.43378210067749023, + "learning_rate": 9.984459424030958e-06, + "logits/chosen": 256968118.85714287, + "logits/rejected": 331611927.27272725, + "logps/chosen": -335.4313151041667, + "logps/rejected": -432.4142400568182, + "loss": 0.1514, + "rewards/chosen": 1.5621163504464286, + "rewards/margins": 6.244262249438794, + "rewards/rejected": -4.682145898992365, + "step": 108 + }, + { + "epoch": 0.040238106224908865, + "grad_norm": 7.5625, + "kl": 0.035178184509277344, + "learning_rate": 9.983992313852776e-06, + "logits/chosen": 318791204.5714286, + "logits/rejected": 314867797.3333333, + "logps/chosen": -351.17710658482144, + "logps/rejected": -389.99848090277777, + "loss": 0.1465, + "rewards/chosen": 1.3561897277832031, + "rewards/margins": 5.2549633449978295, + "rewards/rejected": -3.898773617214627, + "step": 109 + }, + { + "epoch": 0.040607263162752066, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.983518298416564e-06, + "logits/chosen": 306447457.52380955, + "logits/rejected": 255381248.0, + "logps/chosen": -418.57421875, + "logps/rejected": -404.61661044034093, + "loss": 0.1708, + "rewards/chosen": 1.5019299643380302, + "rewards/margins": 5.392597966379934, + "rewards/rejected": -3.8906680020419033, + "step": 110 + }, + { + "epoch": 0.04097642010059527, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.983037378379064e-06, + "logits/chosen": 298363964.2352941, + "logits/rejected": 264843878.4, + "logps/chosen": -274.0853630514706, + "logps/rejected": -352.46689453125, + "loss": 0.1582, + "rewards/chosen": 1.5677040324491613, + "rewards/margins": 5.243775962380802, + "rewards/rejected": -3.6760719299316404, + "step": 111 + }, + { + "epoch": 0.04134557703843847, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.982549554406585e-06, + "logits/chosen": 267116784.94117647, + "logits/rejected": 247671534.93333334, + "logps/chosen": -330.68502987132354, + "logps/rejected": -436.13824869791665, + "loss": 0.2026, + "rewards/chosen": 0.8622505524579216, + "rewards/margins": 5.308252749723547, + "rewards/rejected": -4.446002197265625, + "step": 112 + }, + { + "epoch": 0.041714733976281664, + "grad_norm": 9.0625, + "kl": 0.0, + "learning_rate": 9.982054827175e-06, + "logits/chosen": 277465063.61904764, + "logits/rejected": 228848896.0, + "logps/chosen": -339.31817336309524, + "logps/rejected": -381.86479048295456, + "loss": 0.2182, + "rewards/chosen": 1.0680358523414248, + "rewards/margins": 4.982539123270935, + "rewards/rejected": -3.91450327092951, + "step": 113 + }, + { + "epoch": 0.042083890914124865, + "grad_norm": 9.1875, + "kl": 0.16224908828735352, + "learning_rate": 9.981553197369752e-06, + "logits/chosen": 375597824.0, + "logits/rejected": 400459520.0, + "logps/chosen": -463.62608506944446, + "logps/rejected": -367.50526646205356, + "loss": 0.1638, + "rewards/chosen": 1.8150278727213542, + "rewards/margins": 5.5834555852980845, + "rewards/rejected": -3.76842771257673, + "step": 114 + }, + { + "epoch": 0.042453047851968066, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.981044665685834e-06, + "logits/chosen": 314240955.73333335, + "logits/rejected": 269935164.2352941, + "logps/chosen": -334.66123046875, + "logps/rejected": -423.1309455422794, + "loss": 0.1833, + "rewards/chosen": 0.8523624420166016, + "rewards/margins": 4.961700641407686, + "rewards/rejected": -4.109338199391084, + "step": 115 + }, + { + "epoch": 0.04282220478981127, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.980529232827819e-06, + "logits/chosen": 389786692.26666665, + "logits/rejected": 243833705.4117647, + "logps/chosen": -307.6694010416667, + "logps/rejected": -429.80055147058823, + "loss": 0.1688, + "rewards/chosen": 1.1600802103678385, + "rewards/margins": 5.170056765687232, + "rewards/rejected": -4.009976555319393, + "step": 116 + }, + { + "epoch": 0.04319136172765447, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.980006899509827e-06, + "logits/chosen": 479721984.0, + "logits/rejected": 345092761.6, + "logps/chosen": -400.3775227864583, + "logps/rejected": -427.2904296875, + "loss": 0.1121, + "rewards/chosen": 1.84227720896403, + "rewards/margins": 5.669276873270671, + "rewards/rejected": -3.8269996643066406, + "step": 117 + }, + { + "epoch": 0.04356051866549767, + "grad_norm": 8.6875, + "kl": 0.2181229591369629, + "learning_rate": 9.979477666455547e-06, + "logits/chosen": 340531379.2, + "logits/rejected": 258693056.0, + "logps/chosen": -326.0709716796875, + "logps/rejected": -366.6095784505208, + "loss": 0.2404, + "rewards/chosen": 1.0486746788024903, + "rewards/margins": 3.9630210558573404, + "rewards/rejected": -2.91434637705485, + "step": 118 + }, + { + "epoch": 0.04392967560334087, + "grad_norm": 10.75, + "kl": 3.605989933013916, + "learning_rate": 9.978941534398224e-06, + "logits/chosen": 227956886.5882353, + "logits/rejected": 368813260.8, + "logps/chosen": -442.7810489430147, + "logps/rejected": -421.88662109375, + "loss": 0.2019, + "rewards/chosen": 1.62519152024213, + "rewards/margins": 5.9227562175077555, + "rewards/rejected": -4.297564697265625, + "step": 119 + }, + { + "epoch": 0.04429883254118407, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.978398504080661e-06, + "logits/chosen": 291126332.2352941, + "logits/rejected": 273167189.3333333, + "logps/chosen": -308.32361557904414, + "logps/rejected": -401.38727213541665, + "loss": 0.1576, + "rewards/chosen": 1.8196807188146256, + "rewards/margins": 5.721738112206553, + "rewards/rejected": -3.902057393391927, + "step": 120 + }, + { + "epoch": 0.044667989479027274, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.97784857625522e-06, + "logits/chosen": 319254835.2, + "logits/rejected": 302651182.54545456, + "logps/chosen": -433.366162109375, + "logps/rejected": -471.0048828125, + "loss": 0.1253, + "rewards/chosen": 1.3802021026611329, + "rewards/margins": 6.166778460415927, + "rewards/rejected": -4.786576357754794, + "step": 121 + }, + { + "epoch": 0.04503714641687047, + "grad_norm": 8.625, + "kl": 0.07045173645019531, + "learning_rate": 9.977291751683821e-06, + "logits/chosen": 259084743.1111111, + "logits/rejected": 324581595.4285714, + "logps/chosen": -312.06255425347223, + "logps/rejected": -377.1827915736607, + "loss": 0.198, + "rewards/chosen": 1.4615031348334417, + "rewards/margins": 5.102500612773593, + "rewards/rejected": -3.6409974779401506, + "step": 122 + }, + { + "epoch": 0.04540630335471367, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.976728031137936e-06, + "logits/chosen": 315404219.73333335, + "logits/rejected": 346952673.88235295, + "logps/chosen": -339.0673502604167, + "logps/rejected": -459.1176183363971, + "loss": 0.1728, + "rewards/chosen": 1.1944433848063152, + "rewards/margins": 5.485798742256913, + "rewards/rejected": -4.291355357450597, + "step": 123 + }, + { + "epoch": 0.04577546029255687, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.976157415398591e-06, + "logits/chosen": 320804608.0, + "logits/rejected": 224511872.0, + "logps/chosen": -426.78236607142856, + "logps/rejected": -294.39320203993054, + "loss": 0.2278, + "rewards/chosen": 0.6897065980093819, + "rewards/margins": 3.5275404264056496, + "rewards/rejected": -2.8378338283962674, + "step": 124 + }, + { + "epoch": 0.04614461723040007, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.97557990525637e-06, + "logits/chosen": 416835463.5294118, + "logits/rejected": 260438869.33333334, + "logps/chosen": -405.9875057444853, + "logps/rejected": -354.8162109375, + "loss": 0.2121, + "rewards/chosen": 0.859857390908634, + "rewards/margins": 4.134220336465274, + "rewards/rejected": -3.2743629455566405, + "step": 125 + }, + { + "epoch": 0.046513774168243274, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.974995501511404e-06, + "logits/chosen": 267375966.31578946, + "logits/rejected": 446170899.6923077, + "logps/chosen": -379.74735300164474, + "logps/rejected": -432.5303485576923, + "loss": 0.1723, + "rewards/chosen": 1.3327726062975431, + "rewards/margins": 5.372063107818727, + "rewards/rejected": -4.039290501521184, + "step": 126 + }, + { + "epoch": 0.046882931106086476, + "grad_norm": 8.4375, + "kl": 0.22330379486083984, + "learning_rate": 9.974404204973376e-06, + "logits/chosen": 273397589.3333333, + "logits/rejected": 314191579.4285714, + "logps/chosen": -316.99359809027777, + "logps/rejected": -468.52015904017856, + "loss": 0.1687, + "rewards/chosen": 1.3576190736558702, + "rewards/margins": 5.748626360817561, + "rewards/rejected": -4.391007287161691, + "step": 127 + }, + { + "epoch": 0.04725208804392968, + "grad_norm": 6.46875, + "kl": 0.4679269790649414, + "learning_rate": 9.973806016461522e-06, + "logits/chosen": 301194304.0, + "logits/rejected": 327783104.0, + "logps/chosen": -241.81646728515625, + "logps/rejected": -373.468994140625, + "loss": 0.1707, + "rewards/chosen": 1.677526593208313, + "rewards/margins": 5.5490957498550415, + "rewards/rejected": -3.8715691566467285, + "step": 128 + }, + { + "epoch": 0.04762124498177288, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.973200936804624e-06, + "logits/chosen": 289407085.71428573, + "logits/rejected": 318413340.4444444, + "logps/chosen": -376.61826869419644, + "logps/rejected": -465.5563693576389, + "loss": 0.1469, + "rewards/chosen": 1.431586810520717, + "rewards/margins": 5.621098427545457, + "rewards/rejected": -4.189511617024739, + "step": 129 + }, + { + "epoch": 0.04799040191961608, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.972588966841013e-06, + "logits/chosen": 183140590.93333334, + "logits/rejected": 256630753.88235295, + "logps/chosen": -310.63365885416664, + "logps/rejected": -410.8678193933824, + "loss": 0.1192, + "rewards/chosen": 1.8707721710205079, + "rewards/margins": 5.816050563139074, + "rewards/rejected": -3.945278392118566, + "step": 130 + }, + { + "epoch": 0.048359558857459274, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.971970107418562e-06, + "logits/chosen": 273494851.3684211, + "logits/rejected": 261218441.84615386, + "logps/chosen": -259.1568153782895, + "logps/rejected": -378.7028996394231, + "loss": 0.2338, + "rewards/chosen": 0.7704107886866519, + "rewards/margins": 5.235968473951826, + "rewards/rejected": -4.465557685265174, + "step": 131 + }, + { + "epoch": 0.048728715795302475, + "grad_norm": 7.84375, + "kl": 0.29563140869140625, + "learning_rate": 9.971344359394696e-06, + "logits/chosen": 478417920.0, + "logits/rejected": 484892637.8666667, + "logps/chosen": -385.94944852941177, + "logps/rejected": -474.2452799479167, + "loss": 0.1435, + "rewards/chosen": 1.8829482583438648, + "rewards/margins": 5.812026169720818, + "rewards/rejected": -3.929077911376953, + "step": 132 + }, + { + "epoch": 0.04909787273314568, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.970711723636382e-06, + "logits/chosen": 292155411.6923077, + "logits/rejected": 281149170.5263158, + "logps/chosen": -356.39028695913464, + "logps/rejected": -388.7280016447368, + "loss": 0.1819, + "rewards/chosen": 0.923422593336839, + "rewards/margins": 4.929248238382069, + "rewards/rejected": -4.00582564504523, + "step": 133 + }, + { + "epoch": 0.04946702967098888, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.970072201020127e-06, + "logits/chosen": 233941452.8, + "logits/rejected": 225131640.47058824, + "logps/chosen": -385.0292643229167, + "logps/rejected": -420.1394473805147, + "loss": 0.1648, + "rewards/chosen": 1.192016855875651, + "rewards/margins": 4.717214726466759, + "rewards/rejected": -3.5251978705911076, + "step": 134 + }, + { + "epoch": 0.04983618660883208, + "grad_norm": 8.5625, + "kl": 0.8571453094482422, + "learning_rate": 9.969425792431982e-06, + "logits/chosen": 395858716.4444444, + "logits/rejected": 265721673.14285713, + "logps/chosen": -349.02159288194446, + "logps/rejected": -446.73646763392856, + "loss": 0.1865, + "rewards/chosen": 1.2168162663777669, + "rewards/margins": 5.653805369422549, + "rewards/rejected": -4.436989103044782, + "step": 135 + }, + { + "epoch": 0.05020534354667528, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.968772498767537e-06, + "logits/chosen": 320206396.2352941, + "logits/rejected": 227199078.4, + "logps/chosen": -415.4093807444853, + "logps/rejected": -395.77942708333336, + "loss": 0.1474, + "rewards/chosen": 1.8795318603515625, + "rewards/margins": 6.51103515625, + "rewards/rejected": -4.631503295898438, + "step": 136 + }, + { + "epoch": 0.05057450048451848, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 9.96811232093192e-06, + "logits/chosen": 225557430.85714287, + "logits/rejected": 448236344.8888889, + "logps/chosen": -354.842041015625, + "logps/rejected": -488.8160807291667, + "loss": 0.1311, + "rewards/chosen": 1.6143718447004045, + "rewards/margins": 5.864613442193894, + "rewards/rejected": -4.250241597493489, + "step": 137 + }, + { + "epoch": 0.050943657422361684, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.967445259839805e-06, + "logits/chosen": 308308992.0, + "logits/rejected": 328203468.8, + "logps/chosen": -332.3967715992647, + "logps/rejected": -491.7895833333333, + "loss": 0.1351, + "rewards/chosen": 1.5746678745045382, + "rewards/margins": 7.144504300285788, + "rewards/rejected": -5.56983642578125, + "step": 138 + }, + { + "epoch": 0.051312814360204885, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.966771316415391e-06, + "logits/chosen": 249094656.0, + "logits/rejected": 275655111.1111111, + "logps/chosen": -379.40304129464283, + "logps/rejected": -380.59136284722223, + "loss": 0.1806, + "rewards/chosen": 1.4639245441981725, + "rewards/margins": 5.1028727728223044, + "rewards/rejected": -3.638948228624132, + "step": 139 + }, + { + "epoch": 0.05168197129804808, + "grad_norm": 7.90625, + "kl": 0.0, + "learning_rate": 9.966090491592422e-06, + "logits/chosen": 389128576.0, + "logits/rejected": 325952307.2, + "logps/chosen": -359.4988199869792, + "logps/rejected": -472.613134765625, + "loss": 0.1213, + "rewards/chosen": 1.4962137540181477, + "rewards/margins": 6.272291787465413, + "rewards/rejected": -4.7760780334472654, + "step": 140 + }, + { + "epoch": 0.05205112823589128, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.96540278631417e-06, + "logits/chosen": 245114299.73333332, + "logits/rejected": 219405718.5882353, + "logps/chosen": -319.49443359375, + "logps/rejected": -362.98058363970586, + "loss": 0.1165, + "rewards/chosen": 1.560525894165039, + "rewards/margins": 5.694459915161133, + "rewards/rejected": -4.133934020996094, + "step": 141 + }, + { + "epoch": 0.05242028517373448, + "grad_norm": 9.625, + "kl": 0.0, + "learning_rate": 9.964708201533441e-06, + "logits/chosen": 539857618.8235294, + "logits/rejected": 510283161.6, + "logps/chosen": -388.03845932904414, + "logps/rejected": -483.61253255208334, + "loss": 0.2057, + "rewards/chosen": 0.6531078675213982, + "rewards/margins": 4.890123400968664, + "rewards/rejected": -4.2370155334472654, + "step": 142 + }, + { + "epoch": 0.052789442111577684, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.964006738212574e-06, + "logits/chosen": 317007872.0, + "logits/rejected": 413728768.0, + "logps/chosen": -334.69078776041664, + "logps/rejected": -444.70217715992646, + "loss": 0.1463, + "rewards/chosen": 1.6978284200032552, + "rewards/margins": 6.9596153857661225, + "rewards/rejected": -5.261786965762868, + "step": 143 + }, + { + "epoch": 0.053158599049420885, + "grad_norm": 9.875, + "kl": 0.20581722259521484, + "learning_rate": 9.963298397323443e-06, + "logits/chosen": 274823509.3333333, + "logits/rejected": 403079350.85714287, + "logps/chosen": -416.20855034722223, + "logps/rejected": -357.83077566964283, + "loss": 0.2002, + "rewards/chosen": 1.0152624977959528, + "rewards/margins": 4.671112499539815, + "rewards/rejected": -3.6558500017438615, + "step": 144 + }, + { + "epoch": 0.053527755987264086, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.962583179847436e-06, + "logits/chosen": 255563485.86666667, + "logits/rejected": 330639420.2352941, + "logps/chosen": -328.04560546875, + "logps/rejected": -424.52737247242646, + "loss": 0.1521, + "rewards/chosen": 1.4234608968098958, + "rewards/margins": 5.742534293380438, + "rewards/rejected": -4.319073396570542, + "step": 145 + }, + { + "epoch": 0.05389691292510729, + "grad_norm": 7.75, + "kl": 0.0, + "learning_rate": 9.961861086775483e-06, + "logits/chosen": 276119444.2105263, + "logits/rejected": 356425846.15384614, + "logps/chosen": -429.7837685032895, + "logps/rejected": -465.5176532451923, + "loss": 0.1408, + "rewards/chosen": 1.8471464859811884, + "rewards/margins": 6.405739355666435, + "rewards/rejected": -4.5585928696852465, + "step": 146 + }, + { + "epoch": 0.05426606986295049, + "grad_norm": 7.21875, + "kl": 0.11877727508544922, + "learning_rate": 9.961132119108036e-06, + "logits/chosen": 327336379.73333335, + "logits/rejected": 235832064.0, + "logps/chosen": -311.32705078125, + "logps/rejected": -405.7168830422794, + "loss": 0.1491, + "rewards/chosen": 1.8262680053710938, + "rewards/margins": 5.752885347254136, + "rewards/rejected": -3.926617341883042, + "step": 147 + }, + { + "epoch": 0.05463522680079369, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.960396277855067e-06, + "logits/chosen": 258973730.13333333, + "logits/rejected": 217821921.88235295, + "logps/chosen": -280.6552408854167, + "logps/rejected": -414.375, + "loss": 0.16, + "rewards/chosen": 1.2650960286458333, + "rewards/margins": 5.964657203823912, + "rewards/rejected": -4.699561175178079, + "step": 148 + }, + { + "epoch": 0.055004383738636885, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.959653564036077e-06, + "logits/chosen": 273285831.1111111, + "logits/rejected": 308575451.4285714, + "logps/chosen": -343.52012803819446, + "logps/rejected": -401.10829380580356, + "loss": 0.1554, + "rewards/chosen": 1.86662843492296, + "rewards/margins": 6.09674432542589, + "rewards/rejected": -4.23011589050293, + "step": 149 + }, + { + "epoch": 0.055373540676480086, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.958903978680086e-06, + "logits/chosen": 398187200.0, + "logits/rejected": 198803008.0, + "logps/chosen": -406.8353576660156, + "logps/rejected": -411.4206848144531, + "loss": 0.1842, + "rewards/chosen": 1.114051342010498, + "rewards/margins": 4.566049575805664, + "rewards/rejected": -3.451998233795166, + "step": 150 + }, + { + "epoch": 0.05574269761432329, + "grad_norm": 8.6875, + "kl": 1.6537694931030273, + "learning_rate": 9.958147522825634e-06, + "logits/chosen": 258585248.0, + "logits/rejected": 457821312.0, + "logps/chosen": -425.6638488769531, + "logps/rejected": -446.73626708984375, + "loss": 0.1715, + "rewards/chosen": 1.716949462890625, + "rewards/margins": 5.530115842819214, + "rewards/rejected": -3.813166379928589, + "step": 151 + }, + { + "epoch": 0.05611185455216649, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.957384197520782e-06, + "logits/chosen": 297413180.2352941, + "logits/rejected": 321907302.4, + "logps/chosen": -300.11448759191177, + "logps/rejected": -495.47776692708334, + "loss": 0.123, + "rewards/chosen": 1.946786319508272, + "rewards/margins": 6.369486819996553, + "rewards/rejected": -4.422700500488281, + "step": 152 + }, + { + "epoch": 0.05648101149000969, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.956614003823107e-06, + "logits/chosen": 320942907.0769231, + "logits/rejected": 257940075.78947368, + "logps/chosen": -422.94640174278845, + "logps/rejected": -316.71849300986844, + "loss": 0.1534, + "rewards/chosen": 1.2408673213078425, + "rewards/margins": 4.716812643444973, + "rewards/rejected": -3.47594532213713, + "step": 153 + }, + { + "epoch": 0.05685016842785289, + "grad_norm": 8.125, + "kl": 0.07266712188720703, + "learning_rate": 9.955836942799704e-06, + "logits/chosen": 372142272.0, + "logits/rejected": 332437248.0, + "logps/chosen": -420.97491455078125, + "logps/rejected": -308.2636413574219, + "loss": 0.1688, + "rewards/chosen": 1.5455937385559082, + "rewards/margins": 5.042505979537964, + "rewards/rejected": -3.4969122409820557, + "step": 154 + }, + { + "epoch": 0.05721932536569609, + "grad_norm": 7.71875, + "kl": 0.0, + "learning_rate": 9.955053015527178e-06, + "logits/chosen": 313786304.0, + "logits/rejected": 355635136.0, + "logps/chosen": -368.1385498046875, + "logps/rejected": -374.7402648925781, + "loss": 0.1871, + "rewards/chosen": 0.9262558221817017, + "rewards/margins": 4.556788325309753, + "rewards/rejected": -3.6305325031280518, + "step": 155 + }, + { + "epoch": 0.057588482303539294, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.954262223091654e-06, + "logits/chosen": 312358816.0, + "logits/rejected": 470130496.0, + "logps/chosen": -315.6360168457031, + "logps/rejected": -513.9529418945312, + "loss": 0.1874, + "rewards/chosen": 1.285167932510376, + "rewards/margins": 5.558533430099487, + "rewards/rejected": -4.273365497589111, + "step": 156 + }, + { + "epoch": 0.057957639241382496, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.953464566588762e-06, + "logits/chosen": 245951152.0, + "logits/rejected": 339736533.3333333, + "logps/chosen": -365.2420349121094, + "logps/rejected": -381.6001790364583, + "loss": 0.0765, + "rewards/chosen": 2.2076256275177, + "rewards/margins": 6.678771257400513, + "rewards/rejected": -4.4711456298828125, + "step": 157 + }, + { + "epoch": 0.05832679617922569, + "grad_norm": 9.0625, + "kl": 0.9355654716491699, + "learning_rate": 9.952660047123647e-06, + "logits/chosen": 302508893.09090906, + "logits/rejected": 288198988.8, + "logps/chosen": -296.07619406960225, + "logps/rejected": -456.481982421875, + "loss": 0.2734, + "rewards/chosen": 0.7660180005160245, + "rewards/margins": 4.840207758816805, + "rewards/rejected": -4.074189758300781, + "step": 158 + }, + { + "epoch": 0.05869595311706889, + "grad_norm": 8.1875, + "kl": 0.2217702865600586, + "learning_rate": 9.95184866581096e-06, + "logits/chosen": 335529164.8, + "logits/rejected": 307083023.0588235, + "logps/chosen": -445.33118489583336, + "logps/rejected": -396.9530675551471, + "loss": 0.1479, + "rewards/chosen": 1.498296356201172, + "rewards/margins": 6.101204771154068, + "rewards/rejected": -4.602908414952895, + "step": 159 + }, + { + "epoch": 0.05906511005491209, + "grad_norm": 7.75, + "kl": 0.2670764923095703, + "learning_rate": 9.951030423774858e-06, + "logits/chosen": 356343958.5882353, + "logits/rejected": 412706099.2, + "logps/chosen": -409.6879021139706, + "logps/rejected": -486.78984375, + "loss": 0.139, + "rewards/chosen": 1.6835448321174173, + "rewards/margins": 6.0347685870002294, + "rewards/rejected": -4.351223754882812, + "step": 160 + }, + { + "epoch": 0.059434266992755294, + "grad_norm": 7.125, + "kl": 0.9117159843444824, + "learning_rate": 9.950205322149007e-06, + "logits/chosen": 309120995.5555556, + "logits/rejected": 240697782.85714287, + "logps/chosen": -263.6728515625, + "logps/rejected": -353.1136997767857, + "loss": 0.1815, + "rewards/chosen": 1.5831931432088215, + "rewards/margins": 5.32743349529448, + "rewards/rejected": -3.7442403520856584, + "step": 161 + }, + { + "epoch": 0.059803423930598495, + "grad_norm": 7.34375, + "kl": 0.05079841613769531, + "learning_rate": 9.949373362076572e-06, + "logits/chosen": 309587613.53846157, + "logits/rejected": 344851051.7894737, + "logps/chosen": -363.37984525240387, + "logps/rejected": -429.78104440789474, + "loss": 0.1249, + "rewards/chosen": 1.5192301823542669, + "rewards/margins": 5.675996170352828, + "rewards/rejected": -4.156765987998561, + "step": 162 + }, + { + "epoch": 0.0601725808684417, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.948534544710228e-06, + "logits/chosen": 304221312.0, + "logits/rejected": 419248341.3333333, + "logps/chosen": -399.3435302734375, + "logps/rejected": -418.095703125, + "loss": 0.1988, + "rewards/chosen": 1.6100788116455078, + "rewards/margins": 5.58925724029541, + "rewards/rejected": -3.9791784286499023, + "step": 163 + }, + { + "epoch": 0.0605417378062849, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.947688871212142e-06, + "logits/chosen": 266658944.0, + "logits/rejected": 295912220.4444444, + "logps/chosen": -313.83279854910717, + "logps/rejected": -419.814453125, + "loss": 0.1273, + "rewards/chosen": 1.3187923431396484, + "rewards/margins": 5.922294828626844, + "rewards/rejected": -4.603502485487196, + "step": 164 + }, + { + "epoch": 0.0609108947441281, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 9.946836342753982e-06, + "logits/chosen": 339851264.0, + "logits/rejected": 336855771.4285714, + "logps/chosen": -360.3514811197917, + "logps/rejected": -448.7241908482143, + "loss": 0.1742, + "rewards/chosen": 1.3098813162909613, + "rewards/margins": 5.352542710682703, + "rewards/rejected": -4.042661394391741, + "step": 165 + }, + { + "epoch": 0.0612800516819713, + "grad_norm": 9.1875, + "kl": 0.9411334991455078, + "learning_rate": 9.945976960516921e-06, + "logits/chosen": 259195418.9473684, + "logits/rejected": 264450599.3846154, + "logps/chosen": -405.10572574013156, + "logps/rejected": -423.2333233173077, + "loss": 0.1849, + "rewards/chosen": 1.4465279830129523, + "rewards/margins": 5.890519717444293, + "rewards/rejected": -4.44399173443134, + "step": 166 + }, + { + "epoch": 0.061649208619814495, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.945110725691618e-06, + "logits/chosen": 239445623.46666667, + "logits/rejected": 361009904.9411765, + "logps/chosen": -355.12379557291666, + "logps/rejected": -359.43347886029414, + "loss": 0.1564, + "rewards/chosen": 2.184929911295573, + "rewards/margins": 6.074471537272135, + "rewards/rejected": -3.8895416259765625, + "step": 167 + }, + { + "epoch": 0.0620183655576577, + "grad_norm": 6.875, + "kl": 0.11162447929382324, + "learning_rate": 9.944237639478232e-06, + "logits/chosen": 238864420.57142857, + "logits/rejected": 323727047.1111111, + "logps/chosen": -371.21358816964283, + "logps/rejected": -433.3231608072917, + "loss": 0.1305, + "rewards/chosen": 2.2776993342808316, + "rewards/margins": 6.757272901989165, + "rewards/rejected": -4.479573567708333, + "step": 168 + }, + { + "epoch": 0.0623875224955009, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.943357703086411e-06, + "logits/chosen": 304505446.4, + "logits/rejected": 506872040.72727275, + "logps/chosen": -414.857421875, + "logps/rejected": -533.3612393465909, + "loss": 0.096, + "rewards/chosen": 1.5230682373046875, + "rewards/margins": 6.65949048128995, + "rewards/rejected": -5.136422243985263, + "step": 169 + }, + { + "epoch": 0.0627566794333441, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 9.942470917735299e-06, + "logits/chosen": 286740871.5294118, + "logits/rejected": 286876535.46666664, + "logps/chosen": -339.9931066176471, + "logps/rejected": -398.8551432291667, + "loss": 0.1802, + "rewards/chosen": 1.2512098200180952, + "rewards/margins": 5.489997916128122, + "rewards/rejected": -4.238788096110026, + "step": 170 + }, + { + "epoch": 0.06312583637118731, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 9.941577284653523e-06, + "logits/chosen": 458455661.71428573, + "logits/rejected": 270750151.1111111, + "logps/chosen": -390.19252232142856, + "logps/rejected": -417.2972005208333, + "loss": 0.1709, + "rewards/chosen": 0.9631140572684151, + "rewards/margins": 5.503997863285125, + "rewards/rejected": -4.54088380601671, + "step": 171 + }, + { + "epoch": 0.0634949933090305, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.940676805079201e-06, + "logits/chosen": 341574083.7647059, + "logits/rejected": 349436552.53333336, + "logps/chosen": -351.15113740808823, + "logps/rejected": -345.94833984375, + "loss": 0.1592, + "rewards/chosen": 1.664536195642808, + "rewards/margins": 5.578178824630438, + "rewards/rejected": -3.9136426289876303, + "step": 172 + }, + { + "epoch": 0.0638641502468737, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 9.939769480259937e-06, + "logits/chosen": 439148832.0, + "logits/rejected": 242804053.33333334, + "logps/chosen": -339.0603942871094, + "logps/rejected": -380.2032877604167, + "loss": 0.1036, + "rewards/chosen": 1.653731346130371, + "rewards/margins": 5.582206408182779, + "rewards/rejected": -3.9284750620524087, + "step": 173 + }, + { + "epoch": 0.0642333071847169, + "grad_norm": 9.5, + "kl": 0.0, + "learning_rate": 9.938855311452818e-06, + "logits/chosen": 395835661.4736842, + "logits/rejected": 298734119.38461536, + "logps/chosen": -374.25663034539474, + "logps/rejected": -308.4858961838942, + "loss": 0.1916, + "rewards/chosen": 1.2129101000334088, + "rewards/margins": 5.5642592704247855, + "rewards/rejected": -4.3513491703913765, + "step": 174 + }, + { + "epoch": 0.0646024641225601, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.93793429992441e-06, + "logits/chosen": 289189824.0, + "logits/rejected": 291518976.0, + "logps/chosen": -384.0808919270833, + "logps/rejected": -542.0236328125, + "loss": 0.1073, + "rewards/chosen": 1.1705416043599446, + "rewards/margins": 6.757867654164632, + "rewards/rejected": -5.5873260498046875, + "step": 175 + }, + { + "epoch": 0.0649716210604033, + "grad_norm": 9.1875, + "kl": 0.03723287582397461, + "learning_rate": 9.937006446950768e-06, + "logits/chosen": 334387687.61904764, + "logits/rejected": 223848680.72727272, + "logps/chosen": -346.52320498511904, + "logps/rejected": -388.96395596590907, + "loss": 0.2138, + "rewards/chosen": 1.286037808372861, + "rewards/margins": 5.145251897506384, + "rewards/rejected": -3.859214089133523, + "step": 176 + }, + { + "epoch": 0.0653407779982465, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.936071753817416e-06, + "logits/chosen": 295773610.6666667, + "logits/rejected": 249304320.0, + "logps/chosen": -347.6120876736111, + "logps/rejected": -331.43125697544644, + "loss": 0.1766, + "rewards/chosen": 1.4730228847927518, + "rewards/margins": 5.011464134095207, + "rewards/rejected": -3.5384412493024553, + "step": 177 + }, + { + "epoch": 0.0657099349360897, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.935130221819361e-06, + "logits/chosen": 344345429.3333333, + "logits/rejected": 437376621.71428573, + "logps/chosen": -343.80864800347223, + "logps/rejected": -384.70968191964283, + "loss": 0.155, + "rewards/chosen": 1.3710865444607205, + "rewards/margins": 5.574907454233321, + "rewards/rejected": -4.203820909772601, + "step": 178 + }, + { + "epoch": 0.0660790918739329, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.934181852261084e-06, + "logits/chosen": 325282950.7368421, + "logits/rejected": 212661051.07692307, + "logps/chosen": -328.31224300986844, + "logps/rejected": -404.09018179086536, + "loss": 0.1817, + "rewards/chosen": 1.439181177239669, + "rewards/margins": 6.391211065686183, + "rewards/rejected": -4.952029888446514, + "step": 179 + }, + { + "epoch": 0.0664482488117761, + "grad_norm": 6.125, + "kl": 0.8568601608276367, + "learning_rate": 9.93322664645654e-06, + "logits/chosen": 313048302.93333334, + "logits/rejected": 218992308.70588234, + "logps/chosen": -367.23583984375, + "logps/rejected": -403.5110294117647, + "loss": 0.127, + "rewards/chosen": 2.138080088297526, + "rewards/margins": 6.844103809431488, + "rewards/rejected": -4.7060237211339615, + "step": 180 + }, + { + "epoch": 0.06681740574961931, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.932264605729152e-06, + "logits/chosen": 313594976.0, + "logits/rejected": 271518272.0, + "logps/chosen": -341.2051696777344, + "logps/rejected": -412.02813720703125, + "loss": 0.1882, + "rewards/chosen": 0.9823825359344482, + "rewards/margins": 5.304265737533569, + "rewards/rejected": -4.321883201599121, + "step": 181 + }, + { + "epoch": 0.06718656268746251, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 9.931295731411819e-06, + "logits/chosen": 420180952.61538464, + "logits/rejected": 250369320.42105263, + "logps/chosen": -434.88900991586536, + "logps/rejected": -399.5580283717105, + "loss": 0.1537, + "rewards/chosen": 1.190131114079402, + "rewards/margins": 5.2083746025919435, + "rewards/rejected": -4.0182434885125415, + "step": 182 + }, + { + "epoch": 0.06755571962530571, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.930320024846899e-06, + "logits/chosen": 263188112.0, + "logits/rejected": 226581856.0, + "logps/chosen": -321.74468994140625, + "logps/rejected": -332.2611999511719, + "loss": 0.164, + "rewards/chosen": 1.2463661432266235, + "rewards/margins": 4.765966534614563, + "rewards/rejected": -3.5196003913879395, + "step": 183 + }, + { + "epoch": 0.06792487656314891, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.929337487386225e-06, + "logits/chosen": 282012637.8666667, + "logits/rejected": 272976534.5882353, + "logps/chosen": -336.70172526041665, + "logps/rejected": -453.31298828125, + "loss": 0.1945, + "rewards/chosen": 1.0984366099039713, + "rewards/margins": 4.858925456626743, + "rewards/rejected": -3.760488846722771, + "step": 184 + }, + { + "epoch": 0.06829403350099211, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.928348120391087e-06, + "logits/chosen": 373041984.0, + "logits/rejected": 307085216.0, + "logps/chosen": -341.9854431152344, + "logps/rejected": -498.4840087890625, + "loss": 0.1358, + "rewards/chosen": 1.6039799451828003, + "rewards/margins": 7.162896990776062, + "rewards/rejected": -5.558917045593262, + "step": 185 + }, + { + "epoch": 0.06866319043883531, + "grad_norm": 10.5625, + "kl": 0.18115997314453125, + "learning_rate": 9.927351925232245e-06, + "logits/chosen": 255777536.0, + "logits/rejected": 217817408.0, + "logps/chosen": -481.417333984375, + "logps/rejected": -357.8868815104167, + "loss": 0.1718, + "rewards/chosen": 1.5395903587341309, + "rewards/margins": 5.691773255666097, + "rewards/rejected": -4.152182896931966, + "step": 186 + }, + { + "epoch": 0.06903234737667852, + "grad_norm": 8.375, + "kl": 0.3335151672363281, + "learning_rate": 9.92634890328991e-06, + "logits/chosen": 295333285.64705884, + "logits/rejected": 251600247.46666667, + "logps/chosen": -335.2776309742647, + "logps/rejected": -332.80494791666666, + "loss": 0.1727, + "rewards/chosen": 1.2335779526654411, + "rewards/margins": 5.463569356880936, + "rewards/rejected": -4.229991404215495, + "step": 187 + }, + { + "epoch": 0.06940150431452172, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.92533905595376e-06, + "logits/chosen": 316081302.5882353, + "logits/rejected": 237886873.6, + "logps/chosen": -340.16771024816177, + "logps/rejected": -370.61904296875, + "loss": 0.1524, + "rewards/chosen": 1.276721617754768, + "rewards/margins": 5.8653558544084134, + "rewards/rejected": -4.588634236653646, + "step": 188 + }, + { + "epoch": 0.06977066125236492, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 9.924322384622922e-06, + "logits/chosen": 317039241.84615386, + "logits/rejected": 413932436.2105263, + "logps/chosen": -422.02283653846155, + "logps/rejected": -392.88247841282896, + "loss": 0.1059, + "rewards/chosen": 1.8514213562011719, + "rewards/margins": 5.860632444682874, + "rewards/rejected": -4.0092110884817025, + "step": 189 + }, + { + "epoch": 0.0701398181902081, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.923298890705983e-06, + "logits/chosen": 374345581.71428573, + "logits/rejected": 309292259.5555556, + "logps/chosen": -407.8926478794643, + "logps/rejected": -461.08338758680554, + "loss": 0.153, + "rewards/chosen": 0.8513574600219727, + "rewards/margins": 6.327481693691677, + "rewards/rejected": -5.4761242336697045, + "step": 190 + }, + { + "epoch": 0.07050897512805131, + "grad_norm": 6.71875, + "kl": 0.22394943237304688, + "learning_rate": 9.922268575620981e-06, + "logits/chosen": 303580224.0, + "logits/rejected": 223742566.4, + "logps/chosen": -365.0611165364583, + "logps/rejected": -381.8359375, + "loss": 0.1193, + "rewards/chosen": 1.9390835762023926, + "rewards/margins": 6.394034290313721, + "rewards/rejected": -4.4549507141113285, + "step": 191 + }, + { + "epoch": 0.07087813206589451, + "grad_norm": 8.25, + "kl": 0.29738759994506836, + "learning_rate": 9.921231440795404e-06, + "logits/chosen": 393311829.3333333, + "logits/rejected": 283681645.71428573, + "logps/chosen": -295.8737521701389, + "logps/rejected": -447.69705636160717, + "loss": 0.188, + "rewards/chosen": 1.2896958457099066, + "rewards/margins": 5.813491382296123, + "rewards/rejected": -4.523795536586216, + "step": 192 + }, + { + "epoch": 0.07124728900373771, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.92018748766619e-06, + "logits/chosen": 255876078.93333334, + "logits/rejected": 235182802.82352942, + "logps/chosen": -350.01197916666666, + "logps/rejected": -390.8358513327206, + "loss": 0.1102, + "rewards/chosen": 1.841059112548828, + "rewards/margins": 6.385167335061466, + "rewards/rejected": -4.544108222512638, + "step": 193 + }, + { + "epoch": 0.07161644594158091, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.919136717679723e-06, + "logits/chosen": 373161386.6666667, + "logits/rejected": 291403556.5714286, + "logps/chosen": -378.7184787326389, + "logps/rejected": -336.82425362723217, + "loss": 0.1165, + "rewards/chosen": 1.920990202162001, + "rewards/margins": 5.902765213497101, + "rewards/rejected": -3.9817750113351003, + "step": 194 + }, + { + "epoch": 0.07198560287942411, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.918079132291828e-06, + "logits/chosen": 233104580.92307693, + "logits/rejected": 255369162.10526314, + "logps/chosen": -337.6476487379808, + "logps/rejected": -330.5843955592105, + "loss": 0.1229, + "rewards/chosen": 1.6972368680513823, + "rewards/margins": 5.732576625067213, + "rewards/rejected": -4.035339757015831, + "step": 195 + }, + { + "epoch": 0.07235475981726731, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.917014732967782e-06, + "logits/chosen": 357956300.8, + "logits/rejected": 275511657.4117647, + "logps/chosen": -450.94182942708335, + "logps/rejected": -384.501953125, + "loss": 0.1482, + "rewards/chosen": 1.241439946492513, + "rewards/margins": 4.676839170268938, + "rewards/rejected": -3.435399223776425, + "step": 196 + }, + { + "epoch": 0.07272391675511052, + "grad_norm": 7.71875, + "kl": 0.016225814819335938, + "learning_rate": 9.915943521182292e-06, + "logits/chosen": 260036230.7368421, + "logits/rejected": 286717400.61538464, + "logps/chosen": -283.3358090049342, + "logps/rejected": -476.32279146634613, + "loss": 0.1603, + "rewards/chosen": 1.7275266145405017, + "rewards/margins": 6.593955870099396, + "rewards/rejected": -4.866429255558894, + "step": 197 + }, + { + "epoch": 0.07309307369295372, + "grad_norm": 6.75, + "kl": 0.2572416067123413, + "learning_rate": 9.91486549841951e-06, + "logits/chosen": 290801379.5555556, + "logits/rejected": 289948214.85714287, + "logps/chosen": -302.0366482204861, + "logps/rejected": -392.28062220982144, + "loss": 0.1638, + "rewards/chosen": 1.4963736004299588, + "rewards/margins": 6.546287824237158, + "rewards/rejected": -5.049914223807199, + "step": 198 + }, + { + "epoch": 0.07346223063079692, + "grad_norm": 10.6875, + "kl": 0.0, + "learning_rate": 9.913780666173022e-06, + "logits/chosen": 287367968.0, + "logits/rejected": 356499904.0, + "logps/chosen": -420.9107666015625, + "logps/rejected": -446.5137939453125, + "loss": 0.1901, + "rewards/chosen": 0.9786157608032227, + "rewards/margins": 6.086407661437988, + "rewards/rejected": -5.107791900634766, + "step": 199 + }, + { + "epoch": 0.07383138756864012, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.912689025945851e-06, + "logits/chosen": 305120557.1764706, + "logits/rejected": 383667200.0, + "logps/chosen": -304.02429917279414, + "logps/rejected": -419.69518229166664, + "loss": 0.1356, + "rewards/chosen": 1.7920981014476103, + "rewards/margins": 6.511443194221048, + "rewards/rejected": -4.7193450927734375, + "step": 200 + }, + { + "epoch": 0.07420054450648332, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.91159057925045e-06, + "logits/chosen": 301771659.6363636, + "logits/rejected": 372669123.04761904, + "logps/chosen": -323.02297141335225, + "logps/rejected": -478.3452845982143, + "loss": 0.1211, + "rewards/chosen": 1.101763205094771, + "rewards/margins": 6.045637019268877, + "rewards/rejected": -4.943873814174107, + "step": 201 + }, + { + "epoch": 0.07456970144432652, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.910485327608702e-06, + "logits/chosen": 319891817.4117647, + "logits/rejected": 244827665.06666666, + "logps/chosen": -356.96472886029414, + "logps/rejected": -420.98450520833336, + "loss": 0.1356, + "rewards/chosen": 1.6521095949060776, + "rewards/margins": 6.491807877783682, + "rewards/rejected": -4.839698282877604, + "step": 202 + }, + { + "epoch": 0.07493885838216972, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.909373272551919e-06, + "logits/chosen": 348588311.27272725, + "logits/rejected": 203605455.23809522, + "logps/chosen": -283.8688299005682, + "logps/rejected": -288.35337611607144, + "loss": 0.1147, + "rewards/chosen": 1.702762950550426, + "rewards/margins": 5.332174193807495, + "rewards/rejected": -3.6294112432570684, + "step": 203 + }, + { + "epoch": 0.07530801532001292, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.90825441562084e-06, + "logits/chosen": 455769736.53333336, + "logits/rejected": 337404205.1764706, + "logps/chosen": -369.7958984375, + "logps/rejected": -450.46642348345586, + "loss": 0.1173, + "rewards/chosen": 1.6605462392171224, + "rewards/margins": 6.613336443433575, + "rewards/rejected": -4.952790204216452, + "step": 204 + }, + { + "epoch": 0.07567717225785613, + "grad_norm": 6.3125, + "kl": 3.8450279235839844, + "learning_rate": 9.907128758365627e-06, + "logits/chosen": 256014125.17647058, + "logits/rejected": 374057437.8666667, + "logps/chosen": -292.39869600183823, + "logps/rejected": -425.98951822916666, + "loss": 0.1912, + "rewards/chosen": 1.824619405409869, + "rewards/margins": 6.8294706830791405, + "rewards/rejected": -5.004851277669271, + "step": 205 + }, + { + "epoch": 0.07604632919569933, + "grad_norm": 8.625, + "kl": 0.8574504852294922, + "learning_rate": 9.905996302345863e-06, + "logits/chosen": 396116224.0, + "logits/rejected": 305739093.3333333, + "logps/chosen": -447.20400390625, + "logps/rejected": -382.073974609375, + "loss": 0.154, + "rewards/chosen": 1.9291723251342774, + "rewards/margins": 5.742176818847656, + "rewards/rejected": -3.813004493713379, + "step": 206 + }, + { + "epoch": 0.07641548613354253, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.904857049130553e-06, + "logits/chosen": 345949026.46153843, + "logits/rejected": 296476375.57894737, + "logps/chosen": -395.16849459134613, + "logps/rejected": -408.4721165707237, + "loss": 0.1339, + "rewards/chosen": 1.4804719778207631, + "rewards/margins": 6.40992015576073, + "rewards/rejected": -4.9294481779399675, + "step": 207 + }, + { + "epoch": 0.07678464307138572, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.903711000298118e-06, + "logits/chosen": 290564544.0, + "logits/rejected": 325439539.2, + "logps/chosen": -363.1607666015625, + "logps/rejected": -479.90654296875, + "loss": 0.1141, + "rewards/chosen": 1.367384433746338, + "rewards/margins": 6.667228031158447, + "rewards/rejected": -5.299843597412109, + "step": 208 + }, + { + "epoch": 0.07715380000922892, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.902558157436392e-06, + "logits/chosen": 268974656.0, + "logits/rejected": 397717568.0, + "logps/chosen": -295.08172607421875, + "logps/rejected": -438.4521789550781, + "loss": 0.1307, + "rewards/chosen": 1.5989700555801392, + "rewards/margins": 6.272290349006653, + "rewards/rejected": -4.673320293426514, + "step": 209 + }, + { + "epoch": 0.07752295694707212, + "grad_norm": 7.90625, + "kl": 0.20082616806030273, + "learning_rate": 9.901398522142624e-06, + "logits/chosen": 274037729.88235295, + "logits/rejected": 244789811.2, + "logps/chosen": -350.22610294117646, + "logps/rejected": -323.8180338541667, + "loss": 0.1532, + "rewards/chosen": 1.4677040997673483, + "rewards/margins": 5.396482684565525, + "rewards/rejected": -3.928778584798177, + "step": 210 + }, + { + "epoch": 0.07789211388491532, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.900232096023478e-06, + "logits/chosen": 229888737.88235295, + "logits/rejected": 307194163.2, + "logps/chosen": -278.5174345128676, + "logps/rejected": -509.79791666666665, + "loss": 0.1353, + "rewards/chosen": 1.6924011006074793, + "rewards/margins": 6.0720046772676355, + "rewards/rejected": -4.379603576660156, + "step": 211 + }, + { + "epoch": 0.07826127082275852, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.899058880695019e-06, + "logits/chosen": 240647246.76923078, + "logits/rejected": 320449374.31578946, + "logps/chosen": -359.54349459134613, + "logps/rejected": -448.48848684210526, + "loss": 0.0902, + "rewards/chosen": 1.7600888472336988, + "rewards/margins": 7.4207478341786, + "rewards/rejected": -5.6606589869449015, + "step": 212 + }, + { + "epoch": 0.07863042776060172, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.89787887778272e-06, + "logits/chosen": 404444448.0, + "logits/rejected": 305878304.0, + "logps/chosen": -320.0046081542969, + "logps/rejected": -351.04302978515625, + "loss": 0.1716, + "rewards/chosen": 1.4353272914886475, + "rewards/margins": 5.3439977169036865, + "rewards/rejected": -3.908670425415039, + "step": 213 + }, + { + "epoch": 0.07899958469844492, + "grad_norm": 8.6875, + "kl": 0.0, + "learning_rate": 9.896692088921466e-06, + "logits/chosen": 271762602.6666667, + "logits/rejected": 370946740.7058824, + "logps/chosen": -385.94733072916665, + "logps/rejected": -396.0329159007353, + "loss": 0.0968, + "rewards/chosen": 2.61397221883138, + "rewards/margins": 6.650785393808402, + "rewards/rejected": -4.036813174977022, + "step": 214 + }, + { + "epoch": 0.07936874163628813, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.895498515755532e-06, + "logits/chosen": 233154011.42857143, + "logits/rejected": 345250076.4444444, + "logps/chosen": -314.76443917410717, + "logps/rejected": -361.26860894097223, + "loss": 0.1414, + "rewards/chosen": 1.3316773005894251, + "rewards/margins": 5.707847564939469, + "rewards/rejected": -4.376170264350043, + "step": 215 + }, + { + "epoch": 0.07973789857413133, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 9.894298159938605e-06, + "logits/chosen": 234113126.4, + "logits/rejected": 220001174.5882353, + "logps/chosen": -379.48186848958335, + "logps/rejected": -402.4491325827206, + "loss": 0.1536, + "rewards/chosen": 1.339400863647461, + "rewards/margins": 6.411089347390568, + "rewards/rejected": -5.071688483743107, + "step": 216 + }, + { + "epoch": 0.08010705551197453, + "grad_norm": 9.6875, + "kl": 0.2727031707763672, + "learning_rate": 9.893091023133756e-06, + "logits/chosen": 259223779.55555555, + "logits/rejected": 188234112.0, + "logps/chosen": -378.70103624131946, + "logps/rejected": -329.05604771205356, + "loss": 0.1874, + "rewards/chosen": 1.5177618662516277, + "rewards/margins": 5.939156486874535, + "rewards/rejected": -4.421394620622907, + "step": 217 + }, + { + "epoch": 0.08047621244981773, + "grad_norm": 9.0, + "kl": 0.0, + "learning_rate": 9.891877107013461e-06, + "logits/chosen": 215877888.0, + "logits/rejected": 296866099.2, + "logps/chosen": -339.45851643880206, + "logps/rejected": -391.4420166015625, + "loss": 0.1155, + "rewards/chosen": 1.801455020904541, + "rewards/margins": 6.12431230545044, + "rewards/rejected": -4.322857284545899, + "step": 218 + }, + { + "epoch": 0.08084536938766093, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 9.890656413259585e-06, + "logits/chosen": 322389674.6666667, + "logits/rejected": 212040118.85714287, + "logps/chosen": -374.1025119357639, + "logps/rejected": -320.2081821986607, + "loss": 0.1259, + "rewards/chosen": 2.284708023071289, + "rewards/margins": 6.263603210449219, + "rewards/rejected": -3.9788951873779297, + "step": 219 + }, + { + "epoch": 0.08121452632550413, + "grad_norm": 8.5625, + "kl": 0.4835491180419922, + "learning_rate": 9.889428943563382e-06, + "logits/chosen": 338962670.93333334, + "logits/rejected": 195878189.17647058, + "logps/chosen": -421.10556640625, + "logps/rejected": -374.24425551470586, + "loss": 0.1828, + "rewards/chosen": 0.870482063293457, + "rewards/margins": 4.820454328200396, + "rewards/rejected": -3.9499722649069393, + "step": 220 + }, + { + "epoch": 0.08158368326334733, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.888194699625499e-06, + "logits/chosen": 269815893.3333333, + "logits/rejected": 259220505.6, + "logps/chosen": -355.6497395833333, + "logps/rejected": -343.227978515625, + "loss": 0.0934, + "rewards/chosen": 1.9741126696268718, + "rewards/margins": 6.27033535639445, + "rewards/rejected": -4.296222686767578, + "step": 221 + }, + { + "epoch": 0.08195284020119054, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.886953683155964e-06, + "logits/chosen": 370723247.15789473, + "logits/rejected": 356402333.53846157, + "logps/chosen": -370.5067845394737, + "logps/rejected": -472.1794621394231, + "loss": 0.1843, + "rewards/chosen": 1.2398699710243626, + "rewards/margins": 6.535412174487404, + "rewards/rejected": -5.295542203463041, + "step": 222 + }, + { + "epoch": 0.08232199713903374, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.885705895874188e-06, + "logits/chosen": 386441472.0, + "logits/rejected": 321921045.3333333, + "logps/chosen": -387.0752197265625, + "logps/rejected": -353.2488606770833, + "loss": 0.1617, + "rewards/chosen": 1.7003358840942382, + "rewards/margins": 5.532213592529297, + "rewards/rejected": -3.8318777084350586, + "step": 223 + }, + { + "epoch": 0.08269115407687694, + "grad_norm": 6.78125, + "kl": 0.16900396347045898, + "learning_rate": 9.884451339508967e-06, + "logits/chosen": 288825984.0, + "logits/rejected": 318399872.0, + "logps/chosen": -421.4795227050781, + "logps/rejected": -377.2930603027344, + "loss": 0.1213, + "rewards/chosen": 1.9154726266860962, + "rewards/margins": 6.309888958930969, + "rewards/rejected": -4.394416332244873, + "step": 224 + }, + { + "epoch": 0.08306031101472014, + "grad_norm": 7.5, + "kl": 1.446150779724121, + "learning_rate": 9.883190015798469e-06, + "logits/chosen": 312760285.8666667, + "logits/rejected": 390341451.2941176, + "logps/chosen": -400.2993489583333, + "logps/rejected": -593.2247242647059, + "loss": 0.1288, + "rewards/chosen": 1.4746532440185547, + "rewards/margins": 7.456032584695255, + "rewards/rejected": -5.981379340676701, + "step": 225 + }, + { + "epoch": 0.08342946795256333, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.881921926490245e-06, + "logits/chosen": 300618519.27272725, + "logits/rejected": 477332284.95238096, + "logps/chosen": -409.2745916193182, + "logps/rejected": -449.3092447916667, + "loss": 0.086, + "rewards/chosen": 2.028586647727273, + "rewards/margins": 7.114793356362876, + "rewards/rejected": -5.086206708635602, + "step": 226 + }, + { + "epoch": 0.08379862489040653, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.880647073341219e-06, + "logits/chosen": 302757000.53333336, + "logits/rejected": 269813790.11764705, + "logps/chosen": -386.09541015625, + "logps/rejected": -355.4373563878676, + "loss": 0.1682, + "rewards/chosen": 1.2750361124674479, + "rewards/margins": 5.051844907274433, + "rewards/rejected": -3.7768087948069855, + "step": 227 + }, + { + "epoch": 0.08416778182824973, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.879365458117678e-06, + "logits/chosen": 391553768.72727275, + "logits/rejected": 269850867.8095238, + "logps/chosen": -327.4871715198864, + "logps/rejected": -302.3335890997024, + "loss": 0.1381, + "rewards/chosen": 1.5168881849809126, + "rewards/margins": 4.863814919541925, + "rewards/rejected": -3.346926734561012, + "step": 228 + }, + { + "epoch": 0.08453693876609293, + "grad_norm": 8.9375, + "kl": 0.9157342910766602, + "learning_rate": 9.878077082595287e-06, + "logits/chosen": 310198784.0, + "logits/rejected": 279201865.14285713, + "logps/chosen": -444.70073784722223, + "logps/rejected": -401.39097377232144, + "loss": 0.1819, + "rewards/chosen": 1.2462444305419922, + "rewards/margins": 6.087709971836635, + "rewards/rejected": -4.841465541294643, + "step": 229 + }, + { + "epoch": 0.08490609570393613, + "grad_norm": 7.1875, + "kl": 0.3046445846557617, + "learning_rate": 9.876781948559073e-06, + "logits/chosen": 239302009.2631579, + "logits/rejected": 273398350.7692308, + "logps/chosen": -399.6632658305921, + "logps/rejected": -374.7043644831731, + "loss": 0.1259, + "rewards/chosen": 2.1923709668611226, + "rewards/margins": 6.128685476326266, + "rewards/rejected": -3.936314509465144, + "step": 230 + }, + { + "epoch": 0.08527525264177933, + "grad_norm": 5.84375, + "kl": 0.3180809020996094, + "learning_rate": 9.87548005780343e-06, + "logits/chosen": 218852627.69230768, + "logits/rejected": 298185027.3684211, + "logps/chosen": -388.47216796875, + "logps/rejected": -524.3092619243421, + "loss": 0.0822, + "rewards/chosen": 2.473157442533053, + "rewards/margins": 7.762142443946498, + "rewards/rejected": -5.288985001413446, + "step": 231 + }, + { + "epoch": 0.08564440957962254, + "grad_norm": 7.28125, + "kl": 0.1190347671508789, + "learning_rate": 9.874171412132107e-06, + "logits/chosen": 390813354.6666667, + "logits/rejected": 277218102.85714287, + "logps/chosen": -314.0807834201389, + "logps/rejected": -442.63180106026783, + "loss": 0.1746, + "rewards/chosen": 1.7192440032958984, + "rewards/margins": 6.2123821803501675, + "rewards/rejected": -4.493138177054269, + "step": 232 + }, + { + "epoch": 0.08601356651746574, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.872856013358219e-06, + "logits/chosen": 262075864.6153846, + "logits/rejected": 201301153.68421054, + "logps/chosen": -342.3477313701923, + "logps/rejected": -381.6757555509868, + "loss": 0.1134, + "rewards/chosen": 1.7533968411959135, + "rewards/margins": 6.750025451907262, + "rewards/rejected": -4.9966286107113485, + "step": 233 + }, + { + "epoch": 0.08638272345530894, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.871533863304234e-06, + "logits/chosen": 222726772.36363637, + "logits/rejected": 332418169.9047619, + "logps/chosen": -312.68532492897725, + "logps/rejected": -431.15057663690476, + "loss": 0.1185, + "rewards/chosen": 1.5259949077259412, + "rewards/margins": 6.078866248523003, + "rewards/rejected": -4.552871340797061, + "step": 234 + }, + { + "epoch": 0.08675188039315214, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.870204963801974e-06, + "logits/chosen": 269976768.0, + "logits/rejected": 287968096.0, + "logps/chosen": -334.24359130859375, + "logps/rejected": -361.884521484375, + "loss": 0.1534, + "rewards/chosen": 1.958896279335022, + "rewards/margins": 6.151900887489319, + "rewards/rejected": -4.193004608154297, + "step": 235 + }, + { + "epoch": 0.08712103733099534, + "grad_norm": 6.53125, + "kl": 0.8565692901611328, + "learning_rate": 9.86886931669261e-06, + "logits/chosen": 266113655.46666667, + "logits/rejected": 256797575.52941176, + "logps/chosen": -381.57545572916666, + "logps/rejected": -401.9200654871324, + "loss": 0.1013, + "rewards/chosen": 2.3346232096354167, + "rewards/margins": 7.002576700846355, + "rewards/rejected": -4.6679534912109375, + "step": 236 + }, + { + "epoch": 0.08749019426883854, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.867526923826668e-06, + "logits/chosen": 232444893.86666667, + "logits/rejected": 246863314.82352942, + "logps/chosen": -381.7184244791667, + "logps/rejected": -428.70295266544116, + "loss": 0.1177, + "rewards/chosen": 1.9802268981933593, + "rewards/margins": 7.807276602352367, + "rewards/rejected": -5.827049704159007, + "step": 237 + }, + { + "epoch": 0.08785935120668174, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.866177787064013e-06, + "logits/chosen": 243263696.0, + "logits/rejected": 462747008.0, + "logps/chosen": -386.62335205078125, + "logps/rejected": -401.83282470703125, + "loss": 0.1082, + "rewards/chosen": 1.6147058010101318, + "rewards/margins": 7.056537389755249, + "rewards/rejected": -5.441831588745117, + "step": 238 + }, + { + "epoch": 0.08822850814452494, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.864821908273861e-06, + "logits/chosen": 379021363.2, + "logits/rejected": 227518766.54545453, + "logps/chosen": -404.366015625, + "logps/rejected": -407.6180308948864, + "loss": 0.1267, + "rewards/chosen": 0.8897915840148926, + "rewards/margins": 5.334950455752286, + "rewards/rejected": -4.4451588717373935, + "step": 239 + }, + { + "epoch": 0.08859766508236815, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.863459289334758e-06, + "logits/chosen": 343362976.0, + "logits/rejected": 268954240.0, + "logps/chosen": -384.75286865234375, + "logps/rejected": -363.1307067871094, + "loss": 0.1396, + "rewards/chosen": 1.6335111856460571, + "rewards/margins": 6.413908123970032, + "rewards/rejected": -4.780396938323975, + "step": 240 + }, + { + "epoch": 0.08896682202021135, + "grad_norm": 8.9375, + "kl": 0.0, + "learning_rate": 9.862089932134601e-06, + "logits/chosen": 335116544.0, + "logits/rejected": 241675904.0, + "logps/chosen": -366.75445556640625, + "logps/rejected": -463.53179931640625, + "loss": 0.1738, + "rewards/chosen": 0.8079825639724731, + "rewards/margins": 6.114123463630676, + "rewards/rejected": -5.306140899658203, + "step": 241 + }, + { + "epoch": 0.08933597895805455, + "grad_norm": 7.59375, + "kl": 0.0, + "learning_rate": 9.860713838570616e-06, + "logits/chosen": 283325683.8095238, + "logits/rejected": 324306525.09090906, + "logps/chosen": -349.12713913690476, + "logps/rejected": -388.3113458806818, + "loss": 0.1618, + "rewards/chosen": 1.66170410882859, + "rewards/margins": 6.557942956040948, + "rewards/rejected": -4.896238847212358, + "step": 242 + }, + { + "epoch": 0.08970513589589775, + "grad_norm": 8.3125, + "kl": 0.3953697681427002, + "learning_rate": 9.859331010549362e-06, + "logits/chosen": 240968362.66666666, + "logits/rejected": 284759241.14285713, + "logps/chosen": -296.115478515625, + "logps/rejected": -326.13602120535717, + "loss": 0.1969, + "rewards/chosen": 1.1243310504489474, + "rewards/margins": 5.456543498569065, + "rewards/rejected": -4.332212448120117, + "step": 243 + }, + { + "epoch": 0.09007429283374094, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.85794144998673e-06, + "logits/chosen": 304890647.27272725, + "logits/rejected": 298002944.0, + "logps/chosen": -343.71732954545456, + "logps/rejected": -465.47344680059524, + "loss": 0.1178, + "rewards/chosen": 1.1868700547651811, + "rewards/margins": 6.173799085410643, + "rewards/rejected": -4.986929030645461, + "step": 244 + }, + { + "epoch": 0.09044344977158414, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 9.856545158807938e-06, + "logits/chosen": 352201412.9230769, + "logits/rejected": 266927373.47368422, + "logps/chosen": -408.33837890625, + "logps/rejected": -448.5361328125, + "loss": 0.0993, + "rewards/chosen": 1.7782639723557692, + "rewards/margins": 7.4994931240313445, + "rewards/rejected": -5.721229151675575, + "step": 245 + }, + { + "epoch": 0.09081260670942734, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.855142138947532e-06, + "logits/chosen": 332627456.0, + "logits/rejected": 316708416.0, + "logps/chosen": -499.35748291015625, + "logps/rejected": -388.88116455078125, + "loss": 0.142, + "rewards/chosen": 1.616234540939331, + "rewards/margins": 5.916297197341919, + "rewards/rejected": -4.300062656402588, + "step": 246 + }, + { + "epoch": 0.09118176364727054, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.853732392349376e-06, + "logits/chosen": 231394629.8181818, + "logits/rejected": 231744243.80952382, + "logps/chosen": -318.44027432528407, + "logps/rejected": -360.240234375, + "loss": 0.1564, + "rewards/chosen": 0.9920940399169922, + "rewards/margins": 5.051144463675363, + "rewards/rejected": -4.059050423758371, + "step": 247 + }, + { + "epoch": 0.09155092058511374, + "grad_norm": 7.03125, + "kl": 0.1504230499267578, + "learning_rate": 9.852315920966653e-06, + "logits/chosen": 389621316.26666665, + "logits/rejected": 206375273.4117647, + "logps/chosen": -374.55104166666666, + "logps/rejected": -418.61290785845586, + "loss": 0.1204, + "rewards/chosen": 1.815285619099935, + "rewards/margins": 6.925124217014687, + "rewards/rejected": -5.109838597914752, + "step": 248 + }, + { + "epoch": 0.09192007752295694, + "grad_norm": 7.9375, + "kl": 1.464273452758789, + "learning_rate": 9.850892726761874e-06, + "logits/chosen": 300329728.0, + "logits/rejected": 249191040.0, + "logps/chosen": -351.0330403645833, + "logps/rejected": -406.296875, + "loss": 0.1444, + "rewards/chosen": 1.9587766859266493, + "rewards/margins": 7.153210170685298, + "rewards/rejected": -5.194433484758649, + "step": 249 + }, + { + "epoch": 0.09228923446080015, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.84946281170685e-06, + "logits/chosen": 276625569.68421054, + "logits/rejected": 451933499.0769231, + "logps/chosen": -301.2851819490132, + "logps/rejected": -336.0502178485577, + "loss": 0.1495, + "rewards/chosen": 1.5007415570710834, + "rewards/margins": 6.395835621636889, + "rewards/rejected": -4.895094064565805, + "step": 250 + }, + { + "epoch": 0.09265839139864335, + "grad_norm": 8.125, + "kl": 0.012690305709838867, + "learning_rate": 9.848026177782713e-06, + "logits/chosen": 287583232.0, + "logits/rejected": 293657395.2, + "logps/chosen": -391.80060891544116, + "logps/rejected": -389.09947916666664, + "loss": 0.1527, + "rewards/chosen": 1.710372251622817, + "rewards/margins": 5.922461535883885, + "rewards/rejected": -4.212089284261068, + "step": 251 + }, + { + "epoch": 0.09302754833648655, + "grad_norm": 7.15625, + "kl": 0.39983654022216797, + "learning_rate": 9.846582826979899e-06, + "logits/chosen": 293454813.8666667, + "logits/rejected": 276105216.0, + "logps/chosen": -389.4912434895833, + "logps/rejected": -361.69801240808823, + "loss": 0.1463, + "rewards/chosen": 1.5782159169514973, + "rewards/margins": 5.842160206215054, + "rewards/rejected": -4.263944289263557, + "step": 252 + }, + { + "epoch": 0.09339670527432975, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.845132761298154e-06, + "logits/chosen": 320463904.0, + "logits/rejected": 280111008.0, + "logps/chosen": -241.38551330566406, + "logps/rejected": -404.7611083984375, + "loss": 0.149, + "rewards/chosen": 1.4376113414764404, + "rewards/margins": 6.299758672714233, + "rewards/rejected": -4.862147331237793, + "step": 253 + }, + { + "epoch": 0.09376586221217295, + "grad_norm": 8.875, + "kl": 0.31492137908935547, + "learning_rate": 9.843675982746526e-06, + "logits/chosen": 322533728.0, + "logits/rejected": 255570432.0, + "logps/chosen": -382.48516845703125, + "logps/rejected": -305.25469970703125, + "loss": 0.1732, + "rewards/chosen": 1.3217122554779053, + "rewards/margins": 5.60805869102478, + "rewards/rejected": -4.286346435546875, + "step": 254 + }, + { + "epoch": 0.09413501915001615, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.84221249334336e-06, + "logits/chosen": 388694381.71428573, + "logits/rejected": 236523761.7777778, + "logps/chosen": -414.79771205357144, + "logps/rejected": -397.1962076822917, + "loss": 0.1466, + "rewards/chosen": 1.1980846949986048, + "rewards/margins": 5.312008085704985, + "rewards/rejected": -4.11392339070638, + "step": 255 + }, + { + "epoch": 0.09450417608785935, + "grad_norm": 6.78125, + "kl": 1.383697509765625, + "learning_rate": 9.840742295116306e-06, + "logits/chosen": 337817024.0, + "logits/rejected": 272713856.0, + "logps/chosen": -345.030029296875, + "logps/rejected": -516.3422241210938, + "loss": 0.1316, + "rewards/chosen": 1.8937877416610718, + "rewards/margins": 6.681235671043396, + "rewards/rejected": -4.787447929382324, + "step": 256 + }, + { + "epoch": 0.09487333302570256, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 9.839265390102301e-06, + "logits/chosen": 368794862.93333334, + "logits/rejected": 192503552.0, + "logps/chosen": -355.03704427083335, + "logps/rejected": -368.16486672794116, + "loss": 0.1004, + "rewards/chosen": 2.396204376220703, + "rewards/margins": 7.004463240679572, + "rewards/rejected": -4.608258864458869, + "step": 257 + }, + { + "epoch": 0.09524248996354576, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.837781780347584e-06, + "logits/chosen": 297754339.5555556, + "logits/rejected": 381829120.0, + "logps/chosen": -350.24107530381946, + "logps/rejected": -428.58506556919644, + "loss": 0.1158, + "rewards/chosen": 2.265862570868598, + "rewards/margins": 6.944121739220997, + "rewards/rejected": -4.678259168352399, + "step": 258 + }, + { + "epoch": 0.09561164690138896, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.836291467907671e-06, + "logits/chosen": 301438634.6666667, + "logits/rejected": 266660352.0, + "logps/chosen": -373.72889539930554, + "logps/rejected": -320.34256417410717, + "loss": 0.1429, + "rewards/chosen": 1.7668958240085177, + "rewards/margins": 6.037871754358685, + "rewards/rejected": -4.2709759303501675, + "step": 259 + }, + { + "epoch": 0.09598080383923216, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.834794454847373e-06, + "logits/chosen": 270245412.5714286, + "logits/rejected": 246572743.1111111, + "logps/chosen": -361.88455636160717, + "logps/rejected": -352.18256293402777, + "loss": 0.1046, + "rewards/chosen": 2.2469516481672014, + "rewards/margins": 6.502481687636603, + "rewards/rejected": -4.255530039469401, + "step": 260 + }, + { + "epoch": 0.09634996077707536, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.833290743240785e-06, + "logits/chosen": 205479568.0, + "logits/rejected": 282314784.0, + "logps/chosen": -290.31500244140625, + "logps/rejected": -455.24786376953125, + "loss": 0.1034, + "rewards/chosen": 2.81850266456604, + "rewards/margins": 8.061152696609497, + "rewards/rejected": -5.242650032043457, + "step": 261 + }, + { + "epoch": 0.09671911771491855, + "grad_norm": 7.8125, + "kl": 0.565821647644043, + "learning_rate": 9.83178033517128e-06, + "logits/chosen": 247140582.4, + "logits/rejected": 213855850.66666666, + "logps/chosen": -289.9948974609375, + "logps/rejected": -470.5503336588542, + "loss": 0.1846, + "rewards/chosen": 1.3770435333251954, + "rewards/margins": 6.580580202738444, + "rewards/rejected": -5.203536669413249, + "step": 262 + }, + { + "epoch": 0.09708827465276175, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.830263232731505e-06, + "logits/chosen": 254526706.52631578, + "logits/rejected": 332515485.53846157, + "logps/chosen": -293.96846731085526, + "logps/rejected": -501.08706430288464, + "loss": 0.1229, + "rewards/chosen": 1.7560898630242598, + "rewards/margins": 7.685320186228887, + "rewards/rejected": -5.929230323204627, + "step": 263 + }, + { + "epoch": 0.09745743159060495, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 9.82873943802339e-06, + "logits/chosen": 417544465.06666666, + "logits/rejected": 224183747.7647059, + "logps/chosen": -388.4838541666667, + "logps/rejected": -373.04328469669116, + "loss": 0.154, + "rewards/chosen": 1.384753672281901, + "rewards/margins": 5.942265843410118, + "rewards/rejected": -4.5575121711282165, + "step": 264 + }, + { + "epoch": 0.09782658852844815, + "grad_norm": 5.90625, + "kl": 0.048673152923583984, + "learning_rate": 9.827208953158132e-06, + "logits/chosen": 294331989.3333333, + "logits/rejected": 274610523.4285714, + "logps/chosen": -338.4851888020833, + "logps/rejected": -437.8081752232143, + "loss": 0.1408, + "rewards/chosen": 1.6367513868543837, + "rewards/margins": 7.1409322950575085, + "rewards/rejected": -5.504180908203125, + "step": 265 + }, + { + "epoch": 0.09819574546629135, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.825671780256197e-06, + "logits/chosen": 183115707.73333332, + "logits/rejected": 376294640.9411765, + "logps/chosen": -264.134228515625, + "logps/rejected": -471.39085477941177, + "loss": 0.1116, + "rewards/chosen": 2.3028895060221353, + "rewards/margins": 7.937460357067632, + "rewards/rejected": -5.634570851045496, + "step": 266 + }, + { + "epoch": 0.09856490240413456, + "grad_norm": 6.21875, + "kl": 0.5784311890602112, + "learning_rate": 9.824127921447321e-06, + "logits/chosen": 281956328.72727275, + "logits/rejected": 247907123.2, + "logps/chosen": -340.96231356534093, + "logps/rejected": -400.12412109375, + "loss": 0.1142, + "rewards/chosen": 2.412613782015714, + "rewards/margins": 8.096811017123136, + "rewards/rejected": -5.684197235107422, + "step": 267 + }, + { + "epoch": 0.09893405934197776, + "grad_norm": 8.3125, + "kl": 2.7989091873168945, + "learning_rate": 9.822577378870502e-06, + "logits/chosen": 314006341.8181818, + "logits/rejected": 238676864.0, + "logps/chosen": -447.56161221590907, + "logps/rejected": -441.839453125, + "loss": 0.1956, + "rewards/chosen": 1.9591395638205789, + "rewards/margins": 8.369098975441672, + "rewards/rejected": -6.409959411621093, + "step": 268 + }, + { + "epoch": 0.09930321627982096, + "grad_norm": 6.0625, + "kl": 1.452056884765625, + "learning_rate": 9.82102015467399e-06, + "logits/chosen": 350976402.28571427, + "logits/rejected": 328555520.0, + "logps/chosen": -437.37890625, + "logps/rejected": -457.06787109375, + "loss": 0.1364, + "rewards/chosen": 1.817681176321847, + "rewards/margins": 6.64472280229841, + "rewards/rejected": -4.8270416259765625, + "step": 269 + }, + { + "epoch": 0.09967237321766416, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.81945625101531e-06, + "logits/chosen": 290498368.0, + "logits/rejected": 244787024.0, + "logps/chosen": -282.5541076660156, + "logps/rejected": -458.66229248046875, + "loss": 0.1296, + "rewards/chosen": 1.5960369110107422, + "rewards/margins": 7.408041954040527, + "rewards/rejected": -5.812005043029785, + "step": 270 + }, + { + "epoch": 0.10004153015550736, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 9.81788567006122e-06, + "logits/chosen": 234486400.0, + "logits/rejected": 195798976.0, + "logps/chosen": -320.8009338378906, + "logps/rejected": -409.2419128417969, + "loss": 0.1454, + "rewards/chosen": 1.4441261291503906, + "rewards/margins": 6.269505977630615, + "rewards/rejected": -4.825379848480225, + "step": 271 + }, + { + "epoch": 0.10041068709335056, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.816308413987747e-06, + "logits/chosen": 269648835.7647059, + "logits/rejected": 313158929.06666666, + "logps/chosen": -345.65642233455884, + "logps/rejected": -368.42161458333334, + "loss": 0.1365, + "rewards/chosen": 2.0296660030589386, + "rewards/margins": 6.402596911262064, + "rewards/rejected": -4.372930908203125, + "step": 272 + }, + { + "epoch": 0.10077984403119376, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.814724484980156e-06, + "logits/chosen": 296096837.8181818, + "logits/rejected": 211911116.8, + "logps/chosen": -345.60469193892044, + "logps/rejected": -345.489013671875, + "loss": 0.1302, + "rewards/chosen": 2.149020108309659, + "rewards/margins": 6.516524228182706, + "rewards/rejected": -4.367504119873047, + "step": 273 + }, + { + "epoch": 0.10114900096903696, + "grad_norm": 8.125, + "kl": 0.3693046569824219, + "learning_rate": 9.813133885232962e-06, + "logits/chosen": 290780370.8235294, + "logits/rejected": 243505834.66666666, + "logps/chosen": -419.6095760569853, + "logps/rejected": -391.21708984375, + "loss": 0.1205, + "rewards/chosen": 1.8248745413387524, + "rewards/margins": 6.270085316078335, + "rewards/rejected": -4.445210774739583, + "step": 274 + }, + { + "epoch": 0.10151815790688017, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 9.811536616949921e-06, + "logits/chosen": 286155025.06666666, + "logits/rejected": 220757790.11764705, + "logps/chosen": -358.022265625, + "logps/rejected": -385.1070772058824, + "loss": 0.0968, + "rewards/chosen": 2.235581715901693, + "rewards/margins": 6.856246424656289, + "rewards/rejected": -4.620664708754596, + "step": 275 + }, + { + "epoch": 0.10188731484472337, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.809932682344026e-06, + "logits/chosen": 277972066.46153843, + "logits/rejected": 255133669.0526316, + "logps/chosen": -414.9138371394231, + "logps/rejected": -398.00899465460526, + "loss": 0.1417, + "rewards/chosen": 1.5957758976862981, + "rewards/margins": 5.817282518394563, + "rewards/rejected": -4.221506620708265, + "step": 276 + }, + { + "epoch": 0.10225647178256657, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.80832208363751e-06, + "logits/chosen": 305340476.2352941, + "logits/rejected": 215621205.33333334, + "logps/chosen": -440.6845703125, + "logps/rejected": -432.57477213541665, + "loss": 0.1394, + "rewards/chosen": 2.041439505184398, + "rewards/margins": 6.635841055477367, + "rewards/rejected": -4.594401550292969, + "step": 277 + }, + { + "epoch": 0.10262562872040977, + "grad_norm": 8.0625, + "kl": 0.6402225494384766, + "learning_rate": 9.806704823061837e-06, + "logits/chosen": 242505337.9047619, + "logits/rejected": 247100276.36363637, + "logps/chosen": -381.43387276785717, + "logps/rejected": -314.19477982954544, + "loss": 0.1483, + "rewards/chosen": 2.0247522989908853, + "rewards/margins": 5.991402134750828, + "rewards/rejected": -3.9666498357599433, + "step": 278 + }, + { + "epoch": 0.10299478565825297, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 9.8050809028577e-06, + "logits/chosen": 223183722.66666666, + "logits/rejected": 258744345.6, + "logps/chosen": -353.7753499348958, + "logps/rejected": -352.996630859375, + "loss": 0.0517, + "rewards/chosen": 2.985027313232422, + "rewards/margins": 7.568247985839844, + "rewards/rejected": -4.583220672607422, + "step": 279 + }, + { + "epoch": 0.10336394259609616, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.803450325275018e-06, + "logits/chosen": 347778596.5714286, + "logits/rejected": 233389397.33333334, + "logps/chosen": -335.8633510044643, + "logps/rejected": -445.4435763888889, + "loss": 0.1005, + "rewards/chosen": 1.6704310008457728, + "rewards/margins": 6.513539707849897, + "rewards/rejected": -4.843108707004124, + "step": 280 + }, + { + "epoch": 0.10373309953393936, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.801813092572938e-06, + "logits/chosen": 294609165.4736842, + "logits/rejected": 234853710.76923078, + "logps/chosen": -350.6049547697368, + "logps/rejected": -454.2666015625, + "loss": 0.1307, + "rewards/chosen": 1.7987438001130756, + "rewards/margins": 7.486834784750996, + "rewards/rejected": -5.68809098463792, + "step": 281 + }, + { + "epoch": 0.10410225647178256, + "grad_norm": 7.6875, + "kl": 1.0224857330322266, + "learning_rate": 9.800169207019826e-06, + "logits/chosen": 255080986.9473684, + "logits/rejected": 377982700.3076923, + "logps/chosen": -337.3111122532895, + "logps/rejected": -551.0763596754807, + "loss": 0.152, + "rewards/chosen": 2.2137425071314762, + "rewards/margins": 7.009772111529763, + "rewards/rejected": -4.796029604398287, + "step": 282 + }, + { + "epoch": 0.10447141340962576, + "grad_norm": 7.84375, + "kl": 0.10712814331054688, + "learning_rate": 9.798518670893263e-06, + "logits/chosen": 318751260.4444444, + "logits/rejected": 295405568.0, + "logps/chosen": -338.7742513020833, + "logps/rejected": -419.30859375, + "loss": 0.1541, + "rewards/chosen": 1.387531386481391, + "rewards/margins": 6.240966902838813, + "rewards/rejected": -4.853435516357422, + "step": 283 + }, + { + "epoch": 0.10484057034746896, + "grad_norm": 8.0625, + "kl": 0.0, + "learning_rate": 9.796861486480045e-06, + "logits/chosen": 301895372.8, + "logits/rejected": 314494283.2941176, + "logps/chosen": -377.33330078125, + "logps/rejected": -355.42086971507354, + "loss": 0.1291, + "rewards/chosen": 1.7706251780192057, + "rewards/margins": 6.2541901158351525, + "rewards/rejected": -4.483564937815947, + "step": 284 + }, + { + "epoch": 0.10520972728531217, + "grad_norm": 9.25, + "kl": 0.38208961486816406, + "learning_rate": 9.795197656076182e-06, + "logits/chosen": 251295288.8888889, + "logits/rejected": 346669129.14285713, + "logps/chosen": -453.69677734375, + "logps/rejected": -464.7334681919643, + "loss": 0.107, + "rewards/chosen": 2.006988101535373, + "rewards/margins": 7.091583918011377, + "rewards/rejected": -5.084595816476004, + "step": 285 + }, + { + "epoch": 0.10557888422315537, + "grad_norm": 7.53125, + "kl": 1.4684171676635742, + "learning_rate": 9.793527181986888e-06, + "logits/chosen": 258355008.0, + "logits/rejected": 306531104.0, + "logps/chosen": -365.3017272949219, + "logps/rejected": -387.84271240234375, + "loss": 0.1719, + "rewards/chosen": 1.563455581665039, + "rewards/margins": 5.471706390380859, + "rewards/rejected": -3.9082508087158203, + "step": 286 + }, + { + "epoch": 0.10594804116099857, + "grad_norm": 11.625, + "kl": 0.11474037170410156, + "learning_rate": 9.791850066526584e-06, + "logits/chosen": 369921760.0, + "logits/rejected": 212704880.0, + "logps/chosen": -287.26348876953125, + "logps/rejected": -351.359619140625, + "loss": 0.2021, + "rewards/chosen": 0.7203973531723022, + "rewards/margins": 5.163435339927673, + "rewards/rejected": -4.443037986755371, + "step": 287 + }, + { + "epoch": 0.10631719809884177, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.790166312018894e-06, + "logits/chosen": 261667293.86666667, + "logits/rejected": 301293266.8235294, + "logps/chosen": -365.90270182291664, + "logps/rejected": -443.9335363051471, + "loss": 0.1389, + "rewards/chosen": 1.5727582295735678, + "rewards/margins": 6.101599136053347, + "rewards/rejected": -4.528840906479779, + "step": 288 + }, + { + "epoch": 0.10668635503668497, + "grad_norm": 8.125, + "kl": 0.7326822280883789, + "learning_rate": 9.788475920796638e-06, + "logits/chosen": 397366693.64705884, + "logits/rejected": 248488174.93333334, + "logps/chosen": -339.2681525735294, + "logps/rejected": -414.38938802083334, + "loss": 0.1973, + "rewards/chosen": 1.3333422716926127, + "rewards/margins": 5.797185045130113, + "rewards/rejected": -4.4638427734375, + "step": 289 + }, + { + "epoch": 0.10705551197452817, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.78677889520183e-06, + "logits/chosen": 360584704.0, + "logits/rejected": 337553578.6666667, + "logps/chosen": -257.4214564732143, + "logps/rejected": -464.2155490451389, + "loss": 0.1437, + "rewards/chosen": 1.3613267626081194, + "rewards/margins": 6.599175256396096, + "rewards/rejected": -5.237848493787977, + "step": 290 + }, + { + "epoch": 0.10742466891237137, + "grad_norm": 7.625, + "kl": 1.4937095642089844, + "learning_rate": 9.785075237585678e-06, + "logits/chosen": 325305241.6, + "logits/rejected": 316507196.2352941, + "logps/chosen": -304.19928385416665, + "logps/rejected": -367.25930606617646, + "loss": 0.1711, + "rewards/chosen": 1.146826680501302, + "rewards/margins": 5.027793315812653, + "rewards/rejected": -3.880966635311351, + "step": 291 + }, + { + "epoch": 0.10779382585021458, + "grad_norm": 8.25, + "kl": 1.4761104583740234, + "learning_rate": 9.783364950308578e-06, + "logits/chosen": 321928265.14285713, + "logits/rejected": 321731185.7777778, + "logps/chosen": -451.0172642299107, + "logps/rejected": -395.6738009982639, + "loss": 0.1161, + "rewards/chosen": 2.0073723111833846, + "rewards/margins": 6.400631995428176, + "rewards/rejected": -4.393259684244792, + "step": 292 + }, + { + "epoch": 0.10816298278805778, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.78164803574011e-06, + "logits/chosen": 267314084.57142857, + "logits/rejected": 254794837.33333334, + "logps/chosen": -395.99281529017856, + "logps/rejected": -403.1513400607639, + "loss": 0.1273, + "rewards/chosen": 1.4890704836164201, + "rewards/margins": 6.782819974990119, + "rewards/rejected": -5.293749491373698, + "step": 293 + }, + { + "epoch": 0.10853213972590098, + "grad_norm": 6.25, + "kl": 1.3248915672302246, + "learning_rate": 9.77992449625904e-06, + "logits/chosen": 287841008.9411765, + "logits/rejected": 224754961.06666666, + "logps/chosen": -388.3971737132353, + "logps/rejected": -383.56500651041665, + "loss": 0.1084, + "rewards/chosen": 2.068014705882353, + "rewards/margins": 6.9571026371974565, + "rewards/rejected": -4.889087931315104, + "step": 294 + }, + { + "epoch": 0.10890129666374418, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.778194334253308e-06, + "logits/chosen": 270682550.85714287, + "logits/rejected": 318347520.0, + "logps/chosen": -291.91451590401783, + "logps/rejected": -474.6227756076389, + "loss": 0.1097, + "rewards/chosen": 1.910341807774135, + "rewards/margins": 6.641097083924308, + "rewards/rejected": -4.730755276150173, + "step": 295 + }, + { + "epoch": 0.10927045360158738, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.776457552120034e-06, + "logits/chosen": 343849712.9411765, + "logits/rejected": 288968499.2, + "logps/chosen": -369.1659294577206, + "logps/rejected": -390.1064778645833, + "loss": 0.1295, + "rewards/chosen": 1.715309928445255, + "rewards/margins": 7.233453645893172, + "rewards/rejected": -5.518143717447916, + "step": 296 + }, + { + "epoch": 0.10963961053943058, + "grad_norm": 7.53125, + "kl": 0.6564140319824219, + "learning_rate": 9.774714152265504e-06, + "logits/chosen": 378602797.1764706, + "logits/rejected": 363503820.8, + "logps/chosen": -389.9817899816176, + "logps/rejected": -440.4994791666667, + "loss": 0.1139, + "rewards/chosen": 2.5613984500660614, + "rewards/margins": 8.25343641393325, + "rewards/rejected": -5.692037963867188, + "step": 297 + }, + { + "epoch": 0.11000876747727377, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.772964137105179e-06, + "logits/chosen": 350261174.85714287, + "logits/rejected": 393938602.6666667, + "logps/chosen": -364.90523856026783, + "logps/rejected": -459.142578125, + "loss": 0.1136, + "rewards/chosen": 1.7261739458356584, + "rewards/margins": 7.120006682380797, + "rewards/rejected": -5.393832736545139, + "step": 298 + }, + { + "epoch": 0.11037792441511697, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.771207509063682e-06, + "logits/chosen": 247245366.85714287, + "logits/rejected": 300369351.1111111, + "logps/chosen": -355.44907924107144, + "logps/rejected": -419.1153971354167, + "loss": 0.1124, + "rewards/chosen": 2.253937448774065, + "rewards/margins": 7.142214714534699, + "rewards/rejected": -4.8882772657606335, + "step": 299 + }, + { + "epoch": 0.11074708135296017, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.769444270574799e-06, + "logits/chosen": 384592835.7647059, + "logits/rejected": 171360290.13333333, + "logps/chosen": -418.3955652573529, + "logps/rejected": -287.6421223958333, + "loss": 0.115, + "rewards/chosen": 2.0639922198127296, + "rewards/margins": 6.221265067306219, + "rewards/rejected": -4.1572728474934895, + "step": 300 + }, + { + "epoch": 0.11111623829080337, + "grad_norm": 6.875, + "kl": 0.6311101913452148, + "learning_rate": 9.767674424081472e-06, + "logits/chosen": 288888892.2352941, + "logits/rejected": 239422873.6, + "logps/chosen": -405.34461167279414, + "logps/rejected": -380.8423177083333, + "loss": 0.1124, + "rewards/chosen": 1.8444845536175896, + "rewards/margins": 6.308811539294673, + "rewards/rejected": -4.464326985677084, + "step": 301 + }, + { + "epoch": 0.11148539522864657, + "grad_norm": 6.3125, + "kl": 0.33696556091308594, + "learning_rate": 9.765897972035806e-06, + "logits/chosen": 329467017.84615386, + "logits/rejected": 317333099.7894737, + "logps/chosen": -388.9645432692308, + "logps/rejected": -454.37777549342104, + "loss": 0.0922, + "rewards/chosen": 2.3028562985933743, + "rewards/margins": 7.2743637223958, + "rewards/rejected": -4.971507423802426, + "step": 302 + }, + { + "epoch": 0.11185455216648978, + "grad_norm": 6.625, + "kl": 0.577385425567627, + "learning_rate": 9.764114916899049e-06, + "logits/chosen": 305303076.5714286, + "logits/rejected": 234174876.44444445, + "logps/chosen": -293.1815708705357, + "logps/rejected": -394.4946560329861, + "loss": 0.1685, + "rewards/chosen": 1.2608593532017298, + "rewards/margins": 6.224813431028336, + "rewards/rejected": -4.963954077826606, + "step": 303 + }, + { + "epoch": 0.11222370910433298, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 9.762325261141602e-06, + "logits/chosen": 389631488.0, + "logits/rejected": 183112732.44444445, + "logps/chosen": -340.81319754464283, + "logps/rejected": -351.52948676215277, + "loss": 0.1602, + "rewards/chosen": 1.5892043794904436, + "rewards/margins": 6.274685556926425, + "rewards/rejected": -4.685481177435981, + "step": 304 + }, + { + "epoch": 0.11259286604217618, + "grad_norm": 7.40625, + "kl": 2.022796869277954, + "learning_rate": 9.760529007243011e-06, + "logits/chosen": 285595443.2, + "logits/rejected": 246709525.33333334, + "logps/chosen": -310.5384521484375, + "logps/rejected": -521.7769368489584, + "loss": 0.1432, + "rewards/chosen": 1.9062309265136719, + "rewards/margins": 8.00025749206543, + "rewards/rejected": -6.094026565551758, + "step": 305 + }, + { + "epoch": 0.11296202298001938, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.758726157691961e-06, + "logits/chosen": 412737629.09090906, + "logits/rejected": 189291910.0952381, + "logps/chosen": -363.6226251775568, + "logps/rejected": -422.1598307291667, + "loss": 0.0826, + "rewards/chosen": 1.950720873746005, + "rewards/margins": 6.7431624689143455, + "rewards/rejected": -4.792441595168341, + "step": 306 + }, + { + "epoch": 0.11333117991786258, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 9.75691671498628e-06, + "logits/chosen": 287464209.06666666, + "logits/rejected": 296532088.4705882, + "logps/chosen": -457.52301432291665, + "logps/rejected": -461.43870634191177, + "loss": 0.1007, + "rewards/chosen": 1.9564284006754558, + "rewards/margins": 7.707352903777478, + "rewards/rejected": -5.750924503102022, + "step": 307 + }, + { + "epoch": 0.11370033685570578, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.755100681632924e-06, + "logits/chosen": 217193699.55555555, + "logits/rejected": 203240813.7142857, + "logps/chosen": -315.5524088541667, + "logps/rejected": -370.50149972098217, + "loss": 0.1339, + "rewards/chosen": 1.727256351047092, + "rewards/margins": 6.382042052253844, + "rewards/rejected": -4.654785701206753, + "step": 308 + }, + { + "epoch": 0.11406949379354898, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.75327806014799e-06, + "logits/chosen": 266431434.10526314, + "logits/rejected": 230958592.0, + "logps/chosen": -336.3356291118421, + "logps/rejected": -399.64009915865387, + "loss": 0.1223, + "rewards/chosen": 1.7971235576428866, + "rewards/margins": 6.994560550581588, + "rewards/rejected": -5.197436992938702, + "step": 309 + }, + { + "epoch": 0.11443865073139219, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.75144885305669e-06, + "logits/chosen": 463078496.0, + "logits/rejected": 290342752.0, + "logps/chosen": -424.9921875, + "logps/rejected": -470.57440185546875, + "loss": 0.1196, + "rewards/chosen": 1.7232091426849365, + "rewards/margins": 6.951017141342163, + "rewards/rejected": -5.227807998657227, + "step": 310 + }, + { + "epoch": 0.11480780766923539, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.749613062893373e-06, + "logits/chosen": 303507456.0, + "logits/rejected": 213723016.53333333, + "logps/chosen": -329.10411879595586, + "logps/rejected": -363.803515625, + "loss": 0.1467, + "rewards/chosen": 1.4101939481847428, + "rewards/margins": 5.7408447864008885, + "rewards/rejected": -4.330650838216146, + "step": 311 + }, + { + "epoch": 0.11517696460707859, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.7477706922015e-06, + "logits/chosen": 362836608.0, + "logits/rejected": 355143680.0, + "logps/chosen": -386.30902099609375, + "logps/rejected": -380.79156494140625, + "loss": 0.1358, + "rewards/chosen": 1.7478898763656616, + "rewards/margins": 6.25598418712616, + "rewards/rejected": -4.508094310760498, + "step": 312 + }, + { + "epoch": 0.11554612154492179, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.745921743533653e-06, + "logits/chosen": 283426541.71428573, + "logits/rejected": 318698666.6666667, + "logps/chosen": -276.9054478236607, + "logps/rejected": -460.9111328125, + "loss": 0.1396, + "rewards/chosen": 1.2740707397460938, + "rewards/margins": 5.996425628662109, + "rewards/rejected": -4.722354888916016, + "step": 313 + }, + { + "epoch": 0.11591527848276499, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 9.744066219451526e-06, + "logits/chosen": 300549536.0, + "logits/rejected": 188430464.0, + "logps/chosen": -374.172119140625, + "logps/rejected": -361.7240295410156, + "loss": 0.1254, + "rewards/chosen": 1.49176824092865, + "rewards/margins": 6.670312762260437, + "rewards/rejected": -5.178544521331787, + "step": 314 + }, + { + "epoch": 0.11628443542060819, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 9.742204122525925e-06, + "logits/chosen": 240916148.70588234, + "logits/rejected": 251461495.46666667, + "logps/chosen": -325.62678079044116, + "logps/rejected": -343.77109375, + "loss": 0.1813, + "rewards/chosen": 1.373621099135455, + "rewards/margins": 5.5605926364075895, + "rewards/rejected": -4.186971537272135, + "step": 315 + }, + { + "epoch": 0.11665359235845138, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.740335455336762e-06, + "logits/chosen": 241375385.6, + "logits/rejected": 266240240.94117647, + "logps/chosen": -321.4020182291667, + "logps/rejected": -490.69801240808823, + "loss": 0.1143, + "rewards/chosen": 1.5778377532958985, + "rewards/margins": 7.7236722833970015, + "rewards/rejected": -6.145834530101103, + "step": 316 + }, + { + "epoch": 0.11702274929629458, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.73846022047305e-06, + "logits/chosen": 287949942.15384614, + "logits/rejected": 292553647.15789473, + "logps/chosen": -285.4080153245192, + "logps/rejected": -501.7545230263158, + "loss": 0.1073, + "rewards/chosen": 1.44635376563439, + "rewards/margins": 6.95319693098184, + "rewards/rejected": -5.50684316534745, + "step": 317 + }, + { + "epoch": 0.11739190623413778, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.736578420532904e-06, + "logits/chosen": 310145145.9047619, + "logits/rejected": 375745256.72727275, + "logps/chosen": -385.6967075892857, + "logps/rejected": -561.18603515625, + "loss": 0.1479, + "rewards/chosen": 1.8145423162551153, + "rewards/margins": 8.828934128666337, + "rewards/rejected": -7.014391812411222, + "step": 318 + }, + { + "epoch": 0.11776106317198098, + "grad_norm": 6.1875, + "kl": 0.3671226501464844, + "learning_rate": 9.734690058123534e-06, + "logits/chosen": 305233152.0, + "logits/rejected": 251928277.33333334, + "logps/chosen": -349.33701171875, + "logps/rejected": -385.4793294270833, + "loss": 0.1395, + "rewards/chosen": 1.8039188385009766, + "rewards/margins": 6.060803095499675, + "rewards/rejected": -4.256884256998698, + "step": 319 + }, + { + "epoch": 0.11813022010982419, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 9.732795135861245e-06, + "logits/chosen": 275267745.68421054, + "logits/rejected": 303825939.6923077, + "logps/chosen": -370.9303556743421, + "logps/rejected": -590.3593374399038, + "loss": 0.1279, + "rewards/chosen": 1.9328400461297286, + "rewards/margins": 7.909168382405269, + "rewards/rejected": -5.976328336275541, + "step": 320 + }, + { + "epoch": 0.11849937704766739, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 9.73089365637142e-06, + "logits/chosen": 427165967.0588235, + "logits/rejected": 220365824.0, + "logps/chosen": -427.9051872702206, + "logps/rejected": -340.78160807291664, + "loss": 0.1597, + "rewards/chosen": 1.3136251112994026, + "rewards/margins": 6.1095245660520066, + "rewards/rejected": -4.795899454752604, + "step": 321 + }, + { + "epoch": 0.11886853398551059, + "grad_norm": 9.4375, + "kl": 0.41103076934814453, + "learning_rate": 9.728985622288542e-06, + "logits/chosen": 338415932.95238096, + "logits/rejected": 257872826.1818182, + "logps/chosen": -454.1246744791667, + "logps/rejected": -482.75204190340907, + "loss": 0.1645, + "rewards/chosen": 1.3875965845017206, + "rewards/margins": 7.397480811907615, + "rewards/rejected": -6.009884227405895, + "step": 322 + }, + { + "epoch": 0.11923769092335379, + "grad_norm": 6.65625, + "kl": 0.23540401458740234, + "learning_rate": 9.727071036256166e-06, + "logits/chosen": 279064515.7647059, + "logits/rejected": 202728362.66666666, + "logps/chosen": -380.31413717830884, + "logps/rejected": -397.46884765625, + "loss": 0.1, + "rewards/chosen": 2.0928533217486214, + "rewards/margins": 7.587544789033777, + "rewards/rejected": -5.494691467285156, + "step": 323 + }, + { + "epoch": 0.11960684786119699, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.725149900926925e-06, + "logits/chosen": 346013269.3333333, + "logits/rejected": 331039692.8, + "logps/chosen": -345.3718668619792, + "logps/rejected": -457.136865234375, + "loss": 0.106, + "rewards/chosen": 1.4097843170166016, + "rewards/margins": 6.6951652526855465, + "rewards/rejected": -5.285380935668945, + "step": 324 + }, + { + "epoch": 0.11997600479904019, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.723222218962529e-06, + "logits/chosen": 289851880.72727275, + "logits/rejected": 329748626.28571427, + "logps/chosen": -396.67724609375, + "logps/rejected": -475.2079148065476, + "loss": 0.0956, + "rewards/chosen": 1.5264674100008877, + "rewards/margins": 7.309286158838313, + "rewards/rejected": -5.782818748837426, + "step": 325 + }, + { + "epoch": 0.1203451617368834, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.721287993033757e-06, + "logits/chosen": 159331114.66666666, + "logits/rejected": 232187373.7142857, + "logps/chosen": -363.5739474826389, + "logps/rejected": -352.77713448660717, + "loss": 0.0707, + "rewards/chosen": 2.9778917100694446, + "rewards/margins": 7.885377671983507, + "rewards/rejected": -4.9074859619140625, + "step": 326 + }, + { + "epoch": 0.1207143186747266, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.719347225820453e-06, + "logits/chosen": 283653993.4117647, + "logits/rejected": 242594030.93333334, + "logps/chosen": -399.9459443933824, + "logps/rejected": -346.9923828125, + "loss": 0.1017, + "rewards/chosen": 2.226092170266544, + "rewards/margins": 6.3985164866727935, + "rewards/rejected": -4.17242431640625, + "step": 327 + }, + { + "epoch": 0.1210834756125698, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 9.717399920011527e-06, + "logits/chosen": 346189824.0, + "logits/rejected": 234528013.47368422, + "logps/chosen": -398.28470552884613, + "logps/rejected": -405.6351768092105, + "loss": 0.106, + "rewards/chosen": 1.8161189739520733, + "rewards/margins": 7.179256470097221, + "rewards/rejected": -5.363137496145148, + "step": 328 + }, + { + "epoch": 0.121452632550413, + "grad_norm": 7.84375, + "kl": 0.8902816772460938, + "learning_rate": 9.715446078304946e-06, + "logits/chosen": 257811392.0, + "logits/rejected": 372637600.0, + "logps/chosen": -355.8140869140625, + "logps/rejected": -510.0860900878906, + "loss": 0.1337, + "rewards/chosen": 1.577518343925476, + "rewards/margins": 7.331411719322205, + "rewards/rejected": -5.7538933753967285, + "step": 329 + }, + { + "epoch": 0.1218217894882562, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.713485703407732e-06, + "logits/chosen": 403173649.06666666, + "logits/rejected": 246202654.11764705, + "logps/chosen": -434.7783203125, + "logps/rejected": -406.12405215992646, + "loss": 0.1134, + "rewards/chosen": 1.5502545674641928, + "rewards/margins": 7.968171706854128, + "rewards/rejected": -6.417917139389935, + "step": 330 + }, + { + "epoch": 0.1221909464260994, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 9.71151879803596e-06, + "logits/chosen": 256802176.0, + "logits/rejected": 281814677.3333333, + "logps/chosen": -274.77764892578125, + "logps/rejected": -421.7360432942708, + "loss": 0.0969, + "rewards/chosen": 1.4513224363327026, + "rewards/margins": 6.544086654980977, + "rewards/rejected": -5.092764218648274, + "step": 331 + }, + { + "epoch": 0.1225601033639426, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.709545364914754e-06, + "logits/chosen": 358819876.5714286, + "logits/rejected": 254632334.2222222, + "logps/chosen": -355.007080078125, + "logps/rejected": -434.6929524739583, + "loss": 0.1109, + "rewards/chosen": 1.6219208581107003, + "rewards/margins": 6.454940538557748, + "rewards/rejected": -4.833019680447048, + "step": 332 + }, + { + "epoch": 0.1229292603017858, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.707565406778277e-06, + "logits/chosen": 366777270.85714287, + "logits/rejected": 304572899.5555556, + "logps/chosen": -376.780517578125, + "logps/rejected": -418.2180989583333, + "loss": 0.1087, + "rewards/chosen": 1.9535281317574638, + "rewards/margins": 7.0968090087648426, + "rewards/rejected": -5.143280877007379, + "step": 333 + }, + { + "epoch": 0.12329841723962899, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 9.70557892636974e-06, + "logits/chosen": 332240323.7647059, + "logits/rejected": 313597098.6666667, + "logps/chosen": -384.7532169117647, + "logps/rejected": -481.62392578125, + "loss": 0.1442, + "rewards/chosen": 1.2593172858743107, + "rewards/margins": 6.104841374415978, + "rewards/rejected": -4.845524088541667, + "step": 334 + }, + { + "epoch": 0.12366757417747219, + "grad_norm": 8.1875, + "kl": 0.7468481063842773, + "learning_rate": 9.703585926441383e-06, + "logits/chosen": 319313111.57894737, + "logits/rejected": 301624300.3076923, + "logps/chosen": -465.3569592927632, + "logps/rejected": -489.9001277043269, + "loss": 0.1398, + "rewards/chosen": 1.7998207493832237, + "rewards/margins": 6.56924137316252, + "rewards/rejected": -4.769420623779297, + "step": 335 + }, + { + "epoch": 0.1240367311153154, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.701586409754484e-06, + "logits/chosen": 229870158.76923078, + "logits/rejected": 276956752.84210527, + "logps/chosen": -296.02518404447113, + "logps/rejected": -449.96474095394734, + "loss": 0.1261, + "rewards/chosen": 1.5754977006178637, + "rewards/margins": 6.90237188146182, + "rewards/rejected": -5.326874180843956, + "step": 336 + }, + { + "epoch": 0.1244058880531586, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.699580379079348e-06, + "logits/chosen": 283128685.71428573, + "logits/rejected": 283609258.6666667, + "logps/chosen": -389.39536830357144, + "logps/rejected": -480.9720052083333, + "loss": 0.0788, + "rewards/chosen": 2.3266168321881975, + "rewards/margins": 7.795124386984204, + "rewards/rejected": -5.468507554796007, + "step": 337 + }, + { + "epoch": 0.1247750449910018, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.697567837195303e-06, + "logits/chosen": 232332032.0, + "logits/rejected": 295556494.2222222, + "logps/chosen": -292.5155726841518, + "logps/rejected": -482.90049913194446, + "loss": 0.0792, + "rewards/chosen": 2.7382104056222096, + "rewards/margins": 7.819018833220952, + "rewards/rejected": -5.0808084275987415, + "step": 338 + }, + { + "epoch": 0.125144201928845, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.695548786890701e-06, + "logits/chosen": 386158961.7777778, + "logits/rejected": 421513362.28571427, + "logps/chosen": -351.69878472222223, + "logps/rejected": -582.1599469866071, + "loss": 0.0989, + "rewards/chosen": 2.197071287367079, + "rewards/margins": 8.55944463941786, + "rewards/rejected": -6.362373352050781, + "step": 339 + }, + { + "epoch": 0.125144201928845, + "eval_kl": 0.3058510720729828, + "eval_logits/chosen": 296557832.4768212, + "eval_logits/rejected": 256327738.0992908, + "eval_logps/chosen": -363.73961782560707, + "eval_logps/rejected": -427.5158096926714, + "eval_loss": 0.12238284200429916, + "eval_rewards/chosen": 1.9805920329314983, + "eval_rewards/margins": 6.906813659519855, + "eval_rewards/rejected": -4.926221626588357, + "eval_runtime": 46.6093, + "eval_samples_per_second": 18.795, + "eval_steps_per_second": 4.699, + "step": 339 + }, + { + "epoch": 0.1255133588666882, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 9.693523230962914e-06, + "logits/chosen": 302060608.0, + "logits/rejected": 219931328.0, + "logps/chosen": -316.9400634765625, + "logps/rejected": -387.8214416503906, + "loss": 0.1167, + "rewards/chosen": 2.1675126552581787, + "rewards/margins": 7.494145631790161, + "rewards/rejected": -5.326632976531982, + "step": 340 + }, + { + "epoch": 0.12588251580453141, + "grad_norm": 8.0625, + "kl": 0.3138308525085449, + "learning_rate": 9.691491172218318e-06, + "logits/chosen": 291169696.0, + "logits/rejected": 216062880.0, + "logps/chosen": -346.2454833984375, + "logps/rejected": -262.3503723144531, + "loss": 0.1568, + "rewards/chosen": 1.211580753326416, + "rewards/margins": 5.060364007949829, + "rewards/rejected": -3.848783254623413, + "step": 341 + }, + { + "epoch": 0.12625167274237462, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.689452613472309e-06, + "logits/chosen": 279381191.1111111, + "logits/rejected": 184942372.57142857, + "logps/chosen": -311.42738172743054, + "logps/rejected": -367.51736886160717, + "loss": 0.1157, + "rewards/chosen": 2.0281825595431857, + "rewards/margins": 7.001857333713108, + "rewards/rejected": -4.973674774169922, + "step": 342 + }, + { + "epoch": 0.1266208296802178, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.687407557549282e-06, + "logits/chosen": 298712832.0, + "logits/rejected": 350737600.0, + "logps/chosen": -359.62353515625, + "logps/rejected": -502.478515625, + "loss": 0.0925, + "rewards/chosen": 2.3487753868103027, + "rewards/margins": 7.222583293914795, + "rewards/rejected": -4.873807907104492, + "step": 343 + }, + { + "epoch": 0.126989986618061, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.685356007282639e-06, + "logits/chosen": 252870144.0, + "logits/rejected": 193254553.6, + "logps/chosen": -404.8876091452206, + "logps/rejected": -360.8185546875, + "loss": 0.0906, + "rewards/chosen": 2.1898828394272747, + "rewards/margins": 7.935839260325713, + "rewards/rejected": -5.7459564208984375, + "step": 344 + }, + { + "epoch": 0.1273591435559042, + "grad_norm": 10.0, + "kl": 0.0, + "learning_rate": 9.683297965514774e-06, + "logits/chosen": 185748359.52941176, + "logits/rejected": 299615880.53333336, + "logps/chosen": -381.0862247242647, + "logps/rejected": -331.22822265625, + "loss": 0.1288, + "rewards/chosen": 2.2207013298483456, + "rewards/margins": 6.468247462253945, + "rewards/rejected": -4.247546132405599, + "step": 345 + }, + { + "epoch": 0.1277283004937474, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 9.681233435097078e-06, + "logits/chosen": 311048320.0, + "logits/rejected": 196840426.66666666, + "logps/chosen": -400.254833984375, + "logps/rejected": -472.3328043619792, + "loss": 0.1366, + "rewards/chosen": 1.739175796508789, + "rewards/margins": 7.512531280517578, + "rewards/rejected": -5.773355484008789, + "step": 346 + }, + { + "epoch": 0.1280974574315906, + "grad_norm": 3.15625, + "kl": 0.0, + "learning_rate": 9.679162418889932e-06, + "logits/chosen": 282210329.6, + "logits/rejected": 265046295.27272728, + "logps/chosen": -397.0747314453125, + "logps/rejected": -459.17795632102275, + "loss": 0.0403, + "rewards/chosen": 2.5627490997314455, + "rewards/margins": 8.445556744662198, + "rewards/rejected": -5.882807644930753, + "step": 347 + }, + { + "epoch": 0.1284666143694338, + "grad_norm": 7.34375, + "kl": 0.07346868515014648, + "learning_rate": 9.677084919762703e-06, + "logits/chosen": 290337945.6, + "logits/rejected": 315902485.3333333, + "logps/chosen": -421.1623046875, + "logps/rejected": -462.0775553385417, + "loss": 0.1476, + "rewards/chosen": 1.5664203643798829, + "rewards/margins": 6.787648646036784, + "rewards/rejected": -5.221228281656901, + "step": 348 + }, + { + "epoch": 0.128835771307277, + "grad_norm": 7.03125, + "kl": 0.03321552276611328, + "learning_rate": 9.675000940593738e-06, + "logits/chosen": 234521059.55555555, + "logits/rejected": 238920064.0, + "logps/chosen": -358.06841362847223, + "logps/rejected": -441.4900599888393, + "loss": 0.1287, + "rewards/chosen": 1.8145455254448786, + "rewards/margins": 7.0673019166976685, + "rewards/rejected": -5.25275639125279, + "step": 349 + }, + { + "epoch": 0.1292049282451202, + "grad_norm": 6.09375, + "kl": 0.04551982879638672, + "learning_rate": 9.672910484270367e-06, + "logits/chosen": 317492256.0, + "logits/rejected": 368276672.0, + "logps/chosen": -371.986572265625, + "logps/rejected": -390.28826904296875, + "loss": 0.0947, + "rewards/chosen": 2.0842208862304688, + "rewards/margins": 7.650793075561523, + "rewards/rejected": -5.566572189331055, + "step": 350 + }, + { + "epoch": 0.1295740851829634, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 9.670813553688888e-06, + "logits/chosen": 353806404.26666665, + "logits/rejected": 246404005.6470588, + "logps/chosen": -388.5052734375, + "logps/rejected": -521.5241268382352, + "loss": 0.0856, + "rewards/chosen": 2.0199000040690103, + "rewards/margins": 7.788184521245022, + "rewards/rejected": -5.768284517176011, + "step": 351 + }, + { + "epoch": 0.1299432421208066, + "grad_norm": 5.9375, + "kl": 0.07115840911865234, + "learning_rate": 9.668710151754572e-06, + "logits/chosen": 267330579.69230768, + "logits/rejected": 252685150.31578946, + "logps/chosen": -346.66225961538464, + "logps/rejected": -464.56178042763156, + "loss": 0.089, + "rewards/chosen": 2.5139559232271633, + "rewards/margins": 7.289638472954754, + "rewards/rejected": -4.77568254972759, + "step": 352 + }, + { + "epoch": 0.1303123990586498, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 9.666600281381657e-06, + "logits/chosen": 215329728.0, + "logits/rejected": 402952000.0, + "logps/chosen": -329.8309326171875, + "logps/rejected": -328.46392822265625, + "loss": 0.1254, + "rewards/chosen": 1.9891334772109985, + "rewards/margins": 6.531978249549866, + "rewards/rejected": -4.542844772338867, + "step": 353 + }, + { + "epoch": 0.130681555996493, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.664483945493342e-06, + "logits/chosen": 248813115.07692307, + "logits/rejected": 294765244.6315789, + "logps/chosen": -258.64806189903845, + "logps/rejected": -362.8381990131579, + "loss": 0.1007, + "rewards/chosen": 2.1083529545710635, + "rewards/margins": 5.9990956078656765, + "rewards/rejected": -3.8907426532946134, + "step": 354 + }, + { + "epoch": 0.1310507129343362, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 9.66236114702178e-06, + "logits/chosen": 239671324.44444445, + "logits/rejected": 221293933.7142857, + "logps/chosen": -308.2656521267361, + "logps/rejected": -408.92919921875, + "loss": 0.1329, + "rewards/chosen": 2.6674777136908636, + "rewards/margins": 6.983458897424122, + "rewards/rejected": -4.315981183733259, + "step": 355 + }, + { + "epoch": 0.1314198698721794, + "grad_norm": 6.625, + "kl": 1.1833086013793945, + "learning_rate": 9.660231888908085e-06, + "logits/chosen": 208685251.04761904, + "logits/rejected": 195325602.9090909, + "logps/chosen": -271.44015066964283, + "logps/rejected": -379.70339133522725, + "loss": 0.1645, + "rewards/chosen": 1.846649169921875, + "rewards/margins": 6.26857480135831, + "rewards/rejected": -4.421925631436435, + "step": 356 + }, + { + "epoch": 0.1317890268100226, + "grad_norm": 5.78125, + "kl": 2.21799898147583, + "learning_rate": 9.658096174102314e-06, + "logits/chosen": 262908800.0, + "logits/rejected": 322849280.0, + "logps/chosen": -404.6955261230469, + "logps/rejected": -362.8702697753906, + "loss": 0.1065, + "rewards/chosen": 2.2610950469970703, + "rewards/margins": 7.315790176391602, + "rewards/rejected": -5.054695129394531, + "step": 357 + }, + { + "epoch": 0.1321581837478658, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.655954005563475e-06, + "logits/chosen": 322066212.5714286, + "logits/rejected": 244183096.8888889, + "logps/chosen": -368.76077706473217, + "logps/rejected": -467.7179904513889, + "loss": 0.0907, + "rewards/chosen": 1.8358097076416016, + "rewards/margins": 7.304883533053928, + "rewards/rejected": -5.469073825412327, + "step": 358 + }, + { + "epoch": 0.132527340685709, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.653805386259514e-06, + "logits/chosen": 234148352.0, + "logits/rejected": 236890316.8, + "logps/chosen": -317.5182100183824, + "logps/rejected": -436.39576822916666, + "loss": 0.1049, + "rewards/chosen": 2.557288450353286, + "rewards/margins": 7.734292437983495, + "rewards/rejected": -5.177003987630209, + "step": 359 + }, + { + "epoch": 0.1328964976235522, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.651650319167318e-06, + "logits/chosen": 186530432.0, + "logits/rejected": 214931057.7777778, + "logps/chosen": -249.89754813058036, + "logps/rejected": -438.92711046006946, + "loss": 0.0886, + "rewards/chosen": 2.3764727456229076, + "rewards/margins": 7.825192194136362, + "rewards/rejected": -5.4487194485134545, + "step": 360 + }, + { + "epoch": 0.1332656545613954, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.649488807272703e-06, + "logits/chosen": 307174976.0, + "logits/rejected": 304003680.0, + "logps/chosen": -414.68701171875, + "logps/rejected": -399.31011962890625, + "loss": 0.1475, + "rewards/chosen": 1.8143943548202515, + "rewards/margins": 6.633835196495056, + "rewards/rejected": -4.819440841674805, + "step": 361 + }, + { + "epoch": 0.13363481149923861, + "grad_norm": 6.03125, + "kl": 0.4556403160095215, + "learning_rate": 9.647320853570415e-06, + "logits/chosen": 273277280.0, + "logits/rejected": 292123872.0, + "logps/chosen": -346.0985107421875, + "logps/rejected": -388.2326965332031, + "loss": 0.0867, + "rewards/chosen": 2.707688808441162, + "rewards/margins": 7.914984703063965, + "rewards/rejected": -5.207295894622803, + "step": 362 + }, + { + "epoch": 0.13400396843708182, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.645146461064129e-06, + "logits/chosen": 457766610.8235294, + "logits/rejected": 240866884.26666668, + "logps/chosen": -332.52404067095586, + "logps/rejected": -473.6763671875, + "loss": 0.1483, + "rewards/chosen": 1.6503498974968405, + "rewards/margins": 6.983688825719497, + "rewards/rejected": -5.333338928222656, + "step": 363 + }, + { + "epoch": 0.13437312537492502, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.642965632766437e-06, + "logits/chosen": 211187273.14285713, + "logits/rejected": 183762901.33333334, + "logps/chosen": -298.66831752232144, + "logps/rejected": -275.602294921875, + "loss": 0.1331, + "rewards/chosen": 1.3281547001429967, + "rewards/margins": 5.648894870091998, + "rewards/rejected": -4.320740169949001, + "step": 364 + }, + { + "epoch": 0.13474228231276822, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.64077837169885e-06, + "logits/chosen": 302319917.1764706, + "logits/rejected": 240145851.73333332, + "logps/chosen": -390.12327665441177, + "logps/rejected": -434.79772135416664, + "loss": 0.1119, + "rewards/chosen": 2.1586265563964844, + "rewards/margins": 7.862704213460287, + "rewards/rejected": -5.704077657063802, + "step": 365 + }, + { + "epoch": 0.13511143925061142, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.638584680891787e-06, + "logits/chosen": 217033294.76923078, + "logits/rejected": 291944043.7894737, + "logps/chosen": -298.38269981971155, + "logps/rejected": -474.4635074013158, + "loss": 0.1029, + "rewards/chosen": 2.5579745952899637, + "rewards/margins": 8.236469098913524, + "rewards/rejected": -5.678494503623561, + "step": 366 + }, + { + "epoch": 0.13548059618845462, + "grad_norm": 5.6875, + "kl": 0.13333559036254883, + "learning_rate": 9.636384563384584e-06, + "logits/chosen": 323313603.7647059, + "logits/rejected": 383403315.2, + "logps/chosen": -306.75163717830884, + "logps/rejected": -389.75071614583334, + "loss": 0.1179, + "rewards/chosen": 2.1439451329848347, + "rewards/margins": 7.581081973805148, + "rewards/rejected": -5.437136840820313, + "step": 367 + }, + { + "epoch": 0.13584975312629782, + "grad_norm": 6.1875, + "kl": 0.02844715118408203, + "learning_rate": 9.634178022225474e-06, + "logits/chosen": 311796766.11764705, + "logits/rejected": 232878182.4, + "logps/chosen": -366.01295381433823, + "logps/rejected": -391.98522135416664, + "loss": 0.1265, + "rewards/chosen": 1.7345584420596851, + "rewards/margins": 6.665622793459425, + "rewards/rejected": -4.93106435139974, + "step": 368 + }, + { + "epoch": 0.13621891006414102, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.631965060471587e-06, + "logits/chosen": 255958287.05882353, + "logits/rejected": 243069525.33333334, + "logps/chosen": -300.0275448069853, + "logps/rejected": -472.3465169270833, + "loss": 0.0983, + "rewards/chosen": 2.1819076538085938, + "rewards/margins": 8.322706604003907, + "rewards/rejected": -6.140798950195313, + "step": 369 + }, + { + "epoch": 0.13658806700198423, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.62974568118896e-06, + "logits/chosen": 321804128.0, + "logits/rejected": 257965568.0, + "logps/chosen": -388.5257568359375, + "logps/rejected": -422.61767578125, + "loss": 0.0662, + "rewards/chosen": 1.9005005359649658, + "rewards/margins": 7.457368771235148, + "rewards/rejected": -5.556868235270183, + "step": 370 + }, + { + "epoch": 0.13695722393982743, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.627519887452512e-06, + "logits/chosen": 221193808.0, + "logits/rejected": 278764448.0, + "logps/chosen": -372.63153076171875, + "logps/rejected": -404.45550537109375, + "loss": 0.0901, + "rewards/chosen": 2.3672144412994385, + "rewards/margins": 7.656309366226196, + "rewards/rejected": -5.289094924926758, + "step": 371 + }, + { + "epoch": 0.13732638087767063, + "grad_norm": 5.1875, + "kl": 0.15140581130981445, + "learning_rate": 9.625287682346051e-06, + "logits/chosen": 166844450.13333333, + "logits/rejected": 288486731.2941176, + "logps/chosen": -348.64475911458334, + "logps/rejected": -425.8829561121324, + "loss": 0.0943, + "rewards/chosen": 2.78929443359375, + "rewards/margins": 8.240407517377069, + "rewards/rejected": -5.451113083783318, + "step": 372 + }, + { + "epoch": 0.13769553781551383, + "grad_norm": 6.875, + "kl": 0.44831085205078125, + "learning_rate": 9.62304906896227e-06, + "logits/chosen": 274243584.0, + "logits/rejected": 371939986.28571427, + "logps/chosen": -379.6617838541667, + "logps/rejected": -412.4147251674107, + "loss": 0.0908, + "rewards/chosen": 2.3740656110975475, + "rewards/margins": 7.546873577057369, + "rewards/rejected": -5.172807965959821, + "step": 373 + }, + { + "epoch": 0.13806469475335703, + "grad_norm": 10.1875, + "kl": 0.24500751495361328, + "learning_rate": 9.620804050402738e-06, + "logits/chosen": 322827264.0, + "logits/rejected": 348556361.14285713, + "logps/chosen": -340.29296875, + "logps/rejected": -393.86729213169644, + "loss": 0.1687, + "rewards/chosen": 1.4711328082614474, + "rewards/margins": 6.350449410695878, + "rewards/rejected": -4.8793166024344305, + "step": 374 + }, + { + "epoch": 0.13843385169120023, + "grad_norm": 8.625, + "kl": 2.158534526824951, + "learning_rate": 9.618552629777904e-06, + "logits/chosen": 238371061.76, + "logits/rejected": 309618797.71428573, + "logps/chosen": -348.7252734375, + "logps/rejected": -267.11685616629467, + "loss": 0.2019, + "rewards/chosen": 1.9250161743164063, + "rewards/margins": 5.7909949602399555, + "rewards/rejected": -3.865978785923549, + "step": 375 + }, + { + "epoch": 0.13880300862904343, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.616294810207077e-06, + "logits/chosen": 239504042.66666666, + "logits/rejected": 213159544.47058824, + "logps/chosen": -250.92516276041667, + "logps/rejected": -316.5790153952206, + "loss": 0.151, + "rewards/chosen": 1.9627112070719401, + "rewards/margins": 5.806253418267942, + "rewards/rejected": -3.843542211196002, + "step": 376 + }, + { + "epoch": 0.13917216556688664, + "grad_norm": 6.84375, + "kl": 0.03418874740600586, + "learning_rate": 9.61403059481844e-06, + "logits/chosen": 297559398.4, + "logits/rejected": 179879338.66666666, + "logps/chosen": -333.0562255859375, + "logps/rejected": -362.36181640625, + "loss": 0.1537, + "rewards/chosen": 2.3532806396484376, + "rewards/margins": 6.801932589213054, + "rewards/rejected": -4.448651949564616, + "step": 377 + }, + { + "epoch": 0.13954132250472984, + "grad_norm": 5.8125, + "kl": 0.5580158233642578, + "learning_rate": 9.611759986749036e-06, + "logits/chosen": 397703733.8947368, + "logits/rejected": 270759049.84615386, + "logps/chosen": -387.4747892680921, + "logps/rejected": -374.92724609375, + "loss": 0.097, + "rewards/chosen": 2.4318275451660156, + "rewards/margins": 6.924285888671875, + "rewards/rejected": -4.492458343505859, + "step": 378 + }, + { + "epoch": 0.139910479442573, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.60948298914476e-06, + "logits/chosen": 271444960.0, + "logits/rejected": 255748128.0, + "logps/chosen": -394.79315185546875, + "logps/rejected": -459.151123046875, + "loss": 0.1011, + "rewards/chosen": 2.242133617401123, + "rewards/margins": 8.21304988861084, + "rewards/rejected": -5.970916271209717, + "step": 379 + }, + { + "epoch": 0.1402796363804162, + "grad_norm": 8.5625, + "kl": 0.566807746887207, + "learning_rate": 9.607199605160367e-06, + "logits/chosen": 291323723.2941176, + "logits/rejected": 199482163.2, + "logps/chosen": -433.36980124080884, + "logps/rejected": -367.93011067708335, + "loss": 0.1578, + "rewards/chosen": 1.3059402914608227, + "rewards/margins": 6.534701074338427, + "rewards/rejected": -5.228760782877604, + "step": 380 + }, + { + "epoch": 0.1406487933182594, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 9.604909837959456e-06, + "logits/chosen": 279671756.8, + "logits/rejected": 219192440.47058824, + "logps/chosen": -440.92272135416664, + "logps/rejected": -342.32683249080884, + "loss": 0.1084, + "rewards/chosen": 2.196954091389974, + "rewards/margins": 7.0462915009143305, + "rewards/rejected": -4.849337409524357, + "step": 381 + }, + { + "epoch": 0.14101795025610261, + "grad_norm": 7.0, + "kl": 0.1768054962158203, + "learning_rate": 9.602613690714468e-06, + "logits/chosen": 319243616.0, + "logits/rejected": 373890272.0, + "logps/chosen": -354.90606689453125, + "logps/rejected": -580.3153686523438, + "loss": 0.1122, + "rewards/chosen": 2.261608123779297, + "rewards/margins": 9.107210636138916, + "rewards/rejected": -6.845602512359619, + "step": 382 + }, + { + "epoch": 0.14138710719394582, + "grad_norm": 4.9375, + "kl": 0.06438732147216797, + "learning_rate": 9.600311166606687e-06, + "logits/chosen": 195652215.46666667, + "logits/rejected": 337334362.35294116, + "logps/chosen": -302.76865234375, + "logps/rejected": -416.13229549632354, + "loss": 0.1096, + "rewards/chosen": 2.249004109700521, + "rewards/margins": 6.898670391007966, + "rewards/rejected": -4.649666281307445, + "step": 383 + }, + { + "epoch": 0.14175626413178902, + "grad_norm": 6.78125, + "kl": 0.35001659393310547, + "learning_rate": 9.59800226882623e-06, + "logits/chosen": 283369280.0, + "logits/rejected": 213697040.0, + "logps/chosen": -370.1119689941406, + "logps/rejected": -371.980712890625, + "loss": 0.1427, + "rewards/chosen": 1.7972640991210938, + "rewards/margins": 6.698160171508789, + "rewards/rejected": -4.900896072387695, + "step": 384 + }, + { + "epoch": 0.14212542106963222, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.595687000572049e-06, + "logits/chosen": 310046369.68421054, + "logits/rejected": 246655172.92307693, + "logps/chosen": -409.0834189967105, + "logps/rejected": -462.56595552884613, + "loss": 0.1439, + "rewards/chosen": 1.8485420628597862, + "rewards/margins": 7.466230755392838, + "rewards/rejected": -5.6176886925330525, + "step": 385 + }, + { + "epoch": 0.14249457800747542, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.593365365051915e-06, + "logits/chosen": 305573171.2, + "logits/rejected": 279050119.5294118, + "logps/chosen": -364.37718098958334, + "logps/rejected": -399.4931066176471, + "loss": 0.1176, + "rewards/chosen": 2.081070963541667, + "rewards/margins": 6.980310447543275, + "rewards/rejected": -4.899239484001608, + "step": 386 + }, + { + "epoch": 0.14286373494531862, + "grad_norm": 9.9375, + "kl": 0.0, + "learning_rate": 9.591037365482424e-06, + "logits/chosen": 269426151.61904764, + "logits/rejected": 342887819.6363636, + "logps/chosen": -302.23214285714283, + "logps/rejected": -505.97145774147725, + "loss": 0.2232, + "rewards/chosen": 1.1202251797630673, + "rewards/margins": 7.2554541170855105, + "rewards/rejected": -6.135228937322443, + "step": 387 + }, + { + "epoch": 0.14323289188316182, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.588703005088994e-06, + "logits/chosen": 270651501.71428573, + "logits/rejected": 241479168.0, + "logps/chosen": -351.89536830357144, + "logps/rejected": -436.7297092013889, + "loss": 0.1054, + "rewards/chosen": 2.3149403163364957, + "rewards/margins": 7.006067669580854, + "rewards/rejected": -4.691127353244358, + "step": 388 + }, + { + "epoch": 0.14360204882100502, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.58636228710585e-06, + "logits/chosen": 262592960.0, + "logits/rejected": 293187763.2, + "logps/chosen": -303.02227783203125, + "logps/rejected": -351.029248046875, + "loss": 0.1108, + "rewards/chosen": 1.8170806566874187, + "rewards/margins": 6.218141523996989, + "rewards/rejected": -4.40106086730957, + "step": 389 + }, + { + "epoch": 0.14397120575884823, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.584015214776025e-06, + "logits/chosen": 241902245.6470588, + "logits/rejected": 298369297.06666666, + "logps/chosen": -361.03708065257354, + "logps/rejected": -379.96083984375, + "loss": 0.0974, + "rewards/chosen": 2.469573077033548, + "rewards/margins": 6.746836972704121, + "rewards/rejected": -4.277263895670573, + "step": 390 + }, + { + "epoch": 0.14434036269669143, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 9.58166179135136e-06, + "logits/chosen": 370010180.26666665, + "logits/rejected": 320474232.4705882, + "logps/chosen": -488.97747395833335, + "logps/rejected": -434.6992761948529, + "loss": 0.1329, + "rewards/chosen": 1.7900536855061848, + "rewards/margins": 6.402845307892444, + "rewards/rejected": -4.6127916223862595, + "step": 391 + }, + { + "epoch": 0.14470951963453463, + "grad_norm": 8.625, + "kl": 0.047497332096099854, + "learning_rate": 9.579302020092491e-06, + "logits/chosen": 274919606.85714287, + "logits/rejected": 243244600.8888889, + "logps/chosen": -315.992919921875, + "logps/rejected": -439.6481119791667, + "loss": 0.1399, + "rewards/chosen": 1.0886754308428084, + "rewards/margins": 6.536679835546584, + "rewards/rejected": -5.448004404703776, + "step": 392 + }, + { + "epoch": 0.14507867657237783, + "grad_norm": 6.03125, + "kl": 0.29087162017822266, + "learning_rate": 9.576935904268853e-06, + "logits/chosen": 298953824.0, + "logits/rejected": 313519936.0, + "logps/chosen": -374.343017578125, + "logps/rejected": -439.4222412109375, + "loss": 0.107, + "rewards/chosen": 2.042370319366455, + "rewards/margins": 7.74931001663208, + "rewards/rejected": -5.706939697265625, + "step": 393 + }, + { + "epoch": 0.14544783351022103, + "grad_norm": 6.0, + "kl": 1.2467050552368164, + "learning_rate": 9.574563447158671e-06, + "logits/chosen": 305277558.15384614, + "logits/rejected": 323834880.0, + "logps/chosen": -399.2101487379808, + "logps/rejected": -345.2930972450658, + "loss": 0.0858, + "rewards/chosen": 2.4697142380934496, + "rewards/margins": 7.334943392981402, + "rewards/rejected": -4.8652291548879525, + "step": 394 + }, + { + "epoch": 0.14581699044806423, + "grad_norm": 7.03125, + "kl": 0.4538395404815674, + "learning_rate": 9.57218465204895e-06, + "logits/chosen": 282699648.0, + "logits/rejected": 263705344.0, + "logps/chosen": -367.387939453125, + "logps/rejected": -379.5290934244792, + "loss": 0.1667, + "rewards/chosen": 1.9326324462890625, + "rewards/margins": 6.6468353271484375, + "rewards/rejected": -4.714202880859375, + "step": 395 + }, + { + "epoch": 0.14618614738590743, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 9.569799522235484e-06, + "logits/chosen": 242691002.1818182, + "logits/rejected": 346121830.4, + "logps/chosen": -321.68803267045456, + "logps/rejected": -477.962744140625, + "loss": 0.1249, + "rewards/chosen": 2.1649402271617544, + "rewards/margins": 9.00930210460316, + "rewards/rejected": -6.844361877441406, + "step": 396 + }, + { + "epoch": 0.14655530432375063, + "grad_norm": 8.0, + "kl": 0.35939931869506836, + "learning_rate": 9.567408061022838e-06, + "logits/chosen": 357195360.0, + "logits/rejected": 283995584.0, + "logps/chosen": -403.69085693359375, + "logps/rejected": -458.83349609375, + "loss": 0.1408, + "rewards/chosen": 1.5763146877288818, + "rewards/margins": 6.790769338607788, + "rewards/rejected": -5.214454650878906, + "step": 397 + }, + { + "epoch": 0.14692446126159384, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.565010271724353e-06, + "logits/chosen": 342959672.8888889, + "logits/rejected": 187049600.0, + "logps/chosen": -339.0373263888889, + "logps/rejected": -362.8732212611607, + "loss": 0.1393, + "rewards/chosen": 1.7913449605305989, + "rewards/margins": 7.220369793119884, + "rewards/rejected": -5.429024832589286, + "step": 398 + }, + { + "epoch": 0.14729361819943704, + "grad_norm": 7.59375, + "kl": 1.372391700744629, + "learning_rate": 9.562606157662132e-06, + "logits/chosen": 362078748.4444444, + "logits/rejected": 281582262.85714287, + "logps/chosen": -423.21674262152777, + "logps/rejected": -499.27235630580356, + "loss": 0.1441, + "rewards/chosen": 1.9018546210394964, + "rewards/margins": 7.34707992795914, + "rewards/rejected": -5.445225306919643, + "step": 399 + }, + { + "epoch": 0.14766277513728024, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.56019572216705e-06, + "logits/chosen": 234631443.69230768, + "logits/rejected": 202293760.0, + "logps/chosen": -378.0862379807692, + "logps/rejected": -399.0844212582237, + "loss": 0.1028, + "rewards/chosen": 1.9243479508620043, + "rewards/margins": 7.01773575539531, + "rewards/rejected": -5.093387804533306, + "step": 400 + }, + { + "epoch": 0.14803193207512344, + "grad_norm": 7.125, + "kl": 1.362654685974121, + "learning_rate": 9.557778968578728e-06, + "logits/chosen": 205160738.13333333, + "logits/rejected": 415112192.0, + "logps/chosen": -351.52060546875, + "logps/rejected": -494.3483455882353, + "loss": 0.1147, + "rewards/chosen": 2.085920715332031, + "rewards/margins": 7.789873280244715, + "rewards/rejected": -5.703952564912684, + "step": 401 + }, + { + "epoch": 0.14840108901296664, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.555355900245553e-06, + "logits/chosen": 216040379.73333332, + "logits/rejected": 314316227.7647059, + "logps/chosen": -326.44420572916664, + "logps/rejected": -464.0573299632353, + "loss": 0.1098, + "rewards/chosen": 1.91368776957194, + "rewards/margins": 7.430783896352731, + "rewards/rejected": -5.517096126780791, + "step": 402 + }, + { + "epoch": 0.14877024595080984, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.552926520524654e-06, + "logits/chosen": 300476484.26666665, + "logits/rejected": 258580600.47058824, + "logps/chosen": -407.42493489583336, + "logps/rejected": -488.5114315257353, + "loss": 0.1232, + "rewards/chosen": 1.9989527384440104, + "rewards/margins": 7.56056458716299, + "rewards/rejected": -5.56161184871898, + "step": 403 + }, + { + "epoch": 0.14913940288865304, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.550490832781905e-06, + "logits/chosen": 372691114.6666667, + "logits/rejected": 171448448.0, + "logps/chosen": -371.3794759114583, + "logps/rejected": -293.51220703125, + "loss": 0.0841, + "rewards/chosen": 2.642810344696045, + "rewards/margins": 7.4120707511901855, + "rewards/rejected": -4.769260406494141, + "step": 404 + }, + { + "epoch": 0.14950855982649625, + "grad_norm": 7.5625, + "kl": 1.825098991394043, + "learning_rate": 9.54804884039192e-06, + "logits/chosen": 265462067.2, + "logits/rejected": 325643029.3333333, + "logps/chosen": -353.009326171875, + "logps/rejected": -436.9382731119792, + "loss": 0.1637, + "rewards/chosen": 1.904961395263672, + "rewards/margins": 6.7379609425862625, + "rewards/rejected": -4.832999547322591, + "step": 405 + }, + { + "epoch": 0.14987771676433945, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.545600546738047e-06, + "logits/chosen": 332452044.8, + "logits/rejected": 262157733.6470588, + "logps/chosen": -320.34296875, + "logps/rejected": -474.5977424172794, + "loss": 0.1175, + "rewards/chosen": 1.643945566813151, + "rewards/margins": 6.672582723580154, + "rewards/rejected": -5.028637156767004, + "step": 406 + }, + { + "epoch": 0.15024687370218265, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.54314595521237e-06, + "logits/chosen": 257797686.85714287, + "logits/rejected": 236860188.44444445, + "logps/chosen": -427.32059151785717, + "logps/rejected": -402.5011393229167, + "loss": 0.0937, + "rewards/chosen": 2.2921289716448103, + "rewards/margins": 7.493867964971633, + "rewards/rejected": -5.201738993326823, + "step": 407 + }, + { + "epoch": 0.15061603064002585, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 9.540685069215693e-06, + "logits/chosen": 195577941.33333334, + "logits/rejected": 181451504.94117647, + "logps/chosen": -321.0412109375, + "logps/rejected": -296.9152401194853, + "loss": 0.1344, + "rewards/chosen": 1.8047523498535156, + "rewards/margins": 6.470291362089269, + "rewards/rejected": -4.665539012235754, + "step": 408 + }, + { + "epoch": 0.15098518757786905, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 9.53821789215754e-06, + "logits/chosen": 223603554.46153846, + "logits/rejected": 317918450.5263158, + "logps/chosen": -258.7773625300481, + "logps/rejected": -459.27040501644734, + "loss": 0.0865, + "rewards/chosen": 2.0098838806152344, + "rewards/margins": 8.268759677284642, + "rewards/rejected": -6.2588757966694075, + "step": 409 + }, + { + "epoch": 0.15135434451571225, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.535744427456156e-06, + "logits/chosen": 282864864.0, + "logits/rejected": 272002976.0, + "logps/chosen": -434.949462890625, + "logps/rejected": -386.6890869140625, + "loss": 0.0774, + "rewards/chosen": 2.4693427085876465, + "rewards/margins": 7.339344501495361, + "rewards/rejected": -4.870001792907715, + "step": 410 + }, + { + "epoch": 0.15172350145355545, + "grad_norm": 7.875, + "kl": 0.8378276824951172, + "learning_rate": 9.533264678538493e-06, + "logits/chosen": 408183381.3333333, + "logits/rejected": 294605238.85714287, + "logps/chosen": -356.44151475694446, + "logps/rejected": -541.9028669084821, + "loss": 0.1376, + "rewards/chosen": 1.5896232393052843, + "rewards/margins": 7.699521397787428, + "rewards/rejected": -6.109898158482143, + "step": 411 + }, + { + "epoch": 0.15209265839139866, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 9.530778648840213e-06, + "logits/chosen": 253940992.0, + "logits/rejected": 201301376.0, + "logps/chosen": -373.77794053819446, + "logps/rejected": -316.24155970982144, + "loss": 0.1166, + "rewards/chosen": 2.7091151343451605, + "rewards/margins": 6.538237950158498, + "rewards/rejected": -3.8291228158133372, + "step": 412 + }, + { + "epoch": 0.15246181532924186, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.528286341805675e-06, + "logits/chosen": 309193947.4285714, + "logits/rejected": 198476757.33333334, + "logps/chosen": -429.3058384486607, + "logps/rejected": -398.8471950954861, + "loss": 0.0945, + "rewards/chosen": 1.9174834660121374, + "rewards/margins": 8.076225311037094, + "rewards/rejected": -6.158741845024957, + "step": 413 + }, + { + "epoch": 0.15283097226708506, + "grad_norm": 6.21875, + "kl": 0.5471572875976562, + "learning_rate": 9.525787760887945e-06, + "logits/chosen": 266627128.8888889, + "logits/rejected": 414209572.5714286, + "logps/chosen": -396.9416232638889, + "logps/rejected": -458.30140904017856, + "loss": 0.1124, + "rewards/chosen": 2.271378411187066, + "rewards/margins": 6.972975352453807, + "rewards/rejected": -4.701596941266741, + "step": 414 + }, + { + "epoch": 0.15320012920492823, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.523282909548773e-06, + "logits/chosen": 230228373.33333334, + "logits/rejected": 262207539.2, + "logps/chosen": -295.1545003255208, + "logps/rejected": -473.530078125, + "loss": 0.0962, + "rewards/chosen": 2.061567465464274, + "rewards/margins": 6.805103079477945, + "rewards/rejected": -4.7435356140136715, + "step": 415 + }, + { + "epoch": 0.15356928614277143, + "grad_norm": 5.71875, + "kl": 0.07666015625, + "learning_rate": 9.520771791258593e-06, + "logits/chosen": 255104800.0, + "logits/rejected": 270640768.0, + "logps/chosen": -282.4766845703125, + "logps/rejected": -427.85821533203125, + "loss": 0.1067, + "rewards/chosen": 2.2505686283111572, + "rewards/margins": 7.6662938594818115, + "rewards/rejected": -5.415725231170654, + "step": 416 + }, + { + "epoch": 0.15393844308061463, + "grad_norm": 6.59375, + "kl": 4.1584954261779785, + "learning_rate": 9.518254409496536e-06, + "logits/chosen": 203701873.7777778, + "logits/rejected": 204115748.57142857, + "logps/chosen": -346.7218967013889, + "logps/rejected": -395.66737583705356, + "loss": 0.1513, + "rewards/chosen": 2.394937939114041, + "rewards/margins": 7.458893518599252, + "rewards/rejected": -5.063955579485212, + "step": 417 + }, + { + "epoch": 0.15430760001845784, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.515730767750397e-06, + "logits/chosen": 281349148.4444444, + "logits/rejected": 239195026.2857143, + "logps/chosen": -337.06146918402777, + "logps/rejected": -457.64132254464283, + "loss": 0.1219, + "rewards/chosen": 2.1853824191623263, + "rewards/margins": 7.474371350000775, + "rewards/rejected": -5.288988930838449, + "step": 418 + }, + { + "epoch": 0.15467675695630104, + "grad_norm": 7.96875, + "kl": 0.0, + "learning_rate": 9.513200869516651e-06, + "logits/chosen": 308390400.0, + "logits/rejected": 332980960.0, + "logps/chosen": -369.51263427734375, + "logps/rejected": -483.0381164550781, + "loss": 0.1161, + "rewards/chosen": 1.6450670957565308, + "rewards/margins": 6.7649534940719604, + "rewards/rejected": -5.11988639831543, + "step": 419 + }, + { + "epoch": 0.15504591389414424, + "grad_norm": 8.875, + "kl": 0.3588371276855469, + "learning_rate": 9.51066471830044e-06, + "logits/chosen": 290319692.8, + "logits/rejected": 261832384.0, + "logps/chosen": -369.948486328125, + "logps/rejected": -411.2499186197917, + "loss": 0.1871, + "rewards/chosen": 1.5267781257629394, + "rewards/margins": 6.631012503306071, + "rewards/rejected": -5.104234377543132, + "step": 420 + }, + { + "epoch": 0.15541507083198744, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.50812231761557e-06, + "logits/chosen": 232776925.86666667, + "logits/rejected": 174761984.0, + "logps/chosen": -308.7384440104167, + "logps/rejected": -435.4175666360294, + "loss": 0.1235, + "rewards/chosen": 1.5359368642171225, + "rewards/margins": 7.859973855112113, + "rewards/rejected": -6.3240369908949905, + "step": 421 + }, + { + "epoch": 0.15578422776983064, + "grad_norm": 7.625, + "kl": 1.7235403060913086, + "learning_rate": 9.505573670984502e-06, + "logits/chosen": 295054787.7647059, + "logits/rejected": 354200576.0, + "logps/chosen": -332.83903952205884, + "logps/rejected": -412.6393229166667, + "loss": 0.1676, + "rewards/chosen": 1.9621297050924862, + "rewards/margins": 7.236791977227902, + "rewards/rejected": -5.274662272135417, + "step": 422 + }, + { + "epoch": 0.15615338470767384, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.503018781938358e-06, + "logits/chosen": 219661084.44444445, + "logits/rejected": 308021613.71428573, + "logps/chosen": -295.0574001736111, + "logps/rejected": -496.0848911830357, + "loss": 0.1206, + "rewards/chosen": 2.4076627095540366, + "rewards/margins": 8.019649687267485, + "rewards/rejected": -5.611986977713449, + "step": 423 + }, + { + "epoch": 0.15652254164551704, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.5004576540169e-06, + "logits/chosen": 291810107.0769231, + "logits/rejected": 271790457.2631579, + "logps/chosen": -417.6418269230769, + "logps/rejected": -421.662109375, + "loss": 0.1043, + "rewards/chosen": 1.6790777353140025, + "rewards/margins": 7.0864416269155655, + "rewards/rejected": -5.4073638916015625, + "step": 424 + }, + { + "epoch": 0.15689169858336025, + "grad_norm": 6.1875, + "kl": 1.061579704284668, + "learning_rate": 9.49789029076854e-06, + "logits/chosen": 307431660.3076923, + "logits/rejected": 225510103.57894737, + "logps/chosen": -350.0855243389423, + "logps/rejected": -459.4458778782895, + "loss": 0.1114, + "rewards/chosen": 1.5211830139160156, + "rewards/margins": 7.078492616352282, + "rewards/rejected": -5.557309602436266, + "step": 425 + }, + { + "epoch": 0.15726085552120345, + "grad_norm": 6.1875, + "kl": 2.9442124366760254, + "learning_rate": 9.49531669575033e-06, + "logits/chosen": 228943906.13333333, + "logits/rejected": 258305897.4117647, + "logps/chosen": -372.2540690104167, + "logps/rejected": -374.44873046875, + "loss": 0.1142, + "rewards/chosen": 2.006291707356771, + "rewards/margins": 7.200306312710632, + "rewards/rejected": -5.1940146053538605, + "step": 426 + }, + { + "epoch": 0.15763001245904665, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 9.492736872527948e-06, + "logits/chosen": 246232064.0, + "logits/rejected": 210165288.42105263, + "logps/chosen": -311.50052584134613, + "logps/rejected": -420.6880139802632, + "loss": 0.133, + "rewards/chosen": 1.218516203073355, + "rewards/margins": 6.983914730519901, + "rewards/rejected": -5.765398527446546, + "step": 427 + }, + { + "epoch": 0.15799916939688985, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 9.49015082467571e-06, + "logits/chosen": 287204625.06666666, + "logits/rejected": 218276909.17647058, + "logps/chosen": -345.49583333333334, + "logps/rejected": -405.62838924632354, + "loss": 0.0856, + "rewards/chosen": 2.4236178080240887, + "rewards/margins": 7.6168883230171955, + "rewards/rejected": -5.193270514993107, + "step": 428 + }, + { + "epoch": 0.15836832633473305, + "grad_norm": 5.28125, + "kl": 2.2899703979492188, + "learning_rate": 9.48755855577655e-06, + "logits/chosen": 335645617.2307692, + "logits/rejected": 210674526.31578946, + "logps/chosen": -303.10054837740387, + "logps/rejected": -384.58652857730266, + "loss": 0.1126, + "rewards/chosen": 2.1726374992957482, + "rewards/margins": 6.861768985084193, + "rewards/rejected": -4.689131485788446, + "step": 429 + }, + { + "epoch": 0.15873748327257625, + "grad_norm": 5.46875, + "kl": 0.41540050506591797, + "learning_rate": 9.484960069422026e-06, + "logits/chosen": 199338821.8181818, + "logits/rejected": 211337433.6, + "logps/chosen": -277.64912553267044, + "logps/rejected": -448.26884765625, + "loss": 0.125, + "rewards/chosen": 2.4529795213179155, + "rewards/margins": 8.262115339799362, + "rewards/rejected": -5.809135818481446, + "step": 430 + }, + { + "epoch": 0.15910664021041945, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.482355369212307e-06, + "logits/chosen": 317846560.0, + "logits/rejected": 177499264.0, + "logps/chosen": -351.70947265625, + "logps/rejected": -288.9054870605469, + "loss": 0.1386, + "rewards/chosen": 1.4940779209136963, + "rewards/margins": 5.46350622177124, + "rewards/rejected": -3.969428300857544, + "step": 431 + }, + { + "epoch": 0.15947579714826265, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 9.47974445875617e-06, + "logits/chosen": 252479162.1818182, + "logits/rejected": 162929548.8, + "logps/chosen": -413.8483220880682, + "logps/rejected": -353.560546875, + "loss": 0.14, + "rewards/chosen": 2.235454385930842, + "rewards/margins": 8.547144525701349, + "rewards/rejected": -6.311690139770508, + "step": 432 + }, + { + "epoch": 0.15984495408610586, + "grad_norm": 7.21875, + "kl": 1.267807960510254, + "learning_rate": 9.477127341671e-06, + "logits/chosen": 262180924.2352941, + "logits/rejected": 221173657.6, + "logps/chosen": -293.89694393382354, + "logps/rejected": -383.8988932291667, + "loss": 0.1496, + "rewards/chosen": 1.529168409459731, + "rewards/margins": 5.784915378046971, + "rewards/rejected": -4.25574696858724, + "step": 433 + }, + { + "epoch": 0.16021411102394906, + "grad_norm": 5.53125, + "kl": 0.1871638298034668, + "learning_rate": 9.47450402158278e-06, + "logits/chosen": 361096160.0, + "logits/rejected": 185613056.0, + "logps/chosen": -393.8422546386719, + "logps/rejected": -423.7720642089844, + "loss": 0.0892, + "rewards/chosen": 2.325225353240967, + "rewards/margins": 7.487844467163086, + "rewards/rejected": -5.162619113922119, + "step": 434 + }, + { + "epoch": 0.16058326796179226, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.471874502126087e-06, + "logits/chosen": 287358656.0, + "logits/rejected": 225245696.0, + "logps/chosen": -360.0821940104167, + "logps/rejected": -409.881201171875, + "loss": 0.0737, + "rewards/chosen": 2.152045726776123, + "rewards/margins": 8.177573680877686, + "rewards/rejected": -6.0255279541015625, + "step": 435 + }, + { + "epoch": 0.16095242489963546, + "grad_norm": 6.59375, + "kl": 0.18212175369262695, + "learning_rate": 9.469238786944086e-06, + "logits/chosen": 280858714.35294116, + "logits/rejected": 337913651.2, + "logps/chosen": -296.06494140625, + "logps/rejected": -518.6565104166667, + "loss": 0.1393, + "rewards/chosen": 1.74408519969267, + "rewards/margins": 7.894153355617149, + "rewards/rejected": -6.150068155924479, + "step": 436 + }, + { + "epoch": 0.16132158183747866, + "grad_norm": 6.875, + "kl": 0.3702554702758789, + "learning_rate": 9.466596879688525e-06, + "logits/chosen": 273248073.14285713, + "logits/rejected": 249223196.44444445, + "logps/chosen": -371.27242606026783, + "logps/rejected": -463.93793402777777, + "loss": 0.1145, + "rewards/chosen": 1.5945487703595842, + "rewards/margins": 7.343840932089185, + "rewards/rejected": -5.749292161729601, + "step": 437 + }, + { + "epoch": 0.16169073877532186, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 9.463948784019736e-06, + "logits/chosen": 264161737.14285713, + "logits/rejected": 245257898.66666666, + "logps/chosen": -453.38950892857144, + "logps/rejected": -368.24823676215277, + "loss": 0.0964, + "rewards/chosen": 2.3465063912527904, + "rewards/margins": 7.505466945587642, + "rewards/rejected": -5.158960554334852, + "step": 438 + }, + { + "epoch": 0.16205989571316506, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 9.461294503606621e-06, + "logits/chosen": 241805677.7142857, + "logits/rejected": 297118122.6666667, + "logps/chosen": -347.14174107142856, + "logps/rejected": -490.83208550347223, + "loss": 0.1151, + "rewards/chosen": 1.7744019372122628, + "rewards/margins": 8.13958672114781, + "rewards/rejected": -6.365184783935547, + "step": 439 + }, + { + "epoch": 0.16242905265100827, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.458634042126651e-06, + "logits/chosen": 208725563.07692307, + "logits/rejected": 174674418.52631578, + "logps/chosen": -284.826171875, + "logps/rejected": -340.7593287417763, + "loss": 0.1076, + "rewards/chosen": 1.793784655057467, + "rewards/margins": 6.423471111035058, + "rewards/rejected": -4.62968645597759, + "step": 440 + }, + { + "epoch": 0.16279820958885147, + "grad_norm": 5.59375, + "kl": 0.4483938217163086, + "learning_rate": 9.455967403265861e-06, + "logits/chosen": 389228774.4, + "logits/rejected": 279337066.6666667, + "logps/chosen": -340.0239501953125, + "logps/rejected": -598.7413736979166, + "loss": 0.1058, + "rewards/chosen": 2.3523561477661135, + "rewards/margins": 8.766907564798991, + "rewards/rejected": -6.414551417032878, + "step": 441 + }, + { + "epoch": 0.16316736652669467, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.453294590718846e-06, + "logits/chosen": 335948055.27272725, + "logits/rejected": 212625017.9047619, + "logps/chosen": -302.38236860795456, + "logps/rejected": -352.27662295386904, + "loss": 0.0542, + "rewards/chosen": 2.948759599165483, + "rewards/margins": 7.386372586865445, + "rewards/rejected": -4.437612987699962, + "step": 442 + }, + { + "epoch": 0.16353652346453787, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.450615608188755e-06, + "logits/chosen": 234187136.0, + "logits/rejected": 339975232.0, + "logps/chosen": -356.8456726074219, + "logps/rejected": -461.3004455566406, + "loss": 0.0989, + "rewards/chosen": 2.424544334411621, + "rewards/margins": 6.990416526794434, + "rewards/rejected": -4.5658721923828125, + "step": 443 + }, + { + "epoch": 0.16390568040238107, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.447930459387284e-06, + "logits/chosen": 306272157.53846157, + "logits/rejected": 230451900.63157895, + "logps/chosen": -265.3181903545673, + "logps/rejected": -399.5413240131579, + "loss": 0.0984, + "rewards/chosen": 2.030687478872446, + "rewards/margins": 7.926465934104765, + "rewards/rejected": -5.895778455232319, + "step": 444 + }, + { + "epoch": 0.16427483734022427, + "grad_norm": 5.71875, + "kl": 0.03672218322753906, + "learning_rate": 9.445239148034673e-06, + "logits/chosen": 365983195.4285714, + "logits/rejected": 166209109.33333334, + "logps/chosen": -265.11741420200894, + "logps/rejected": -333.44623480902777, + "loss": 0.1147, + "rewards/chosen": 1.7386861528669084, + "rewards/margins": 6.562457735576327, + "rewards/rejected": -4.823771582709418, + "step": 445 + }, + { + "epoch": 0.16464399427806747, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.442541677859695e-06, + "logits/chosen": 423333329.45454544, + "logits/rejected": 249172211.80952382, + "logps/chosen": -397.40780362215907, + "logps/rejected": -464.8510044642857, + "loss": 0.0808, + "rewards/chosen": 1.9811990911310369, + "rewards/margins": 7.908331833876573, + "rewards/rejected": -5.927132742745536, + "step": 446 + }, + { + "epoch": 0.16501315121591068, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 9.439838052599668e-06, + "logits/chosen": 310439384.61538464, + "logits/rejected": 250738903.57894737, + "logps/chosen": -342.61666165865387, + "logps/rejected": -496.7777549342105, + "loss": 0.0689, + "rewards/chosen": 2.646845890925481, + "rewards/margins": 7.77558148341623, + "rewards/rejected": -5.128735592490749, + "step": 447 + }, + { + "epoch": 0.16538230815375388, + "grad_norm": 5.875, + "kl": 1.23126220703125, + "learning_rate": 9.437128276000424e-06, + "logits/chosen": 297110601.14285713, + "logits/rejected": 309995520.0, + "logps/chosen": -361.05001395089283, + "logps/rejected": -518.0955946180555, + "loss": 0.0924, + "rewards/chosen": 2.7760396684919084, + "rewards/margins": 7.5998900882781495, + "rewards/rejected": -4.8238504197862415, + "step": 448 + }, + { + "epoch": 0.16575146509159708, + "grad_norm": 6.71875, + "kl": 0.9746894836425781, + "learning_rate": 9.434412351816329e-06, + "logits/chosen": 268453708.8, + "logits/rejected": 248329344.0, + "logps/chosen": -376.9782470703125, + "logps/rejected": -376.08935546875, + "loss": 0.115, + "rewards/chosen": 2.6422279357910154, + "rewards/margins": 7.410822041829427, + "rewards/rejected": -4.768594106038411, + "step": 449 + }, + { + "epoch": 0.16612062202944028, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.431690283810257e-06, + "logits/chosen": 158334464.0, + "logits/rejected": 202646480.0, + "logps/chosen": -279.44146728515625, + "logps/rejected": -418.60113525390625, + "loss": 0.1053, + "rewards/chosen": 2.531107187271118, + "rewards/margins": 7.196195840835571, + "rewards/rejected": -4.665088653564453, + "step": 450 + }, + { + "epoch": 0.16648977896728345, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 9.428962075753602e-06, + "logits/chosen": 236908986.1818182, + "logits/rejected": 245382729.14285713, + "logps/chosen": -336.4706365411932, + "logps/rejected": -486.5836123511905, + "loss": 0.0714, + "rewards/chosen": 2.2514721263538706, + "rewards/margins": 8.251662134608148, + "rewards/rejected": -6.000190008254278, + "step": 451 + }, + { + "epoch": 0.16685893590512665, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 9.42622773142626e-06, + "logits/chosen": 217350103.57894737, + "logits/rejected": 245520305.23076922, + "logps/chosen": -361.9578279194079, + "logps/rejected": -570.9224008413462, + "loss": 0.1081, + "rewards/chosen": 2.485917141563014, + "rewards/margins": 9.007365130219865, + "rewards/rejected": -6.521447988656851, + "step": 452 + }, + { + "epoch": 0.16722809284296986, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 9.423487254616632e-06, + "logits/chosen": 272377324.3076923, + "logits/rejected": 310890037.8947368, + "logps/chosen": -361.7336989182692, + "logps/rejected": -623.4495271381579, + "loss": 0.1266, + "rewards/chosen": 1.2000911419208233, + "rewards/margins": 9.595174117609558, + "rewards/rejected": -8.395082975688734, + "step": 453 + }, + { + "epoch": 0.16759724978081306, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 9.420740649121611e-06, + "logits/chosen": 373652749.4736842, + "logits/rejected": 301063522.46153843, + "logps/chosen": -339.40211245888156, + "logps/rejected": -550.8355994591346, + "loss": 0.0985, + "rewards/chosen": 2.9127809624922905, + "rewards/margins": 9.49672132175461, + "rewards/rejected": -6.58394035926232, + "step": 454 + }, + { + "epoch": 0.16796640671865626, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 9.417987918746587e-06, + "logits/chosen": 231908693.33333334, + "logits/rejected": 215591526.4, + "logps/chosen": -272.97812906901044, + "logps/rejected": -404.05009765625, + "loss": 0.1055, + "rewards/chosen": 2.02288818359375, + "rewards/margins": 7.815377807617187, + "rewards/rejected": -5.792489624023437, + "step": 455 + }, + { + "epoch": 0.16833556365649946, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.41522906730543e-06, + "logits/chosen": 248894659.7647059, + "logits/rejected": 178988817.06666666, + "logps/chosen": -317.49290556066177, + "logps/rejected": -365.11920572916665, + "loss": 0.1211, + "rewards/chosen": 1.746459399952608, + "rewards/margins": 6.73781987358542, + "rewards/rejected": -4.991360473632812, + "step": 456 + }, + { + "epoch": 0.16870472059434266, + "grad_norm": 4.125, + "kl": 0.4420294761657715, + "learning_rate": 9.412464098620495e-06, + "logits/chosen": 239777698.9090909, + "logits/rejected": 304437808.7619048, + "logps/chosen": -358.7623401988636, + "logps/rejected": -499.00664992559524, + "loss": 0.0578, + "rewards/chosen": 2.347758553244851, + "rewards/margins": 8.083507570869479, + "rewards/rejected": -5.735749017624628, + "step": 457 + }, + { + "epoch": 0.16907387753218586, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.409693016522613e-06, + "logits/chosen": 219925742.93333334, + "logits/rejected": 211889121.88235295, + "logps/chosen": -378.48570963541664, + "logps/rejected": -506.8111787683824, + "loss": 0.1124, + "rewards/chosen": 1.6273361206054688, + "rewards/margins": 8.113765671673944, + "rewards/rejected": -6.486429551068475, + "step": 458 + }, + { + "epoch": 0.16944303447002906, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.40691582485108e-06, + "logits/chosen": 201170197.33333334, + "logits/rejected": 293785548.8, + "logps/chosen": -289.25636800130206, + "logps/rejected": -487.581103515625, + "loss": 0.072, + "rewards/chosen": 2.6682141621907554, + "rewards/margins": 8.066062672932944, + "rewards/rejected": -5.397848510742188, + "step": 459 + }, + { + "epoch": 0.16981219140787226, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 9.404132527453662e-06, + "logits/chosen": 199064704.0, + "logits/rejected": 295865453.71428573, + "logps/chosen": -275.44325086805554, + "logps/rejected": -472.58056640625, + "loss": 0.1161, + "rewards/chosen": 2.0718665652804904, + "rewards/margins": 9.06004536341107, + "rewards/rejected": -6.988178798130581, + "step": 460 + }, + { + "epoch": 0.17018134834571547, + "grad_norm": 9.0625, + "kl": 0.7454996109008789, + "learning_rate": 9.40134312818658e-06, + "logits/chosen": 218166116.17391303, + "logits/rejected": 168073813.33333334, + "logps/chosen": -353.5689750339674, + "logps/rejected": -451.93329535590277, + "loss": 0.1731, + "rewards/chosen": 1.524544093919837, + "rewards/margins": 8.131089436259247, + "rewards/rejected": -6.60654534233941, + "step": 461 + }, + { + "epoch": 0.17055050528355867, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.398547630914512e-06, + "logits/chosen": 208447890.2857143, + "logits/rejected": 247851861.33333334, + "logps/chosen": -387.29188755580356, + "logps/rejected": -371.76288519965277, + "loss": 0.1158, + "rewards/chosen": 2.2528273718697682, + "rewards/margins": 7.312669799441382, + "rewards/rejected": -5.059842427571614, + "step": 462 + }, + { + "epoch": 0.17091966222140187, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 9.395746039510585e-06, + "logits/chosen": 336463250.28571427, + "logits/rejected": 245600483.55555555, + "logps/chosen": -418.09256417410717, + "logps/rejected": -422.27685546875, + "loss": 0.12, + "rewards/chosen": 1.7601983206612724, + "rewards/margins": 6.758690243675596, + "rewards/rejected": -4.998491923014323, + "step": 463 + }, + { + "epoch": 0.17128881915924507, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.392938357856367e-06, + "logits/chosen": 307443602.28571427, + "logits/rejected": 262577720.8888889, + "logps/chosen": -365.51778738839283, + "logps/rejected": -515.1072591145834, + "loss": 0.0833, + "rewards/chosen": 2.530372074672154, + "rewards/margins": 7.65017349000961, + "rewards/rejected": -5.119801415337457, + "step": 464 + }, + { + "epoch": 0.17165797609708827, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.390124589841866e-06, + "logits/chosen": 273046784.0, + "logits/rejected": 374383835.4285714, + "logps/chosen": -355.70147026909723, + "logps/rejected": -376.383056640625, + "loss": 0.0992, + "rewards/chosen": 2.4207753075493708, + "rewards/margins": 7.515393968612429, + "rewards/rejected": -5.094618661063058, + "step": 465 + }, + { + "epoch": 0.17202713303493147, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 9.387304739365524e-06, + "logits/chosen": 311567579.4285714, + "logits/rejected": 246641976.8888889, + "logps/chosen": -372.06100027901783, + "logps/rejected": -438.9164767795139, + "loss": 0.0822, + "rewards/chosen": 1.7947066170828683, + "rewards/margins": 8.453020792158823, + "rewards/rejected": -6.6583141750759545, + "step": 466 + }, + { + "epoch": 0.17239628997277467, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.384478810334202e-06, + "logits/chosen": 263662772.70588234, + "logits/rejected": 162149870.93333334, + "logps/chosen": -274.7057674632353, + "logps/rejected": -406.25709635416666, + "loss": 0.1058, + "rewards/chosen": 2.2494504591997933, + "rewards/margins": 7.6516797682818245, + "rewards/rejected": -5.402229309082031, + "step": 467 + }, + { + "epoch": 0.17276544691061788, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 9.381646806663195e-06, + "logits/chosen": 293742061.71428573, + "logits/rejected": 256005091.55555555, + "logps/chosen": -358.10400390625, + "logps/rejected": -458.06304253472223, + "loss": 0.0984, + "rewards/chosen": 1.6062047140938895, + "rewards/margins": 7.800089245750791, + "rewards/rejected": -6.193884531656901, + "step": 468 + }, + { + "epoch": 0.17313460384846108, + "grad_norm": 7.4375, + "kl": 1.498281478881836, + "learning_rate": 9.378808732276206e-06, + "logits/chosen": 314019356.4444444, + "logits/rejected": 229385453.7142857, + "logps/chosen": -396.5556640625, + "logps/rejected": -438.9259556361607, + "loss": 0.1254, + "rewards/chosen": 1.9623228708902996, + "rewards/margins": 8.076791581653413, + "rewards/rejected": -6.114468710763114, + "step": 469 + }, + { + "epoch": 0.17350376078630428, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.37596459110535e-06, + "logits/chosen": 217445472.0, + "logits/rejected": 249412352.0, + "logps/chosen": -324.67193603515625, + "logps/rejected": -359.6192932128906, + "loss": 0.1619, + "rewards/chosen": 1.0337212085723877, + "rewards/margins": 6.022506952285767, + "rewards/rejected": -4.988785743713379, + "step": 470 + }, + { + "epoch": 0.17387291772414748, + "grad_norm": 6.375, + "kl": 0.6642475128173828, + "learning_rate": 9.373114387091148e-06, + "logits/chosen": 282730104.4705882, + "logits/rejected": 219925299.2, + "logps/chosen": -368.9143497242647, + "logps/rejected": -498.1264973958333, + "loss": 0.1131, + "rewards/chosen": 2.3399312636431526, + "rewards/margins": 8.761796599743413, + "rewards/rejected": -6.42186533610026, + "step": 471 + }, + { + "epoch": 0.17424207466199068, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 9.370258124182525e-06, + "logits/chosen": 276918980.9230769, + "logits/rejected": 201041340.63157895, + "logps/chosen": -382.8835637019231, + "logps/rejected": -398.6702302631579, + "loss": 0.1209, + "rewards/chosen": 1.3195230043851411, + "rewards/margins": 6.5306211232173785, + "rewards/rejected": -5.211098118832237, + "step": 472 + }, + { + "epoch": 0.17461123159983388, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.367395806336793e-06, + "logits/chosen": 257838646.85714287, + "logits/rejected": 197235868.44444445, + "logps/chosen": -347.930419921875, + "logps/rejected": -303.90370008680554, + "loss": 0.1118, + "rewards/chosen": 2.400005885532924, + "rewards/margins": 6.099857875279017, + "rewards/rejected": -3.6998519897460938, + "step": 473 + }, + { + "epoch": 0.17498038853767708, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 9.364527437519658e-06, + "logits/chosen": 339051212.8, + "logits/rejected": 264560880.94117647, + "logps/chosen": -293.22620442708336, + "logps/rejected": -473.62712545955884, + "loss": 0.1071, + "rewards/chosen": 1.718830362955729, + "rewards/margins": 7.605822424795114, + "rewards/rejected": -5.8869920618393845, + "step": 474 + }, + { + "epoch": 0.17534954547552029, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.361653021705211e-06, + "logits/chosen": 390092234.1052632, + "logits/rejected": 269573513.84615386, + "logps/chosen": -376.5907689144737, + "logps/rejected": -279.1181640625, + "loss": 0.1624, + "rewards/chosen": 1.662230039897718, + "rewards/margins": 6.38481586471743, + "rewards/rejected": -4.722585824819712, + "step": 475 + }, + { + "epoch": 0.1757187024133635, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 9.358772562875914e-06, + "logits/chosen": 210649915.07692307, + "logits/rejected": 191388375.57894737, + "logps/chosen": -348.12950721153845, + "logps/rejected": -407.1107113486842, + "loss": 0.1006, + "rewards/chosen": 2.4172307527982273, + "rewards/margins": 7.223524842667676, + "rewards/rejected": -4.806294089869449, + "step": 476 + }, + { + "epoch": 0.1760878593512067, + "grad_norm": 7.34375, + "kl": 1.310225486755371, + "learning_rate": 9.355886065022611e-06, + "logits/chosen": 318699904.0, + "logits/rejected": 197828757.33333334, + "logps/chosen": -348.643505859375, + "logps/rejected": -423.6169026692708, + "loss": 0.1459, + "rewards/chosen": 1.898191261291504, + "rewards/margins": 7.077956962585449, + "rewards/rejected": -5.179765701293945, + "step": 477 + }, + { + "epoch": 0.1764570162890499, + "grad_norm": 6.78125, + "kl": 0.5322685241699219, + "learning_rate": 9.352993532144505e-06, + "logits/chosen": 210506598.4, + "logits/rejected": 222599680.0, + "logps/chosen": -387.3800455729167, + "logps/rejected": -423.60472196691177, + "loss": 0.1153, + "rewards/chosen": 2.0464855194091798, + "rewards/margins": 7.230749309764189, + "rewards/rejected": -5.1842637903550095, + "step": 478 + }, + { + "epoch": 0.1768261732268931, + "grad_norm": 7.53125, + "kl": 0.0, + "learning_rate": 9.350094968249163e-06, + "logits/chosen": 243250733.17647058, + "logits/rejected": 301554449.06666666, + "logps/chosen": -317.6409122242647, + "logps/rejected": -408.52259114583336, + "loss": 0.1464, + "rewards/chosen": 1.702374402214499, + "rewards/margins": 6.798911674349916, + "rewards/rejected": -5.096537272135417, + "step": 479 + }, + { + "epoch": 0.1771953301647363, + "grad_norm": 7.46875, + "kl": 0.32808589935302734, + "learning_rate": 9.347190377352512e-06, + "logits/chosen": 285722784.0, + "logits/rejected": 285270784.0, + "logps/chosen": -324.42962646484375, + "logps/rejected": -393.74554443359375, + "loss": 0.1572, + "rewards/chosen": 1.6733344793319702, + "rewards/margins": 6.4214266538619995, + "rewards/rejected": -4.748092174530029, + "step": 480 + }, + { + "epoch": 0.1775644871025795, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 9.344279763478823e-06, + "logits/chosen": 140828016.0, + "logits/rejected": 253502368.0, + "logps/chosen": -275.6158447265625, + "logps/rejected": -325.40472412109375, + "loss": 0.1501, + "rewards/chosen": 2.079955577850342, + "rewards/margins": 6.51678466796875, + "rewards/rejected": -4.436829090118408, + "step": 481 + }, + { + "epoch": 0.1779336440404227, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.341363130660714e-06, + "logits/chosen": 344695881.14285713, + "logits/rejected": 252977863.1111111, + "logps/chosen": -292.11300223214283, + "logps/rejected": -457.4259440104167, + "loss": 0.0944, + "rewards/chosen": 2.0470473425728932, + "rewards/margins": 7.875413516211131, + "rewards/rejected": -5.828366173638238, + "step": 482 + }, + { + "epoch": 0.1783028009782659, + "grad_norm": 6.03125, + "kl": 0.5108613967895508, + "learning_rate": 9.338440482939146e-06, + "logits/chosen": 256058247.52941176, + "logits/rejected": 213306009.6, + "logps/chosen": -329.0048828125, + "logps/rejected": -432.5283203125, + "loss": 0.1344, + "rewards/chosen": 1.7693156074075138, + "rewards/margins": 7.489958205877565, + "rewards/rejected": -5.720642598470052, + "step": 483 + }, + { + "epoch": 0.1786719579161091, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 9.335511824363407e-06, + "logits/chosen": 264605749.89473686, + "logits/rejected": 454398070.15384614, + "logps/chosen": -318.55044716282896, + "logps/rejected": -496.7064678485577, + "loss": 0.1032, + "rewards/chosen": 2.759919417531867, + "rewards/margins": 7.54252011380215, + "rewards/rejected": -4.782600696270283, + "step": 484 + }, + { + "epoch": 0.1790411148539523, + "grad_norm": 7.1875, + "kl": 0.0, + "learning_rate": 9.332577158991118e-06, + "logits/chosen": 341885411.5555556, + "logits/rejected": 420265325.71428573, + "logps/chosen": -380.068359375, + "logps/rejected": -447.59364536830356, + "loss": 0.1317, + "rewards/chosen": 1.9326434665256076, + "rewards/margins": 7.892312852163164, + "rewards/rejected": -5.9596693856375555, + "step": 485 + }, + { + "epoch": 0.1794102717917955, + "grad_norm": 3.90625, + "kl": 0.09189033508300781, + "learning_rate": 9.32963649088822e-06, + "logits/chosen": 282619624.72727275, + "logits/rejected": 351962648.38095236, + "logps/chosen": -341.1024724786932, + "logps/rejected": -363.30029296875, + "loss": 0.0625, + "rewards/chosen": 2.509073430841619, + "rewards/margins": 7.876412792123241, + "rewards/rejected": -5.367339361281622, + "step": 486 + }, + { + "epoch": 0.17977942872963867, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 9.326689824128971e-06, + "logits/chosen": 199222030.2222222, + "logits/rejected": 317454482.28571427, + "logps/chosen": -329.23573133680554, + "logps/rejected": -387.49278041294644, + "loss": 0.1224, + "rewards/chosen": 2.0570216708713107, + "rewards/margins": 7.75237776741149, + "rewards/rejected": -5.695356096540179, + "step": 487 + }, + { + "epoch": 0.18014858566748188, + "grad_norm": 7.375, + "kl": 0.8265438079833984, + "learning_rate": 9.323737162795941e-06, + "logits/chosen": 258496572.2352941, + "logits/rejected": 211159483.73333332, + "logps/chosen": -395.69318704044116, + "logps/rejected": -447.4322916666667, + "loss": 0.1232, + "rewards/chosen": 2.012852163875804, + "rewards/margins": 6.678296092912262, + "rewards/rejected": -4.665443929036458, + "step": 488 + }, + { + "epoch": 0.18051774260532508, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.320778510980004e-06, + "logits/chosen": 191662890.66666666, + "logits/rejected": 255325988.57142857, + "logps/chosen": -342.8654513888889, + "logps/rejected": -391.17184012276783, + "loss": 0.1212, + "rewards/chosen": 2.55446285671658, + "rewards/margins": 7.259615095834883, + "rewards/rejected": -4.705152239118304, + "step": 489 + }, + { + "epoch": 0.18088689954316828, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 9.317813872780336e-06, + "logits/chosen": 245606242.46153846, + "logits/rejected": 262641717.89473686, + "logps/chosen": -393.5801532451923, + "logps/rejected": -535.6276212993421, + "loss": 0.1058, + "rewards/chosen": 1.8082457322340746, + "rewards/margins": 7.474115194096739, + "rewards/rejected": -5.665869461862664, + "step": 490 + }, + { + "epoch": 0.18125605648101148, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.314843252304405e-06, + "logits/chosen": 303593198.93333334, + "logits/rejected": 227134900.70588234, + "logps/chosen": -357.1730143229167, + "logps/rejected": -448.0652860753676, + "loss": 0.081, + "rewards/chosen": 2.1534299214680988, + "rewards/margins": 8.718902692607804, + "rewards/rejected": -6.565472771139706, + "step": 491 + }, + { + "epoch": 0.18162521341885468, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 9.311866653667967e-06, + "logits/chosen": 278158392.8888889, + "logits/rejected": 367644050.28571427, + "logps/chosen": -383.42523871527777, + "logps/rejected": -587.2656947544643, + "loss": 0.1007, + "rewards/chosen": 2.125075446234809, + "rewards/margins": 8.50968303377666, + "rewards/rejected": -6.384607587541852, + "step": 492 + }, + { + "epoch": 0.18199437035669788, + "grad_norm": 6.78125, + "kl": 1.5189378261566162, + "learning_rate": 9.30888408099506e-06, + "logits/chosen": 244310897.7777778, + "logits/rejected": 181777152.0, + "logps/chosen": -317.40977647569446, + "logps/rejected": -358.30517578125, + "loss": 0.1397, + "rewards/chosen": 1.7469857533772786, + "rewards/margins": 6.4495178404308495, + "rewards/rejected": -4.702532087053571, + "step": 493 + }, + { + "epoch": 0.18236352729454108, + "grad_norm": 5.46875, + "kl": 0.9644412994384766, + "learning_rate": 9.305895538418004e-06, + "logits/chosen": 265584418.13333333, + "logits/rejected": 219951405.17647058, + "logps/chosen": -416.1052734375, + "logps/rejected": -390.5291532628676, + "loss": 0.0772, + "rewards/chosen": 2.463695780436198, + "rewards/margins": 7.867428529028799, + "rewards/rejected": -5.403732748592601, + "step": 494 + }, + { + "epoch": 0.18273268423238428, + "grad_norm": 5.0, + "kl": 0.7281956672668457, + "learning_rate": 9.302901030077384e-06, + "logits/chosen": 322764074.6666667, + "logits/rejected": 262544921.6, + "logps/chosen": -420.54345703125, + "logps/rejected": -440.090185546875, + "loss": 0.0701, + "rewards/chosen": 2.3968516985575357, + "rewards/margins": 8.423374907175699, + "rewards/rejected": -6.026523208618164, + "step": 495 + }, + { + "epoch": 0.1831018411702275, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 9.299900560122057e-06, + "logits/chosen": 349270493.8666667, + "logits/rejected": 160548111.05882353, + "logps/chosen": -400.84189453125, + "logps/rejected": -310.8091681985294, + "loss": 0.0862, + "rewards/chosen": 2.030033747355143, + "rewards/margins": 6.519555895936255, + "rewards/rejected": -4.489522148581112, + "step": 496 + }, + { + "epoch": 0.1834709981080707, + "grad_norm": 4.90625, + "kl": 0.30380821228027344, + "learning_rate": 9.296894132709134e-06, + "logits/chosen": 287451062.85714287, + "logits/rejected": 236389333.33333334, + "logps/chosen": -357.52089146205356, + "logps/rejected": -426.29725477430554, + "loss": 0.0655, + "rewards/chosen": 2.788566861833845, + "rewards/margins": 8.20830224052308, + "rewards/rejected": -5.419735378689236, + "step": 497 + }, + { + "epoch": 0.1838401550459139, + "grad_norm": 5.6875, + "kl": 0.3002662658691406, + "learning_rate": 9.29388175200398e-06, + "logits/chosen": 198678328.8888889, + "logits/rejected": 310029494.85714287, + "logps/chosen": -368.08745659722223, + "logps/rejected": -421.98514229910717, + "loss": 0.0887, + "rewards/chosen": 2.2835761176215277, + "rewards/margins": 7.8620226421053445, + "rewards/rejected": -5.578446524483817, + "step": 498 + }, + { + "epoch": 0.1842093119837571, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 9.290863422180211e-06, + "logits/chosen": 240776806.4, + "logits/rejected": 258073460.36363637, + "logps/chosen": -371.06962890625, + "logps/rejected": -377.6982421875, + "loss": 0.0753, + "rewards/chosen": 1.8100519180297852, + "rewards/margins": 6.826808149164373, + "rewards/rejected": -5.016756231134588, + "step": 499 + }, + { + "epoch": 0.1845784689216003, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 9.287839147419685e-06, + "logits/chosen": 269458581.3333333, + "logits/rejected": 337324646.4, + "logps/chosen": -424.7006022135417, + "logps/rejected": -463.02421875, + "loss": 0.0679, + "rewards/chosen": 1.8417903582255046, + "rewards/margins": 8.115584150950113, + "rewards/rejected": -6.273793792724609, + "step": 500 + }, + { + "epoch": 0.1849476258594435, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 9.284808931912501e-06, + "logits/chosen": 308586091.7894737, + "logits/rejected": 234149021.53846154, + "logps/chosen": -363.41822574013156, + "logps/rejected": -427.9821589543269, + "loss": 0.1155, + "rewards/chosen": 2.185941595780222, + "rewards/margins": 8.09233479364681, + "rewards/rejected": -5.906393197866587, + "step": 501 + }, + { + "epoch": 0.1853167827972867, + "grad_norm": 5.75, + "kl": 0.025266170501708984, + "learning_rate": 9.281772779856977e-06, + "logits/chosen": 192885878.15384614, + "logits/rejected": 178751285.89473686, + "logps/chosen": -344.3254582331731, + "logps/rejected": -388.3770816200658, + "loss": 0.0804, + "rewards/chosen": 2.413318780752329, + "rewards/margins": 7.682337255130412, + "rewards/rejected": -5.2690184743780835, + "step": 502 + }, + { + "epoch": 0.1856859397351299, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.278730695459664e-06, + "logits/chosen": 244627401.14285713, + "logits/rejected": 178205368.8888889, + "logps/chosen": -346.7113560267857, + "logps/rejected": -405.2513020833333, + "loss": 0.1162, + "rewards/chosen": 1.5345453534807478, + "rewards/margins": 7.502114492749412, + "rewards/rejected": -5.967569139268663, + "step": 503 + }, + { + "epoch": 0.1860550966729731, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.275682682935336e-06, + "logits/chosen": 191006498.13333333, + "logits/rejected": 214142960.94117647, + "logps/chosen": -304.25270182291666, + "logps/rejected": -396.3329503676471, + "loss": 0.1299, + "rewards/chosen": 1.7386871337890626, + "rewards/margins": 8.013882536046646, + "rewards/rejected": -6.275195402257583, + "step": 504 + }, + { + "epoch": 0.1864242536108163, + "grad_norm": 8.125, + "kl": 0.7009754180908203, + "learning_rate": 9.27262874650697e-06, + "logits/chosen": 221394588.44444445, + "logits/rejected": 196212096.0, + "logps/chosen": -401.3142361111111, + "logps/rejected": -333.34054129464283, + "loss": 0.1566, + "rewards/chosen": 1.8601979149712458, + "rewards/margins": 6.199339760674371, + "rewards/rejected": -4.339141845703125, + "step": 505 + }, + { + "epoch": 0.1867934105486595, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 9.269568890405762e-06, + "logits/chosen": 186798208.0, + "logits/rejected": 252703943.1111111, + "logps/chosen": -292.42972237723217, + "logps/rejected": -431.4443359375, + "loss": 0.1086, + "rewards/chosen": 2.1799891335623607, + "rewards/margins": 7.790805665273515, + "rewards/rejected": -5.610816531711155, + "step": 506 + }, + { + "epoch": 0.1871625674865027, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.2665031188711e-06, + "logits/chosen": 256041693.86666667, + "logits/rejected": 159601423.05882353, + "logps/chosen": -394.778125, + "logps/rejected": -375.22532743566177, + "loss": 0.0692, + "rewards/chosen": 2.737346394856771, + "rewards/margins": 8.869593422085632, + "rewards/rejected": -6.1322470272288605, + "step": 507 + }, + { + "epoch": 0.1875317244243459, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.263431436150571e-06, + "logits/chosen": 270500633.6, + "logits/rejected": 258368832.0, + "logps/chosen": -265.71162109375, + "logps/rejected": -284.83681233723956, + "loss": 0.1682, + "rewards/chosen": 1.890840721130371, + "rewards/margins": 5.989145787556966, + "rewards/rejected": -4.098305066426595, + "step": 508 + }, + { + "epoch": 0.1879008813621891, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 9.260353846499954e-06, + "logits/chosen": 248089819.42857143, + "logits/rejected": 197829546.66666666, + "logps/chosen": -374.83091517857144, + "logps/rejected": -374.9072265625, + "loss": 0.0891, + "rewards/chosen": 2.2118606567382812, + "rewards/margins": 6.8900095621744795, + "rewards/rejected": -4.678148905436198, + "step": 509 + }, + { + "epoch": 0.1882700383000323, + "grad_norm": 7.78125, + "kl": 1.412858486175537, + "learning_rate": 9.257270354183212e-06, + "logits/chosen": 188374800.0, + "logits/rejected": 238655008.0, + "logps/chosen": -327.92919921875, + "logps/rejected": -477.0848388671875, + "loss": 0.1288, + "rewards/chosen": 1.9290143251419067, + "rewards/margins": 9.185001015663147, + "rewards/rejected": -7.25598669052124, + "step": 510 + }, + { + "epoch": 0.1886391952378755, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 9.254180963472478e-06, + "logits/chosen": 269686272.0, + "logits/rejected": 173675169.68421054, + "logps/chosen": -420.9778019831731, + "logps/rejected": -423.7371761924342, + "loss": 0.1038, + "rewards/chosen": 1.704438723050631, + "rewards/margins": 6.997542392869709, + "rewards/rejected": -5.293103669819079, + "step": 511 + }, + { + "epoch": 0.1890083521757187, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.251085678648072e-06, + "logits/chosen": 244076181.33333334, + "logits/rejected": 207983539.2, + "logps/chosen": -380.5265299479167, + "logps/rejected": -352.510595703125, + "loss": 0.0812, + "rewards/chosen": 2.565444310506185, + "rewards/margins": 7.946262486775716, + "rewards/rejected": -5.380818176269531, + "step": 512 + }, + { + "epoch": 0.1893775091135619, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 9.247984503998466e-06, + "logits/chosen": 253158400.0, + "logits/rejected": 301128429.71428573, + "logps/chosen": -355.61328125, + "logps/rejected": -402.229248046875, + "loss": 0.1005, + "rewards/chosen": 2.292848375108507, + "rewards/margins": 8.550623273092604, + "rewards/rejected": -6.257774897984096, + "step": 513 + }, + { + "epoch": 0.1897466660514051, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.2448774438203e-06, + "logits/chosen": 211895153.7777778, + "logits/rejected": 330391917.71428573, + "logps/chosen": -376.256103515625, + "logps/rejected": -451.09242466517856, + "loss": 0.1185, + "rewards/chosen": 2.2306876712375217, + "rewards/margins": 8.169677038041373, + "rewards/rejected": -5.938989366803851, + "step": 514 + }, + { + "epoch": 0.1901158229892483, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 9.241764502418365e-06, + "logits/chosen": 279632000.0, + "logits/rejected": 333099520.0, + "logps/chosen": -418.77508544921875, + "logps/rejected": -394.67266845703125, + "loss": 0.0711, + "rewards/chosen": 2.7097818851470947, + "rewards/margins": 7.538362264633179, + "rewards/rejected": -4.828580379486084, + "step": 515 + }, + { + "epoch": 0.1904849799270915, + "grad_norm": 5.6875, + "kl": 0.7824039459228516, + "learning_rate": 9.238645684105606e-06, + "logits/chosen": 147481804.8, + "logits/rejected": 301676664.4705882, + "logps/chosen": -321.0570963541667, + "logps/rejected": -579.7385110294117, + "loss": 0.1149, + "rewards/chosen": 1.7886885325113933, + "rewards/margins": 8.435037373561485, + "rewards/rejected": -6.6463488410500915, + "step": 516 + }, + { + "epoch": 0.19085413686493471, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 9.2355209932031e-06, + "logits/chosen": 238947809.88235295, + "logits/rejected": 358825233.06666666, + "logps/chosen": -364.2244657628676, + "logps/rejected": -453.4830078125, + "loss": 0.121, + "rewards/chosen": 1.872667200425092, + "rewards/margins": 7.291865808823529, + "rewards/rejected": -5.419198608398437, + "step": 517 + }, + { + "epoch": 0.19122329380277792, + "grad_norm": 5.4375, + "kl": 0.13137292861938477, + "learning_rate": 9.232390434040071e-06, + "logits/chosen": 186478224.0, + "logits/rejected": 188399584.0, + "logps/chosen": -296.5047607421875, + "logps/rejected": -509.7666015625, + "loss": 0.1009, + "rewards/chosen": 2.373495101928711, + "rewards/margins": 9.438036918640137, + "rewards/rejected": -7.064541816711426, + "step": 518 + }, + { + "epoch": 0.19159245074062112, + "grad_norm": 5.65625, + "kl": 0.051570892333984375, + "learning_rate": 9.229254010953868e-06, + "logits/chosen": 243076656.0, + "logits/rejected": 187462784.0, + "logps/chosen": -405.5072021484375, + "logps/rejected": -448.31884765625, + "loss": 0.0844, + "rewards/chosen": 2.5171303749084473, + "rewards/margins": 9.886941909790039, + "rewards/rejected": -7.369811534881592, + "step": 519 + }, + { + "epoch": 0.19196160767846432, + "grad_norm": 6.6875, + "kl": 0.6098427772521973, + "learning_rate": 9.226111728289963e-06, + "logits/chosen": 294128835.04761904, + "logits/rejected": 197789207.27272728, + "logps/chosen": -364.1410202752976, + "logps/rejected": -427.01708984375, + "loss": 0.1145, + "rewards/chosen": 2.389256795247396, + "rewards/margins": 7.357904145211885, + "rewards/rejected": -4.968647349964488, + "step": 520 + }, + { + "epoch": 0.19233076461630752, + "grad_norm": 8.3125, + "kl": 0.6603727340698242, + "learning_rate": 9.222963590401953e-06, + "logits/chosen": 430834580.2105263, + "logits/rejected": 287416123.0769231, + "logps/chosen": -503.7515419407895, + "logps/rejected": -294.60606971153845, + "loss": 0.1191, + "rewards/chosen": 2.6866730137875208, + "rewards/margins": 7.243344156365646, + "rewards/rejected": -4.556671142578125, + "step": 521 + }, + { + "epoch": 0.19269992155415072, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.21980960165154e-06, + "logits/chosen": 341790354.28571427, + "logits/rejected": 231389496.8888889, + "logps/chosen": -423.91005161830356, + "logps/rejected": -467.44867621527777, + "loss": 0.0929, + "rewards/chosen": 2.1487221036638533, + "rewards/margins": 9.053135962713332, + "rewards/rejected": -6.9044138590494795, + "step": 522 + }, + { + "epoch": 0.1930690784919939, + "grad_norm": 7.59375, + "kl": 0.4860267639160156, + "learning_rate": 9.216649766408536e-06, + "logits/chosen": 265279002.9473684, + "logits/rejected": 354364258.46153843, + "logps/chosen": -399.1707699424342, + "logps/rejected": -407.61606069711536, + "loss": 0.1189, + "rewards/chosen": 2.129996651097348, + "rewards/margins": 7.134969255702216, + "rewards/rejected": -5.004972604604868, + "step": 523 + }, + { + "epoch": 0.1934382354298371, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 9.213484089050853e-06, + "logits/chosen": 224070008.47058824, + "logits/rejected": 225506696.53333333, + "logps/chosen": -348.20387178308823, + "logps/rejected": -415.0614908854167, + "loss": 0.088, + "rewards/chosen": 2.096437790814568, + "rewards/margins": 7.781609808229932, + "rewards/rejected": -5.685172017415365, + "step": 524 + }, + { + "epoch": 0.1938073923676803, + "grad_norm": 5.96875, + "kl": 0.47811317443847656, + "learning_rate": 9.210312573964496e-06, + "logits/chosen": 231092377.6, + "logits/rejected": 197205312.0, + "logps/chosen": -321.3078369140625, + "logps/rejected": -426.0077718098958, + "loss": 0.1005, + "rewards/chosen": 2.5920843124389648, + "rewards/margins": 7.784676933288575, + "rewards/rejected": -5.192592620849609, + "step": 525 + }, + { + "epoch": 0.1941765493055235, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.207135225543557e-06, + "logits/chosen": 158778062.76923078, + "logits/rejected": 283112636.6315789, + "logps/chosen": -338.35836087740387, + "logps/rejected": -450.56172902960526, + "loss": 0.0828, + "rewards/chosen": 1.8222462580754206, + "rewards/margins": 8.001604087922255, + "rewards/rejected": -6.1793578298468335, + "step": 526 + }, + { + "epoch": 0.1945457062433667, + "grad_norm": 5.8125, + "kl": 1.702651023864746, + "learning_rate": 9.203952048190217e-06, + "logits/chosen": 233342313.4117647, + "logits/rejected": 246847283.2, + "logps/chosen": -276.14372702205884, + "logps/rejected": -301.2805989583333, + "loss": 0.1379, + "rewards/chosen": 2.0486252728630516, + "rewards/margins": 6.809937121821385, + "rewards/rejected": -4.761311848958333, + "step": 527 + }, + { + "epoch": 0.1949148631812099, + "grad_norm": 6.78125, + "kl": 0.9805755615234375, + "learning_rate": 9.200763046314725e-06, + "logits/chosen": 311499366.4, + "logits/rejected": 231560448.0, + "logps/chosen": -396.811962890625, + "logps/rejected": -413.3304036458333, + "loss": 0.1167, + "rewards/chosen": 2.5519771575927734, + "rewards/margins": 7.601826032002767, + "rewards/rejected": -5.049848874409993, + "step": 528 + }, + { + "epoch": 0.1952840201190531, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.197568224335401e-06, + "logits/chosen": 251696782.2222222, + "logits/rejected": 251896502.85714287, + "logps/chosen": -379.048828125, + "logps/rejected": -499.88692801339283, + "loss": 0.0951, + "rewards/chosen": 2.2381070454915366, + "rewards/margins": 8.229967571440197, + "rewards/rejected": -5.991860525948661, + "step": 529 + }, + { + "epoch": 0.1956531770568963, + "grad_norm": 6.75, + "kl": 0.6507587432861328, + "learning_rate": 9.194367586678634e-06, + "logits/chosen": 211687001.6, + "logits/rejected": 228108501.33333334, + "logps/chosen": -381.39189453125, + "logps/rejected": -400.9236653645833, + "loss": 0.1488, + "rewards/chosen": 2.0507839202880858, + "rewards/margins": 7.386147308349609, + "rewards/rejected": -5.335363388061523, + "step": 530 + }, + { + "epoch": 0.1960223339947395, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.191161137778861e-06, + "logits/chosen": 244563396.92307693, + "logits/rejected": 243599818.10526314, + "logps/chosen": -446.55649038461536, + "logps/rejected": -387.90869140625, + "loss": 0.101, + "rewards/chosen": 2.4650594271146336, + "rewards/margins": 7.71226947799868, + "rewards/rejected": -5.247210050884046, + "step": 531 + }, + { + "epoch": 0.1963914909325827, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 9.187948882078582e-06, + "logits/chosen": 224677997.7142857, + "logits/rejected": 255489280.0, + "logps/chosen": -325.8087681361607, + "logps/rejected": -365.719970703125, + "loss": 0.1082, + "rewards/chosen": 2.168647221156529, + "rewards/margins": 6.354063548738994, + "rewards/rejected": -4.185416327582465, + "step": 532 + }, + { + "epoch": 0.1967606478704259, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.184730824028334e-06, + "logits/chosen": 256606811.42857143, + "logits/rejected": 203495690.24, + "logps/chosen": -405.671875, + "logps/rejected": -457.25703125, + "loss": 0.0645, + "rewards/chosen": 2.0884545189993724, + "rewards/margins": 8.249377980913435, + "rewards/rejected": -6.160923461914063, + "step": 533 + }, + { + "epoch": 0.1971298048082691, + "grad_norm": 5.6875, + "kl": 0.14470195770263672, + "learning_rate": 9.181506968086696e-06, + "logits/chosen": 410019900.2352941, + "logits/rejected": 207905621.33333334, + "logps/chosen": -347.72429342830884, + "logps/rejected": -468.55091145833336, + "loss": 0.0989, + "rewards/chosen": 1.9890805412741268, + "rewards/margins": 8.752472163181679, + "rewards/rejected": -6.7633916219075525, + "step": 534 + }, + { + "epoch": 0.1974989617461123, + "grad_norm": 5.90625, + "kl": 0.5842757225036621, + "learning_rate": 9.178277318720279e-06, + "logits/chosen": 272377696.0, + "logits/rejected": 129608744.0, + "logps/chosen": -395.0940856933594, + "logps/rejected": -276.3193054199219, + "loss": 0.1064, + "rewards/chosen": 2.0551719665527344, + "rewards/margins": 6.77003288269043, + "rewards/rejected": -4.714860916137695, + "step": 535 + }, + { + "epoch": 0.1978681186839555, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.175041880403721e-06, + "logits/chosen": 268840256.0, + "logits/rejected": 296045344.0, + "logps/chosen": -356.02655029296875, + "logps/rejected": -511.52130126953125, + "loss": 0.0918, + "rewards/chosen": 2.301058292388916, + "rewards/margins": 8.278623104095459, + "rewards/rejected": -5.977564811706543, + "step": 536 + }, + { + "epoch": 0.19823727562179871, + "grad_norm": 9.0625, + "kl": 1.4804887771606445, + "learning_rate": 9.171800657619683e-06, + "logits/chosen": 274188515.5555556, + "logits/rejected": 230203282.2857143, + "logps/chosen": -360.36322699652777, + "logps/rejected": -312.72970145089283, + "loss": 0.1874, + "rewards/chosen": 1.4214090771145291, + "rewards/margins": 6.028331226772732, + "rewards/rejected": -4.606922149658203, + "step": 537 + }, + { + "epoch": 0.19860643255964192, + "grad_norm": 5.125, + "kl": 0.3712949752807617, + "learning_rate": 9.168553654858834e-06, + "logits/chosen": 256492463.15789473, + "logits/rejected": 308764987.0769231, + "logps/chosen": -320.0368009868421, + "logps/rejected": -447.89547025240387, + "loss": 0.0965, + "rewards/chosen": 2.259200246710526, + "rewards/margins": 9.006433710878195, + "rewards/rejected": -6.747233464167668, + "step": 538 + }, + { + "epoch": 0.19897558949748512, + "grad_norm": 4.3125, + "kl": 0.27567100524902344, + "learning_rate": 9.165300876619857e-06, + "logits/chosen": 306834144.0, + "logits/rejected": 187828800.0, + "logps/chosen": -461.0538024902344, + "logps/rejected": -481.90045166015625, + "loss": 0.0917, + "rewards/chosen": 2.6439037322998047, + "rewards/margins": 8.898477554321289, + "rewards/rejected": -6.254573822021484, + "step": 539 + }, + { + "epoch": 0.19934474643532832, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 9.162042327409437e-06, + "logits/chosen": 205788266.66666666, + "logits/rejected": 138154752.0, + "logps/chosen": -344.28662109375, + "logps/rejected": -368.27020263671875, + "loss": 0.1742, + "rewards/chosen": 2.126056353251139, + "rewards/margins": 8.09397808710734, + "rewards/rejected": -5.967921733856201, + "step": 540 + }, + { + "epoch": 0.19971390337317152, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 9.15877801174225e-06, + "logits/chosen": 314241632.0, + "logits/rejected": 238153552.0, + "logps/chosen": -325.1726989746094, + "logps/rejected": -398.5402526855469, + "loss": 0.1039, + "rewards/chosen": 1.8748257160186768, + "rewards/margins": 7.808844804763794, + "rewards/rejected": -5.934019088745117, + "step": 541 + }, + { + "epoch": 0.20008306031101472, + "grad_norm": 10.4375, + "kl": 0.0, + "learning_rate": 9.155507934140962e-06, + "logits/chosen": 199345117.86666667, + "logits/rejected": 236321310.11764705, + "logps/chosen": -397.6712890625, + "logps/rejected": -367.55919692095586, + "loss": 0.1521, + "rewards/chosen": 1.4151311238606772, + "rewards/margins": 6.803704265519684, + "rewards/rejected": -5.388573141659007, + "step": 542 + }, + { + "epoch": 0.20045221724885792, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.152232099136227e-06, + "logits/chosen": 243292416.0, + "logits/rejected": 267985408.0, + "logps/chosen": -392.2191685267857, + "logps/rejected": -394.48895941840277, + "loss": 0.1049, + "rewards/chosen": 2.0161119188581194, + "rewards/margins": 7.655087698073615, + "rewards/rejected": -5.638975779215495, + "step": 543 + }, + { + "epoch": 0.20082137418670112, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 9.148950511266674e-06, + "logits/chosen": 375437152.0, + "logits/rejected": 237537648.0, + "logps/chosen": -429.880859375, + "logps/rejected": -420.59112548828125, + "loss": 0.0564, + "rewards/chosen": 2.8675899505615234, + "rewards/margins": 8.344575881958008, + "rewards/rejected": -5.476985931396484, + "step": 544 + }, + { + "epoch": 0.20119053112454433, + "grad_norm": 5.125, + "kl": 0.4317600727081299, + "learning_rate": 9.1456631750789e-06, + "logits/chosen": 339709248.0, + "logits/rejected": 343722393.6, + "logps/chosen": -372.1802571614583, + "logps/rejected": -461.2220703125, + "loss": 0.1116, + "rewards/chosen": 1.6292831103007, + "rewards/margins": 7.157878081003825, + "rewards/rejected": -5.528594970703125, + "step": 545 + }, + { + "epoch": 0.20155968806238753, + "grad_norm": 6.6875, + "kl": 0.1512908935546875, + "learning_rate": 9.142370095127465e-06, + "logits/chosen": 219915444.70588234, + "logits/rejected": 397078289.06666666, + "logps/chosen": -372.24299172794116, + "logps/rejected": -343.68971354166666, + "loss": 0.1031, + "rewards/chosen": 2.2991790771484375, + "rewards/margins": 7.597134399414062, + "rewards/rejected": -5.297955322265625, + "step": 546 + }, + { + "epoch": 0.20192884500023073, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 9.139071275974894e-06, + "logits/chosen": 215469568.0, + "logits/rejected": 253675975.1111111, + "logps/chosen": -274.4943150111607, + "logps/rejected": -549.2864583333334, + "loss": 0.093, + "rewards/chosen": 2.27297306060791, + "rewards/margins": 8.881853421529133, + "rewards/rejected": -6.608880360921224, + "step": 547 + }, + { + "epoch": 0.20229800193807393, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.135766722191655e-06, + "logits/chosen": 297010261.3333333, + "logits/rejected": 178706212.57142857, + "logps/chosen": -319.52113172743054, + "logps/rejected": -372.7483607700893, + "loss": 0.1279, + "rewards/chosen": 1.9188272688123915, + "rewards/margins": 7.77733524261959, + "rewards/rejected": -5.858507973807199, + "step": 548 + }, + { + "epoch": 0.20266715887591713, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 9.132456438356165e-06, + "logits/chosen": 316816042.6666667, + "logits/rejected": 339655643.4285714, + "logps/chosen": -333.8794759114583, + "logps/rejected": -461.44559151785717, + "loss": 0.0908, + "rewards/chosen": 2.5337130228678384, + "rewards/margins": 9.044483003162203, + "rewards/rejected": -6.510769980294364, + "step": 549 + }, + { + "epoch": 0.20303631581376033, + "grad_norm": 8.875, + "kl": 0.0, + "learning_rate": 9.129140429054785e-06, + "logits/chosen": 283954278.4, + "logits/rejected": 378178861.1764706, + "logps/chosen": -424.80579427083336, + "logps/rejected": -518.1093175551471, + "loss": 0.1264, + "rewards/chosen": 1.2332733154296875, + "rewards/margins": 7.695723230698529, + "rewards/rejected": -6.4624499152688415, + "step": 550 + }, + { + "epoch": 0.20340547275160353, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 9.125818698881798e-06, + "logits/chosen": 225644361.14285713, + "logits/rejected": 171678606.2222222, + "logps/chosen": -332.2567661830357, + "logps/rejected": -348.93438042534723, + "loss": 0.0793, + "rewards/chosen": 2.4853613717215404, + "rewards/margins": 7.668324001251705, + "rewards/rejected": -5.182962629530165, + "step": 551 + }, + { + "epoch": 0.20377462968944673, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 9.122491252439425e-06, + "logits/chosen": 298891702.85714287, + "logits/rejected": 276671032.8888889, + "logps/chosen": -386.2049037388393, + "logps/rejected": -410.0693359375, + "loss": 0.1107, + "rewards/chosen": 1.8749747957502092, + "rewards/margins": 7.8223841076805485, + "rewards/rejected": -5.947409311930339, + "step": 552 + }, + { + "epoch": 0.20414378662728994, + "grad_norm": 7.125, + "kl": 0.7667427062988281, + "learning_rate": 9.119158094337794e-06, + "logits/chosen": 258255579.42857143, + "logits/rejected": 183624622.54545453, + "logps/chosen": -309.2681361607143, + "logps/rejected": -401.9002574573864, + "loss": 0.1294, + "rewards/chosen": 2.3183646429152716, + "rewards/margins": 8.920717825621237, + "rewards/rejected": -6.602353182705966, + "step": 553 + }, + { + "epoch": 0.20451294356513314, + "grad_norm": 6.0625, + "kl": 0.10532188415527344, + "learning_rate": 9.11581922919496e-06, + "logits/chosen": 331141312.0, + "logits/rejected": 275662496.0, + "logps/chosen": -304.8453063964844, + "logps/rejected": -308.6689758300781, + "loss": 0.1395, + "rewards/chosen": 1.8569306135177612, + "rewards/margins": 6.132055163383484, + "rewards/rejected": -4.275124549865723, + "step": 554 + }, + { + "epoch": 0.20488210050297634, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 9.112474661636871e-06, + "logits/chosen": 215161795.7647059, + "logits/rejected": 297513608.53333336, + "logps/chosen": -274.62767118566177, + "logps/rejected": -400.25693359375, + "loss": 0.1401, + "rewards/chosen": 1.4811054678524243, + "rewards/margins": 7.118391844805549, + "rewards/rejected": -5.637286376953125, + "step": 555 + }, + { + "epoch": 0.20525125744081954, + "grad_norm": 6.59375, + "kl": 0.4914684295654297, + "learning_rate": 9.109124396297391e-06, + "logits/chosen": 266781824.0, + "logits/rejected": 259351779.55555555, + "logps/chosen": -343.3323451450893, + "logps/rejected": -422.9975857204861, + "loss": 0.1013, + "rewards/chosen": 2.0430682046072826, + "rewards/margins": 7.731871559506372, + "rewards/rejected": -5.688803354899089, + "step": 556 + }, + { + "epoch": 0.20562041437866274, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.105768437818263e-06, + "logits/chosen": 378272981.3333333, + "logits/rejected": 248389785.6, + "logps/chosen": -441.2136637369792, + "logps/rejected": -468.968408203125, + "loss": 0.0608, + "rewards/chosen": 2.1501267751057944, + "rewards/margins": 8.12724469502767, + "rewards/rejected": -5.977117919921875, + "step": 557 + }, + { + "epoch": 0.20598957131650594, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 9.102406790849129e-06, + "logits/chosen": 278680780.8, + "logits/rejected": 178874944.0, + "logps/chosen": -369.99482421875, + "logps/rejected": -316.7263997395833, + "loss": 0.1114, + "rewards/chosen": 2.186244773864746, + "rewards/margins": 7.378487714131673, + "rewards/rejected": -5.192242940266927, + "step": 558 + }, + { + "epoch": 0.20635872825434912, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 9.099039460047506e-06, + "logits/chosen": 209423402.66666666, + "logits/rejected": 217676851.2, + "logps/chosen": -344.5362955729167, + "logps/rejected": -385.839306640625, + "loss": 0.0701, + "rewards/chosen": 2.675194422403971, + "rewards/margins": 8.243097941080729, + "rewards/rejected": -5.567903518676758, + "step": 559 + }, + { + "epoch": 0.20672788519219232, + "grad_norm": 7.03125, + "kl": 0.0719447135925293, + "learning_rate": 9.09566645007879e-06, + "logits/chosen": 183981648.0, + "logits/rejected": 197786336.0, + "logps/chosen": -312.45770263671875, + "logps/rejected": -420.138427734375, + "loss": 0.1067, + "rewards/chosen": 2.5014231204986572, + "rewards/margins": 7.541155099868774, + "rewards/rejected": -5.039731979370117, + "step": 560 + }, + { + "epoch": 0.20709704213003552, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 9.09228776561624e-06, + "logits/chosen": 259046301.53846154, + "logits/rejected": 334042704.84210527, + "logps/chosen": -338.12853064903845, + "logps/rejected": -452.4052220394737, + "loss": 0.1127, + "rewards/chosen": 1.7483802208533654, + "rewards/margins": 7.644726865204722, + "rewards/rejected": -5.896346644351357, + "step": 561 + }, + { + "epoch": 0.20746619906787872, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.088903411340985e-06, + "logits/chosen": 208022357.33333334, + "logits/rejected": 256560402.2857143, + "logps/chosen": -314.2344021267361, + "logps/rejected": -515.4737723214286, + "loss": 0.1208, + "rewards/chosen": 1.882229487101237, + "rewards/margins": 8.765389669509162, + "rewards/rejected": -6.8831601824079245, + "step": 562 + }, + { + "epoch": 0.20783535600572192, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 9.085513391942003e-06, + "logits/chosen": 184903601.23076922, + "logits/rejected": 195004294.7368421, + "logps/chosen": -380.58946814903845, + "logps/rejected": -411.79569284539474, + "loss": 0.0723, + "rewards/chosen": 2.713974879338191, + "rewards/margins": 8.586609365486423, + "rewards/rejected": -5.872634486148232, + "step": 563 + }, + { + "epoch": 0.20820451294356512, + "grad_norm": 7.78125, + "kl": 0.26439905166625977, + "learning_rate": 9.08211771211612e-06, + "logits/chosen": 300035152.84210527, + "logits/rejected": 370143271.38461536, + "logps/chosen": -423.49740439967104, + "logps/rejected": -388.71724759615387, + "loss": 0.099, + "rewards/chosen": 2.628094321803043, + "rewards/margins": 8.715154516552142, + "rewards/rejected": -6.087060194749099, + "step": 564 + }, + { + "epoch": 0.20857366988140832, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 9.078716376568011e-06, + "logits/chosen": 294091081.14285713, + "logits/rejected": 211520739.55555555, + "logps/chosen": -466.30538504464283, + "logps/rejected": -379.7277018229167, + "loss": 0.0862, + "rewards/chosen": 2.501655306134905, + "rewards/margins": 7.999262219383603, + "rewards/rejected": -5.497606913248698, + "step": 565 + }, + { + "epoch": 0.20894282681925153, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 9.075309390010182e-06, + "logits/chosen": 264134412.19047618, + "logits/rejected": 178798417.45454547, + "logps/chosen": -285.80103701636904, + "logps/rejected": -400.54789595170456, + "loss": 0.1294, + "rewards/chosen": 1.9277118501209078, + "rewards/margins": 7.7286789799149425, + "rewards/rejected": -5.800967129794034, + "step": 566 + }, + { + "epoch": 0.20931198375709473, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.07189675716297e-06, + "logits/chosen": 251726708.36363637, + "logits/rejected": 230425088.0, + "logps/chosen": -418.666015625, + "logps/rejected": -399.9847935267857, + "loss": 0.0799, + "rewards/chosen": 2.9661456021395596, + "rewards/margins": 9.297416819122446, + "rewards/rejected": -6.331271216982887, + "step": 567 + }, + { + "epoch": 0.20968114069493793, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 9.068478482754532e-06, + "logits/chosen": 232950144.0, + "logits/rejected": 231923399.1111111, + "logps/chosen": -353.10777064732144, + "logps/rejected": -480.20258246527777, + "loss": 0.1042, + "rewards/chosen": 1.935596329825265, + "rewards/margins": 8.33882508959089, + "rewards/rejected": -6.403228759765625, + "step": 568 + }, + { + "epoch": 0.21005029763278113, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 9.065054571520846e-06, + "logits/chosen": 274762456.61538464, + "logits/rejected": 307967919.15789473, + "logps/chosen": -388.2457932692308, + "logps/rejected": -550.402189555921, + "loss": 0.0779, + "rewards/chosen": 2.0546405498798075, + "rewards/margins": 9.543640569154068, + "rewards/rejected": -7.48900001927426, + "step": 569 + }, + { + "epoch": 0.21041945457062433, + "grad_norm": 7.34375, + "kl": 1.083662986755371, + "learning_rate": 9.061625028205699e-06, + "logits/chosen": 255840221.86666667, + "logits/rejected": 225114669.17647058, + "logps/chosen": -403.0685221354167, + "logps/rejected": -408.77392578125, + "loss": 0.1644, + "rewards/chosen": 1.6967002868652343, + "rewards/margins": 7.430306513169232, + "rewards/rejected": -5.733606226303998, + "step": 570 + }, + { + "epoch": 0.21078861150846753, + "grad_norm": 6.34375, + "kl": 0.5623207092285156, + "learning_rate": 9.058189857560675e-06, + "logits/chosen": 357355878.4, + "logits/rejected": 235210112.0, + "logps/chosen": -411.28349609375, + "logps/rejected": -318.0943196614583, + "loss": 0.0938, + "rewards/chosen": 2.198734664916992, + "rewards/margins": 8.060792668660481, + "rewards/rejected": -5.862058003743489, + "step": 571 + }, + { + "epoch": 0.21115776844631073, + "grad_norm": 6.4375, + "kl": 0.31542110443115234, + "learning_rate": 9.054749064345165e-06, + "logits/chosen": 214144976.0, + "logits/rejected": 236701664.0, + "logps/chosen": -338.9067687988281, + "logps/rejected": -362.7032470703125, + "loss": 0.1076, + "rewards/chosen": 2.3220343589782715, + "rewards/margins": 7.098622798919678, + "rewards/rejected": -4.776588439941406, + "step": 572 + }, + { + "epoch": 0.21152692538415394, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.05130265332634e-06, + "logits/chosen": 266583827.69230768, + "logits/rejected": 277049424.84210527, + "logps/chosen": -338.49008413461536, + "logps/rejected": -549.2341694078947, + "loss": 0.0997, + "rewards/chosen": 1.7780669285700872, + "rewards/margins": 7.720287114502447, + "rewards/rejected": -5.94222018593236, + "step": 573 + }, + { + "epoch": 0.21189608232199714, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 9.04785062927916e-06, + "logits/chosen": 287376926.11764705, + "logits/rejected": 206319291.73333332, + "logps/chosen": -376.30615234375, + "logps/rejected": -414.71959635416664, + "loss": 0.071, + "rewards/chosen": 2.206255520091337, + "rewards/margins": 8.035018337474149, + "rewards/rejected": -5.828762817382812, + "step": 574 + }, + { + "epoch": 0.21226523925984034, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 9.04439299698636e-06, + "logits/chosen": 270816902.7368421, + "logits/rejected": 195309863.3846154, + "logps/chosen": -312.04515316611844, + "logps/rejected": -432.87684044471155, + "loss": 0.1215, + "rewards/chosen": 1.8441981265419407, + "rewards/margins": 7.948844075685571, + "rewards/rejected": -6.10464594914363, + "step": 575 + }, + { + "epoch": 0.21263439619768354, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 9.040929761238448e-06, + "logits/chosen": 191808240.0, + "logits/rejected": 268483584.0, + "logps/chosen": -336.2958679199219, + "logps/rejected": -522.0059204101562, + "loss": 0.0999, + "rewards/chosen": 1.857661485671997, + "rewards/margins": 8.060078859329224, + "rewards/rejected": -6.202417373657227, + "step": 576 + }, + { + "epoch": 0.21300355313552674, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 9.03746092683369e-06, + "logits/chosen": 151292777.4117647, + "logits/rejected": 273296947.2, + "logps/chosen": -284.70280905330884, + "logps/rejected": -571.2288411458334, + "loss": 0.1299, + "rewards/chosen": 1.7348085291245405, + "rewards/margins": 8.410486019358915, + "rewards/rejected": -6.675677490234375, + "step": 577 + }, + { + "epoch": 0.21337271007336994, + "grad_norm": 6.90625, + "kl": 2.0806264877319336, + "learning_rate": 9.033986498578113e-06, + "logits/chosen": 197432539.42857143, + "logits/rejected": 243243377.7777778, + "logps/chosen": -372.02371651785717, + "logps/rejected": -442.4001193576389, + "loss": 0.1407, + "rewards/chosen": 2.0086570467267717, + "rewards/margins": 7.626232798137362, + "rewards/rejected": -5.61757575141059, + "step": 578 + }, + { + "epoch": 0.21374186701121314, + "grad_norm": 8.75, + "kl": 0.30951976776123047, + "learning_rate": 9.030506481285495e-06, + "logits/chosen": 194825638.4, + "logits/rejected": 198317376.0, + "logps/chosen": -321.896728515625, + "logps/rejected": -324.95753987630206, + "loss": 0.1519, + "rewards/chosen": 1.7721256256103515, + "rewards/margins": 6.901899719238282, + "rewards/rejected": -5.12977409362793, + "step": 579 + }, + { + "epoch": 0.21411102394905634, + "grad_norm": 9.25, + "kl": 0.0, + "learning_rate": 9.027020879777354e-06, + "logits/chosen": 154219520.0, + "logits/rejected": 306319803.73333335, + "logps/chosen": -326.1991613051471, + "logps/rejected": -504.44055989583336, + "loss": 0.1486, + "rewards/chosen": 1.5924489638384651, + "rewards/margins": 7.113646982230392, + "rewards/rejected": -5.521198018391927, + "step": 580 + }, + { + "epoch": 0.21448018088689955, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 9.023529698882946e-06, + "logits/chosen": 252990947.55555555, + "logits/rejected": 401360713.14285713, + "logps/chosen": -307.4970431857639, + "logps/rejected": -487.55991908482144, + "loss": 0.1471, + "rewards/chosen": 1.7112599478827581, + "rewards/margins": 6.700070895845927, + "rewards/rejected": -4.988810947963169, + "step": 581 + }, + { + "epoch": 0.21484933782474275, + "grad_norm": 8.0, + "kl": 1.2236003875732422, + "learning_rate": 9.020032943439258e-06, + "logits/chosen": 236069017.6, + "logits/rejected": 149148672.0, + "logps/chosen": -409.1723307291667, + "logps/rejected": -390.80563534007354, + "loss": 0.1414, + "rewards/chosen": 1.801980972290039, + "rewards/margins": 5.89524897407083, + "rewards/rejected": -4.093268001780791, + "step": 582 + }, + { + "epoch": 0.21521849476258595, + "grad_norm": 4.46875, + "kl": 0.2696542739868164, + "learning_rate": 9.016530618291001e-06, + "logits/chosen": 246316202.66666666, + "logits/rejected": 217903360.0, + "logps/chosen": -429.8184000651042, + "logps/rejected": -455.6431640625, + "loss": 0.0569, + "rewards/chosen": 3.018610636393229, + "rewards/margins": 9.257837931315104, + "rewards/rejected": -6.239227294921875, + "step": 583 + }, + { + "epoch": 0.21558765170042915, + "grad_norm": 4.75, + "kl": 0.7364864349365234, + "learning_rate": 9.013022728290604e-06, + "logits/chosen": 209810295.46666667, + "logits/rejected": 301686964.7058824, + "logps/chosen": -255.885498046875, + "logps/rejected": -474.47776884191177, + "loss": 0.0946, + "rewards/chosen": 2.64783935546875, + "rewards/margins": 8.60216127283433, + "rewards/rejected": -5.954321917365579, + "step": 584 + }, + { + "epoch": 0.21595680863827235, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 9.009509278298201e-06, + "logits/chosen": 278436181.3333333, + "logits/rejected": 262281938.82352942, + "logps/chosen": -371.39039713541666, + "logps/rejected": -451.75700827205884, + "loss": 0.1081, + "rewards/chosen": 1.6019323984781901, + "rewards/margins": 7.636592289045746, + "rewards/rejected": -6.034659890567555, + "step": 585 + }, + { + "epoch": 0.21632596557611555, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 9.005990273181631e-06, + "logits/chosen": 287066654.11764705, + "logits/rejected": 202604612.26666668, + "logps/chosen": -365.9876493566176, + "logps/rejected": -503.35986328125, + "loss": 0.0881, + "rewards/chosen": 2.064251843620749, + "rewards/margins": 9.480747103223614, + "rewards/rejected": -7.416495259602865, + "step": 586 + }, + { + "epoch": 0.21669512251395875, + "grad_norm": 6.96875, + "kl": 1.475473403930664, + "learning_rate": 9.002465717816436e-06, + "logits/chosen": 197994390.5882353, + "logits/rejected": 246471355.73333332, + "logps/chosen": -322.24339384191177, + "logps/rejected": -346.72682291666666, + "loss": 0.0979, + "rewards/chosen": 3.644957149730009, + "rewards/margins": 8.322829362457874, + "rewards/rejected": -4.6778722127278645, + "step": 587 + }, + { + "epoch": 0.21706427945180196, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.998935617085837e-06, + "logits/chosen": 248987363.55555555, + "logits/rejected": 266659163.42857143, + "logps/chosen": -350.6269259982639, + "logps/rejected": -494.96250697544644, + "loss": 0.0845, + "rewards/chosen": 2.591462665134006, + "rewards/margins": 9.875449437943717, + "rewards/rejected": -7.28398677280971, + "step": 588 + }, + { + "epoch": 0.21743343638964516, + "grad_norm": 7.78125, + "kl": 4.928298473358154, + "learning_rate": 8.995399975880749e-06, + "logits/chosen": 302048135.5294118, + "logits/rejected": 270544571.73333335, + "logps/chosen": -432.26453354779414, + "logps/rejected": -435.10670572916666, + "loss": 0.1552, + "rewards/chosen": 2.285678639131434, + "rewards/margins": 9.073534168916591, + "rewards/rejected": -6.787855529785157, + "step": 589 + }, + { + "epoch": 0.21780259332748836, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 8.991858799099755e-06, + "logits/chosen": 172446190.93333334, + "logits/rejected": 272748182.5882353, + "logps/chosen": -330.40618489583335, + "logps/rejected": -488.72581571691177, + "loss": 0.0899, + "rewards/chosen": 2.4321390787760415, + "rewards/margins": 8.377520692114736, + "rewards/rejected": -5.945381613338695, + "step": 590 + }, + { + "epoch": 0.21817175026533156, + "grad_norm": 5.96875, + "kl": 0.1259021759033203, + "learning_rate": 8.98831209164911e-06, + "logits/chosen": 242237842.2857143, + "logits/rejected": 202341034.66666666, + "logps/chosen": -378.18348911830356, + "logps/rejected": -354.3832736545139, + "loss": 0.1048, + "rewards/chosen": 1.9605975832257951, + "rewards/margins": 6.402655722602965, + "rewards/rejected": -4.4420581393771705, + "step": 591 + }, + { + "epoch": 0.21854090720317476, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.984759858442734e-06, + "logits/chosen": 168665141.89473686, + "logits/rejected": 181949380.92307693, + "logps/chosen": -284.9285824424342, + "logps/rejected": -422.5637770432692, + "loss": 0.1292, + "rewards/chosen": 2.4855356718364514, + "rewards/margins": 7.840690442907665, + "rewards/rejected": -5.355154771071214, + "step": 592 + }, + { + "epoch": 0.21891006414101796, + "grad_norm": 7.59375, + "kl": 2.881913661956787, + "learning_rate": 8.9812021044022e-06, + "logits/chosen": 259634127.23809522, + "logits/rejected": 280123112.72727275, + "logps/chosen": -442.4046688988095, + "logps/rejected": -623.0033291903409, + "loss": 0.1194, + "rewards/chosen": 2.5154393513997397, + "rewards/margins": 10.329852248683121, + "rewards/rejected": -7.814412897283381, + "step": 593 + }, + { + "epoch": 0.21927922107886116, + "grad_norm": 5.0, + "kl": 0.02950000762939453, + "learning_rate": 8.97763883445673e-06, + "logits/chosen": 242735917.17647058, + "logits/rejected": 303357508.26666665, + "logps/chosen": -333.01217830882354, + "logps/rejected": -429.9146484375, + "loss": 0.0968, + "rewards/chosen": 2.2207004322725186, + "rewards/margins": 7.966295728496476, + "rewards/rejected": -5.745595296223958, + "step": 594 + }, + { + "epoch": 0.21964837801670434, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.97407005354319e-06, + "logits/chosen": 306345531.0769231, + "logits/rejected": 304355678.31578946, + "logps/chosen": -308.9633976862981, + "logps/rejected": -425.99912623355266, + "loss": 0.1381, + "rewards/chosen": 1.127007190997784, + "rewards/margins": 6.5473675554097905, + "rewards/rejected": -5.420360364412007, + "step": 595 + }, + { + "epoch": 0.22001753495454754, + "grad_norm": 7.0625, + "kl": 0.4356689453125, + "learning_rate": 8.970495766606083e-06, + "logits/chosen": 299097728.0, + "logits/rejected": 305278691.5555556, + "logps/chosen": -288.51168387276783, + "logps/rejected": -469.23828125, + "loss": 0.0983, + "rewards/chosen": 2.1504693712506975, + "rewards/margins": 7.26667352706667, + "rewards/rejected": -5.116204155815972, + "step": 596 + }, + { + "epoch": 0.22038669189239074, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 8.966915978597532e-06, + "logits/chosen": 257463637.33333334, + "logits/rejected": 175346779.42857143, + "logps/chosen": -345.4289279513889, + "logps/rejected": -423.05831473214283, + "loss": 0.119, + "rewards/chosen": 1.797292709350586, + "rewards/margins": 7.548916680472238, + "rewards/rejected": -5.751623971121652, + "step": 597 + }, + { + "epoch": 0.22075584883023394, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.963330694477295e-06, + "logits/chosen": 319540976.9411765, + "logits/rejected": 165248614.4, + "logps/chosen": -356.50442325367646, + "logps/rejected": -353.29436848958335, + "loss": 0.1104, + "rewards/chosen": 2.662677989286535, + "rewards/margins": 8.437950612984451, + "rewards/rejected": -5.775272623697917, + "step": 598 + }, + { + "epoch": 0.22112500576807714, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 8.959739919212734e-06, + "logits/chosen": 159417719.46666667, + "logits/rejected": 194992022.5882353, + "logps/chosen": -288.52890625, + "logps/rejected": -420.2361270680147, + "loss": 0.0804, + "rewards/chosen": 2.660376485188802, + "rewards/margins": 8.52504897772097, + "rewards/rejected": -5.864672492532169, + "step": 599 + }, + { + "epoch": 0.22149416270592034, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 8.956143657778822e-06, + "logits/chosen": 293821166.93333334, + "logits/rejected": 177579670.5882353, + "logps/chosen": -230.51647135416667, + "logps/rejected": -288.5982306985294, + "loss": 0.1297, + "rewards/chosen": 1.7334730784098307, + "rewards/margins": 5.767430167104684, + "rewards/rejected": -4.033957088694853, + "step": 600 + }, + { + "epoch": 0.22186331964376355, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 8.952541915158137e-06, + "logits/chosen": 231130031.15789473, + "logits/rejected": 215037341.53846154, + "logps/chosen": -347.6015625, + "logps/rejected": -390.68956580528845, + "loss": 0.1243, + "rewards/chosen": 2.52884252447831, + "rewards/margins": 7.74675270420337, + "rewards/rejected": -5.21791017972506, + "step": 601 + }, + { + "epoch": 0.22223247658160675, + "grad_norm": 7.375, + "kl": 0.46017587184906006, + "learning_rate": 8.948934696340842e-06, + "logits/chosen": 290637958.7368421, + "logits/rejected": 243131431.3846154, + "logps/chosen": -322.6025390625, + "logps/rejected": -466.98230919471155, + "loss": 0.1675, + "rewards/chosen": 1.4211559295654297, + "rewards/margins": 8.04086523789626, + "rewards/rejected": -6.61970930833083, + "step": 602 + }, + { + "epoch": 0.22260163351944995, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 8.945322006324698e-06, + "logits/chosen": 258895904.0, + "logits/rejected": 388928224.0, + "logps/chosen": -313.1680908203125, + "logps/rejected": -518.5355834960938, + "loss": 0.093, + "rewards/chosen": 2.425848960876465, + "rewards/margins": 8.388179779052734, + "rewards/rejected": -5.9623308181762695, + "step": 603 + }, + { + "epoch": 0.22297079045729315, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 8.941703850115037e-06, + "logits/chosen": 282191655.38461536, + "logits/rejected": 202967821.47368422, + "logps/chosen": -408.7102614182692, + "logps/rejected": -328.39149876644734, + "loss": 0.0827, + "rewards/chosen": 2.510679391714243, + "rewards/margins": 7.562989732997138, + "rewards/rejected": -5.052310341282895, + "step": 604 + }, + { + "epoch": 0.22333994739513635, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 8.938080232724773e-06, + "logits/chosen": 351176305.7777778, + "logits/rejected": 192146230.85714287, + "logps/chosen": -400.49131944444446, + "logps/rejected": -368.53086635044644, + "loss": 0.0991, + "rewards/chosen": 2.547666761610243, + "rewards/margins": 7.733082846989707, + "rewards/rejected": -5.185416085379464, + "step": 605 + }, + { + "epoch": 0.22370910433297955, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 8.934451159174377e-06, + "logits/chosen": 314365586.28571427, + "logits/rejected": 212525198.2222222, + "logps/chosen": -338.6675502232143, + "logps/rejected": -361.9044596354167, + "loss": 0.0969, + "rewards/chosen": 2.0253519330705916, + "rewards/margins": 6.847225491962735, + "rewards/rejected": -4.821873558892144, + "step": 606 + }, + { + "epoch": 0.22407826127082275, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.930816634491887e-06, + "logits/chosen": 209705728.0, + "logits/rejected": 311922090.6666667, + "logps/chosen": -360.47129603794644, + "logps/rejected": -439.876708984375, + "loss": 0.0816, + "rewards/chosen": 2.9176712036132812, + "rewards/margins": 7.900797526041667, + "rewards/rejected": -4.983126322428386, + "step": 607 + }, + { + "epoch": 0.22444741820866596, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.927176663712892e-06, + "logits/chosen": 278072064.0, + "logits/rejected": 271908288.0, + "logps/chosen": -405.250244140625, + "logps/rejected": -511.9877014160156, + "loss": 0.0971, + "rewards/chosen": 1.9580694437026978, + "rewards/margins": 8.503496289253235, + "rewards/rejected": -6.545426845550537, + "step": 608 + }, + { + "epoch": 0.22481657514650916, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 8.923531251880524e-06, + "logits/chosen": 292143320.61538464, + "logits/rejected": 379625013.8947368, + "logps/chosen": -450.2707707331731, + "logps/rejected": -465.5058079769737, + "loss": 0.087, + "rewards/chosen": 2.1622384878305287, + "rewards/margins": 7.894859530182503, + "rewards/rejected": -5.7326210423519735, + "step": 609 + }, + { + "epoch": 0.22518573208435236, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 8.919880404045452e-06, + "logits/chosen": 264948336.0, + "logits/rejected": 213354944.0, + "logps/chosen": -400.77496337890625, + "logps/rejected": -413.5911865234375, + "loss": 0.0838, + "rewards/chosen": 2.5652577877044678, + "rewards/margins": 8.938915014266968, + "rewards/rejected": -6.3736572265625, + "step": 610 + }, + { + "epoch": 0.22555488902219556, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 8.916224125265883e-06, + "logits/chosen": 257712187.07692307, + "logits/rejected": 300357874.5263158, + "logps/chosen": -331.0742938701923, + "logps/rejected": -498.7460423519737, + "loss": 0.0978, + "rewards/chosen": 1.6374821296105018, + "rewards/margins": 7.5622438515728785, + "rewards/rejected": -5.924761721962376, + "step": 611 + }, + { + "epoch": 0.22592404596003876, + "grad_norm": 7.96875, + "kl": 0.1933884620666504, + "learning_rate": 8.912562420607545e-06, + "logits/chosen": 152347008.0, + "logits/rejected": 176148256.0, + "logps/chosen": -319.906103515625, + "logps/rejected": -340.6811116536458, + "loss": 0.1051, + "rewards/chosen": 2.9192461013793944, + "rewards/margins": 8.387189292907715, + "rewards/rejected": -5.46794319152832, + "step": 612 + }, + { + "epoch": 0.22629320289788196, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.90889529514368e-06, + "logits/chosen": 308028245.3333333, + "logits/rejected": 263539079.52941176, + "logps/chosen": -462.9316731770833, + "logps/rejected": -432.4774816176471, + "loss": 0.0784, + "rewards/chosen": 2.5438685099283855, + "rewards/margins": 7.913045486749388, + "rewards/rejected": -5.369176976821002, + "step": 613 + }, + { + "epoch": 0.22666235983572516, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.905222753955045e-06, + "logits/chosen": 267010496.0, + "logits/rejected": 159467232.0, + "logps/chosen": -430.49871826171875, + "logps/rejected": -447.39678955078125, + "loss": 0.0908, + "rewards/chosen": 2.304558753967285, + "rewards/margins": 8.331232070922852, + "rewards/rejected": -6.026673316955566, + "step": 614 + }, + { + "epoch": 0.22703151677356836, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 8.901544802129903e-06, + "logits/chosen": 245394102.85714287, + "logits/rejected": 254322432.0, + "logps/chosen": -409.19796316964283, + "logps/rejected": -356.0056966145833, + "loss": 0.1021, + "rewards/chosen": 2.5361695970807756, + "rewards/margins": 7.44199319869753, + "rewards/rejected": -4.905823601616754, + "step": 615 + }, + { + "epoch": 0.22740067371141157, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 8.897861444764004e-06, + "logits/chosen": 142142021.8181818, + "logits/rejected": 285889267.8095238, + "logps/chosen": -343.46244673295456, + "logps/rejected": -372.03111049107144, + "loss": 0.089, + "rewards/chosen": 3.1494216918945312, + "rewards/margins": 7.778755551292782, + "rewards/rejected": -4.629333859398251, + "step": 616 + }, + { + "epoch": 0.22776983064925477, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 8.894172686960594e-06, + "logits/chosen": 230159781.6470588, + "logits/rejected": 276843707.73333335, + "logps/chosen": -277.38893037683823, + "logps/rejected": -480.11337890625, + "loss": 0.1406, + "rewards/chosen": 1.6920503728529985, + "rewards/margins": 7.467993874643363, + "rewards/rejected": -5.775943501790365, + "step": 617 + }, + { + "epoch": 0.22813898758709797, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 8.890478533830403e-06, + "logits/chosen": 242346928.0, + "logits/rejected": 226274448.0, + "logps/chosen": -349.39703369140625, + "logps/rejected": -551.4635009765625, + "loss": 0.1208, + "rewards/chosen": 2.515986919403076, + "rewards/margins": 10.475740909576416, + "rewards/rejected": -7.95975399017334, + "step": 618 + }, + { + "epoch": 0.22850814452494117, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.886778990491632e-06, + "logits/chosen": 240144512.0, + "logits/rejected": 224600800.0, + "logps/chosen": -331.592041015625, + "logps/rejected": -379.4638671875, + "loss": 0.13, + "rewards/chosen": 1.7414315938949585, + "rewards/margins": 7.712577223777771, + "rewards/rejected": -5.9711456298828125, + "step": 619 + }, + { + "epoch": 0.22887730146278437, + "grad_norm": 5.71875, + "kl": 1.2134761810302734, + "learning_rate": 8.883074062069948e-06, + "logits/chosen": 258928725.33333334, + "logits/rejected": 273339366.4, + "logps/chosen": -362.37744140625, + "logps/rejected": -507.8765625, + "loss": 0.0984, + "rewards/chosen": 1.8865005175272624, + "rewards/margins": 7.776360289255778, + "rewards/rejected": -5.889859771728515, + "step": 620 + }, + { + "epoch": 0.22924645840062757, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.879363753698487e-06, + "logits/chosen": 320254361.6, + "logits/rejected": 324728289.88235295, + "logps/chosen": -366.9464518229167, + "logps/rejected": -555.7404641544117, + "loss": 0.0667, + "rewards/chosen": 2.3642262776692706, + "rewards/margins": 8.606461349188113, + "rewards/rejected": -6.2422350715188415, + "step": 621 + }, + { + "epoch": 0.22961561533847077, + "grad_norm": 5.96875, + "kl": 2.6215457916259766, + "learning_rate": 8.875648070517832e-06, + "logits/chosen": 289587264.0, + "logits/rejected": 184642336.0, + "logps/chosen": -381.8103332519531, + "logps/rejected": -350.90301513671875, + "loss": 0.1244, + "rewards/chosen": 2.617448568344116, + "rewards/margins": 7.528487920761108, + "rewards/rejected": -4.911039352416992, + "step": 622 + }, + { + "epoch": 0.22998477227631398, + "grad_norm": 6.125, + "kl": 0.3554849624633789, + "learning_rate": 8.871927017676013e-06, + "logits/chosen": 247136691.2, + "logits/rejected": 225926464.0, + "logps/chosen": -327.4931884765625, + "logps/rejected": -392.2202555338542, + "loss": 0.1108, + "rewards/chosen": 2.3504682540893556, + "rewards/margins": 9.864697074890136, + "rewards/rejected": -7.514228820800781, + "step": 623 + }, + { + "epoch": 0.23035392921415718, + "grad_norm": 6.78125, + "kl": 1.0904216766357422, + "learning_rate": 8.868200600328505e-06, + "logits/chosen": 268229632.0, + "logits/rejected": 255324074.66666666, + "logps/chosen": -414.44144112723217, + "logps/rejected": -428.04454210069446, + "loss": 0.1102, + "rewards/chosen": 2.03185544695173, + "rewards/margins": 7.1161132085891, + "rewards/rejected": -5.08425776163737, + "step": 624 + }, + { + "epoch": 0.23072308615200038, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 8.864468823638211e-06, + "logits/chosen": 182760990.11764705, + "logits/rejected": 245796300.8, + "logps/chosen": -319.6599551930147, + "logps/rejected": -418.4078776041667, + "loss": 0.1013, + "rewards/chosen": 2.569435344022863, + "rewards/margins": 8.08989839740828, + "rewards/rejected": -5.520463053385416, + "step": 625 + }, + { + "epoch": 0.23109224308984358, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.860731692775459e-06, + "logits/chosen": 300999953.06666666, + "logits/rejected": 252825178.3529412, + "logps/chosen": -429.037109375, + "logps/rejected": -413.1538660386029, + "loss": 0.1004, + "rewards/chosen": 2.324603525797526, + "rewards/margins": 8.19882667391908, + "rewards/rejected": -5.874223148121553, + "step": 626 + }, + { + "epoch": 0.23146140002768678, + "grad_norm": 5.71875, + "kl": 1.5649490356445312, + "learning_rate": 8.856989212917994e-06, + "logits/chosen": 175527736.8888889, + "logits/rejected": 368434505.14285713, + "logps/chosen": -343.4176974826389, + "logps/rejected": -349.76388113839283, + "loss": 0.0866, + "rewards/chosen": 3.643090989854601, + "rewards/margins": 7.863541799878317, + "rewards/rejected": -4.220450810023716, + "step": 627 + }, + { + "epoch": 0.23183055696552998, + "grad_norm": 6.40625, + "kl": 1.2854466438293457, + "learning_rate": 8.853241389250981e-06, + "logits/chosen": 230422432.0, + "logits/rejected": 222821744.0, + "logps/chosen": -318.00469970703125, + "logps/rejected": -437.8396301269531, + "loss": 0.1281, + "rewards/chosen": 2.078754186630249, + "rewards/margins": 8.359339475631714, + "rewards/rejected": -6.280585289001465, + "step": 628 + }, + { + "epoch": 0.23219971390337318, + "grad_norm": 5.90625, + "kl": 3.8596253395080566, + "learning_rate": 8.849488226966975e-06, + "logits/chosen": 279470865.06666666, + "logits/rejected": 193229839.05882353, + "logps/chosen": -317.9427083333333, + "logps/rejected": -345.8791934742647, + "loss": 0.128, + "rewards/chosen": 2.5792195638020834, + "rewards/margins": 7.135888911228554, + "rewards/rejected": -4.556669347426471, + "step": 629 + }, + { + "epoch": 0.23256887084121639, + "grad_norm": 5.25, + "kl": 1.2317075729370117, + "learning_rate": 8.84572973126594e-06, + "logits/chosen": 215481190.4, + "logits/rejected": 254320579.7647059, + "logps/chosen": -386.1049479166667, + "logps/rejected": -391.14036649816177, + "loss": 0.0855, + "rewards/chosen": 2.352142588297526, + "rewards/margins": 7.841773792341644, + "rewards/rejected": -5.489631204044118, + "step": 630 + }, + { + "epoch": 0.2329380277790596, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 8.84196590735522e-06, + "logits/chosen": 204443728.0, + "logits/rejected": 331297344.0, + "logps/chosen": -273.8930969238281, + "logps/rejected": -522.4186401367188, + "loss": 0.1198, + "rewards/chosen": 1.8016976118087769, + "rewards/margins": 7.855640769004822, + "rewards/rejected": -6.053943157196045, + "step": 631 + }, + { + "epoch": 0.23330718471690276, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 8.83819676044955e-06, + "logits/chosen": 201241134.54545453, + "logits/rejected": 241420092.95238096, + "logps/chosen": -308.71855024857956, + "logps/rejected": -417.8501209077381, + "loss": 0.0807, + "rewards/chosen": 2.6589263569224966, + "rewards/margins": 7.93707918604731, + "rewards/rejected": -5.278152829124814, + "step": 632 + }, + { + "epoch": 0.23367634165474596, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 8.83442229577103e-06, + "logits/chosen": 296536726.5882353, + "logits/rejected": 294826837.3333333, + "logps/chosen": -369.41139131433823, + "logps/rejected": -429.77630208333335, + "loss": 0.0994, + "rewards/chosen": 1.9242143069996553, + "rewards/margins": 7.891331107943666, + "rewards/rejected": -5.9671168009440105, + "step": 633 + }, + { + "epoch": 0.23404549859258916, + "grad_norm": 7.34375, + "kl": 1.138209342956543, + "learning_rate": 8.830642518549135e-06, + "logits/chosen": 249792843.29411766, + "logits/rejected": 223018120.53333333, + "logps/chosen": -359.60153377757354, + "logps/rejected": -449.9572265625, + "loss": 0.1305, + "rewards/chosen": 2.130841647877413, + "rewards/margins": 7.517199781829236, + "rewards/rejected": -5.3863581339518225, + "step": 634 + }, + { + "epoch": 0.23441465553043236, + "grad_norm": 11.125, + "kl": 0.0, + "learning_rate": 8.8268574340207e-06, + "logits/chosen": 256865890.46153846, + "logits/rejected": 199784501.89473686, + "logps/chosen": -364.82711087740387, + "logps/rejected": -405.92349403782896, + "loss": 0.1256, + "rewards/chosen": 1.4762042118952825, + "rewards/margins": 6.623967915894049, + "rewards/rejected": -5.147763703998766, + "step": 635 + }, + { + "epoch": 0.23478381246827557, + "grad_norm": 6.5625, + "kl": 0.7939519882202148, + "learning_rate": 8.823067047429908e-06, + "logits/chosen": 305144516.9230769, + "logits/rejected": 344084857.2631579, + "logps/chosen": -332.68209134615387, + "logps/rejected": -399.22543174342104, + "loss": 0.1358, + "rewards/chosen": 1.2324594350961537, + "rewards/margins": 6.419079243895496, + "rewards/rejected": -5.1866198087993425, + "step": 636 + }, + { + "epoch": 0.23515296940611877, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.819271364028294e-06, + "logits/chosen": 256437418.66666666, + "logits/rejected": 222678031.05882353, + "logps/chosen": -334.06725260416664, + "logps/rejected": -395.59495634191177, + "loss": 0.122, + "rewards/chosen": 2.220800272623698, + "rewards/margins": 7.431268789253982, + "rewards/rejected": -5.210468516630285, + "step": 637 + }, + { + "epoch": 0.23552212634396197, + "grad_norm": 8.3125, + "kl": 0.2581338882446289, + "learning_rate": 8.815470389074727e-06, + "logits/chosen": 292856295.61904764, + "logits/rejected": 229870196.36363637, + "logps/chosen": -427.66489955357144, + "logps/rejected": -355.2411443536932, + "loss": 0.1428, + "rewards/chosen": 1.9724515279134114, + "rewards/margins": 8.12531685106682, + "rewards/rejected": -6.152865323153409, + "step": 638 + }, + { + "epoch": 0.23589128328180517, + "grad_norm": 5.1875, + "kl": 0.25444793701171875, + "learning_rate": 8.811664127835412e-06, + "logits/chosen": 191686414.2222222, + "logits/rejected": 328315977.14285713, + "logps/chosen": -339.81287977430554, + "logps/rejected": -411.6770717075893, + "loss": 0.0858, + "rewards/chosen": 2.4699403974745007, + "rewards/margins": 8.212714815896653, + "rewards/rejected": -5.742774418422154, + "step": 639 + }, + { + "epoch": 0.23626044021964837, + "grad_norm": 8.5, + "kl": 0.3148970603942871, + "learning_rate": 8.807852585583876e-06, + "logits/chosen": 270964897.68421054, + "logits/rejected": 207437213.53846154, + "logps/chosen": -445.89633018092104, + "logps/rejected": -389.32729867788464, + "loss": 0.1652, + "rewards/chosen": 1.6122248800177323, + "rewards/margins": 6.226153161361633, + "rewards/rejected": -4.613928281343901, + "step": 640 + }, + { + "epoch": 0.23662959715749157, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.80403576760096e-06, + "logits/chosen": 232595840.0, + "logits/rejected": 233405107.2, + "logps/chosen": -330.7978108723958, + "logps/rejected": -471.369384765625, + "loss": 0.112, + "rewards/chosen": 1.5663385391235352, + "rewards/margins": 8.07342700958252, + "rewards/rejected": -6.507088470458984, + "step": 641 + }, + { + "epoch": 0.23699875409533477, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 8.800213679174818e-06, + "logits/chosen": 229954523.42857143, + "logits/rejected": 243742208.0, + "logps/chosen": -351.77978515625, + "logps/rejected": -414.7880859375, + "loss": 0.0845, + "rewards/chosen": 3.1428140912737166, + "rewards/margins": 8.340511200919984, + "rewards/rejected": -5.197697109646267, + "step": 642 + }, + { + "epoch": 0.23736791103317798, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.796386325600906e-06, + "logits/chosen": 226165956.92307693, + "logits/rejected": 222519013.0526316, + "logps/chosen": -377.0988581730769, + "logps/rejected": -460.0328947368421, + "loss": 0.062, + "rewards/chosen": 2.2981862288254957, + "rewards/margins": 9.015208773284789, + "rewards/rejected": -6.717022544459293, + "step": 643 + }, + { + "epoch": 0.23773706797102118, + "grad_norm": 4.9375, + "kl": 0.787755012512207, + "learning_rate": 8.79255371218197e-06, + "logits/chosen": 269329106.8235294, + "logits/rejected": 239363737.6, + "logps/chosen": -389.00295840992646, + "logps/rejected": -517.0658854166667, + "loss": 0.0661, + "rewards/chosen": 2.663448782528148, + "rewards/margins": 9.794053844377107, + "rewards/rejected": -7.130605061848958, + "step": 644 + }, + { + "epoch": 0.23810622490886438, + "grad_norm": 5.28125, + "kl": 0.11092996597290039, + "learning_rate": 8.78871584422805e-06, + "logits/chosen": 284443306.6666667, + "logits/rejected": 209552665.6, + "logps/chosen": -287.14475504557294, + "logps/rejected": -544.59609375, + "loss": 0.0783, + "rewards/chosen": 1.7842168807983398, + "rewards/margins": 9.276698875427247, + "rewards/rejected": -7.492481994628906, + "step": 645 + }, + { + "epoch": 0.23847538184670758, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 8.784872727056464e-06, + "logits/chosen": 211407134.11764705, + "logits/rejected": 165529804.8, + "logps/chosen": -259.1180778952206, + "logps/rejected": -379.19404296875, + "loss": 0.1418, + "rewards/chosen": 1.4909994461957146, + "rewards/margins": 6.7585277258181105, + "rewards/rejected": -5.267528279622396, + "step": 646 + }, + { + "epoch": 0.23884453878455078, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 8.781024365991802e-06, + "logits/chosen": 332978969.6, + "logits/rejected": 225566277.8181818, + "logps/chosen": -377.3185302734375, + "logps/rejected": -408.8728693181818, + "loss": 0.0583, + "rewards/chosen": 2.701150131225586, + "rewards/margins": 8.767627126520331, + "rewards/rejected": -6.066476995294744, + "step": 647 + }, + { + "epoch": 0.23921369572239398, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 8.777170766365916e-06, + "logits/chosen": 300885026.1333333, + "logits/rejected": 205106808.47058824, + "logps/chosen": -336.1438802083333, + "logps/rejected": -312.74755859375, + "loss": 0.113, + "rewards/chosen": 1.744888432820638, + "rewards/margins": 6.685568992764342, + "rewards/rejected": -4.940680559943704, + "step": 648 + }, + { + "epoch": 0.23958285266023718, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.773311933517923e-06, + "logits/chosen": 254754816.0, + "logits/rejected": 231922852.57142857, + "logps/chosen": -469.62830946180554, + "logps/rejected": -414.50191824776783, + "loss": 0.1099, + "rewards/chosen": 2.1709406110975475, + "rewards/margins": 9.551620695326063, + "rewards/rejected": -7.380680084228516, + "step": 649 + }, + { + "epoch": 0.23995200959808038, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 8.769447872794185e-06, + "logits/chosen": 231047862.85714287, + "logits/rejected": 152310343.1111111, + "logps/chosen": -377.09092494419644, + "logps/rejected": -378.4485677083333, + "loss": 0.0559, + "rewards/chosen": 3.582268033708845, + "rewards/margins": 8.941230198693654, + "rewards/rejected": -5.358962164984809, + "step": 650 + }, + { + "epoch": 0.24032116653592359, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.765578589548309e-06, + "logits/chosen": 296009955.5555556, + "logits/rejected": 282066194.28571427, + "logps/chosen": -422.12299262152777, + "logps/rejected": -423.55796595982144, + "loss": 0.0815, + "rewards/chosen": 2.3016526963975696, + "rewards/margins": 8.829375736297123, + "rewards/rejected": -6.527723039899554, + "step": 651 + }, + { + "epoch": 0.2406903234737668, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.76170408914114e-06, + "logits/chosen": 254014179.55555555, + "logits/rejected": 227544978.2857143, + "logps/chosen": -238.81434461805554, + "logps/rejected": -462.85128348214283, + "loss": 0.1443, + "rewards/chosen": 1.665324952867296, + "rewards/margins": 7.80042845105368, + "rewards/rejected": -6.135103498186384, + "step": 652 + }, + { + "epoch": 0.24105948041161, + "grad_norm": 5.6875, + "kl": 0.6342678070068359, + "learning_rate": 8.757824376940748e-06, + "logits/chosen": 290617398.85714287, + "logits/rejected": 266496853.33333334, + "logps/chosen": -380.8541782924107, + "logps/rejected": -450.8310275607639, + "loss": 0.0923, + "rewards/chosen": 2.6433045523507253, + "rewards/margins": 7.574207245357453, + "rewards/rejected": -4.930902693006727, + "step": 653 + }, + { + "epoch": 0.2414286373494532, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.75393945832242e-06, + "logits/chosen": 250830405.8181818, + "logits/rejected": 393430553.6, + "logps/chosen": -411.47243430397725, + "logps/rejected": -439.0123046875, + "loss": 0.1069, + "rewards/chosen": 2.3715974634343926, + "rewards/margins": 8.624555379694158, + "rewards/rejected": -6.252957916259765, + "step": 654 + }, + { + "epoch": 0.2417977942872964, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.75004933866867e-06, + "logits/chosen": 230379136.0, + "logits/rejected": 240649523.2, + "logps/chosen": -285.0461832682292, + "logps/rejected": -401.93681640625, + "loss": 0.0899, + "rewards/chosen": 2.666788101196289, + "rewards/margins": 8.464592361450196, + "rewards/rejected": -5.797804260253907, + "step": 655 + }, + { + "epoch": 0.2421669512251396, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 8.746154023369204e-06, + "logits/chosen": 307703239.1111111, + "logits/rejected": 237602962.2857143, + "logps/chosen": -379.4772135416667, + "logps/rejected": -416.4843052455357, + "loss": 0.1184, + "rewards/chosen": 1.8218498229980469, + "rewards/margins": 7.63097163609096, + "rewards/rejected": -5.809121813092913, + "step": 656 + }, + { + "epoch": 0.2425361081629828, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 8.742253517820933e-06, + "logits/chosen": 160108684.8, + "logits/rejected": 231276906.66666666, + "logps/chosen": -242.341650390625, + "logps/rejected": -480.0375162760417, + "loss": 0.13, + "rewards/chosen": 2.0768001556396483, + "rewards/margins": 8.510437774658204, + "rewards/rejected": -6.433637619018555, + "step": 657 + }, + { + "epoch": 0.242905265100826, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.738347827427957e-06, + "logits/chosen": 284124603.73333335, + "logits/rejected": 230583672.47058824, + "logps/chosen": -412.85169270833336, + "logps/rejected": -412.63752297794116, + "loss": 0.0964, + "rewards/chosen": 2.010995101928711, + "rewards/margins": 7.771058722103343, + "rewards/rejected": -5.760063620174632, + "step": 658 + }, + { + "epoch": 0.2432744220386692, + "grad_norm": 4.625, + "kl": 1.6405730247497559, + "learning_rate": 8.734436957601564e-06, + "logits/chosen": 182978480.0, + "logits/rejected": 221504080.0, + "logps/chosen": -317.68951416015625, + "logps/rejected": -415.6993713378906, + "loss": 0.083, + "rewards/chosen": 2.9818661212921143, + "rewards/margins": 8.683639764785767, + "rewards/rejected": -5.701773643493652, + "step": 659 + }, + { + "epoch": 0.2436435789765124, + "grad_norm": 7.375, + "kl": 0.6098418235778809, + "learning_rate": 8.730520913760209e-06, + "logits/chosen": 280490792.42105263, + "logits/rejected": 277149656.61538464, + "logps/chosen": -355.0436369243421, + "logps/rejected": -489.5424053485577, + "loss": 0.1328, + "rewards/chosen": 1.9525640387284129, + "rewards/margins": 7.855667206922524, + "rewards/rejected": -5.903103168194111, + "step": 660 + }, + { + "epoch": 0.2440127359143556, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 8.726599701329526e-06, + "logits/chosen": 178690375.1111111, + "logits/rejected": 226672512.0, + "logps/chosen": -308.4845377604167, + "logps/rejected": -378.48988560267856, + "loss": 0.1148, + "rewards/chosen": 1.9904085795084636, + "rewards/margins": 7.311896551223029, + "rewards/rejected": -5.321487971714565, + "step": 661 + }, + { + "epoch": 0.2443818928521988, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 8.722673325742302e-06, + "logits/chosen": 230523136.0, + "logits/rejected": 229628550.7368421, + "logps/chosen": -327.83939302884613, + "logps/rejected": -493.2783717105263, + "loss": 0.0456, + "rewards/chosen": 3.3722454951359677, + "rewards/margins": 9.206348326524742, + "rewards/rejected": -5.834102831388774, + "step": 662 + }, + { + "epoch": 0.244751049790042, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 8.718741792438481e-06, + "logits/chosen": 260324538.1818182, + "logits/rejected": 182649831.6190476, + "logps/chosen": -343.72432084517044, + "logps/rejected": -405.7005208333333, + "loss": 0.0749, + "rewards/chosen": 2.950458526611328, + "rewards/margins": 9.164986201695033, + "rewards/rejected": -6.214527675083706, + "step": 663 + }, + { + "epoch": 0.2451202067278852, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 8.714805106865151e-06, + "logits/chosen": 205380498.2857143, + "logits/rejected": 312604241.92, + "logps/chosen": -225.52001953125, + "logps/rejected": -411.2094140625, + "loss": 0.0624, + "rewards/chosen": 1.374983787536621, + "rewards/margins": 7.578246726989746, + "rewards/rejected": -6.203262939453125, + "step": 664 + }, + { + "epoch": 0.2454893636657284, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.710863274476544e-06, + "logits/chosen": 142174116.57142857, + "logits/rejected": 303447324.4444444, + "logps/chosen": -235.75840541294642, + "logps/rejected": -371.57085503472223, + "loss": 0.1215, + "rewards/chosen": 2.1140287944248746, + "rewards/margins": 7.094079199291411, + "rewards/rejected": -4.980050404866536, + "step": 665 + }, + { + "epoch": 0.2458585206035716, + "grad_norm": 7.40625, + "kl": 1.50592041015625, + "learning_rate": 8.706916300734017e-06, + "logits/chosen": 225265592.8888889, + "logits/rejected": 228347721.14285713, + "logps/chosen": -350.87969292534723, + "logps/rejected": -436.73193359375, + "loss": 0.1614, + "rewards/chosen": 2.2046529981825085, + "rewards/margins": 7.731110648503379, + "rewards/rejected": -5.526457650320871, + "step": 666 + }, + { + "epoch": 0.2462276775414148, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.70296419110605e-06, + "logits/chosen": 239506568.53333333, + "logits/rejected": 272546394.35294116, + "logps/chosen": -347.77734375, + "logps/rejected": -458.2992589613971, + "loss": 0.0815, + "rewards/chosen": 2.4604838053385416, + "rewards/margins": 8.225256108302695, + "rewards/rejected": -5.764772302964154, + "step": 667 + }, + { + "epoch": 0.24659683447925798, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 8.69900695106824e-06, + "logits/chosen": 233097020.2352941, + "logits/rejected": 180090368.0, + "logps/chosen": -369.4953182444853, + "logps/rejected": -420.23059895833336, + "loss": 0.0922, + "rewards/chosen": 2.2258466832778034, + "rewards/margins": 7.793456343108533, + "rewards/rejected": -5.567609659830729, + "step": 668 + }, + { + "epoch": 0.24696599141710118, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 8.695044586103297e-06, + "logits/chosen": 231458261.33333334, + "logits/rejected": 224249764.57142857, + "logps/chosen": -316.2102322048611, + "logps/rejected": -445.98172433035717, + "loss": 0.1258, + "rewards/chosen": 1.9141195085313585, + "rewards/margins": 7.741897371080187, + "rewards/rejected": -5.827777862548828, + "step": 669 + }, + { + "epoch": 0.24733514835494438, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.691077101701024e-06, + "logits/chosen": 208209271.46666667, + "logits/rejected": 233190415.05882353, + "logps/chosen": -327.9370442708333, + "logps/rejected": -471.7634708180147, + "loss": 0.0788, + "rewards/chosen": 2.364660898844401, + "rewards/margins": 8.697049384023629, + "rewards/rejected": -6.332388485179228, + "step": 670 + }, + { + "epoch": 0.24770430529278759, + "grad_norm": 6.34375, + "kl": 0.5507364273071289, + "learning_rate": 8.68710450335832e-06, + "logits/chosen": 257241600.0, + "logits/rejected": 258203407.05882353, + "logps/chosen": -403.3157552083333, + "logps/rejected": -461.5848173253676, + "loss": 0.073, + "rewards/chosen": 2.789012145996094, + "rewards/margins": 7.968857170553768, + "rewards/rejected": -5.179845024557674, + "step": 671 + }, + { + "epoch": 0.2480734622306308, + "grad_norm": 6.46875, + "kl": 0.7336206436157227, + "learning_rate": 8.683126796579173e-06, + "logits/chosen": 299602176.0, + "logits/rejected": 223047789.7142857, + "logps/chosen": -372.7840983072917, + "logps/rejected": -352.0560825892857, + "loss": 0.1142, + "rewards/chosen": 2.2932427724202475, + "rewards/margins": 8.186340241205125, + "rewards/rejected": -5.893097468784878, + "step": 672 + }, + { + "epoch": 0.248442619168474, + "grad_norm": 5.90625, + "kl": 1.0212020874023438, + "learning_rate": 8.679143986874643e-06, + "logits/chosen": 257720432.0, + "logits/rejected": 151473344.0, + "logps/chosen": -379.28387451171875, + "logps/rejected": -371.5189514160156, + "loss": 0.0964, + "rewards/chosen": 2.6678109169006348, + "rewards/margins": 8.272007942199707, + "rewards/rejected": -5.604197025299072, + "step": 673 + }, + { + "epoch": 0.2488117761063172, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 8.67515607976286e-06, + "logits/chosen": 309384252.2352941, + "logits/rejected": 195349128.53333333, + "logps/chosen": -325.1136259191176, + "logps/rejected": -397.98333333333335, + "loss": 0.0953, + "rewards/chosen": 2.2222761266371784, + "rewards/margins": 8.074968794280407, + "rewards/rejected": -5.852692667643229, + "step": 674 + }, + { + "epoch": 0.2491809330441604, + "grad_norm": 6.03125, + "kl": 0.5933456420898438, + "learning_rate": 8.671163080769025e-06, + "logits/chosen": 256980496.0, + "logits/rejected": 290762752.0, + "logps/chosen": -358.3360900878906, + "logps/rejected": -445.25341796875, + "loss": 0.1032, + "rewards/chosen": 2.3379106521606445, + "rewards/margins": 7.752274036407471, + "rewards/rejected": -5.414363384246826, + "step": 675 + }, + { + "epoch": 0.2495500899820036, + "grad_norm": 4.0, + "kl": 1.7412166595458984, + "learning_rate": 8.66716499542538e-06, + "logits/chosen": 251589947.07692307, + "logits/rejected": 218006972.63157895, + "logps/chosen": -409.2912785456731, + "logps/rejected": -427.16776315789474, + "loss": 0.0517, + "rewards/chosen": 3.0900946397047777, + "rewards/margins": 8.438400592881177, + "rewards/rejected": -5.348305953176398, + "step": 676 + }, + { + "epoch": 0.2499192469198468, + "grad_norm": 5.40625, + "kl": 1.6449594497680664, + "learning_rate": 8.663161829271226e-06, + "logits/chosen": 276152427.7894737, + "logits/rejected": 193944083.69230768, + "logps/chosen": -362.53536184210526, + "logps/rejected": -389.19764122596155, + "loss": 0.0736, + "rewards/chosen": 3.1345333300138774, + "rewards/margins": 8.93555930751538, + "rewards/rejected": -5.801025977501502, + "step": 677 + }, + { + "epoch": 0.25028840385769, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 8.659153587852895e-06, + "logits/chosen": 259090318.2222222, + "logits/rejected": 224200228.57142857, + "logps/chosen": -266.048828125, + "logps/rejected": -426.73960658482144, + "loss": 0.098, + "rewards/chosen": 2.4377318488226996, + "rewards/margins": 9.749561552017454, + "rewards/rejected": -7.311829703194754, + "step": 678 + }, + { + "epoch": 0.25028840385769, + "eval_kl": 0.3567507565021515, + "eval_logits/chosen": 256374499.17880794, + "eval_logits/rejected": 219713719.98108748, + "eval_logps/chosen": -359.5354235651214, + "eval_logps/rejected": -436.6929669030733, + "eval_loss": 0.09856116026639938, + "eval_rewards/chosen": 2.4010131027524833, + "eval_rewards/margins": 8.244950681246868, + "eval_rewards/rejected": -5.843937578494385, + "eval_runtime": 46.8243, + "eval_samples_per_second": 18.708, + "eval_steps_per_second": 4.677, + "step": 678 + }, + { + "epoch": 0.2506575607955332, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 8.65514027672376e-06, + "logits/chosen": 240621184.0, + "logits/rejected": 249775573.33333334, + "logps/chosen": -346.5876708984375, + "logps/rejected": -390.3330485026042, + "loss": 0.1577, + "rewards/chosen": 1.7507843017578124, + "rewards/margins": 6.004586283365885, + "rewards/rejected": -4.253801981608073, + "step": 679 + }, + { + "epoch": 0.2510267177333764, + "grad_norm": 8.25, + "kl": 1.4420185089111328, + "learning_rate": 8.651121901444208e-06, + "logits/chosen": 257785372.44444445, + "logits/rejected": 260636013.7142857, + "logps/chosen": -347.09795464409723, + "logps/rejected": -486.88211495535717, + "loss": 0.1588, + "rewards/chosen": 1.6079223420884874, + "rewards/margins": 7.502420894683354, + "rewards/rejected": -5.894498552594866, + "step": 680 + }, + { + "epoch": 0.2513958746712196, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 8.64709846758165e-06, + "logits/chosen": 295729186.1333333, + "logits/rejected": 287852604.2352941, + "logps/chosen": -344.12721354166666, + "logps/rejected": -436.21375229779414, + "loss": 0.109, + "rewards/chosen": 2.347617340087891, + "rewards/margins": 7.754132124956916, + "rewards/rejected": -5.406514784869025, + "step": 681 + }, + { + "epoch": 0.25176503160906283, + "grad_norm": 6.28125, + "kl": 2.039674758911133, + "learning_rate": 8.643069980710502e-06, + "logits/chosen": 226842142.11764705, + "logits/rejected": 207527065.6, + "logps/chosen": -377.37916475183823, + "logps/rejected": -403.60325520833334, + "loss": 0.1146, + "rewards/chosen": 2.494303086224724, + "rewards/margins": 7.64957550647212, + "rewards/rejected": -5.155272420247396, + "step": 682 + }, + { + "epoch": 0.252134188546906, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.639036446412177e-06, + "logits/chosen": 186376719.05882353, + "logits/rejected": 213091618.13333333, + "logps/chosen": -315.4688074448529, + "logps/rejected": -334.1505533854167, + "loss": 0.0941, + "rewards/chosen": 2.75016627592199, + "rewards/margins": 7.611318416221469, + "rewards/rejected": -4.861152140299479, + "step": 683 + }, + { + "epoch": 0.25250334548474923, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.634997870275092e-06, + "logits/chosen": 219403712.0, + "logits/rejected": 251867408.0, + "logps/chosen": -299.425048828125, + "logps/rejected": -356.34014892578125, + "loss": 0.07, + "rewards/chosen": 2.538137912750244, + "rewards/margins": 7.900886058807373, + "rewards/rejected": -5.362748146057129, + "step": 684 + }, + { + "epoch": 0.2528725024225924, + "grad_norm": 5.875, + "kl": 0.5854701995849609, + "learning_rate": 8.63095425789464e-06, + "logits/chosen": 298388894.47619045, + "logits/rejected": 317316677.8181818, + "logps/chosen": -413.6670851934524, + "logps/rejected": -539.2784978693181, + "loss": 0.1045, + "rewards/chosen": 2.1769080389113653, + "rewards/margins": 10.267081933620172, + "rewards/rejected": -8.090173894708807, + "step": 685 + }, + { + "epoch": 0.2532416593604356, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.62690561487319e-06, + "logits/chosen": 267728672.0, + "logits/rejected": 184569280.0, + "logps/chosen": -425.9144287109375, + "logps/rejected": -415.23284912109375, + "loss": 0.0623, + "rewards/chosen": 3.1339004039764404, + "rewards/margins": 9.212045431137085, + "rewards/rejected": -6.0781450271606445, + "step": 686 + }, + { + "epoch": 0.2536108162982788, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 8.622851946820094e-06, + "logits/chosen": 282989670.4, + "logits/rejected": 165964679.52941176, + "logps/chosen": -409.5865234375, + "logps/rejected": -371.44723690257354, + "loss": 0.1225, + "rewards/chosen": 1.7769744873046875, + "rewards/margins": 6.700179245892693, + "rewards/rejected": -4.923204758588006, + "step": 687 + }, + { + "epoch": 0.253979973236122, + "grad_norm": 4.8125, + "kl": 0.5401926040649414, + "learning_rate": 8.618793259351655e-06, + "logits/chosen": 276346758.0952381, + "logits/rejected": 220736605.0909091, + "logps/chosen": -382.1841982886905, + "logps/rejected": -377.8084161931818, + "loss": 0.0898, + "rewards/chosen": 2.423812684558687, + "rewards/margins": 8.506316957019624, + "rewards/rejected": -6.0825042724609375, + "step": 688 + }, + { + "epoch": 0.2543491301739652, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 8.614729558091129e-06, + "logits/chosen": 291690688.0, + "logits/rejected": 292689376.0, + "logps/chosen": -350.90399169921875, + "logps/rejected": -541.0474243164062, + "loss": 0.0892, + "rewards/chosen": 1.9876635074615479, + "rewards/margins": 8.484615564346313, + "rewards/rejected": -6.496952056884766, + "step": 689 + }, + { + "epoch": 0.2547182871118084, + "grad_norm": 6.53125, + "kl": 1.7120561599731445, + "learning_rate": 8.610660848668723e-06, + "logits/chosen": 320211638.85714287, + "logits/rejected": 200796686.2222222, + "logps/chosen": -435.6629115513393, + "logps/rejected": -463.94428168402777, + "loss": 0.0929, + "rewards/chosen": 1.9466639927455358, + "rewards/margins": 8.651975661989242, + "rewards/rejected": -6.705311669243707, + "step": 690 + }, + { + "epoch": 0.2550874440496516, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 8.606587136721586e-06, + "logits/chosen": 266496451.7647059, + "logits/rejected": 238826530.13333333, + "logps/chosen": -340.5887810202206, + "logps/rejected": -402.15439453125, + "loss": 0.0981, + "rewards/chosen": 2.7018798379337086, + "rewards/margins": 8.502733312868605, + "rewards/rejected": -5.800853474934896, + "step": 691 + }, + { + "epoch": 0.2554566009874948, + "grad_norm": 5.5, + "kl": 0.27271270751953125, + "learning_rate": 8.602508427893794e-06, + "logits/chosen": 169122252.8, + "logits/rejected": 200492672.0, + "logps/chosen": -364.3890380859375, + "logps/rejected": -438.8294270833333, + "loss": 0.0693, + "rewards/chosen": 3.6504676818847654, + "rewards/margins": 12.0618465423584, + "rewards/rejected": -8.411378860473633, + "step": 692 + }, + { + "epoch": 0.255825757925338, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 8.598424727836343e-06, + "logits/chosen": 248184362.66666666, + "logits/rejected": 273720115.2, + "logps/chosen": -279.5220133463542, + "logps/rejected": -440.602783203125, + "loss": 0.0935, + "rewards/chosen": 2.149454116821289, + "rewards/margins": 8.451185607910157, + "rewards/rejected": -6.3017314910888675, + "step": 693 + }, + { + "epoch": 0.2561949148631812, + "grad_norm": 5.15625, + "kl": 0.4198617935180664, + "learning_rate": 8.59433604220715e-06, + "logits/chosen": 216245009.06666666, + "logits/rejected": 292569750.5882353, + "logps/chosen": -315.58564453125, + "logps/rejected": -355.8133329503676, + "loss": 0.0818, + "rewards/chosen": 2.799352010091146, + "rewards/margins": 8.587788002163757, + "rewards/rejected": -5.7884359920726105, + "step": 694 + }, + { + "epoch": 0.2565640718010244, + "grad_norm": 5.03125, + "kl": 0.330350399017334, + "learning_rate": 8.590242376671035e-06, + "logits/chosen": 223786345.4117647, + "logits/rejected": 185790941.86666667, + "logps/chosen": -304.2479032628676, + "logps/rejected": -352.3444010416667, + "loss": 0.1124, + "rewards/chosen": 2.268825755399816, + "rewards/margins": 7.478222117704504, + "rewards/rejected": -5.2093963623046875, + "step": 695 + }, + { + "epoch": 0.2569332287388676, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 8.586143736899721e-06, + "logits/chosen": 306490496.0, + "logits/rejected": 199027456.0, + "logps/chosen": -405.380859375, + "logps/rejected": -398.3295654296875, + "loss": 0.0542, + "rewards/chosen": 2.45389191309611, + "rewards/margins": 7.9956288973490395, + "rewards/rejected": -5.54173698425293, + "step": 696 + }, + { + "epoch": 0.2573023856767108, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 8.582040128571822e-06, + "logits/chosen": 194943385.6, + "logits/rejected": 321518351.0588235, + "logps/chosen": -279.17216796875, + "logps/rejected": -334.6399931066176, + "loss": 0.0749, + "rewards/chosen": 2.816547648111979, + "rewards/margins": 8.073737230487898, + "rewards/rejected": -5.257189582375919, + "step": 697 + }, + { + "epoch": 0.257671542614554, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 8.577931557372832e-06, + "logits/chosen": 289170944.0, + "logits/rejected": 261968412.44444445, + "logps/chosen": -305.54854910714283, + "logps/rejected": -326.7439236111111, + "loss": 0.1024, + "rewards/chosen": 1.9438599177769251, + "rewards/margins": 7.537712914603097, + "rewards/rejected": -5.593852996826172, + "step": 698 + }, + { + "epoch": 0.2580406995523972, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 8.573818028995129e-06, + "logits/chosen": 187767632.0, + "logits/rejected": 271255456.0, + "logps/chosen": -401.1234130859375, + "logps/rejected": -416.5828857421875, + "loss": 0.0699, + "rewards/chosen": 3.108492851257324, + "rewards/margins": 8.473710536956787, + "rewards/rejected": -5.365217685699463, + "step": 699 + }, + { + "epoch": 0.2584098564902404, + "grad_norm": 2.75, + "kl": 0.0, + "learning_rate": 8.56969954913795e-06, + "logits/chosen": 285968738.46153843, + "logits/rejected": 251515365.0526316, + "logps/chosen": -332.4595477764423, + "logps/rejected": -479.9418174342105, + "loss": 0.0506, + "rewards/chosen": 2.691746638371394, + "rewards/margins": 9.462260875624683, + "rewards/rejected": -6.770514237253289, + "step": 700 + }, + { + "epoch": 0.2587790134280836, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 8.565576123507398e-06, + "logits/chosen": 208341312.0, + "logits/rejected": 186656528.0, + "logps/chosen": -304.7563171386719, + "logps/rejected": -414.6758117675781, + "loss": 0.0813, + "rewards/chosen": 2.7538599967956543, + "rewards/margins": 9.122286796569824, + "rewards/rejected": -6.36842679977417, + "step": 701 + }, + { + "epoch": 0.2591481703659268, + "grad_norm": 5.09375, + "kl": 0.5111942291259766, + "learning_rate": 8.561447757816428e-06, + "logits/chosen": 314290651.4285714, + "logits/rejected": 187252366.2222222, + "logps/chosen": -373.680908203125, + "logps/rejected": -438.12822808159723, + "loss": 0.0645, + "rewards/chosen": 2.2566590990339006, + "rewards/margins": 7.644941451057555, + "rewards/rejected": -5.388282352023655, + "step": 702 + }, + { + "epoch": 0.25951732730377003, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 8.557314457784838e-06, + "logits/chosen": 201849600.0, + "logits/rejected": 264852720.94117647, + "logps/chosen": -438.8167317708333, + "logps/rejected": -421.25890395220586, + "loss": 0.1076, + "rewards/chosen": 2.1320510864257813, + "rewards/margins": 8.322165276022519, + "rewards/rejected": -6.190114189596737, + "step": 703 + }, + { + "epoch": 0.2598864842416132, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 8.553176229139262e-06, + "logits/chosen": 309569805.4736842, + "logits/rejected": 334176374.15384614, + "logps/chosen": -335.9902857730263, + "logps/rejected": -445.01697716346155, + "loss": 0.1085, + "rewards/chosen": 2.0675805744371916, + "rewards/margins": 9.13987495638581, + "rewards/rejected": -7.072294381948618, + "step": 704 + }, + { + "epoch": 0.26025564117945643, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 8.54903307761316e-06, + "logits/chosen": 259897571.55555555, + "logits/rejected": 238577133.7142857, + "logps/chosen": -351.6988932291667, + "logps/rejected": -543.4986746651786, + "loss": 0.0906, + "rewards/chosen": 2.6584498087565103, + "rewards/margins": 11.123358953566777, + "rewards/rejected": -8.464909144810267, + "step": 705 + }, + { + "epoch": 0.2606247981172996, + "grad_norm": 6.53125, + "kl": 0.17580223083496094, + "learning_rate": 8.544885008946822e-06, + "logits/chosen": 237119514.9473684, + "logits/rejected": 289043633.2307692, + "logps/chosen": -398.6000462582237, + "logps/rejected": -509.0920973557692, + "loss": 0.1097, + "rewards/chosen": 2.3695042258814762, + "rewards/margins": 8.804903794879372, + "rewards/rejected": -6.435399568997896, + "step": 706 + }, + { + "epoch": 0.26099395505514283, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 8.54073202888734e-06, + "logits/chosen": 352865129.4117647, + "logits/rejected": 188430677.33333334, + "logps/chosen": -398.76438993566177, + "logps/rejected": -334.21142578125, + "loss": 0.1215, + "rewards/chosen": 2.4545507992015168, + "rewards/margins": 8.0462947770661, + "rewards/rejected": -5.591743977864583, + "step": 707 + }, + { + "epoch": 0.261363111992986, + "grad_norm": 5.375, + "kl": 0.25768566131591797, + "learning_rate": 8.536574143188619e-06, + "logits/chosen": 337944214.5882353, + "logits/rejected": 332001348.26666665, + "logps/chosen": -341.24661075367646, + "logps/rejected": -395.9751953125, + "loss": 0.0949, + "rewards/chosen": 2.6459640054141773, + "rewards/margins": 8.606043961468865, + "rewards/rejected": -5.960079956054687, + "step": 708 + }, + { + "epoch": 0.26173226893082924, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 8.532411357611352e-06, + "logits/chosen": 244664652.8, + "logits/rejected": 248945728.0, + "logps/chosen": -333.1787841796875, + "logps/rejected": -457.7737630208333, + "loss": 0.1298, + "rewards/chosen": 2.1174720764160155, + "rewards/margins": 7.262501462300618, + "rewards/rejected": -5.145029385884603, + "step": 709 + }, + { + "epoch": 0.2621014258686724, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 8.528243677923028e-06, + "logits/chosen": 211674717.0909091, + "logits/rejected": 234779501.7142857, + "logps/chosen": -317.97611860795456, + "logps/rejected": -486.2318638392857, + "loss": 0.0844, + "rewards/chosen": 1.9157035134055398, + "rewards/margins": 9.001936016660748, + "rewards/rejected": -7.086232503255208, + "step": 710 + }, + { + "epoch": 0.26247058280651564, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 8.524071109897915e-06, + "logits/chosen": 290438144.0, + "logits/rejected": 163599239.52941176, + "logps/chosen": -348.1847330729167, + "logps/rejected": -373.81497012867646, + "loss": 0.0744, + "rewards/chosen": 2.67490234375, + "rewards/margins": 8.511414741067325, + "rewards/rejected": -5.836512397317326, + "step": 711 + }, + { + "epoch": 0.2628397397443588, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.519893659317052e-06, + "logits/chosen": 247677801.4117647, + "logits/rejected": 213795276.8, + "logps/chosen": -318.5609777113971, + "logps/rejected": -442.70286458333334, + "loss": 0.1027, + "rewards/chosen": 2.815192503087661, + "rewards/margins": 8.565964089187922, + "rewards/rejected": -5.75077158610026, + "step": 712 + }, + { + "epoch": 0.26320889668220204, + "grad_norm": 5.75, + "kl": 0.7089872360229492, + "learning_rate": 8.515711331968242e-06, + "logits/chosen": 181130917.6470588, + "logits/rejected": 311619072.0, + "logps/chosen": -343.2169404871324, + "logps/rejected": -509.96373697916664, + "loss": 0.0889, + "rewards/chosen": 2.6210302465102253, + "rewards/margins": 8.253234938079236, + "rewards/rejected": -5.632204691569011, + "step": 713 + }, + { + "epoch": 0.2635780536200452, + "grad_norm": 5.53125, + "kl": 1.452826976776123, + "learning_rate": 8.51152413364605e-06, + "logits/chosen": 267353429.33333334, + "logits/rejected": 198712996.57142857, + "logps/chosen": -368.75830078125, + "logps/rejected": -453.78390066964283, + "loss": 0.0978, + "rewards/chosen": 2.6669798956976996, + "rewards/margins": 9.480709318130735, + "rewards/rejected": -6.813729422433036, + "step": 714 + }, + { + "epoch": 0.26394721055788845, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 8.507332070151784e-06, + "logits/chosen": 253710546.82352942, + "logits/rejected": 204260130.13333333, + "logps/chosen": -394.86695772058823, + "logps/rejected": -468.2548828125, + "loss": 0.1038, + "rewards/chosen": 2.1987051122328816, + "rewards/margins": 9.48358975578757, + "rewards/rejected": -7.284884643554688, + "step": 715 + }, + { + "epoch": 0.2643163674957316, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.503135147293496e-06, + "logits/chosen": 233594282.66666666, + "logits/rejected": 257929892.57142857, + "logps/chosen": -374.47577582465277, + "logps/rejected": -458.4471958705357, + "loss": 0.0832, + "rewards/chosen": 2.454294628567166, + "rewards/margins": 8.291431033422077, + "rewards/rejected": -5.837136404854911, + "step": 716 + }, + { + "epoch": 0.26468552443357485, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 8.498933370885967e-06, + "logits/chosen": 225555507.2, + "logits/rejected": 326200681.4117647, + "logps/chosen": -356.16546223958335, + "logps/rejected": -499.30152803308823, + "loss": 0.1107, + "rewards/chosen": 1.7224960327148438, + "rewards/margins": 8.971439585966223, + "rewards/rejected": -7.248943553251379, + "step": 717 + }, + { + "epoch": 0.265054681371418, + "grad_norm": 5.96875, + "kl": 1.4775772094726562, + "learning_rate": 8.494726746750705e-06, + "logits/chosen": 174797718.5882353, + "logits/rejected": 188767300.26666668, + "logps/chosen": -271.40538832720586, + "logps/rejected": -422.7184244791667, + "loss": 0.1365, + "rewards/chosen": 2.237364825080423, + "rewards/margins": 8.238037229051777, + "rewards/rejected": -6.000672403971354, + "step": 718 + }, + { + "epoch": 0.26542383830926125, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.490515280715937e-06, + "logits/chosen": 366374314.6666667, + "logits/rejected": 356152429.71428573, + "logps/chosen": -418.87730577256946, + "logps/rejected": -422.019287109375, + "loss": 0.0952, + "rewards/chosen": 2.453215069240994, + "rewards/margins": 7.772558091178773, + "rewards/rejected": -5.319343021937779, + "step": 719 + }, + { + "epoch": 0.2657929952471044, + "grad_norm": 5.5625, + "kl": 0.5961647033691406, + "learning_rate": 8.486298978616593e-06, + "logits/chosen": 218003438.93333334, + "logits/rejected": 200772020.70588234, + "logps/chosen": -440.07763671875, + "logps/rejected": -426.75267118566177, + "loss": 0.0813, + "rewards/chosen": 2.876020304361979, + "rewards/margins": 8.42999288521561, + "rewards/rejected": -5.553972580853631, + "step": 720 + }, + { + "epoch": 0.26616215218494765, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 8.48207784629431e-06, + "logits/chosen": 260467772.2352941, + "logits/rejected": 281303603.2, + "logps/chosen": -309.97248391544116, + "logps/rejected": -339.99085286458336, + "loss": 0.0983, + "rewards/chosen": 2.1440739351160385, + "rewards/margins": 7.567372071509268, + "rewards/rejected": -5.423298136393229, + "step": 721 + }, + { + "epoch": 0.2665313091227908, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 8.477851889597408e-06, + "logits/chosen": 220617767.3846154, + "logits/rejected": 282693254.7368421, + "logps/chosen": -351.3605769230769, + "logps/rejected": -455.81630345394734, + "loss": 0.0684, + "rewards/chosen": 2.449261738703801, + "rewards/margins": 8.662262789150965, + "rewards/rejected": -6.213001050447163, + "step": 722 + }, + { + "epoch": 0.266900466060634, + "grad_norm": 6.03125, + "kl": 0.7202465534210205, + "learning_rate": 8.473621114380899e-06, + "logits/chosen": 325410936.4705882, + "logits/rejected": 227816977.06666666, + "logps/chosen": -392.47093290441177, + "logps/rejected": -371.30491536458334, + "loss": 0.0943, + "rewards/chosen": 2.0740892746869255, + "rewards/margins": 7.503762024524165, + "rewards/rejected": -5.4296727498372395, + "step": 723 + }, + { + "epoch": 0.26726962299847723, + "grad_norm": 6.59375, + "kl": 0.19774913787841797, + "learning_rate": 8.469385526506466e-06, + "logits/chosen": 243267020.8, + "logits/rejected": 298773443.7647059, + "logps/chosen": -359.3557942708333, + "logps/rejected": -422.3475700827206, + "loss": 0.1234, + "rewards/chosen": 1.8886937459309896, + "rewards/margins": 7.03247013466031, + "rewards/rejected": -5.14377638872932, + "step": 724 + }, + { + "epoch": 0.2676387799363204, + "grad_norm": 5.65625, + "kl": 0.8858447074890137, + "learning_rate": 8.465145131842467e-06, + "logits/chosen": 304109329.06666666, + "logits/rejected": 268162469.6470588, + "logps/chosen": -400.5693033854167, + "logps/rejected": -502.20559512867646, + "loss": 0.101, + "rewards/chosen": 2.0783917744954428, + "rewards/margins": 9.613169247496362, + "rewards/rejected": -7.534777473000919, + "step": 725 + }, + { + "epoch": 0.26800793687416363, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 8.46089993626391e-06, + "logits/chosen": 171869476.57142857, + "logits/rejected": 254324394.66666666, + "logps/chosen": -387.35630580357144, + "logps/rejected": -435.4052463107639, + "loss": 0.0801, + "rewards/chosen": 2.983088629586356, + "rewards/margins": 8.294113613310314, + "rewards/rejected": -5.311024983723958, + "step": 726 + }, + { + "epoch": 0.2683770938120068, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 8.456649945652463e-06, + "logits/chosen": 377866069.3333333, + "logits/rejected": 332641701.64705884, + "logps/chosen": -371.1646484375, + "logps/rejected": -468.95085592830884, + "loss": 0.0955, + "rewards/chosen": 2.0112635294596353, + "rewards/margins": 8.416237176633349, + "rewards/rejected": -6.404973647173713, + "step": 727 + }, + { + "epoch": 0.26874625074985004, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.452395165896433e-06, + "logits/chosen": 248017078.85714287, + "logits/rejected": 253810659.55555555, + "logps/chosen": -320.67710658482144, + "logps/rejected": -375.5655110677083, + "loss": 0.1426, + "rewards/chosen": 1.4967459269932337, + "rewards/margins": 6.612790713234554, + "rewards/rejected": -5.11604478624132, + "step": 728 + }, + { + "epoch": 0.2691154076876932, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.448135602890763e-06, + "logits/chosen": 202775552.0, + "logits/rejected": 207678236.44444445, + "logps/chosen": -332.77650669642856, + "logps/rejected": -272.0021701388889, + "loss": 0.0938, + "rewards/chosen": 2.153000695364816, + "rewards/margins": 7.038323266165597, + "rewards/rejected": -4.885322570800781, + "step": 729 + }, + { + "epoch": 0.26948456462553644, + "grad_norm": 5.9375, + "kl": 0.47258853912353516, + "learning_rate": 8.443871262537023e-06, + "logits/chosen": 262464917.33333334, + "logits/rejected": 260237760.0, + "logps/chosen": -350.8126220703125, + "logps/rejected": -500.7622375488281, + "loss": 0.118, + "rewards/chosen": 2.405032475789388, + "rewards/margins": 8.84138854344686, + "rewards/rejected": -6.436356067657471, + "step": 730 + }, + { + "epoch": 0.2698537215633796, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 8.4396021507434e-06, + "logits/chosen": 253358816.0, + "logits/rejected": 245797488.0, + "logps/chosen": -398.142333984375, + "logps/rejected": -424.6960754394531, + "loss": 0.0994, + "rewards/chosen": 2.387129545211792, + "rewards/margins": 7.5161988735198975, + "rewards/rejected": -5.1290693283081055, + "step": 731 + }, + { + "epoch": 0.27022287850122284, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.4353282734247e-06, + "logits/chosen": 207017144.8888889, + "logits/rejected": 276921380.5714286, + "logps/chosen": -336.82085503472223, + "logps/rejected": -581.1847446986607, + "loss": 0.1127, + "rewards/chosen": 2.2115099165174694, + "rewards/margins": 9.759867743840292, + "rewards/rejected": -7.548357827322824, + "step": 732 + }, + { + "epoch": 0.270592035439066, + "grad_norm": 5.34375, + "kl": 0.016203880310058594, + "learning_rate": 8.431049636502322e-06, + "logits/chosen": 197340213.89473686, + "logits/rejected": 205576723.69230768, + "logps/chosen": -308.6588199013158, + "logps/rejected": -442.9746844951923, + "loss": 0.1127, + "rewards/chosen": 2.4347530164216695, + "rewards/margins": 8.889958493622691, + "rewards/rejected": -6.455205477201021, + "step": 733 + }, + { + "epoch": 0.27096119237690924, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 8.426766245904263e-06, + "logits/chosen": 223430734.76923078, + "logits/rejected": 257445106.52631578, + "logps/chosen": -286.7685359074519, + "logps/rejected": -344.46127158717104, + "loss": 0.1023, + "rewards/chosen": 2.413998383742112, + "rewards/margins": 8.201237056902063, + "rewards/rejected": -5.78723867315995, + "step": 734 + }, + { + "epoch": 0.2713303493147524, + "grad_norm": 5.15625, + "kl": 0.3857088088989258, + "learning_rate": 8.422478107565108e-06, + "logits/chosen": 133875344.0, + "logits/rejected": 230957056.0, + "logps/chosen": -316.09820556640625, + "logps/rejected": -465.5191650390625, + "loss": 0.0717, + "rewards/chosen": 3.2499260902404785, + "rewards/margins": 9.71707534790039, + "rewards/rejected": -6.467149257659912, + "step": 735 + }, + { + "epoch": 0.27169950625259565, + "grad_norm": 5.03125, + "kl": 0.043150901794433594, + "learning_rate": 8.418185227426016e-06, + "logits/chosen": 222350144.0, + "logits/rejected": 235212432.0, + "logps/chosen": -268.96575927734375, + "logps/rejected": -383.253662109375, + "loss": 0.0828, + "rewards/chosen": 2.886469841003418, + "rewards/margins": 7.2354230880737305, + "rewards/rejected": -4.3489532470703125, + "step": 736 + }, + { + "epoch": 0.2720686631904388, + "grad_norm": 6.375, + "kl": 0.03530406951904297, + "learning_rate": 8.41388761143472e-06, + "logits/chosen": 173272484.57142857, + "logits/rejected": 229654812.44444445, + "logps/chosen": -346.125, + "logps/rejected": -424.41514756944446, + "loss": 0.0961, + "rewards/chosen": 1.9092987605503626, + "rewards/margins": 8.596126102265858, + "rewards/rejected": -6.686827341715495, + "step": 737 + }, + { + "epoch": 0.27243782012828205, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.409585265545509e-06, + "logits/chosen": 347876384.0, + "logits/rejected": 225610512.0, + "logps/chosen": -348.552978515625, + "logps/rejected": -429.8204345703125, + "loss": 0.1134, + "rewards/chosen": 1.8456971645355225, + "rewards/margins": 7.867223024368286, + "rewards/rejected": -6.021525859832764, + "step": 738 + }, + { + "epoch": 0.2728069770661252, + "grad_norm": 5.75, + "kl": 0.1256885528564453, + "learning_rate": 8.405278195719233e-06, + "logits/chosen": 321146538.6666667, + "logits/rejected": 289559625.14285713, + "logps/chosen": -390.9485677083333, + "logps/rejected": -410.8606654575893, + "loss": 0.0875, + "rewards/chosen": 2.5283985137939453, + "rewards/margins": 7.791637693132673, + "rewards/rejected": -5.263239179338727, + "step": 739 + }, + { + "epoch": 0.27317613400396845, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.40096640792328e-06, + "logits/chosen": 254019456.0, + "logits/rejected": 280817638.4, + "logps/chosen": -332.69854736328125, + "logps/rejected": -510.11416015625, + "loss": 0.0862, + "rewards/chosen": 3.2085158030192056, + "rewards/margins": 9.863972345987955, + "rewards/rejected": -6.65545654296875, + "step": 740 + }, + { + "epoch": 0.2735452909418116, + "grad_norm": 4.4375, + "kl": 0.45699310302734375, + "learning_rate": 8.396649908131578e-06, + "logits/chosen": 199625284.26666668, + "logits/rejected": 200278256.94117647, + "logps/chosen": -349.39720052083334, + "logps/rejected": -413.41868681066177, + "loss": 0.0714, + "rewards/chosen": 2.398865509033203, + "rewards/margins": 8.812079799876493, + "rewards/rejected": -6.413214290843291, + "step": 741 + }, + { + "epoch": 0.27391444787965485, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.39232870232458e-06, + "logits/chosen": 207360128.0, + "logits/rejected": 222878048.0, + "logps/chosen": -319.7125549316406, + "logps/rejected": -417.67437744140625, + "loss": 0.094, + "rewards/chosen": 2.149592399597168, + "rewards/margins": 8.441375255584717, + "rewards/rejected": -6.291782855987549, + "step": 742 + }, + { + "epoch": 0.27428360481749803, + "grad_norm": 4.875, + "kl": 0.2862682342529297, + "learning_rate": 8.388002796489267e-06, + "logits/chosen": 211508361.84615386, + "logits/rejected": 180659348.21052632, + "logps/chosen": -301.4758112980769, + "logps/rejected": -429.4033974095395, + "loss": 0.0755, + "rewards/chosen": 2.9847347552959738, + "rewards/margins": 9.750941998562833, + "rewards/rejected": -6.766207243266859, + "step": 743 + }, + { + "epoch": 0.27465276175534126, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 8.383672196619123e-06, + "logits/chosen": 237717148.44444445, + "logits/rejected": 276293979.4285714, + "logps/chosen": -348.1155598958333, + "logps/rejected": -440.7263881138393, + "loss": 0.1503, + "rewards/chosen": 1.846641116672092, + "rewards/margins": 7.4280522058880525, + "rewards/rejected": -5.58141108921596, + "step": 744 + }, + { + "epoch": 0.27502191869318443, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.37933690871414e-06, + "logits/chosen": 268253003.29411766, + "logits/rejected": 387163955.2, + "logps/chosen": -393.2119715073529, + "logps/rejected": -459.86481119791665, + "loss": 0.0934, + "rewards/chosen": 2.633119695326861, + "rewards/margins": 8.247949203790403, + "rewards/rejected": -5.614829508463542, + "step": 745 + }, + { + "epoch": 0.27539107563102766, + "grad_norm": 4.6875, + "kl": 0.04340749979019165, + "learning_rate": 8.374996938780804e-06, + "logits/chosen": 160728982.5882353, + "logits/rejected": 178809378.13333333, + "logps/chosen": -247.6847426470588, + "logps/rejected": -348.02418619791666, + "loss": 0.0816, + "rewards/chosen": 2.720382241641774, + "rewards/margins": 8.579155028100107, + "rewards/rejected": -5.858772786458333, + "step": 746 + }, + { + "epoch": 0.27576023256887083, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 8.370652292832087e-06, + "logits/chosen": 295428608.0, + "logits/rejected": 203033618.2857143, + "logps/chosen": -347.5049641927083, + "logps/rejected": -477.1664341517857, + "loss": 0.1121, + "rewards/chosen": 2.1811205546061196, + "rewards/margins": 8.449366796584357, + "rewards/rejected": -6.268246241978237, + "step": 747 + }, + { + "epoch": 0.27612938950671406, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.366302976887442e-06, + "logits/chosen": 321709129.14285713, + "logits/rejected": 241249336.8888889, + "logps/chosen": -408.60379464285717, + "logps/rejected": -344.1001247829861, + "loss": 0.0856, + "rewards/chosen": 2.726864678519113, + "rewards/margins": 6.937811503334651, + "rewards/rejected": -4.210946824815538, + "step": 748 + }, + { + "epoch": 0.27649854644455724, + "grad_norm": 7.15625, + "kl": 0.0, + "learning_rate": 8.361948996972792e-06, + "logits/chosen": 283450641.06666666, + "logits/rejected": 328407160.4705882, + "logps/chosen": -339.40569661458335, + "logps/rejected": -554.8844784007352, + "loss": 0.1058, + "rewards/chosen": 1.9913188934326171, + "rewards/margins": 8.960572882259592, + "rewards/rejected": -6.969253988826976, + "step": 749 + }, + { + "epoch": 0.27686770338240047, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 8.357590359120518e-06, + "logits/chosen": 305609109.3333333, + "logits/rejected": 253146291.2, + "logps/chosen": -282.8706868489583, + "logps/rejected": -426.92099609375, + "loss": 0.1382, + "rewards/chosen": 0.9289507071177164, + "rewards/margins": 7.169762245814006, + "rewards/rejected": -6.240811538696289, + "step": 750 + }, + { + "epoch": 0.27723686032024364, + "grad_norm": 5.90625, + "kl": 0.3520522117614746, + "learning_rate": 8.353227069369461e-06, + "logits/chosen": 214803524.26666668, + "logits/rejected": 299525451.2941176, + "logps/chosen": -333.37268880208336, + "logps/rejected": -496.150390625, + "loss": 0.0893, + "rewards/chosen": 1.813582992553711, + "rewards/margins": 8.12028195998248, + "rewards/rejected": -6.306698967428768, + "step": 751 + }, + { + "epoch": 0.27760601725808687, + "grad_norm": 6.3125, + "kl": 0.8799362182617188, + "learning_rate": 8.348859133764902e-06, + "logits/chosen": 275418624.0, + "logits/rejected": 246709201.45454547, + "logps/chosen": -341.37534877232144, + "logps/rejected": -624.6654829545455, + "loss": 0.1309, + "rewards/chosen": 2.2827037629627047, + "rewards/margins": 9.644613109109722, + "rewards/rejected": -7.361909346147017, + "step": 752 + }, + { + "epoch": 0.27797517419593004, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 8.34448655835856e-06, + "logits/chosen": 313075200.0, + "logits/rejected": 281688206.2222222, + "logps/chosen": -266.02711704799106, + "logps/rejected": -355.4070638020833, + "loss": 0.1135, + "rewards/chosen": 1.7862649645124162, + "rewards/margins": 7.562609263828822, + "rewards/rejected": -5.776344299316406, + "step": 753 + }, + { + "epoch": 0.27834433113377327, + "grad_norm": 4.75, + "kl": 0.3806171417236328, + "learning_rate": 8.34010934920858e-06, + "logits/chosen": 308918723.7647059, + "logits/rejected": 210730547.2, + "logps/chosen": -356.52458639705884, + "logps/rejected": -494.0107096354167, + "loss": 0.0791, + "rewards/chosen": 2.249682482551126, + "rewards/margins": 8.425593432258157, + "rewards/rejected": -6.175910949707031, + "step": 754 + }, + { + "epoch": 0.27871348807161644, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.335727512379535e-06, + "logits/chosen": 331514486.15384614, + "logits/rejected": 217551467.78947368, + "logps/chosen": -420.69786658653845, + "logps/rejected": -331.6268760279605, + "loss": 0.0836, + "rewards/chosen": 1.8621854048508863, + "rewards/margins": 8.289526665258986, + "rewards/rejected": -6.427341260408101, + "step": 755 + }, + { + "epoch": 0.2790826450094597, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 8.3313410539424e-06, + "logits/chosen": 232429056.0, + "logits/rejected": 306732453.64705884, + "logps/chosen": -350.04860026041666, + "logps/rejected": -404.54831112132354, + "loss": 0.0673, + "rewards/chosen": 2.502319844563802, + "rewards/margins": 9.050104986452588, + "rewards/rejected": -6.547785141888787, + "step": 756 + }, + { + "epoch": 0.27945180194730285, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 8.32694997997456e-06, + "logits/chosen": 207852629.33333334, + "logits/rejected": 332638310.4, + "logps/chosen": -346.1641438802083, + "logps/rejected": -794.596923828125, + "loss": 0.0853, + "rewards/chosen": 2.7173967361450195, + "rewards/margins": 12.493794441223145, + "rewards/rejected": -9.776397705078125, + "step": 757 + }, + { + "epoch": 0.279820958885146, + "grad_norm": 6.15625, + "kl": 0.45763111114501953, + "learning_rate": 8.322554296559792e-06, + "logits/chosen": 338126080.0, + "logits/rejected": 191676672.0, + "logps/chosen": -374.8341552734375, + "logps/rejected": -426.4644368489583, + "loss": 0.0968, + "rewards/chosen": 2.4591041564941407, + "rewards/margins": 9.00241305033366, + "rewards/rejected": -6.5433088938395185, + "step": 758 + }, + { + "epoch": 0.28019011582298925, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 8.318154009788257e-06, + "logits/chosen": 272902144.0, + "logits/rejected": 285375307.2941176, + "logps/chosen": -358.35553385416665, + "logps/rejected": -407.7252987132353, + "loss": 0.1017, + "rewards/chosen": 1.8469764709472656, + "rewards/margins": 7.78140406889074, + "rewards/rejected": -5.934427597943475, + "step": 759 + }, + { + "epoch": 0.2805592727608324, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 8.3137491257565e-06, + "logits/chosen": 165849307.42857143, + "logits/rejected": 202783374.2222222, + "logps/chosen": -316.30991908482144, + "logps/rejected": -413.68565538194446, + "loss": 0.1018, + "rewards/chosen": 2.54909542628697, + "rewards/margins": 8.284065095205156, + "rewards/rejected": -5.734969668918186, + "step": 760 + }, + { + "epoch": 0.28092842969867565, + "grad_norm": 6.59375, + "kl": 1.1700210571289062, + "learning_rate": 8.30933965056743e-06, + "logits/chosen": 241133966.2222222, + "logits/rejected": 208942774.85714287, + "logps/chosen": -365.2458224826389, + "logps/rejected": -387.03585379464283, + "loss": 0.1253, + "rewards/chosen": 2.582380082872179, + "rewards/margins": 6.84053699553959, + "rewards/rejected": -4.258156912667411, + "step": 761 + }, + { + "epoch": 0.2812975866365188, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.304925590330318e-06, + "logits/chosen": 168151552.0, + "logits/rejected": 274050816.0, + "logps/chosen": -322.73272705078125, + "logps/rejected": -475.64263916015625, + "loss": 0.1139, + "rewards/chosen": 2.0318005084991455, + "rewards/margins": 8.638540506362915, + "rewards/rejected": -6.6067399978637695, + "step": 762 + }, + { + "epoch": 0.28166674357436206, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 8.300506951160789e-06, + "logits/chosen": 247194336.0, + "logits/rejected": 166862608.0, + "logps/chosen": -380.8706970214844, + "logps/rejected": -448.81182861328125, + "loss": 0.1191, + "rewards/chosen": 2.002765417098999, + "rewards/margins": 8.582216024398804, + "rewards/rejected": -6.579450607299805, + "step": 763 + }, + { + "epoch": 0.28203590051220523, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 8.296083739180812e-06, + "logits/chosen": 229902776.8888889, + "logits/rejected": 347696676.5714286, + "logps/chosen": -325.7649197048611, + "logps/rejected": -578.8180803571429, + "loss": 0.1337, + "rewards/chosen": 1.7535581588745117, + "rewards/margins": 7.539313316345215, + "rewards/rejected": -5.785755157470703, + "step": 764 + }, + { + "epoch": 0.28240505745004846, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 8.29165596051869e-06, + "logits/chosen": 300836785.2307692, + "logits/rejected": 290330085.05263156, + "logps/chosen": -328.29244290865387, + "logps/rejected": -477.5605982730263, + "loss": 0.0789, + "rewards/chosen": 2.0778528360220103, + "rewards/margins": 8.335287881766254, + "rewards/rejected": -6.257435045744243, + "step": 765 + }, + { + "epoch": 0.28277421438789163, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.287223621309055e-06, + "logits/chosen": 270765056.0, + "logits/rejected": 226523776.0, + "logps/chosen": -316.4104309082031, + "logps/rejected": -343.7183837890625, + "loss": 0.0975, + "rewards/chosen": 2.01396107673645, + "rewards/margins": 7.10781455039978, + "rewards/rejected": -5.09385347366333, + "step": 766 + }, + { + "epoch": 0.28314337132573486, + "grad_norm": 5.8125, + "kl": 1.2220687866210938, + "learning_rate": 8.282786727692856e-06, + "logits/chosen": 195303648.0, + "logits/rejected": 184769216.0, + "logps/chosen": -358.1578369140625, + "logps/rejected": -400.37896728515625, + "loss": 0.1201, + "rewards/chosen": 2.6368861198425293, + "rewards/margins": 8.07764482498169, + "rewards/rejected": -5.44075870513916, + "step": 767 + }, + { + "epoch": 0.28351252826357803, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 8.278345285817353e-06, + "logits/chosen": 334284642.46153843, + "logits/rejected": 225333557.89473686, + "logps/chosen": -483.2758037860577, + "logps/rejected": -434.7428042763158, + "loss": 0.0978, + "rewards/chosen": 1.6607259603647084, + "rewards/margins": 7.35846099390192, + "rewards/rejected": -5.697735033537212, + "step": 768 + }, + { + "epoch": 0.28388168520142126, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.273899301836111e-06, + "logits/chosen": 209704917.33333334, + "logits/rejected": 248486144.0, + "logps/chosen": -393.8550618489583, + "logps/rejected": -441.928076171875, + "loss": 0.0712, + "rewards/chosen": 2.3406616846720376, + "rewards/margins": 8.177373949686686, + "rewards/rejected": -5.836712265014649, + "step": 769 + }, + { + "epoch": 0.28425084213926444, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.269448781908977e-06, + "logits/chosen": 201850693.8181818, + "logits/rejected": 238859117.7142857, + "logps/chosen": -371.35964133522725, + "logps/rejected": -557.8484468005952, + "loss": 0.0659, + "rewards/chosen": 2.2769818739457564, + "rewards/margins": 9.529102845625443, + "rewards/rejected": -7.2521209716796875, + "step": 770 + }, + { + "epoch": 0.28461999907710767, + "grad_norm": 4.375, + "kl": 0.5063080787658691, + "learning_rate": 8.264993732202094e-06, + "logits/chosen": 279161651.2, + "logits/rejected": 162664512.0, + "logps/chosen": -332.97001953125, + "logps/rejected": -379.1267496744792, + "loss": 0.0886, + "rewards/chosen": 2.393497085571289, + "rewards/margins": 8.1410582224528, + "rewards/rejected": -5.747561136881511, + "step": 771 + }, + { + "epoch": 0.28498915601495084, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.260534158887878e-06, + "logits/chosen": 227260652.30769232, + "logits/rejected": 260657717.89473686, + "logps/chosen": -339.0392503004808, + "logps/rejected": -414.3467053865132, + "loss": 0.0669, + "rewards/chosen": 3.0221000084510217, + "rewards/margins": 8.531424734756532, + "rewards/rejected": -5.50932472630551, + "step": 772 + }, + { + "epoch": 0.28535831295279407, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.256070068145009e-06, + "logits/chosen": 306942765.1764706, + "logits/rejected": 221972258.13333333, + "logps/chosen": -396.33263442095586, + "logps/rejected": -534.8915690104167, + "loss": 0.0682, + "rewards/chosen": 2.796603932100184, + "rewards/margins": 9.812010222790288, + "rewards/rejected": -7.0154062906901045, + "step": 773 + }, + { + "epoch": 0.28572746989063724, + "grad_norm": 5.15625, + "kl": 0.7940554618835449, + "learning_rate": 8.251601466158428e-06, + "logits/chosen": 229255054.2222222, + "logits/rejected": 278487734.85714287, + "logps/chosen": -335.2052951388889, + "logps/rejected": -421.02451869419644, + "loss": 0.0836, + "rewards/chosen": 3.2092617882622614, + "rewards/margins": 8.623149902101547, + "rewards/rejected": -5.413888113839286, + "step": 774 + }, + { + "epoch": 0.28609662682848047, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 8.247128359119326e-06, + "logits/chosen": 247727232.0, + "logits/rejected": 265669088.0, + "logps/chosen": -298.2471618652344, + "logps/rejected": -448.49652099609375, + "loss": 0.1122, + "rewards/chosen": 1.995009422302246, + "rewards/margins": 7.803788185119629, + "rewards/rejected": -5.808778762817383, + "step": 775 + }, + { + "epoch": 0.28646578376632365, + "grad_norm": 7.15625, + "kl": 1.2444217205047607, + "learning_rate": 8.242650753225137e-06, + "logits/chosen": 311276544.0, + "logits/rejected": 181172553.14285713, + "logps/chosen": -299.7892795138889, + "logps/rejected": -329.3767787388393, + "loss": 0.2073, + "rewards/chosen": 1.8167487250434027, + "rewards/margins": 6.626832386804005, + "rewards/rejected": -4.810083661760602, + "step": 776 + }, + { + "epoch": 0.2868349407041669, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 8.238168654679528e-06, + "logits/chosen": 212808560.0, + "logits/rejected": 210908176.0, + "logps/chosen": -328.28302001953125, + "logps/rejected": -395.70989990234375, + "loss": 0.0743, + "rewards/chosen": 2.6572372913360596, + "rewards/margins": 8.928382635116577, + "rewards/rejected": -6.271145343780518, + "step": 777 + }, + { + "epoch": 0.28720409764201005, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.233682069692388e-06, + "logits/chosen": 243641429.33333334, + "logits/rejected": 290913228.8, + "logps/chosen": -407.6080322265625, + "logps/rejected": -481.281689453125, + "loss": 0.0917, + "rewards/chosen": 2.110375722249349, + "rewards/margins": 7.888924153645833, + "rewards/rejected": -5.7785484313964846, + "step": 778 + }, + { + "epoch": 0.2875732545798533, + "grad_norm": 6.53125, + "kl": 0.40582275390625, + "learning_rate": 8.229191004479825e-06, + "logits/chosen": 189739136.0, + "logits/rejected": 193757354.66666666, + "logps/chosen": -355.0426513671875, + "logps/rejected": -378.0916748046875, + "loss": 0.0982, + "rewards/chosen": 3.5125465393066406, + "rewards/margins": 8.386509259541828, + "rewards/rejected": -4.8739627202351885, + "step": 779 + }, + { + "epoch": 0.28794241151769645, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.22469546526415e-06, + "logits/chosen": 201205717.33333334, + "logits/rejected": 296741427.2, + "logps/chosen": -375.1667887369792, + "logps/rejected": -534.503564453125, + "loss": 0.0588, + "rewards/chosen": 2.7311410903930664, + "rewards/margins": 8.544121360778808, + "rewards/rejected": -5.812980270385742, + "step": 780 + }, + { + "epoch": 0.2883115684555397, + "grad_norm": 4.40625, + "kl": 0.5300159454345703, + "learning_rate": 8.220195458273879e-06, + "logits/chosen": 290153636.5714286, + "logits/rejected": 247912277.33333334, + "logps/chosen": -273.11083984375, + "logps/rejected": -454.37657335069446, + "loss": 0.1027, + "rewards/chosen": 2.269123077392578, + "rewards/margins": 7.844224294026692, + "rewards/rejected": -5.575101216634114, + "step": 781 + }, + { + "epoch": 0.28868072539338285, + "grad_norm": 5.65625, + "kl": 0.03036785125732422, + "learning_rate": 8.21569098974371e-06, + "logits/chosen": 325909353.4117647, + "logits/rejected": 158844108.8, + "logps/chosen": -372.6644933363971, + "logps/rejected": -362.9501953125, + "loss": 0.0823, + "rewards/chosen": 2.595255234662224, + "rewards/margins": 8.462342535280714, + "rewards/rejected": -5.86708730061849, + "step": 782 + }, + { + "epoch": 0.2890498823312261, + "grad_norm": 5.6875, + "kl": 0.22827625274658203, + "learning_rate": 8.211182065914531e-06, + "logits/chosen": 209430186.66666666, + "logits/rejected": 174990801.45454547, + "logps/chosen": -347.78590029761904, + "logps/rejected": -424.97713955965907, + "loss": 0.0986, + "rewards/chosen": 2.761286417643229, + "rewards/margins": 8.871406323982008, + "rewards/rejected": -6.110119906338778, + "step": 783 + }, + { + "epoch": 0.28941903926906926, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 8.206668693033399e-06, + "logits/chosen": 290519200.0, + "logits/rejected": 234245968.0, + "logps/chosen": -349.91302490234375, + "logps/rejected": -464.9971923828125, + "loss": 0.1137, + "rewards/chosen": 2.2974319458007812, + "rewards/margins": 8.629261016845703, + "rewards/rejected": -6.331829071044922, + "step": 784 + }, + { + "epoch": 0.2897881962069125, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 8.202150877353533e-06, + "logits/chosen": 225988266.66666666, + "logits/rejected": 192844236.8, + "logps/chosen": -357.3885498046875, + "logps/rejected": -399.8020263671875, + "loss": 0.0705, + "rewards/chosen": 2.5694526036580405, + "rewards/margins": 8.51629441579183, + "rewards/rejected": -5.946841812133789, + "step": 785 + }, + { + "epoch": 0.29015735314475566, + "grad_norm": 6.0625, + "kl": 0.5854759216308594, + "learning_rate": 8.197628625134306e-06, + "logits/chosen": 226467131.07692307, + "logits/rejected": 213598908.63157895, + "logps/chosen": -428.0724534254808, + "logps/rejected": -407.41331722861844, + "loss": 0.0863, + "rewards/chosen": 1.7252979278564453, + "rewards/margins": 7.47323317276804, + "rewards/rejected": -5.747935244911595, + "step": 786 + }, + { + "epoch": 0.2905265100825989, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 8.193101942641248e-06, + "logits/chosen": 288847598.93333334, + "logits/rejected": 300863488.0, + "logps/chosen": -374.71917317708335, + "logps/rejected": -516.6767578125, + "loss": 0.0859, + "rewards/chosen": 2.349878692626953, + "rewards/margins": 9.12084830789005, + "rewards/rejected": -6.770969615263097, + "step": 787 + }, + { + "epoch": 0.29089566702044206, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 8.188570836146015e-06, + "logits/chosen": 292677277.53846157, + "logits/rejected": 248987432.42105263, + "logps/chosen": -458.35190054086536, + "logps/rejected": -467.73524876644734, + "loss": 0.0818, + "rewards/chosen": 2.543948540320763, + "rewards/margins": 8.703528168713033, + "rewards/rejected": -6.15957962839227, + "step": 788 + }, + { + "epoch": 0.2912648239582853, + "grad_norm": 6.09375, + "kl": 0.005974292755126953, + "learning_rate": 8.184035311926397e-06, + "logits/chosen": 317859990.5882353, + "logits/rejected": 220947968.0, + "logps/chosen": -415.67190372242646, + "logps/rejected": -461.45631510416666, + "loss": 0.1064, + "rewards/chosen": 2.2187327216653263, + "rewards/margins": 9.658219924627566, + "rewards/rejected": -7.43948720296224, + "step": 789 + }, + { + "epoch": 0.29163398089612846, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 8.17949537626631e-06, + "logits/chosen": 220615152.94117647, + "logits/rejected": 173159611.73333332, + "logps/chosen": -404.5530790441176, + "logps/rejected": -460.8078125, + "loss": 0.0456, + "rewards/chosen": 3.6772384643554688, + "rewards/margins": 10.862145487467448, + "rewards/rejected": -7.184907023111979, + "step": 790 + }, + { + "epoch": 0.2920031378339717, + "grad_norm": 7.53125, + "kl": 1.976400375366211, + "learning_rate": 8.174951035455772e-06, + "logits/chosen": 195475968.0, + "logits/rejected": 163638931.69230768, + "logps/chosen": -363.7192896792763, + "logps/rejected": -368.68795072115387, + "loss": 0.1217, + "rewards/chosen": 2.2680742364180717, + "rewards/margins": 7.87898685285437, + "rewards/rejected": -5.610912616436298, + "step": 791 + }, + { + "epoch": 0.29237229477181487, + "grad_norm": 7.65625, + "kl": 0.09655380249023438, + "learning_rate": 8.170402295790913e-06, + "logits/chosen": 195814016.0, + "logits/rejected": 219071360.0, + "logps/chosen": -313.85419379340277, + "logps/rejected": -502.7302943638393, + "loss": 0.1547, + "rewards/chosen": 2.0147298177083335, + "rewards/margins": 8.067030407133556, + "rewards/rejected": -6.052300589425223, + "step": 792 + }, + { + "epoch": 0.2927414517096581, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 8.165849163573954e-06, + "logits/chosen": 169335552.0, + "logits/rejected": 200295208.42105263, + "logps/chosen": -258.64926382211536, + "logps/rejected": -480.57216282894734, + "loss": 0.0726, + "rewards/chosen": 3.335330082820012, + "rewards/margins": 9.534054126816724, + "rewards/rejected": -6.198724043996711, + "step": 793 + }, + { + "epoch": 0.29311060864750127, + "grad_norm": 6.5625, + "kl": 1.7028961181640625, + "learning_rate": 8.161291645113198e-06, + "logits/chosen": 250533166.54545453, + "logits/rejected": 327962265.6, + "logps/chosen": -298.7818714488636, + "logps/rejected": -395.4207275390625, + "loss": 0.1169, + "rewards/chosen": 2.44302021373402, + "rewards/margins": 8.458299602161754, + "rewards/rejected": -6.015279388427734, + "step": 794 + }, + { + "epoch": 0.29347976558534444, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 8.156729746723034e-06, + "logits/chosen": 195009979.73333332, + "logits/rejected": 192275456.0, + "logps/chosen": -354.8354166666667, + "logps/rejected": -377.96340762867646, + "loss": 0.0997, + "rewards/chosen": 2.326890055338542, + "rewards/margins": 8.375205664541207, + "rewards/rejected": -6.048315609202666, + "step": 795 + }, + { + "epoch": 0.2938489225231877, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 8.15216347472391e-06, + "logits/chosen": 289507448.4705882, + "logits/rejected": 268936960.0, + "logps/chosen": -352.0415900735294, + "logps/rejected": -393.07255859375, + "loss": 0.0924, + "rewards/chosen": 1.9727363586425781, + "rewards/margins": 7.546282704671224, + "rewards/rejected": -5.573546346028646, + "step": 796 + }, + { + "epoch": 0.29421807946103085, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 8.14759283544234e-06, + "logits/chosen": 251952412.44444445, + "logits/rejected": 279984822.85714287, + "logps/chosen": -405.9196506076389, + "logps/rejected": -437.81093052455356, + "loss": 0.08, + "rewards/chosen": 2.9280211130777993, + "rewards/margins": 8.483304795764742, + "rewards/rejected": -5.555283682686942, + "step": 797 + }, + { + "epoch": 0.2945872363988741, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 8.143017835210886e-06, + "logits/chosen": 225062041.6, + "logits/rejected": 186777569.88235295, + "logps/chosen": -345.27210286458336, + "logps/rejected": -481.9931066176471, + "loss": 0.0683, + "rewards/chosen": 2.273810068766276, + "rewards/margins": 9.488742229985256, + "rewards/rejected": -7.21493216121898, + "step": 798 + }, + { + "epoch": 0.29495639333671725, + "grad_norm": 5.90625, + "kl": 1.9548540115356445, + "learning_rate": 8.138438480368153e-06, + "logits/chosen": 213571041.88235295, + "logits/rejected": 242189294.93333334, + "logps/chosen": -378.73512178308823, + "logps/rejected": -559.1952473958333, + "loss": 0.0836, + "rewards/chosen": 3.085896435905905, + "rewards/margins": 11.001148104200176, + "rewards/rejected": -7.915251668294271, + "step": 799 + }, + { + "epoch": 0.2953255502745605, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.133854777258779e-06, + "logits/chosen": 290989781.3333333, + "logits/rejected": 229041075.2, + "logps/chosen": -469.8994140625, + "logps/rejected": -447.719775390625, + "loss": 0.0625, + "rewards/chosen": 2.398653507232666, + "rewards/margins": 9.363356113433838, + "rewards/rejected": -6.964702606201172, + "step": 800 + }, + { + "epoch": 0.29569470721240365, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 8.129266732233427e-06, + "logits/chosen": 203513685.33333334, + "logits/rejected": 191565101.17647058, + "logps/chosen": -351.6177734375, + "logps/rejected": -500.9179113051471, + "loss": 0.0478, + "rewards/chosen": 3.5839004516601562, + "rewards/margins": 10.48675537109375, + "rewards/rejected": -6.902854919433594, + "step": 801 + }, + { + "epoch": 0.2960638641502469, + "grad_norm": 6.5625, + "kl": 1.731429100036621, + "learning_rate": 8.124674351648773e-06, + "logits/chosen": 177242394.9473684, + "logits/rejected": 233847532.30769232, + "logps/chosen": -299.69564658717104, + "logps/rejected": -523.7198768028846, + "loss": 0.1252, + "rewards/chosen": 1.952303033126028, + "rewards/margins": 7.228578667891653, + "rewards/rejected": -5.276275634765625, + "step": 802 + }, + { + "epoch": 0.29643302108809005, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 8.120077641867506e-06, + "logits/chosen": 279326144.0, + "logits/rejected": 211985424.0, + "logps/chosen": -373.06878662109375, + "logps/rejected": -401.65692138671875, + "loss": 0.1212, + "rewards/chosen": 2.065305233001709, + "rewards/margins": 7.846780300140381, + "rewards/rejected": -5.781475067138672, + "step": 803 + }, + { + "epoch": 0.2968021780259333, + "grad_norm": 4.75, + "kl": 0.1984691619873047, + "learning_rate": 8.115476609258303e-06, + "logits/chosen": 222534929.06666666, + "logits/rejected": 171507200.0, + "logps/chosen": -377.37766927083334, + "logps/rejected": -367.10710592830884, + "loss": 0.0577, + "rewards/chosen": 2.680867004394531, + "rewards/margins": 9.266865988338694, + "rewards/rejected": -6.5859989839441635, + "step": 804 + }, + { + "epoch": 0.29717133496377646, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 8.110871260195843e-06, + "logits/chosen": 219114936.8888889, + "logits/rejected": 189209288.3478261, + "logps/chosen": -322.9238552517361, + "logps/rejected": -415.26821501358694, + "loss": 0.0501, + "rewards/chosen": 2.6480657789442272, + "rewards/margins": 8.270025962792733, + "rewards/rejected": -5.621960183848506, + "step": 805 + }, + { + "epoch": 0.2975404919016197, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 8.106261601060773e-06, + "logits/chosen": 221603584.0, + "logits/rejected": 266601795.36842105, + "logps/chosen": -333.98257211538464, + "logps/rejected": -479.1025390625, + "loss": 0.0948, + "rewards/chosen": 1.8342899909386268, + "rewards/margins": 8.730924544546768, + "rewards/rejected": -6.896634553608141, + "step": 806 + }, + { + "epoch": 0.29790964883946286, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 8.10164763823972e-06, + "logits/chosen": 252460846.54545453, + "logits/rejected": 258360295.6190476, + "logps/chosen": -358.48697176846593, + "logps/rejected": -474.5649181547619, + "loss": 0.036, + "rewards/chosen": 3.3361764387650923, + "rewards/margins": 10.06146373996487, + "rewards/rejected": -6.725287301199777, + "step": 807 + }, + { + "epoch": 0.2982788057773061, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 8.097029378125269e-06, + "logits/chosen": 173072074.10526314, + "logits/rejected": 297103990.15384614, + "logps/chosen": -345.1012541118421, + "logps/rejected": -325.2435772235577, + "loss": 0.1251, + "rewards/chosen": 2.281991255910773, + "rewards/margins": 6.826352957289228, + "rewards/rejected": -4.544361701378455, + "step": 808 + }, + { + "epoch": 0.29864796271514926, + "grad_norm": 5.78125, + "kl": 0.6863951683044434, + "learning_rate": 8.092406827115964e-06, + "logits/chosen": 243503030.85714287, + "logits/rejected": 180368302.54545453, + "logps/chosen": -293.96117001488096, + "logps/rejected": -341.8806818181818, + "loss": 0.1338, + "rewards/chosen": 1.9867849804106212, + "rewards/margins": 6.700087939505969, + "rewards/rejected": -4.713302959095348, + "step": 809 + }, + { + "epoch": 0.2990171196529925, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 8.087779991616287e-06, + "logits/chosen": 248037978.3529412, + "logits/rejected": 313799406.93333334, + "logps/chosen": -299.4707892922794, + "logps/rejected": -401.67649739583334, + "loss": 0.0923, + "rewards/chosen": 2.8192470774931064, + "rewards/margins": 7.74352025050743, + "rewards/rejected": -4.924273173014323, + "step": 810 + }, + { + "epoch": 0.29938627659083566, + "grad_norm": 6.96875, + "kl": 0.10757589340209961, + "learning_rate": 8.083148878036662e-06, + "logits/chosen": 337232504.4705882, + "logits/rejected": 199652027.73333332, + "logps/chosen": -386.25094784007354, + "logps/rejected": -385.13916015625, + "loss": 0.1384, + "rewards/chosen": 2.015961815329159, + "rewards/margins": 6.908828331442441, + "rewards/rejected": -4.892866516113282, + "step": 811 + }, + { + "epoch": 0.2997554335286789, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 8.078513492793438e-06, + "logits/chosen": 231760640.0, + "logits/rejected": 250369536.0, + "logps/chosen": -430.1494140625, + "logps/rejected": -465.95811631944446, + "loss": 0.0453, + "rewards/chosen": 2.897709710257394, + "rewards/margins": 10.081025411212256, + "rewards/rejected": -7.183315700954861, + "step": 812 + }, + { + "epoch": 0.30012459046652207, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.073873842308882e-06, + "logits/chosen": 241209623.27272728, + "logits/rejected": 206858971.42857143, + "logps/chosen": -277.3182927911932, + "logps/rejected": -454.7972470238095, + "loss": 0.0683, + "rewards/chosen": 2.7674616033380683, + "rewards/margins": 8.569606698436655, + "rewards/rejected": -5.802145095098586, + "step": 813 + }, + { + "epoch": 0.3004937474043653, + "grad_norm": 5.84375, + "kl": 1.9381217956542969, + "learning_rate": 8.06922993301117e-06, + "logits/chosen": 264743480.8888889, + "logits/rejected": 180764928.0, + "logps/chosen": -385.28168402777777, + "logps/rejected": -432.37367466517856, + "loss": 0.0932, + "rewards/chosen": 2.4384551578097873, + "rewards/margins": 8.640575802515423, + "rewards/rejected": -6.202120644705636, + "step": 814 + }, + { + "epoch": 0.30086290434220847, + "grad_norm": 5.46875, + "kl": 0.033295631408691406, + "learning_rate": 8.06458177133438e-06, + "logits/chosen": 248691898.1818182, + "logits/rejected": 268100169.14285713, + "logps/chosen": -424.22651811079544, + "logps/rejected": -513.3760230654761, + "loss": 0.0667, + "rewards/chosen": 2.650480270385742, + "rewards/margins": 9.164855502900624, + "rewards/rejected": -6.514375232514881, + "step": 815 + }, + { + "epoch": 0.3012320612800517, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 8.059929363718482e-06, + "logits/chosen": 210493504.0, + "logits/rejected": 345219968.0, + "logps/chosen": -287.2474365234375, + "logps/rejected": -496.5328063964844, + "loss": 0.0692, + "rewards/chosen": 3.0678822994232178, + "rewards/margins": 8.58253264427185, + "rewards/rejected": -5.514650344848633, + "step": 816 + }, + { + "epoch": 0.3016012182178949, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 8.055272716609325e-06, + "logits/chosen": 425614060.3076923, + "logits/rejected": 182840939.78947368, + "logps/chosen": -347.3449519230769, + "logps/rejected": -402.5709806743421, + "loss": 0.0866, + "rewards/chosen": 2.889071831336388, + "rewards/margins": 8.451706349608386, + "rewards/rejected": -5.562634518271999, + "step": 817 + }, + { + "epoch": 0.3019703751557381, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 8.050611836458638e-06, + "logits/chosen": 218873178.3529412, + "logits/rejected": 162445312.0, + "logps/chosen": -425.14226217830884, + "logps/rejected": -524.4529622395834, + "loss": 0.0807, + "rewards/chosen": 2.221549763399012, + "rewards/margins": 9.539498676973231, + "rewards/rejected": -7.317948913574218, + "step": 818 + }, + { + "epoch": 0.3023395320935813, + "grad_norm": 5.15625, + "kl": 0.5441875457763672, + "learning_rate": 8.045946729724006e-06, + "logits/chosen": 293093376.0, + "logits/rejected": 193603997.53846154, + "logps/chosen": -433.892578125, + "logps/rejected": -414.0471754807692, + "loss": 0.0793, + "rewards/chosen": 3.0283626757170024, + "rewards/margins": 8.971078834070367, + "rewards/rejected": -5.942716158353365, + "step": 819 + }, + { + "epoch": 0.3027086890314245, + "grad_norm": 7.0625, + "kl": 1.9478137493133545, + "learning_rate": 8.041277402868881e-06, + "logits/chosen": 217663627.63636363, + "logits/rejected": 242415104.0, + "logps/chosen": -363.10220614346593, + "logps/rejected": -592.75947265625, + "loss": 0.1159, + "rewards/chosen": 2.3519092906605112, + "rewards/margins": 9.579879413951527, + "rewards/rejected": -7.227970123291016, + "step": 820 + }, + { + "epoch": 0.3030778459692677, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 8.036603862362553e-06, + "logits/chosen": 257317845.33333334, + "logits/rejected": 273989376.0, + "logps/chosen": -299.85068766276044, + "logps/rejected": -432.381640625, + "loss": 0.0776, + "rewards/chosen": 2.0615053176879883, + "rewards/margins": 8.606272315979004, + "rewards/rejected": -6.544766998291015, + "step": 821 + }, + { + "epoch": 0.3034470029071109, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 8.031926114680153e-06, + "logits/chosen": 199197900.8, + "logits/rejected": 249855255.27272728, + "logps/chosen": -393.4353515625, + "logps/rejected": -422.67946555397725, + "loss": 0.0685, + "rewards/chosen": 2.8503725051879885, + "rewards/margins": 8.589138915322044, + "rewards/rejected": -5.738766410134056, + "step": 822 + }, + { + "epoch": 0.3038161598449541, + "grad_norm": 5.6875, + "kl": 0.23933172225952148, + "learning_rate": 8.027244166302641e-06, + "logits/chosen": 230534736.84210527, + "logits/rejected": 244186564.92307693, + "logps/chosen": -288.13340357730266, + "logps/rejected": -513.4924879807693, + "loss": 0.1166, + "rewards/chosen": 2.3286544398257605, + "rewards/margins": 9.6552999241632, + "rewards/rejected": -7.32664548433744, + "step": 823 + }, + { + "epoch": 0.3041853167827973, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 8.022558023716799e-06, + "logits/chosen": 248457944.6153846, + "logits/rejected": 231142615.57894737, + "logps/chosen": -345.6100510817308, + "logps/rejected": -521.9856085526316, + "loss": 0.0652, + "rewards/chosen": 2.434076896080604, + "rewards/margins": 9.357349279921065, + "rewards/rejected": -6.923272383840461, + "step": 824 + }, + { + "epoch": 0.3045544737206405, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 8.017867693415214e-06, + "logits/chosen": 174307252.70588234, + "logits/rejected": 336266513.06666666, + "logps/chosen": -290.40363625919116, + "logps/rejected": -405.95465494791665, + "loss": 0.0826, + "rewards/chosen": 2.9943432527429916, + "rewards/margins": 9.268021332983878, + "rewards/rejected": -6.273678080240885, + "step": 825 + }, + { + "epoch": 0.3049236306584837, + "grad_norm": 7.84375, + "kl": 1.6439266204833984, + "learning_rate": 8.013173181896283e-06, + "logits/chosen": 236446746.9473684, + "logits/rejected": 238890732.30769232, + "logps/chosen": -339.66776315789474, + "logps/rejected": -488.67183743990387, + "loss": 0.1169, + "rewards/chosen": 2.4859888177169, + "rewards/margins": 8.00951669669827, + "rewards/rejected": -5.52352787898137, + "step": 826 + }, + { + "epoch": 0.3052927875963269, + "grad_norm": 7.53125, + "kl": 4.62020206451416, + "learning_rate": 8.008474495664189e-06, + "logits/chosen": 235855220.36363637, + "logits/rejected": 307587379.2, + "logps/chosen": -328.12875088778407, + "logps/rejected": -516.184619140625, + "loss": 0.1847, + "rewards/chosen": 2.366871573708274, + "rewards/margins": 8.952870871803977, + "rewards/rejected": -6.585999298095703, + "step": 827 + }, + { + "epoch": 0.3056619445341701, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 8.003771641228905e-06, + "logits/chosen": 294135265.88235295, + "logits/rejected": 138502869.33333334, + "logps/chosen": -320.8693416819853, + "logps/rejected": -364.08082682291666, + "loss": 0.0847, + "rewards/chosen": 2.4042793722713696, + "rewards/margins": 8.143328139361213, + "rewards/rejected": -5.739048767089844, + "step": 828 + }, + { + "epoch": 0.3060311014720133, + "grad_norm": 7.90625, + "kl": 0.10834360122680664, + "learning_rate": 7.999064625106174e-06, + "logits/chosen": 161718662.0952381, + "logits/rejected": 198390248.72727272, + "logps/chosen": -362.44549851190476, + "logps/rejected": -448.21639737215907, + "loss": 0.1323, + "rewards/chosen": 2.6034669421968006, + "rewards/margins": 8.380875542050315, + "rewards/rejected": -5.777408599853516, + "step": 829 + }, + { + "epoch": 0.30640025840985646, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 7.994353453817508e-06, + "logits/chosen": 208066858.66666666, + "logits/rejected": 330414694.4, + "logps/chosen": -298.38673909505206, + "logps/rejected": -466.48955078125, + "loss": 0.0791, + "rewards/chosen": 2.7851263682047525, + "rewards/margins": 9.088597361246745, + "rewards/rejected": -6.3034709930419925, + "step": 830 + }, + { + "epoch": 0.3067694153476997, + "grad_norm": 7.125, + "kl": 0.0, + "learning_rate": 7.989638133890174e-06, + "logits/chosen": 238804805.8181818, + "logits/rejected": 262782259.2, + "logps/chosen": -381.7056995738636, + "logps/rejected": -524.555029296875, + "loss": 0.1263, + "rewards/chosen": 2.0243675925514917, + "rewards/margins": 9.590142128684304, + "rewards/rejected": -7.565774536132812, + "step": 831 + }, + { + "epoch": 0.30713857228554287, + "grad_norm": 6.78125, + "kl": 1.5584087371826172, + "learning_rate": 7.984918671857189e-06, + "logits/chosen": 338670912.0, + "logits/rejected": 335899968.0, + "logps/chosen": -380.1864013671875, + "logps/rejected": -578.3619995117188, + "loss": 0.1139, + "rewards/chosen": 2.3998148441314697, + "rewards/margins": 7.942064046859741, + "rewards/rejected": -5.5422492027282715, + "step": 832 + }, + { + "epoch": 0.3075077292233861, + "grad_norm": 5.21875, + "kl": 0.09849119186401367, + "learning_rate": 7.980195074257307e-06, + "logits/chosen": 247494716.2352941, + "logits/rejected": 229093120.0, + "logps/chosen": -370.3976045496324, + "logps/rejected": -415.41529947916666, + "loss": 0.084, + "rewards/chosen": 2.7424262551700367, + "rewards/margins": 9.082247625612744, + "rewards/rejected": -6.339821370442708, + "step": 833 + }, + { + "epoch": 0.30787688616122927, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 7.975467347635012e-06, + "logits/chosen": 217805451.63636363, + "logits/rejected": 214657625.6, + "logps/chosen": -301.32712624289775, + "logps/rejected": -375.1062744140625, + "loss": 0.1155, + "rewards/chosen": 2.740518743341619, + "rewards/margins": 7.335017186945135, + "rewards/rejected": -4.5944984436035154, + "step": 834 + }, + { + "epoch": 0.3082460430990725, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 7.97073549854051e-06, + "logits/chosen": 227726572.30769232, + "logits/rejected": 262052756.21052632, + "logps/chosen": -345.05870643028845, + "logps/rejected": -462.58182565789474, + "loss": 0.06, + "rewards/chosen": 2.3662041884202223, + "rewards/margins": 9.10629480956537, + "rewards/rejected": -6.740090621145148, + "step": 835 + }, + { + "epoch": 0.30861520003691567, + "grad_norm": 5.09375, + "kl": 0.23682212829589844, + "learning_rate": 7.965999533529718e-06, + "logits/chosen": 329124864.0, + "logits/rejected": 276356788.7058824, + "logps/chosen": -376.84606119791664, + "logps/rejected": -446.4563993566176, + "loss": 0.0735, + "rewards/chosen": 3.095781707763672, + "rewards/margins": 8.843333120907054, + "rewards/rejected": -5.747551413143382, + "step": 836 + }, + { + "epoch": 0.3089843569747589, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.961259459164254e-06, + "logits/chosen": 286554214.4, + "logits/rejected": 293465871.0588235, + "logps/chosen": -368.95751953125, + "logps/rejected": -446.5051700367647, + "loss": 0.0651, + "rewards/chosen": 3.6959823608398437, + "rewards/margins": 8.992609629911534, + "rewards/rejected": -5.296627269071691, + "step": 837 + }, + { + "epoch": 0.3093535139126021, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 7.956515282011434e-06, + "logits/chosen": 135962948.26666668, + "logits/rejected": 216720670.11764705, + "logps/chosen": -224.350927734375, + "logps/rejected": -386.18701171875, + "loss": 0.0787, + "rewards/chosen": 2.9999059041341147, + "rewards/margins": 7.880343418495327, + "rewards/rejected": -4.880437514361213, + "step": 838 + }, + { + "epoch": 0.3097226708504453, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 7.951767008644251e-06, + "logits/chosen": 270920248.8888889, + "logits/rejected": 143318765.7142857, + "logps/chosen": -374.531982421875, + "logps/rejected": -400.99135044642856, + "loss": 0.0679, + "rewards/chosen": 3.348568810356988, + "rewards/margins": 10.278667207748171, + "rewards/rejected": -6.930098397391183, + "step": 839 + }, + { + "epoch": 0.3100918277882885, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 7.94701464564138e-06, + "logits/chosen": 320839121.45454544, + "logits/rejected": 143891748.57142857, + "logps/chosen": -319.48848100142044, + "logps/rejected": -386.6476236979167, + "loss": 0.0996, + "rewards/chosen": 1.4141809290105647, + "rewards/margins": 6.833438683381845, + "rewards/rejected": -5.41925775437128, + "step": 840 + }, + { + "epoch": 0.3104609847261317, + "grad_norm": 5.3125, + "kl": 1.4703502655029297, + "learning_rate": 7.942258199587158e-06, + "logits/chosen": 324041984.0, + "logits/rejected": 255034341.0526316, + "logps/chosen": -294.1631610576923, + "logps/rejected": -487.87864925986844, + "loss": 0.0869, + "rewards/chosen": 2.8908122136042667, + "rewards/margins": 8.772208858598098, + "rewards/rejected": -5.881396644993832, + "step": 841 + }, + { + "epoch": 0.3108301416639749, + "grad_norm": 5.625, + "kl": 1.067491054534912, + "learning_rate": 7.937497677071583e-06, + "logits/chosen": 261718016.0, + "logits/rejected": 273483776.0, + "logps/chosen": -362.49288137335526, + "logps/rejected": -523.1572641225962, + "loss": 0.1204, + "rewards/chosen": 2.5455097399259867, + "rewards/margins": 8.973137303402549, + "rewards/rejected": -6.4276275634765625, + "step": 842 + }, + { + "epoch": 0.3111992986018181, + "grad_norm": 2.8125, + "kl": 0.0, + "learning_rate": 7.932733084690296e-06, + "logits/chosen": 178991344.0, + "logits/rejected": 225906496.0, + "logps/chosen": -323.2952575683594, + "logps/rejected": -408.0133056640625, + "loss": 0.0453, + "rewards/chosen": 3.5758068561553955, + "rewards/margins": 9.917983293533325, + "rewards/rejected": -6.34217643737793, + "step": 843 + }, + { + "epoch": 0.3115684555396613, + "grad_norm": 3.828125, + "kl": 1.2270784378051758, + "learning_rate": 7.92796442904458e-06, + "logits/chosen": 233516497.45454547, + "logits/rejected": 231321648.76190478, + "logps/chosen": -423.89870383522725, + "logps/rejected": -404.56761532738096, + "loss": 0.0589, + "rewards/chosen": 3.2065616954456675, + "rewards/margins": 9.149378656824945, + "rewards/rejected": -5.942816961379278, + "step": 844 + }, + { + "epoch": 0.3119376124775045, + "grad_norm": 7.375, + "kl": 1.166337013244629, + "learning_rate": 7.923191716741348e-06, + "logits/chosen": 213149401.6, + "logits/rejected": 210121621.33333334, + "logps/chosen": -379.404345703125, + "logps/rejected": -432.8675537109375, + "loss": 0.1342, + "rewards/chosen": 2.5348495483398437, + "rewards/margins": 7.811997477213541, + "rewards/rejected": -5.277147928873698, + "step": 845 + }, + { + "epoch": 0.3123067694153477, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 7.91841495439313e-06, + "logits/chosen": 221132813.47368422, + "logits/rejected": 226068716.30769232, + "logps/chosen": -320.27461965460526, + "logps/rejected": -350.45447716346155, + "loss": 0.1213, + "rewards/chosen": 1.863603491532175, + "rewards/margins": 6.935139629039687, + "rewards/rejected": -5.071536137507512, + "step": 846 + }, + { + "epoch": 0.3126759263531909, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 7.913634148618073e-06, + "logits/chosen": 271743744.0, + "logits/rejected": 235583283.2, + "logps/chosen": -418.8290201822917, + "logps/rejected": -362.0343994140625, + "loss": 0.0811, + "rewards/chosen": 2.1795473098754883, + "rewards/margins": 8.561810493469238, + "rewards/rejected": -6.38226318359375, + "step": 847 + }, + { + "epoch": 0.3130450832910341, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 7.908849306039918e-06, + "logits/chosen": 217292528.0, + "logits/rejected": 216222128.0, + "logps/chosen": -337.4622802734375, + "logps/rejected": -382.5711669921875, + "loss": 0.1035, + "rewards/chosen": 2.1667590141296387, + "rewards/margins": 7.700621604919434, + "rewards/rejected": -5.533862590789795, + "step": 848 + }, + { + "epoch": 0.3134142402288773, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 7.904060433288007e-06, + "logits/chosen": 224893530.3529412, + "logits/rejected": 246558788.26666668, + "logps/chosen": -265.8526252297794, + "logps/rejected": -476.2955729166667, + "loss": 0.1095, + "rewards/chosen": 2.1645148782169117, + "rewards/margins": 9.184237880332796, + "rewards/rejected": -7.019723002115885, + "step": 849 + }, + { + "epoch": 0.3137833971667205, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 7.899267536997261e-06, + "logits/chosen": 276194069.3333333, + "logits/rejected": 286924441.6, + "logps/chosen": -357.7791748046875, + "logps/rejected": -424.42451171875, + "loss": 0.027, + "rewards/chosen": 4.056024551391602, + "rewards/margins": 9.808113098144531, + "rewards/rejected": -5.75208854675293, + "step": 850 + }, + { + "epoch": 0.3141525541045637, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.894470623808176e-06, + "logits/chosen": 263590608.0, + "logits/rejected": 276062144.0, + "logps/chosen": -344.7067565917969, + "logps/rejected": -355.78753662109375, + "loss": 0.0597, + "rewards/chosen": 3.0383894443511963, + "rewards/margins": 8.913512945175171, + "rewards/rejected": -5.875123500823975, + "step": 851 + }, + { + "epoch": 0.3145217110424069, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 7.889669700366818e-06, + "logits/chosen": 197275154.2857143, + "logits/rejected": 208427434.66666666, + "logps/chosen": -257.52183314732144, + "logps/rejected": -443.69091796875, + "loss": 0.084, + "rewards/chosen": 2.3625610896519254, + "rewards/margins": 8.251880191621327, + "rewards/rejected": -5.889319101969401, + "step": 852 + }, + { + "epoch": 0.3148908679802501, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 7.884864773324802e-06, + "logits/chosen": 177230438.4, + "logits/rejected": 214901639.52941176, + "logps/chosen": -371.8676432291667, + "logps/rejected": -436.4573184742647, + "loss": 0.0612, + "rewards/chosen": 2.636254628499349, + "rewards/margins": 9.216241200764975, + "rewards/rejected": -6.579986572265625, + "step": 853 + }, + { + "epoch": 0.3152600249180933, + "grad_norm": 6.15625, + "kl": 0.2273578643798828, + "learning_rate": 7.880055849339294e-06, + "logits/chosen": 242776775.1111111, + "logits/rejected": 298952265.14285713, + "logps/chosen": -331.00336371527777, + "logps/rejected": -384.38253348214283, + "loss": 0.0843, + "rewards/chosen": 2.628779729207357, + "rewards/margins": 8.551851454235258, + "rewards/rejected": -5.923071725027902, + "step": 854 + }, + { + "epoch": 0.3156291818559365, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 7.875242935073e-06, + "logits/chosen": 225604132.57142857, + "logits/rejected": 289802609.7777778, + "logps/chosen": -323.92476981026783, + "logps/rejected": -479.0749782986111, + "loss": 0.0666, + "rewards/chosen": 2.888838086809431, + "rewards/margins": 9.883597540476966, + "rewards/rejected": -6.994759453667535, + "step": 855 + }, + { + "epoch": 0.3159983387937797, + "grad_norm": 6.53125, + "kl": 0.4453859329223633, + "learning_rate": 7.870426037194146e-06, + "logits/chosen": 230753426.2857143, + "logits/rejected": 222439493.8181818, + "logps/chosen": -311.49783761160717, + "logps/rejected": -449.27761008522725, + "loss": 0.1355, + "rewards/chosen": 2.1339111328125, + "rewards/margins": 8.87957902388139, + "rewards/rejected": -6.745667891068892, + "step": 856 + }, + { + "epoch": 0.3163674957316229, + "grad_norm": 8.75, + "kl": 0.0, + "learning_rate": 7.865605162376485e-06, + "logits/chosen": 182952760.8888889, + "logits/rejected": 143294372.57142857, + "logps/chosen": -382.24273003472223, + "logps/rejected": -502.501708984375, + "loss": 0.1091, + "rewards/chosen": 2.5318936241997614, + "rewards/margins": 10.215811623467339, + "rewards/rejected": -7.683917999267578, + "step": 857 + }, + { + "epoch": 0.3167366526694661, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.860780317299282e-06, + "logits/chosen": 210822746.3529412, + "logits/rejected": 173598566.4, + "logps/chosen": -292.40920840992646, + "logps/rejected": -465.0568359375, + "loss": 0.0481, + "rewards/chosen": 3.2608745799345127, + "rewards/margins": 10.449357245950138, + "rewards/rejected": -7.188482666015625, + "step": 858 + }, + { + "epoch": 0.31710580960730933, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.855951508647295e-06, + "logits/chosen": 257533392.0, + "logits/rejected": 159011712.0, + "logps/chosen": -390.26300048828125, + "logps/rejected": -458.22674560546875, + "loss": 0.0839, + "rewards/chosen": 2.1646997928619385, + "rewards/margins": 8.395770788192749, + "rewards/rejected": -6.2310709953308105, + "step": 859 + }, + { + "epoch": 0.3174749665451525, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 7.851118743110774e-06, + "logits/chosen": 311122270.31578946, + "logits/rejected": 199764480.0, + "logps/chosen": -529.5024157072369, + "logps/rejected": -550.8712815504807, + "loss": 0.0745, + "rewards/chosen": 2.475686324270148, + "rewards/margins": 10.971596968801398, + "rewards/rejected": -8.49591064453125, + "step": 860 + }, + { + "epoch": 0.31784412348299573, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 7.846282027385462e-06, + "logits/chosen": 187474261.33333334, + "logits/rejected": 254864775.52941176, + "logps/chosen": -340.13343098958336, + "logps/rejected": -407.7398035386029, + "loss": 0.1026, + "rewards/chosen": 2.128075536092122, + "rewards/margins": 8.479331513947132, + "rewards/rejected": -6.3512559778550095, + "step": 861 + }, + { + "epoch": 0.3182132804208389, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 7.841441368172559e-06, + "logits/chosen": 194577168.0, + "logits/rejected": 155468064.0, + "logps/chosen": -334.3840637207031, + "logps/rejected": -347.1985168457031, + "loss": 0.0879, + "rewards/chosen": 2.5529592037200928, + "rewards/margins": 7.645949125289917, + "rewards/rejected": -5.092989921569824, + "step": 862 + }, + { + "epoch": 0.31858243735868214, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 7.836596772178741e-06, + "logits/chosen": 251290422.85714287, + "logits/rejected": 202051328.0, + "logps/chosen": -356.31766183035717, + "logps/rejected": -363.75279405381946, + "loss": 0.0982, + "rewards/chosen": 1.9257568631853377, + "rewards/margins": 7.723182875012595, + "rewards/rejected": -5.797426011827257, + "step": 863 + }, + { + "epoch": 0.3189515942965253, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 7.831748246116136e-06, + "logits/chosen": 175833890.13333333, + "logits/rejected": 223947745.88235295, + "logps/chosen": -296.081640625, + "logps/rejected": -454.33883846507354, + "loss": 0.059, + "rewards/chosen": 3.2127756754557293, + "rewards/margins": 8.877152775783165, + "rewards/rejected": -5.664377100327435, + "step": 864 + }, + { + "epoch": 0.31932075123436854, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.826895796702311e-06, + "logits/chosen": 191891456.0, + "logits/rejected": 274759284.3636364, + "logps/chosen": -360.15810546875, + "logps/rejected": -492.40403053977275, + "loss": 0.0766, + "rewards/chosen": 2.11588134765625, + "rewards/margins": 8.95870888449929, + "rewards/rejected": -6.84282753684304, + "step": 865 + }, + { + "epoch": 0.3196899081722117, + "grad_norm": 5.03125, + "kl": 0.13409805297851562, + "learning_rate": 7.822039430660276e-06, + "logits/chosen": 231568093.86666667, + "logits/rejected": 249392489.4117647, + "logps/chosen": -402.855859375, + "logps/rejected": -492.0846737132353, + "loss": 0.0686, + "rewards/chosen": 2.607269795735677, + "rewards/margins": 10.009508978151807, + "rewards/rejected": -7.402239182416131, + "step": 866 + }, + { + "epoch": 0.3200590651100549, + "grad_norm": 7.46875, + "kl": 0.0, + "learning_rate": 7.817179154718463e-06, + "logits/chosen": 276697307.4285714, + "logits/rejected": 142786759.1111111, + "logps/chosen": -397.16029575892856, + "logps/rejected": -451.38975694444446, + "loss": 0.0924, + "rewards/chosen": 1.8178957530430384, + "rewards/margins": 10.172457452804322, + "rewards/rejected": -8.354561699761284, + "step": 867 + }, + { + "epoch": 0.3204282220478981, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 7.812314975610722e-06, + "logits/chosen": 253007818.10526314, + "logits/rejected": 276034067.6923077, + "logps/chosen": -360.9634303042763, + "logps/rejected": -332.78894981971155, + "loss": 0.1447, + "rewards/chosen": 1.6287980330617804, + "rewards/margins": 7.485187592294052, + "rewards/rejected": -5.856389559232271, + "step": 868 + }, + { + "epoch": 0.3207973789857413, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 7.807446900076314e-06, + "logits/chosen": 218368358.4, + "logits/rejected": 181935146.66666666, + "logps/chosen": -364.6987548828125, + "logps/rejected": -356.2829996744792, + "loss": 0.0804, + "rewards/chosen": 2.7786895751953127, + "rewards/margins": 8.014390500386556, + "rewards/rejected": -5.235700925191243, + "step": 869 + }, + { + "epoch": 0.3211665359235845, + "grad_norm": 6.25, + "kl": 0.2710685729980469, + "learning_rate": 7.802574934859894e-06, + "logits/chosen": 228585328.0, + "logits/rejected": 252534016.0, + "logps/chosen": -417.84576416015625, + "logps/rejected": -479.680908203125, + "loss": 0.0789, + "rewards/chosen": 2.678046226501465, + "rewards/margins": 9.918426990509033, + "rewards/rejected": -7.240380764007568, + "step": 870 + }, + { + "epoch": 0.3215356928614277, + "grad_norm": 5.09375, + "kl": 1.7790355682373047, + "learning_rate": 7.797699086711507e-06, + "logits/chosen": 324244659.2, + "logits/rejected": 190826218.66666666, + "logps/chosen": -443.75087890625, + "logps/rejected": -455.2180582682292, + "loss": 0.0728, + "rewards/chosen": 3.2079113006591795, + "rewards/margins": 10.025934346516927, + "rewards/rejected": -6.818023045857747, + "step": 871 + }, + { + "epoch": 0.3219048497992709, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 7.792819362386581e-06, + "logits/chosen": 274378944.0, + "logits/rejected": 139927536.0, + "logps/chosen": -342.142333984375, + "logps/rejected": -261.92169189453125, + "loss": 0.0938, + "rewards/chosen": 2.663501262664795, + "rewards/margins": 8.271490097045898, + "rewards/rejected": -5.6079888343811035, + "step": 872 + }, + { + "epoch": 0.3222740067371141, + "grad_norm": 6.5625, + "kl": 1.905698299407959, + "learning_rate": 7.78793576864591e-06, + "logits/chosen": 247324441.6, + "logits/rejected": 180909525.33333334, + "logps/chosen": -336.6058837890625, + "logps/rejected": -396.083251953125, + "loss": 0.1407, + "rewards/chosen": 2.2935226440429686, + "rewards/margins": 7.154430834452311, + "rewards/rejected": -4.860908190409343, + "step": 873 + }, + { + "epoch": 0.3226431636749573, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 7.783048312255653e-06, + "logits/chosen": 188981435.73333332, + "logits/rejected": 245135134.11764705, + "logps/chosen": -298.6414388020833, + "logps/rejected": -383.2158777573529, + "loss": 0.1434, + "rewards/chosen": 1.6293763478597005, + "rewards/margins": 6.991455743827072, + "rewards/rejected": -5.362079395967371, + "step": 874 + }, + { + "epoch": 0.3230123206128005, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 7.778156999987317e-06, + "logits/chosen": 227873554.2857143, + "logits/rejected": 188991772.44444445, + "logps/chosen": -339.780517578125, + "logps/rejected": -465.5196940104167, + "loss": 0.063, + "rewards/chosen": 2.8655199323381697, + "rewards/margins": 9.399507613409133, + "rewards/rejected": -6.533987681070964, + "step": 875 + }, + { + "epoch": 0.3233814775506437, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 7.773261838617753e-06, + "logits/chosen": 254361304.6153846, + "logits/rejected": 202036143.15789473, + "logps/chosen": -392.9450495793269, + "logps/rejected": -408.28543893914474, + "loss": 0.0414, + "rewards/chosen": 3.141711014967698, + "rewards/margins": 8.988535660963791, + "rewards/rejected": -5.846824645996094, + "step": 876 + }, + { + "epoch": 0.3237506344884869, + "grad_norm": 7.375, + "kl": 0.5696134567260742, + "learning_rate": 7.768362834929146e-06, + "logits/chosen": 197175973.6470588, + "logits/rejected": 168612420.26666668, + "logps/chosen": -358.2066004136029, + "logps/rejected": -352.6943359375, + "loss": 0.1358, + "rewards/chosen": 2.219957015093635, + "rewards/margins": 8.313392684038948, + "rewards/rejected": -6.093435668945313, + "step": 877 + }, + { + "epoch": 0.32411979142633013, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 7.763459995709004e-06, + "logits/chosen": 187795541.33333334, + "logits/rejected": 212440521.14285713, + "logps/chosen": -330.00640190972223, + "logps/rejected": -537.2894810267857, + "loss": 0.1182, + "rewards/chosen": 2.0774563683403864, + "rewards/margins": 10.043769866701156, + "rewards/rejected": -7.96631349836077, + "step": 878 + }, + { + "epoch": 0.3244889483641733, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 7.758553327750146e-06, + "logits/chosen": 211193968.0, + "logits/rejected": 295771264.0, + "logps/chosen": -354.4249267578125, + "logps/rejected": -421.44598388671875, + "loss": 0.0942, + "rewards/chosen": 2.563972234725952, + "rewards/margins": 8.205946683883667, + "rewards/rejected": -5.641974449157715, + "step": 879 + }, + { + "epoch": 0.32485810530201653, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 7.753642837850698e-06, + "logits/chosen": 184849590.85714287, + "logits/rejected": 351031409.7777778, + "logps/chosen": -334.06033761160717, + "logps/rejected": -524.7770724826389, + "loss": 0.0954, + "rewards/chosen": 2.2621096202305386, + "rewards/margins": 9.718701756189741, + "rewards/rejected": -7.456592135959202, + "step": 880 + }, + { + "epoch": 0.3252272622398597, + "grad_norm": 7.1875, + "kl": 2.6231637001037598, + "learning_rate": 7.748728532814087e-06, + "logits/chosen": 303623270.4, + "logits/rejected": 436374954.6666667, + "logps/chosen": -329.8064453125, + "logps/rejected": -473.91015625, + "loss": 0.1244, + "rewards/chosen": 2.4285486221313475, + "rewards/margins": 9.133198229471843, + "rewards/rejected": -6.704649607340495, + "step": 881 + }, + { + "epoch": 0.32559641917770293, + "grad_norm": 5.9375, + "kl": 0.027818679809570312, + "learning_rate": 7.743810419449014e-06, + "logits/chosen": 286639811.04761904, + "logits/rejected": 400096674.90909094, + "logps/chosen": -370.4042736235119, + "logps/rejected": -389.00301846590907, + "loss": 0.1185, + "rewards/chosen": 2.189906529017857, + "rewards/margins": 8.34116759857574, + "rewards/rejected": -6.151261069557884, + "step": 882 + }, + { + "epoch": 0.3259655761155461, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 7.738888504569463e-06, + "logits/chosen": 282772359.5294118, + "logits/rejected": 207093043.2, + "logps/chosen": -352.0654296875, + "logps/rejected": -433.6720377604167, + "loss": 0.1077, + "rewards/chosen": 1.8483076656565947, + "rewards/margins": 7.609413012336282, + "rewards/rejected": -5.761105346679687, + "step": 883 + }, + { + "epoch": 0.32633473305338934, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 7.733962794994689e-06, + "logits/chosen": 210683704.8888889, + "logits/rejected": 176783158.85714287, + "logps/chosen": -313.5618489583333, + "logps/rejected": -358.66224888392856, + "loss": 0.09, + "rewards/chosen": 2.115679634941949, + "rewards/margins": 7.4316063229999845, + "rewards/rejected": -5.315926688058036, + "step": 884 + }, + { + "epoch": 0.3267038899912325, + "grad_norm": 6.09375, + "kl": 2.167607307434082, + "learning_rate": 7.729033297549195e-06, + "logits/chosen": 156362598.4, + "logits/rejected": 178901930.66666666, + "logps/chosen": -318.3553955078125, + "logps/rejected": -384.1612955729167, + "loss": 0.1027, + "rewards/chosen": 2.856770133972168, + "rewards/margins": 9.609655316670736, + "rewards/rejected": -6.752885182698567, + "step": 885 + }, + { + "epoch": 0.32707304692907574, + "grad_norm": 6.3125, + "kl": 0.05479145050048828, + "learning_rate": 7.724100019062739e-06, + "logits/chosen": 207010944.0, + "logits/rejected": 196785097.14285713, + "logps/chosen": -388.38294813368054, + "logps/rejected": -400.5967494419643, + "loss": 0.095, + "rewards/chosen": 3.465957005818685, + "rewards/margins": 10.00394376118978, + "rewards/rejected": -6.537986755371094, + "step": 886 + }, + { + "epoch": 0.3274422038669189, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.719162966370318e-06, + "logits/chosen": 164198174.11764705, + "logits/rejected": 239154107.73333332, + "logps/chosen": -307.38927504595586, + "logps/rejected": -520.3485677083333, + "loss": 0.0676, + "rewards/chosen": 3.3088901744169346, + "rewards/margins": 10.666002295998966, + "rewards/rejected": -7.357112121582031, + "step": 887 + }, + { + "epoch": 0.32781136080476214, + "grad_norm": 2.71875, + "kl": 0.0, + "learning_rate": 7.714222146312151e-06, + "logits/chosen": 234546346.66666666, + "logits/rejected": 152473766.4, + "logps/chosen": -375.5565592447917, + "logps/rejected": -359.2637451171875, + "loss": 0.0517, + "rewards/chosen": 4.041544278462728, + "rewards/margins": 9.17703088124593, + "rewards/rejected": -5.135486602783203, + "step": 888 + }, + { + "epoch": 0.3281805177426053, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 7.709277565733686e-06, + "logits/chosen": 267971961.2631579, + "logits/rejected": 186130884.92307693, + "logps/chosen": -272.19325657894734, + "logps/rejected": -428.23914513221155, + "loss": 0.1534, + "rewards/chosen": 1.6778020356830798, + "rewards/margins": 7.4352266103149915, + "rewards/rejected": -5.757424574631911, + "step": 889 + }, + { + "epoch": 0.32854967468044854, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 7.704329231485576e-06, + "logits/chosen": 157446229.33333334, + "logits/rejected": 269624199.5294118, + "logps/chosen": -291.497265625, + "logps/rejected": -499.9595588235294, + "loss": 0.1303, + "rewards/chosen": 2.055482355753581, + "rewards/margins": 9.766102113910751, + "rewards/rejected": -7.710619758157169, + "step": 890 + }, + { + "epoch": 0.3289188316182917, + "grad_norm": 7.1875, + "kl": 0.10114574432373047, + "learning_rate": 7.699377150423673e-06, + "logits/chosen": 287708752.84210527, + "logits/rejected": 258046779.07692307, + "logps/chosen": -335.41049033717104, + "logps/rejected": -477.42202524038464, + "loss": 0.1371, + "rewards/chosen": 2.128055170962685, + "rewards/margins": 8.77987249274003, + "rewards/rejected": -6.651817321777344, + "step": 891 + }, + { + "epoch": 0.32928798855613495, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 7.69442132940902e-06, + "logits/chosen": 210496165.6470588, + "logits/rejected": 172730265.6, + "logps/chosen": -337.38062959558823, + "logps/rejected": -358.20445963541664, + "loss": 0.1412, + "rewards/chosen": 2.1299853605382584, + "rewards/margins": 7.3100492440018, + "rewards/rejected": -5.180063883463542, + "step": 892 + }, + { + "epoch": 0.3296571454939781, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 7.689461775307852e-06, + "logits/chosen": 258648696.47058824, + "logits/rejected": 222432307.2, + "logps/chosen": -355.60765165441177, + "logps/rejected": -533.514453125, + "loss": 0.1042, + "rewards/chosen": 2.2558687995461857, + "rewards/margins": 7.639896916408166, + "rewards/rejected": -5.3840281168619795, + "step": 893 + }, + { + "epoch": 0.33002630243182135, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 7.684498494991562e-06, + "logits/chosen": 293390438.4, + "logits/rejected": 181596943.05882353, + "logps/chosen": -382.8228515625, + "logps/rejected": -408.1507352941176, + "loss": 0.0643, + "rewards/chosen": 2.764403788248698, + "rewards/margins": 8.59398779775582, + "rewards/rejected": -5.829584009507123, + "step": 894 + }, + { + "epoch": 0.3303954593696645, + "grad_norm": 6.0, + "kl": 0.03639698028564453, + "learning_rate": 7.679531495336712e-06, + "logits/chosen": 297424704.0, + "logits/rejected": 190918976.0, + "logps/chosen": -329.346435546875, + "logps/rejected": -455.650146484375, + "loss": 0.0867, + "rewards/chosen": 2.3628787994384766, + "rewards/margins": 7.756301403045654, + "rewards/rejected": -5.393422603607178, + "step": 895 + }, + { + "epoch": 0.33076461630750775, + "grad_norm": 6.9375, + "kl": 1.158121109008789, + "learning_rate": 7.674560783225018e-06, + "logits/chosen": 191212303.05882353, + "logits/rejected": 160040038.4, + "logps/chosen": -348.4518612132353, + "logps/rejected": -328.40029296875, + "loss": 0.1508, + "rewards/chosen": 1.9092856014476103, + "rewards/margins": 6.778022885790058, + "rewards/rejected": -4.868737284342448, + "step": 896 + }, + { + "epoch": 0.3311337732453509, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 7.669586365543342e-06, + "logits/chosen": 147133366.85714287, + "logits/rejected": 192086044.44444445, + "logps/chosen": -249.72593470982142, + "logps/rejected": -404.5670572916667, + "loss": 0.0411, + "rewards/chosen": 3.2900567735944475, + "rewards/margins": 9.403815708463155, + "rewards/rejected": -6.113758934868707, + "step": 897 + }, + { + "epoch": 0.33150293018319416, + "grad_norm": 7.84375, + "kl": 0.8591690063476562, + "learning_rate": 7.66460824918367e-06, + "logits/chosen": 256359008.0, + "logits/rejected": 231790752.0, + "logps/chosen": -465.20135498046875, + "logps/rejected": -318.7252502441406, + "loss": 0.0954, + "rewards/chosen": 2.2406156063079834, + "rewards/margins": 6.932123899459839, + "rewards/rejected": -4.6915082931518555, + "step": 898 + }, + { + "epoch": 0.33187208712103733, + "grad_norm": 4.3125, + "kl": 0.12045717239379883, + "learning_rate": 7.659626441043125e-06, + "logits/chosen": 281358738.28571427, + "logits/rejected": 172592881.7777778, + "logps/chosen": -361.60585239955356, + "logps/rejected": -506.22992621527777, + "loss": 0.0775, + "rewards/chosen": 2.029507500784738, + "rewards/margins": 8.962390021672324, + "rewards/rejected": -6.932882520887587, + "step": 899 + }, + { + "epoch": 0.33224124405888056, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 7.654640948023934e-06, + "logits/chosen": 256085248.0, + "logits/rejected": 164196300.8, + "logps/chosen": -298.0201009114583, + "logps/rejected": -383.0888671875, + "loss": 0.0882, + "rewards/chosen": 2.278025468190511, + "rewards/margins": 7.9890671094258625, + "rewards/rejected": -5.711041641235352, + "step": 900 + }, + { + "epoch": 0.33261040099672373, + "grad_norm": 7.5, + "kl": 2.960944652557373, + "learning_rate": 7.649651777033438e-06, + "logits/chosen": 248514916.17391303, + "logits/rejected": 264979399.1111111, + "logps/chosen": -379.93839164402175, + "logps/rejected": -482.0598958333333, + "loss": 0.149, + "rewards/chosen": 2.176994157874066, + "rewards/margins": 8.095377438310264, + "rewards/rejected": -5.918383280436198, + "step": 901 + }, + { + "epoch": 0.3329795579345669, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 7.644658934984066e-06, + "logits/chosen": 160674520.6153846, + "logits/rejected": 179189571.36842105, + "logps/chosen": -312.42025991586536, + "logps/rejected": -437.86204769736844, + "loss": 0.0453, + "rewards/chosen": 3.3796392587515025, + "rewards/margins": 10.357342955554545, + "rewards/rejected": -6.977703696803043, + "step": 902 + }, + { + "epoch": 0.33334871487241013, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 7.639662428793342e-06, + "logits/chosen": 211907840.0, + "logits/rejected": 208488471.27272728, + "logps/chosen": -330.433251953125, + "logps/rejected": -411.04541015625, + "loss": 0.0473, + "rewards/chosen": 3.0509012222290037, + "rewards/margins": 8.631590739163485, + "rewards/rejected": -5.5806895169344815, + "step": 903 + }, + { + "epoch": 0.3337178718102533, + "grad_norm": 7.125, + "kl": 0.022351741790771484, + "learning_rate": 7.634662265383858e-06, + "logits/chosen": 224693504.0, + "logits/rejected": 209494635.78947368, + "logps/chosen": -377.45229867788464, + "logps/rejected": -377.64088199013156, + "loss": 0.0976, + "rewards/chosen": 2.458147929264949, + "rewards/margins": 7.756531140099653, + "rewards/rejected": -5.298383210834704, + "step": 904 + }, + { + "epoch": 0.33408702874809654, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 7.62965845168328e-06, + "logits/chosen": 257337230.2222222, + "logits/rejected": 212892379.42857143, + "logps/chosen": -305.42621527777777, + "logps/rejected": -449.14571707589283, + "loss": 0.1001, + "rewards/chosen": 2.4904147254096136, + "rewards/margins": 7.680432425604926, + "rewards/rejected": -5.1900177001953125, + "step": 905 + }, + { + "epoch": 0.3344561856859397, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.624650994624325e-06, + "logits/chosen": 216319015.3846154, + "logits/rejected": 249780008.42105263, + "logps/chosen": -315.74891075721155, + "logps/rejected": -472.36893503289474, + "loss": 0.0937, + "rewards/chosen": 2.1513548630934496, + "rewards/margins": 8.866424282552742, + "rewards/rejected": -6.715069419459293, + "step": 906 + }, + { + "epoch": 0.33482534262378294, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 7.619639901144764e-06, + "logits/chosen": 199125778.2857143, + "logits/rejected": 259367253.33333334, + "logps/chosen": -366.5059291294643, + "logps/rejected": -389.48662651909723, + "loss": 0.0567, + "rewards/chosen": 3.162205014910017, + "rewards/margins": 9.256839176965139, + "rewards/rejected": -6.094634162055121, + "step": 907 + }, + { + "epoch": 0.3351944995616261, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.614625178187402e-06, + "logits/chosen": 252095856.0, + "logits/rejected": 159712688.0, + "logps/chosen": -359.00140380859375, + "logps/rejected": -498.7751159667969, + "loss": 0.0773, + "rewards/chosen": 2.394251585006714, + "rewards/margins": 9.80557131767273, + "rewards/rejected": -7.411319732666016, + "step": 908 + }, + { + "epoch": 0.33556365649946934, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 7.609606832700074e-06, + "logits/chosen": 298258720.0, + "logits/rejected": 169649200.0, + "logps/chosen": -298.0234680175781, + "logps/rejected": -420.4402770996094, + "loss": 0.1106, + "rewards/chosen": 1.884530782699585, + "rewards/margins": 8.432847738265991, + "rewards/rejected": -6.548316955566406, + "step": 909 + }, + { + "epoch": 0.3359328134373125, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 7.604584871635634e-06, + "logits/chosen": 228249136.0, + "logits/rejected": 274931840.0, + "logps/chosen": -373.08441162109375, + "logps/rejected": -375.7421875, + "loss": 0.0863, + "rewards/chosen": 2.6593992710113525, + "rewards/margins": 8.017322301864624, + "rewards/rejected": -5.3579230308532715, + "step": 910 + }, + { + "epoch": 0.33630197037515575, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 7.5995593019519444e-06, + "logits/chosen": 249999634.2857143, + "logits/rejected": 226346880.0, + "logps/chosen": -337.21773856026783, + "logps/rejected": -364.76752387152777, + "loss": 0.0961, + "rewards/chosen": 2.1982421875, + "rewards/margins": 7.711251576741536, + "rewards/rejected": -5.513009389241536, + "step": 911 + }, + { + "epoch": 0.3366711273129989, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 7.59453013061187e-06, + "logits/chosen": 238783960.6153846, + "logits/rejected": 191927875.36842105, + "logps/chosen": -340.5238506610577, + "logps/rejected": -356.5112818667763, + "loss": 0.1023, + "rewards/chosen": 2.4165735978346605, + "rewards/margins": 7.233165362585894, + "rewards/rejected": -4.816591764751234, + "step": 912 + }, + { + "epoch": 0.33704028425084215, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 7.589497364583263e-06, + "logits/chosen": 201393408.0, + "logits/rejected": 288333198.2222222, + "logps/chosen": -304.18028041294644, + "logps/rejected": -388.8079427083333, + "loss": 0.0719, + "rewards/chosen": 2.5493262154715404, + "rewards/margins": 8.259083702450708, + "rewards/rejected": -5.709757486979167, + "step": 913 + }, + { + "epoch": 0.3374094411886853, + "grad_norm": 5.71875, + "kl": 0.7934103012084961, + "learning_rate": 7.5844610108389546e-06, + "logits/chosen": 212357440.0, + "logits/rejected": 350429973.3333333, + "logps/chosen": -337.3397705078125, + "logps/rejected": -590.03857421875, + "loss": 0.0915, + "rewards/chosen": 3.1566045761108397, + "rewards/margins": 11.243678855895997, + "rewards/rejected": -8.087074279785156, + "step": 914 + }, + { + "epoch": 0.33777859812652855, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 7.579421076356753e-06, + "logits/chosen": 178622739.69230768, + "logits/rejected": 150824394.10526314, + "logps/chosen": -301.82173978365387, + "logps/rejected": -376.31563527960526, + "loss": 0.0456, + "rewards/chosen": 3.286656306340144, + "rewards/margins": 9.002468989445614, + "rewards/rejected": -5.715812683105469, + "step": 915 + }, + { + "epoch": 0.3381477550643717, + "grad_norm": 6.3125, + "kl": 0.2558870315551758, + "learning_rate": 7.574377568119421e-06, + "logits/chosen": 287128132.26666665, + "logits/rejected": 188644547.7647059, + "logps/chosen": -400.18828125, + "logps/rejected": -467.0240119485294, + "loss": 0.0838, + "rewards/chosen": 1.9348093668619792, + "rewards/margins": 7.986455879959406, + "rewards/rejected": -6.051646513097427, + "step": 916 + }, + { + "epoch": 0.33851691200221495, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.569330493114675e-06, + "logits/chosen": 274297777.2307692, + "logits/rejected": 266294433.68421054, + "logps/chosen": -372.72318209134613, + "logps/rejected": -471.91334292763156, + "loss": 0.0967, + "rewards/chosen": 1.7450717045710638, + "rewards/margins": 8.088037081575587, + "rewards/rejected": -6.342965377004523, + "step": 917 + }, + { + "epoch": 0.3388860689400581, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 7.564279858335174e-06, + "logits/chosen": 255160280.6153846, + "logits/rejected": 211312869.0526316, + "logps/chosen": -365.56753305288464, + "logps/rejected": -485.52415707236844, + "loss": 0.0733, + "rewards/chosen": 2.235426976130559, + "rewards/margins": 8.44030914615523, + "rewards/rejected": -6.204882170024671, + "step": 918 + }, + { + "epoch": 0.33925522587790136, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 7.5592256707785085e-06, + "logits/chosen": 337787477.3333333, + "logits/rejected": 245497453.7142857, + "logps/chosen": -318.1236165364583, + "logps/rejected": -558.3005022321429, + "loss": 0.0754, + "rewards/chosen": 2.715603722466363, + "rewards/margins": 8.85952135116335, + "rewards/rejected": -6.143917628696987, + "step": 919 + }, + { + "epoch": 0.33962438281574453, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 7.55416793744719e-06, + "logits/chosen": 165191443.69230768, + "logits/rejected": 184972288.0, + "logps/chosen": -362.5023662860577, + "logps/rejected": -422.4752261513158, + "loss": 0.0478, + "rewards/chosen": 3.4470672607421875, + "rewards/margins": 9.798966257195723, + "rewards/rejected": -6.351898996453536, + "step": 920 + }, + { + "epoch": 0.33999353975358776, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 7.549106665348644e-06, + "logits/chosen": 190899968.0, + "logits/rejected": 275988224.0, + "logps/chosen": -368.9095458984375, + "logps/rejected": -355.57281494140625, + "loss": 0.0887, + "rewards/chosen": 2.6250243186950684, + "rewards/margins": 8.317622184753418, + "rewards/rejected": -5.69259786605835, + "step": 921 + }, + { + "epoch": 0.34036269669143093, + "grad_norm": 5.15625, + "kl": 0.8595705032348633, + "learning_rate": 7.544041861495202e-06, + "logits/chosen": 213903680.0, + "logits/rejected": 134592832.0, + "logps/chosen": -355.0522766113281, + "logps/rejected": -366.5732727050781, + "loss": 0.0885, + "rewards/chosen": 2.6672403812408447, + "rewards/margins": 8.452326536178589, + "rewards/rejected": -5.785086154937744, + "step": 922 + }, + { + "epoch": 0.34073185362927416, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 7.53897353290408e-06, + "logits/chosen": 249526920.53333333, + "logits/rejected": 208715414.5882353, + "logps/chosen": -376.62701822916665, + "logps/rejected": -407.9324161305147, + "loss": 0.0756, + "rewards/chosen": 2.2591405232747395, + "rewards/margins": 8.956588804955576, + "rewards/rejected": -6.6974482816808365, + "step": 923 + }, + { + "epoch": 0.34110101056711734, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 7.5339016865973865e-06, + "logits/chosen": 217827721.84615386, + "logits/rejected": 248196688.84210527, + "logps/chosen": -314.3073918269231, + "logps/rejected": -518.9857113486842, + "loss": 0.0912, + "rewards/chosen": 2.2336791111872745, + "rewards/margins": 9.897919898091057, + "rewards/rejected": -7.6642407869037825, + "step": 924 + }, + { + "epoch": 0.34147016750496056, + "grad_norm": 8.4375, + "kl": 1.4241390228271484, + "learning_rate": 7.528826329602099e-06, + "logits/chosen": 228018688.0, + "logits/rejected": 144752779.63636363, + "logps/chosen": -342.8833240327381, + "logps/rejected": -386.2872869318182, + "loss": 0.1742, + "rewards/chosen": 1.6285460335867745, + "rewards/margins": 7.386092198359502, + "rewards/rejected": -5.7575461647727275, + "step": 925 + }, + { + "epoch": 0.34183932444280374, + "grad_norm": 6.0, + "kl": 0.2903294563293457, + "learning_rate": 7.523747468950061e-06, + "logits/chosen": 190556672.0, + "logits/rejected": 225568093.0909091, + "logps/chosen": -322.93980189732144, + "logps/rejected": -381.97971413352275, + "loss": 0.0982, + "rewards/chosen": 2.7609837849934897, + "rewards/margins": 8.620712511467211, + "rewards/rejected": -5.859728726473722, + "step": 926 + }, + { + "epoch": 0.34220848138064697, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 7.518665111677968e-06, + "logits/chosen": 238478939.42857143, + "logits/rejected": 182954396.44444445, + "logps/chosen": -344.82303292410717, + "logps/rejected": -479.83355034722223, + "loss": 0.048, + "rewards/chosen": 2.985491616385324, + "rewards/margins": 10.3799923488072, + "rewards/rejected": -7.394500732421875, + "step": 927 + }, + { + "epoch": 0.34257763831849014, + "grad_norm": 3.40625, + "kl": 0.0, + "learning_rate": 7.513579264827362e-06, + "logits/chosen": 228157064.53333333, + "logits/rejected": 208544632.47058824, + "logps/chosen": -395.1966796875, + "logps/rejected": -485.1936465992647, + "loss": 0.041, + "rewards/chosen": 3.1761395772298178, + "rewards/margins": 10.181107659433401, + "rewards/rejected": -7.004968082203584, + "step": 928 + }, + { + "epoch": 0.34294679525633337, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.508489935444618e-06, + "logits/chosen": 176958791.1111111, + "logits/rejected": 200593993.14285713, + "logps/chosen": -300.8825954861111, + "logps/rejected": -366.46142578125, + "loss": 0.0838, + "rewards/chosen": 2.80214966668023, + "rewards/margins": 7.887541120014493, + "rewards/rejected": -5.085391453334263, + "step": 929 + }, + { + "epoch": 0.34331595219417654, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.5033971305809405e-06, + "logits/chosen": 159372266.66666666, + "logits/rejected": 190323302.4, + "logps/chosen": -291.45668538411456, + "logps/rejected": -458.55185546875, + "loss": 0.0814, + "rewards/chosen": 2.408255100250244, + "rewards/margins": 9.649884128570557, + "rewards/rejected": -7.241629028320313, + "step": 930 + }, + { + "epoch": 0.3436851091320198, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 7.498300857292342e-06, + "logits/chosen": 305540130.1333333, + "logits/rejected": 232740728.47058824, + "logps/chosen": -357.4966145833333, + "logps/rejected": -534.1342486213235, + "loss": 0.0754, + "rewards/chosen": 2.799182637532552, + "rewards/margins": 10.848161555271524, + "rewards/rejected": -8.048978917738971, + "step": 931 + }, + { + "epoch": 0.34405426606986295, + "grad_norm": 8.5, + "kl": 0.9538145065307617, + "learning_rate": 7.493201122639648e-06, + "logits/chosen": 199051696.0, + "logits/rejected": 292363648.0, + "logps/chosen": -376.2951965332031, + "logps/rejected": -439.74517822265625, + "loss": 0.1335, + "rewards/chosen": 2.0914158821105957, + "rewards/margins": 8.447134494781494, + "rewards/rejected": -6.355718612670898, + "step": 932 + }, + { + "epoch": 0.3444234230077062, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 7.488097933688474e-06, + "logits/chosen": 147755562.66666666, + "logits/rejected": 245011609.6, + "logps/chosen": -400.66259765625, + "logps/rejected": -374.45771484375, + "loss": 0.0481, + "rewards/chosen": 2.665795644124349, + "rewards/margins": 8.583266194661459, + "rewards/rejected": -5.917470550537109, + "step": 933 + }, + { + "epoch": 0.34479257994554935, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.482991297509225e-06, + "logits/chosen": 242832042.66666666, + "logits/rejected": 178175608.47058824, + "logps/chosen": -391.00690104166665, + "logps/rejected": -494.33995863970586, + "loss": 0.0674, + "rewards/chosen": 2.7908533732096354, + "rewards/margins": 9.96087532791437, + "rewards/rejected": -7.170021954704733, + "step": 934 + }, + { + "epoch": 0.3451617368833926, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 7.477881221177077e-06, + "logits/chosen": 264641649.7777778, + "logits/rejected": 196023789.7142857, + "logps/chosen": -271.51158311631946, + "logps/rejected": -441.4329310825893, + "loss": 0.1466, + "rewards/chosen": 1.4577795664469402, + "rewards/margins": 8.838241123017811, + "rewards/rejected": -7.380461556570871, + "step": 935 + }, + { + "epoch": 0.34553089382123575, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.472767711771979e-06, + "logits/chosen": 213678106.9473684, + "logits/rejected": 333620027.0769231, + "logps/chosen": -348.1019222861842, + "logps/rejected": -373.4370868389423, + "loss": 0.0941, + "rewards/chosen": 2.668696353310033, + "rewards/margins": 8.70717725869615, + "rewards/rejected": -6.038480905386118, + "step": 936 + }, + { + "epoch": 0.345900050759079, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 7.467650776378633e-06, + "logits/chosen": 295533120.0, + "logits/rejected": 195225120.0, + "logps/chosen": -321.77044677734375, + "logps/rejected": -453.59930419921875, + "loss": 0.0526, + "rewards/chosen": 3.365401268005371, + "rewards/margins": 10.446199893951416, + "rewards/rejected": -7.080798625946045, + "step": 937 + }, + { + "epoch": 0.34626920769692215, + "grad_norm": 7.28125, + "kl": 0.0, + "learning_rate": 7.462530422086487e-06, + "logits/chosen": 208180081.7777778, + "logits/rejected": 239870500.57142857, + "logps/chosen": -350.87754991319446, + "logps/rejected": -362.06113978794644, + "loss": 0.1141, + "rewards/chosen": 2.312095430162218, + "rewards/margins": 8.2759822361053, + "rewards/rejected": -5.963886805943081, + "step": 938 + }, + { + "epoch": 0.34663836463476533, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 7.4574066559897276e-06, + "logits/chosen": 360961686.5882353, + "logits/rejected": 167130419.2, + "logps/chosen": -389.7067440257353, + "logps/rejected": -346.01240234375, + "loss": 0.1109, + "rewards/chosen": 2.2687835693359375, + "rewards/margins": 7.981832885742188, + "rewards/rejected": -5.71304931640625, + "step": 939 + }, + { + "epoch": 0.34700752157260856, + "grad_norm": 5.3125, + "kl": 0.0272369384765625, + "learning_rate": 7.452279485187268e-06, + "logits/chosen": 214891203.7647059, + "logits/rejected": 348512324.26666665, + "logps/chosen": -445.76740579044116, + "logps/rejected": -455.59534505208336, + "loss": 0.0672, + "rewards/chosen": 2.7842557570513558, + "rewards/margins": 9.06449867697323, + "rewards/rejected": -6.280242919921875, + "step": 940 + }, + { + "epoch": 0.34737667851045173, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.4471489167827374e-06, + "logits/chosen": 246650400.0, + "logits/rejected": 272541184.0, + "logps/chosen": -337.8834533691406, + "logps/rejected": -442.0582275390625, + "loss": 0.1107, + "rewards/chosen": 2.2545266151428223, + "rewards/margins": 8.774590969085693, + "rewards/rejected": -6.520064353942871, + "step": 941 + }, + { + "epoch": 0.34774583544829496, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 7.442014957884473e-06, + "logits/chosen": 226627072.0, + "logits/rejected": 178671985.7777778, + "logps/chosen": -304.6160365513393, + "logps/rejected": -459.7706705729167, + "loss": 0.0602, + "rewards/chosen": 2.4325147356305803, + "rewards/margins": 10.066127474345858, + "rewards/rejected": -7.633612738715278, + "step": 942 + }, + { + "epoch": 0.34811499238613813, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 7.43687761560551e-06, + "logits/chosen": 249885361.23076922, + "logits/rejected": 204293160.42105263, + "logps/chosen": -476.0305739182692, + "logps/rejected": -413.4440275493421, + "loss": 0.0776, + "rewards/chosen": 3.2719207176795373, + "rewards/margins": 9.589686204547341, + "rewards/rejected": -6.317765486867804, + "step": 943 + }, + { + "epoch": 0.34848414932398136, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.43173689706357e-06, + "logits/chosen": 234374836.70588234, + "logits/rejected": 330871671.46666664, + "logps/chosen": -390.5498621323529, + "logps/rejected": -372.5006510416667, + "loss": 0.1112, + "rewards/chosen": 2.370334176456227, + "rewards/margins": 7.215855213240081, + "rewards/rejected": -4.845521036783854, + "step": 944 + }, + { + "epoch": 0.34885330626182454, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 7.4265928093810545e-06, + "logits/chosen": 162951232.0, + "logits/rejected": 184174336.0, + "logps/chosen": -276.4035400390625, + "logps/rejected": -390.6693929036458, + "loss": 0.0922, + "rewards/chosen": 2.3333898544311524, + "rewards/margins": 8.642155901590984, + "rewards/rejected": -6.308766047159831, + "step": 945 + }, + { + "epoch": 0.34922246319966777, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 7.421445359685031e-06, + "logits/chosen": 169228544.0, + "logits/rejected": 173498368.0, + "logps/chosen": -350.2663981119792, + "logps/rejected": -425.669287109375, + "loss": 0.027, + "rewards/chosen": 3.6435391108194985, + "rewards/margins": 10.46960417429606, + "rewards/rejected": -6.8260650634765625, + "step": 946 + }, + { + "epoch": 0.34959162013751094, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.416294555107226e-06, + "logits/chosen": 188386932.36363637, + "logits/rejected": 202032847.23809522, + "logps/chosen": -304.51662375710225, + "logps/rejected": -397.2016834077381, + "loss": 0.0768, + "rewards/chosen": 1.8678380792791194, + "rewards/margins": 7.860473897033955, + "rewards/rejected": -5.992635817754836, + "step": 947 + }, + { + "epoch": 0.34996077707535417, + "grad_norm": 8.1875, + "kl": 0.22882366180419922, + "learning_rate": 7.411140402784014e-06, + "logits/chosen": 191523632.76190478, + "logits/rejected": 338672826.1818182, + "logps/chosen": -309.6588076636905, + "logps/rejected": -477.38130326704544, + "loss": 0.1678, + "rewards/chosen": 1.9925357273646764, + "rewards/margins": 8.172493724079876, + "rewards/rejected": -6.179957996715199, + "step": 948 + }, + { + "epoch": 0.35032993401319734, + "grad_norm": 6.59375, + "kl": 0.06484222412109375, + "learning_rate": 7.4059829098564075e-06, + "logits/chosen": 278275640.8888889, + "logits/rejected": 244752859.42857143, + "logps/chosen": -323.03095160590277, + "logps/rejected": -392.0946568080357, + "loss": 0.1203, + "rewards/chosen": 1.833234151204427, + "rewards/margins": 8.363872709728422, + "rewards/rejected": -6.530638558523996, + "step": 949 + }, + { + "epoch": 0.35069909095104057, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 7.400822083470046e-06, + "logits/chosen": 231383883.29411766, + "logits/rejected": 265903564.8, + "logps/chosen": -298.63195082720586, + "logps/rejected": -362.88684895833336, + "loss": 0.1215, + "rewards/chosen": 1.9470986758961397, + "rewards/margins": 7.436140830844056, + "rewards/rejected": -5.489042154947916, + "step": 950 + }, + { + "epoch": 0.35106824788888374, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 7.395657930775191e-06, + "logits/chosen": 262272271.05882353, + "logits/rejected": 205419144.53333333, + "logps/chosen": -347.2985409007353, + "logps/rejected": -521.8093098958333, + "loss": 0.081, + "rewards/chosen": 2.613529878504136, + "rewards/margins": 9.584531058517157, + "rewards/rejected": -6.971001180013021, + "step": 951 + }, + { + "epoch": 0.351437404826727, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 7.390490458926708e-06, + "logits/chosen": 232275642.1818182, + "logits/rejected": 173023780.57142857, + "logps/chosen": -359.3814142400568, + "logps/rejected": -323.29999069940476, + "loss": 0.0631, + "rewards/chosen": 2.301491130482067, + "rewards/margins": 7.537760763457328, + "rewards/rejected": -5.236269632975261, + "step": 952 + }, + { + "epoch": 0.35180656176457015, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 7.385319675084066e-06, + "logits/chosen": 204562525.0909091, + "logits/rejected": 255327890.2857143, + "logps/chosen": -309.82155539772725, + "logps/rejected": -496.99730282738096, + "loss": 0.0697, + "rewards/chosen": 2.3645803278142754, + "rewards/margins": 9.04910268412008, + "rewards/rejected": -6.684522356305804, + "step": 953 + }, + { + "epoch": 0.3521757187024134, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 7.38014558641132e-06, + "logits/chosen": 216037580.8, + "logits/rejected": 195811734.5882353, + "logps/chosen": -377.54791666666665, + "logps/rejected": -381.36345358455884, + "loss": 0.0863, + "rewards/chosen": 2.313435363769531, + "rewards/margins": 8.773017075482537, + "rewards/rejected": -6.459581711713006, + "step": 954 + }, + { + "epoch": 0.35254487564025655, + "grad_norm": 6.53125, + "kl": 0.13321352005004883, + "learning_rate": 7.3749682000771016e-06, + "logits/chosen": 350521312.0, + "logits/rejected": 170808048.0, + "logps/chosen": -325.315673828125, + "logps/rejected": -365.4849853515625, + "loss": 0.1193, + "rewards/chosen": 2.123203754425049, + "rewards/margins": 7.318618297576904, + "rewards/rejected": -5.1954145431518555, + "step": 955 + }, + { + "epoch": 0.3529140325780998, + "grad_norm": 6.46875, + "kl": 0.6616029739379883, + "learning_rate": 7.369787523254617e-06, + "logits/chosen": 256819638.85714287, + "logits/rejected": 192924928.0, + "logps/chosen": -407.72191220238096, + "logps/rejected": -237.9126642400568, + "loss": 0.1012, + "rewards/chosen": 2.5785922095889138, + "rewards/margins": 7.7193573130157604, + "rewards/rejected": -5.140765103426847, + "step": 956 + }, + { + "epoch": 0.35328318951594295, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.364603563121627e-06, + "logits/chosen": 232946634.10526314, + "logits/rejected": 192344969.84615386, + "logps/chosen": -310.6288548519737, + "logps/rejected": -449.5780498798077, + "loss": 0.1104, + "rewards/chosen": 1.9947999653063322, + "rewards/margins": 8.257221793356212, + "rewards/rejected": -6.26242182804988, + "step": 957 + }, + { + "epoch": 0.3536523464537862, + "grad_norm": 5.5, + "kl": 0.23700284957885742, + "learning_rate": 7.359416326860443e-06, + "logits/chosen": 190084713.4117647, + "logits/rejected": 215100177.06666666, + "logps/chosen": -341.2613740808824, + "logps/rejected": -465.3077799479167, + "loss": 0.0898, + "rewards/chosen": 2.2891194960650276, + "rewards/margins": 8.96619774313534, + "rewards/rejected": -6.6770782470703125, + "step": 958 + }, + { + "epoch": 0.35402150339162936, + "grad_norm": 4.46875, + "kl": 1.4403047561645508, + "learning_rate": 7.3542258216579136e-06, + "logits/chosen": 217280030.11764705, + "logits/rejected": 270800366.93333334, + "logps/chosen": -342.77516084558823, + "logps/rejected": -483.373046875, + "loss": 0.0938, + "rewards/chosen": 2.438746732823989, + "rewards/margins": 9.433783048741958, + "rewards/rejected": -6.995036315917969, + "step": 959 + }, + { + "epoch": 0.3543906603294726, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.349032054705417e-06, + "logits/chosen": 259819922.2857143, + "logits/rejected": 197054919.1111111, + "logps/chosen": -349.02329799107144, + "logps/rejected": -435.53721788194446, + "loss": 0.0678, + "rewards/chosen": 2.5721304757254466, + "rewards/margins": 9.299436417836992, + "rewards/rejected": -6.7273059421115455, + "step": 960 + }, + { + "epoch": 0.35475981726731576, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 7.343835033198854e-06, + "logits/chosen": 204308019.2, + "logits/rejected": 216129536.0, + "logps/chosen": -333.587890625, + "logps/rejected": -440.84811580882354, + "loss": 0.0925, + "rewards/chosen": 1.965673065185547, + "rewards/margins": 8.68054194730871, + "rewards/rejected": -6.714868882123162, + "step": 961 + }, + { + "epoch": 0.355128974205159, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 7.33863476433863e-06, + "logits/chosen": 282324128.0, + "logits/rejected": 209273216.0, + "logps/chosen": -360.3106689453125, + "logps/rejected": -488.2525329589844, + "loss": 0.0856, + "rewards/chosen": 2.083071708679199, + "rewards/margins": 9.390039443969727, + "rewards/rejected": -7.306967735290527, + "step": 962 + }, + { + "epoch": 0.35549813114300216, + "grad_norm": 5.0, + "kl": 0.2313861846923828, + "learning_rate": 7.333431255329653e-06, + "logits/chosen": 233449453.7142857, + "logits/rejected": 246625507.55555555, + "logps/chosen": -340.65042550223217, + "logps/rejected": -431.2430826822917, + "loss": 0.0819, + "rewards/chosen": 2.5850464957101003, + "rewards/margins": 7.671346815805586, + "rewards/rejected": -5.086300320095486, + "step": 963 + }, + { + "epoch": 0.3558672880808454, + "grad_norm": 6.09375, + "kl": 0.7225837707519531, + "learning_rate": 7.3282245133813155e-06, + "logits/chosen": 180485563.73333332, + "logits/rejected": 212709842.82352942, + "logps/chosen": -295.078515625, + "logps/rejected": -464.0376838235294, + "loss": 0.1027, + "rewards/chosen": 1.912786356608073, + "rewards/margins": 8.176871565276501, + "rewards/rejected": -6.264085208668428, + "step": 964 + }, + { + "epoch": 0.35623644501868856, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 7.323014545707497e-06, + "logits/chosen": 292394154.6666667, + "logits/rejected": 194665709.7142857, + "logps/chosen": -389.8111979166667, + "logps/rejected": -390.02085658482144, + "loss": 0.0591, + "rewards/chosen": 3.0189516279432507, + "rewards/margins": 9.273040983412, + "rewards/rejected": -6.25408935546875, + "step": 965 + }, + { + "epoch": 0.3566056019565318, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 7.317801359526538e-06, + "logits/chosen": 251254028.19047618, + "logits/rejected": 192310853.8181818, + "logps/chosen": -418.8876488095238, + "logps/rejected": -359.7811168323864, + "loss": 0.0844, + "rewards/chosen": 3.1488534836542037, + "rewards/margins": 8.93456251796706, + "rewards/rejected": -5.785709034312855, + "step": 966 + }, + { + "epoch": 0.35697475889437497, + "grad_norm": 6.15625, + "kl": 1.2166204452514648, + "learning_rate": 7.312584962061243e-06, + "logits/chosen": 222767314.82352942, + "logits/rejected": 238988441.6, + "logps/chosen": -370.6739717371324, + "logps/rejected": -409.4224609375, + "loss": 0.0938, + "rewards/chosen": 2.412777171415441, + "rewards/margins": 8.393355784696691, + "rewards/rejected": -5.98057861328125, + "step": 967 + }, + { + "epoch": 0.3573439158322182, + "grad_norm": 6.53125, + "kl": 0.203338623046875, + "learning_rate": 7.307365360538865e-06, + "logits/chosen": 200109824.0, + "logits/rejected": 273957944.8888889, + "logps/chosen": -342.27786690848217, + "logps/rejected": -449.88433159722223, + "loss": 0.1055, + "rewards/chosen": 2.479891095842634, + "rewards/margins": 7.707979898604135, + "rewards/rejected": -5.228088802761501, + "step": 968 + }, + { + "epoch": 0.35771307277006137, + "grad_norm": 6.5625, + "kl": 0.042795658111572266, + "learning_rate": 7.302142562191092e-06, + "logits/chosen": 212461312.0, + "logits/rejected": 160161715.2, + "logps/chosen": -380.54794034090907, + "logps/rejected": -362.4730224609375, + "loss": 0.1026, + "rewards/chosen": 2.5436689203435723, + "rewards/margins": 8.279366475885563, + "rewards/rejected": -5.735697555541992, + "step": 969 + }, + { + "epoch": 0.3580822297079046, + "grad_norm": 7.4375, + "kl": 1.2956523895263672, + "learning_rate": 7.2969165742540495e-06, + "logits/chosen": 232213290.66666666, + "logits/rejected": 152243538.2857143, + "logps/chosen": -405.7323947482639, + "logps/rejected": -375.17539760044644, + "loss": 0.1734, + "rewards/chosen": 1.8105197482638888, + "rewards/margins": 8.347593701074993, + "rewards/rejected": -6.537073952811105, + "step": 970 + }, + { + "epoch": 0.35845138664574777, + "grad_norm": 5.28125, + "kl": 0.1955556869506836, + "learning_rate": 7.2916874039682765e-06, + "logits/chosen": 335544285.8666667, + "logits/rejected": 285549387.2941176, + "logps/chosen": -428.23759765625, + "logps/rejected": -370.0078125, + "loss": 0.0815, + "rewards/chosen": 2.994460042317708, + "rewards/margins": 8.22168181176279, + "rewards/rejected": -5.227221769445083, + "step": 971 + }, + { + "epoch": 0.358820543583591, + "grad_norm": 7.5625, + "kl": 0.0, + "learning_rate": 7.286455058578719e-06, + "logits/chosen": 266597922.13333333, + "logits/rejected": 237409415.52941176, + "logps/chosen": -383.95478515625, + "logps/rejected": -362.3804285386029, + "loss": 0.1229, + "rewards/chosen": 1.7737857818603515, + "rewards/margins": 6.699929203706629, + "rewards/rejected": -4.926143421846278, + "step": 972 + }, + { + "epoch": 0.3591897005214342, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 7.281219545334727e-06, + "logits/chosen": 208797440.0, + "logits/rejected": 248744880.0, + "logps/chosen": -380.65753173828125, + "logps/rejected": -373.76031494140625, + "loss": 0.0832, + "rewards/chosen": 2.7320666313171387, + "rewards/margins": 8.397661209106445, + "rewards/rejected": -5.665594577789307, + "step": 973 + }, + { + "epoch": 0.35955885745927735, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 7.275980871490038e-06, + "logits/chosen": 198538770.2857143, + "logits/rejected": 115500828.44444445, + "logps/chosen": -409.8055943080357, + "logps/rejected": -327.7509765625, + "loss": 0.0645, + "rewards/chosen": 3.3151324135916576, + "rewards/margins": 9.112366418989877, + "rewards/rejected": -5.79723400539822, + "step": 974 + }, + { + "epoch": 0.3599280143971206, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.270739044302767e-06, + "logits/chosen": 316267068.2352941, + "logits/rejected": 230800930.13333333, + "logps/chosen": -368.4132295496324, + "logps/rejected": -525.5564127604167, + "loss": 0.0845, + "rewards/chosen": 2.7752609252929688, + "rewards/margins": 11.140839131673177, + "rewards/rejected": -8.365578206380208, + "step": 975 + }, + { + "epoch": 0.36029717133496375, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 7.265494071035401e-06, + "logits/chosen": 148842736.0, + "logits/rejected": 226768288.0, + "logps/chosen": -337.74859619140625, + "logps/rejected": -307.6744079589844, + "loss": 0.072, + "rewards/chosen": 3.600320339202881, + "rewards/margins": 9.205461502075195, + "rewards/rejected": -5.6051411628723145, + "step": 976 + }, + { + "epoch": 0.360666328272807, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 7.260245958954783e-06, + "logits/chosen": 312163749.64705884, + "logits/rejected": 377240746.6666667, + "logps/chosen": -400.31677964154414, + "logps/rejected": -527.7004557291667, + "loss": 0.0697, + "rewards/chosen": 2.5587355669806984, + "rewards/margins": 10.012367158777574, + "rewards/rejected": -7.453631591796875, + "step": 977 + }, + { + "epoch": 0.36103548521065015, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 7.254994715332102e-06, + "logits/chosen": 245982208.0, + "logits/rejected": 155919760.0, + "logps/chosen": -380.5256042480469, + "logps/rejected": -346.00042724609375, + "loss": 0.061, + "rewards/chosen": 3.401848316192627, + "rewards/margins": 9.049612998962402, + "rewards/rejected": -5.647764682769775, + "step": 978 + }, + { + "epoch": 0.3614046421484934, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 7.249740347442895e-06, + "logits/chosen": 207725218.9090909, + "logits/rejected": 218311168.0, + "logps/chosen": -363.12109375, + "logps/rejected": -378.102587890625, + "loss": 0.1273, + "rewards/chosen": 2.615274429321289, + "rewards/margins": 8.016198348999023, + "rewards/rejected": -5.4009239196777346, + "step": 979 + }, + { + "epoch": 0.36177379908633656, + "grad_norm": 6.90625, + "kl": 0.6884875297546387, + "learning_rate": 7.244482862567018e-06, + "logits/chosen": 293245297.7777778, + "logits/rejected": 310573458.28571427, + "logps/chosen": -431.39105902777777, + "logps/rejected": -409.1774204799107, + "loss": 0.1059, + "rewards/chosen": 1.9875787099202473, + "rewards/margins": 8.007318678356352, + "rewards/rejected": -6.019739968436105, + "step": 980 + }, + { + "epoch": 0.3621429560241798, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 7.2392222679886506e-06, + "logits/chosen": 307235424.0, + "logits/rejected": 237950112.0, + "logps/chosen": -382.08453369140625, + "logps/rejected": -536.1615600585938, + "loss": 0.0944, + "rewards/chosen": 1.7383782863616943, + "rewards/margins": 9.515122175216675, + "rewards/rejected": -7.7767438888549805, + "step": 981 + }, + { + "epoch": 0.36251211296202296, + "grad_norm": 6.8125, + "kl": 0.4684715270996094, + "learning_rate": 7.23395857099628e-06, + "logits/chosen": 172888644.26666668, + "logits/rejected": 217168384.0, + "logps/chosen": -407.73333333333335, + "logps/rejected": -350.39952895220586, + "loss": 0.0965, + "rewards/chosen": 2.6275919596354167, + "rewards/margins": 7.857098957136566, + "rewards/rejected": -5.229506997501149, + "step": 982 + }, + { + "epoch": 0.3628812698998662, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 7.2286917788826926e-06, + "logits/chosen": 250140689.06666666, + "logits/rejected": 300704075.2941176, + "logps/chosen": -383.56956380208334, + "logps/rejected": -495.39206112132354, + "loss": 0.0854, + "rewards/chosen": 2.7941741943359375, + "rewards/margins": 8.62444843965418, + "rewards/rejected": -5.830274245318244, + "step": 983 + }, + { + "epoch": 0.36325042683770936, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 7.22342189894496e-06, + "logits/chosen": 252006052.57142857, + "logits/rejected": 236131953.7777778, + "logps/chosen": -366.44105747767856, + "logps/rejected": -353.78868272569446, + "loss": 0.1246, + "rewards/chosen": 2.543064662388393, + "rewards/margins": 7.46530036320762, + "rewards/rejected": -4.922235700819227, + "step": 984 + }, + { + "epoch": 0.3636195837755526, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 7.218148938484435e-06, + "logits/chosen": 204903495.1111111, + "logits/rejected": 219664036.57142857, + "logps/chosen": -364.42192925347223, + "logps/rejected": -429.77235630580356, + "loss": 0.1168, + "rewards/chosen": 2.3818331824408636, + "rewards/margins": 7.544419939555819, + "rewards/rejected": -5.162586757114956, + "step": 985 + }, + { + "epoch": 0.36398874071339576, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 7.212872904806736e-06, + "logits/chosen": 248533435.73333332, + "logits/rejected": 337695894.5882353, + "logps/chosen": -380.51764322916665, + "logps/rejected": -415.53644875919116, + "loss": 0.0938, + "rewards/chosen": 2.327862294514974, + "rewards/margins": 7.615238249535654, + "rewards/rejected": -5.28737595502068, + "step": 986 + }, + { + "epoch": 0.364357897651239, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.207593805221742e-06, + "logits/chosen": 185348216.47058824, + "logits/rejected": 221776128.0, + "logps/chosen": -357.76551011029414, + "logps/rejected": -330.5908528645833, + "loss": 0.0496, + "rewards/chosen": 3.4370788125430836, + "rewards/margins": 9.454285640342563, + "rewards/rejected": -6.0172068277994795, + "step": 987 + }, + { + "epoch": 0.36472705458908217, + "grad_norm": 6.0625, + "kl": 0.20684337615966797, + "learning_rate": 7.202311647043579e-06, + "logits/chosen": 159308960.0, + "logits/rejected": 240736176.0, + "logps/chosen": -302.7579650878906, + "logps/rejected": -405.6474609375, + "loss": 0.0825, + "rewards/chosen": 2.466456890106201, + "rewards/margins": 8.153029441833496, + "rewards/rejected": -5.686572551727295, + "step": 988 + }, + { + "epoch": 0.3650962115269254, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 7.197026437590608e-06, + "logits/chosen": 269110546.28571427, + "logits/rejected": 204611953.7777778, + "logps/chosen": -394.74434988839283, + "logps/rejected": -340.31776258680554, + "loss": 0.054, + "rewards/chosen": 3.2902327946254184, + "rewards/margins": 8.708233333769298, + "rewards/rejected": -5.41800053914388, + "step": 989 + }, + { + "epoch": 0.36546536846476857, + "grad_norm": 5.75, + "kl": 0.550687313079834, + "learning_rate": 7.191738184185422e-06, + "logits/chosen": 324890214.4, + "logits/rejected": 219833645.17647058, + "logps/chosen": -380.1625, + "logps/rejected": -386.6565946691176, + "loss": 0.0863, + "rewards/chosen": 2.4951182047526044, + "rewards/margins": 8.584969584147135, + "rewards/rejected": -6.089851379394531, + "step": 990 + }, + { + "epoch": 0.3658345254026118, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 7.186446894154826e-06, + "logits/chosen": 207344400.0, + "logits/rejected": 199474656.0, + "logps/chosen": -359.9826354980469, + "logps/rejected": -499.041748046875, + "loss": 0.0824, + "rewards/chosen": 2.4720609188079834, + "rewards/margins": 8.007044553756714, + "rewards/rejected": -5.5349836349487305, + "step": 991 + }, + { + "epoch": 0.366203682340455, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 7.181152574829837e-06, + "logits/chosen": 294210379.2941176, + "logits/rejected": 422514824.53333336, + "logps/chosen": -334.2491096047794, + "logps/rejected": -580.4869791666666, + "loss": 0.103, + "rewards/chosen": 2.3674127915326286, + "rewards/margins": 8.116124949735752, + "rewards/rejected": -5.748712158203125, + "step": 992 + }, + { + "epoch": 0.3665728392782982, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 7.175855233545669e-06, + "logits/chosen": 291196885.3333333, + "logits/rejected": 177654284.8, + "logps/chosen": -483.9007975260417, + "logps/rejected": -395.936865234375, + "loss": 0.0545, + "rewards/chosen": 2.503406524658203, + "rewards/margins": 9.200653839111329, + "rewards/rejected": -6.697247314453125, + "step": 993 + }, + { + "epoch": 0.3669419962161414, + "grad_norm": 6.5625, + "kl": 0.58538818359375, + "learning_rate": 7.1705548776417165e-06, + "logits/chosen": 251816476.44444445, + "logits/rejected": 207873462.85714287, + "logps/chosen": -427.59703233506946, + "logps/rejected": -450.25606863839283, + "loss": 0.1188, + "rewards/chosen": 2.0079718695746527, + "rewards/margins": 9.109878782242063, + "rewards/rejected": -7.101906912667411, + "step": 994 + }, + { + "epoch": 0.3673111531539846, + "grad_norm": 5.5625, + "kl": 1.503199577331543, + "learning_rate": 7.1652515144615575e-06, + "logits/chosen": 272447649.68421054, + "logits/rejected": 157557710.76923078, + "logps/chosen": -417.9930355674342, + "logps/rejected": -455.71473106971155, + "loss": 0.073, + "rewards/chosen": 3.2867746855083264, + "rewards/margins": 8.313094336011632, + "rewards/rejected": -5.026319650503305, + "step": 995 + }, + { + "epoch": 0.3676803100918278, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 7.1599451513529364e-06, + "logits/chosen": 192352187.73333332, + "logits/rejected": 186157266.82352942, + "logps/chosen": -311.49462890625, + "logps/rejected": -433.7213350183824, + "loss": 0.0743, + "rewards/chosen": 2.473075358072917, + "rewards/margins": 8.902497175628064, + "rewards/rejected": -6.429421817555147, + "step": 996 + }, + { + "epoch": 0.368049467029671, + "grad_norm": 7.625, + "kl": 1.5352163314819336, + "learning_rate": 7.154635795667748e-06, + "logits/chosen": 171877526.5882353, + "logits/rejected": 373924761.6, + "logps/chosen": -416.8756318933824, + "logps/rejected": -496.0944010416667, + "loss": 0.1029, + "rewards/chosen": 2.1749770220588234, + "rewards/margins": 8.375379854090074, + "rewards/rejected": -6.20040283203125, + "step": 997 + }, + { + "epoch": 0.3684186239675142, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 7.149323454762039e-06, + "logits/chosen": 246324352.0, + "logits/rejected": 260235463.1111111, + "logps/chosen": -343.35543387276783, + "logps/rejected": -404.68071831597223, + "loss": 0.0482, + "rewards/chosen": 3.359252384730748, + "rewards/margins": 8.957631307934959, + "rewards/rejected": -5.59837892320421, + "step": 998 + }, + { + "epoch": 0.3687877809053574, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 7.144008135995992e-06, + "logits/chosen": 208259793.45454547, + "logits/rejected": 254314276.57142857, + "logps/chosen": -291.5108087713068, + "logps/rejected": -514.6843843005952, + "loss": 0.0706, + "rewards/chosen": 1.9460260217840022, + "rewards/margins": 9.132595962260194, + "rewards/rejected": -7.186569940476191, + "step": 999 + }, + { + "epoch": 0.3691569378432006, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 7.1386898467339114e-06, + "logits/chosen": 403903330.46153843, + "logits/rejected": 197926642.52631578, + "logps/chosen": -485.35355318509613, + "logps/rejected": -420.27700966282896, + "loss": 0.0739, + "rewards/chosen": 2.2416317279522238, + "rewards/margins": 8.353034540709213, + "rewards/rejected": -6.11140281275699, + "step": 1000 + }, + { + "epoch": 0.3695260947810438, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 7.13336859434422e-06, + "logits/chosen": 235814987.29411766, + "logits/rejected": 324037529.6, + "logps/chosen": -272.66989315257354, + "logps/rejected": -445.77216796875, + "loss": 0.1131, + "rewards/chosen": 2.968296724207261, + "rewards/margins": 8.598731904871324, + "rewards/rejected": -5.630435180664063, + "step": 1001 + }, + { + "epoch": 0.369895251718887, + "grad_norm": 4.34375, + "kl": 3.2764945030212402, + "learning_rate": 7.128044386199445e-06, + "logits/chosen": 320169391.15789473, + "logits/rejected": 290573312.0, + "logps/chosen": -394.7205746299342, + "logps/rejected": -457.1506535456731, + "loss": 0.0589, + "rewards/chosen": 3.297794542814556, + "rewards/margins": 10.42380397039869, + "rewards/rejected": -7.126009427584135, + "step": 1002 + }, + { + "epoch": 0.3702644086567302, + "grad_norm": 5.96875, + "kl": 0.7743511199951172, + "learning_rate": 7.1227172296762086e-06, + "logits/chosen": 209374483.69230768, + "logits/rejected": 236287541.89473686, + "logps/chosen": -250.0821814903846, + "logps/rejected": -375.78114720394734, + "loss": 0.1324, + "rewards/chosen": 2.0462882702167215, + "rewards/margins": 7.50827581583247, + "rewards/rejected": -5.461987545615749, + "step": 1003 + }, + { + "epoch": 0.3706335655945734, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 7.11738713215522e-06, + "logits/chosen": 192705670.7368421, + "logits/rejected": 205584974.76923078, + "logps/chosen": -377.7074938322368, + "logps/rejected": -343.1805889423077, + "loss": 0.0762, + "rewards/chosen": 2.9750952469675163, + "rewards/margins": 9.195138722778815, + "rewards/rejected": -6.220043475811298, + "step": 1004 + }, + { + "epoch": 0.3710027225324166, + "grad_norm": 6.40625, + "kl": 0.9175558090209961, + "learning_rate": 7.112054101021262e-06, + "logits/chosen": 240836522.66666666, + "logits/rejected": 175024179.2, + "logps/chosen": -475.52099609375, + "logps/rejected": -358.035009765625, + "loss": 0.0792, + "rewards/chosen": 2.5425802866617837, + "rewards/margins": 8.967702356974284, + "rewards/rejected": -6.4251220703125, + "step": 1005 + }, + { + "epoch": 0.3713718794702598, + "grad_norm": 8.5625, + "kl": 0.0, + "learning_rate": 7.106718143663178e-06, + "logits/chosen": 203315724.8, + "logits/rejected": 151299509.33333334, + "logps/chosen": -375.740380859375, + "logps/rejected": -405.451904296875, + "loss": 0.0909, + "rewards/chosen": 3.3642139434814453, + "rewards/margins": 9.479467391967773, + "rewards/rejected": -6.115253448486328, + "step": 1006 + }, + { + "epoch": 0.371741036408103, + "grad_norm": 7.5, + "kl": 0.0, + "learning_rate": 7.101379267473873e-06, + "logits/chosen": 284210403.5555556, + "logits/rejected": 202798701.7142857, + "logps/chosen": -378.34412977430554, + "logps/rejected": -487.31766183035717, + "loss": 0.1129, + "rewards/chosen": 2.504256990220812, + "rewards/margins": 8.814088881961883, + "rewards/rejected": -6.309831891741071, + "step": 1007 + }, + { + "epoch": 0.3721101933459462, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 7.096037479850292e-06, + "logits/chosen": 147230298.3529412, + "logits/rejected": 249866786.13333333, + "logps/chosen": -267.32740693933823, + "logps/rejected": -416.3598958333333, + "loss": 0.127, + "rewards/chosen": 2.108600840849035, + "rewards/margins": 8.084814423205806, + "rewards/rejected": -5.976213582356771, + "step": 1008 + }, + { + "epoch": 0.3724793502837894, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.090692788193409e-06, + "logits/chosen": 206768759.46666667, + "logits/rejected": 252272850.82352942, + "logps/chosen": -272.39365234375, + "logps/rejected": -410.4867589613971, + "loss": 0.0915, + "rewards/chosen": 2.5204569498697915, + "rewards/margins": 8.099865483302695, + "rewards/rejected": -5.579408533432904, + "step": 1009 + }, + { + "epoch": 0.3728485072216326, + "grad_norm": 4.53125, + "kl": 0.05621814727783203, + "learning_rate": 7.085345199908234e-06, + "logits/chosen": 194549248.0, + "logits/rejected": 224275456.0, + "logps/chosen": -396.3444260817308, + "logps/rejected": -481.9495271381579, + "loss": 0.068, + "rewards/chosen": 2.326700357290415, + "rewards/margins": 9.160365903908424, + "rewards/rejected": -6.83366554661801, + "step": 1010 + }, + { + "epoch": 0.37321766415947577, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 7.0799947224037765e-06, + "logits/chosen": 224353837.17647058, + "logits/rejected": 164159027.2, + "logps/chosen": -347.3394990808824, + "logps/rejected": -458.375390625, + "loss": 0.0761, + "rewards/chosen": 2.5308617984547332, + "rewards/margins": 9.694123212028952, + "rewards/rejected": -7.163261413574219, + "step": 1011 + }, + { + "epoch": 0.373586821097319, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 7.074641363093058e-06, + "logits/chosen": 250010416.0, + "logits/rejected": 150475760.0, + "logps/chosen": -343.0478820800781, + "logps/rejected": -386.94293212890625, + "loss": 0.0986, + "rewards/chosen": 2.0532169342041016, + "rewards/margins": 8.250295639038086, + "rewards/rejected": -6.197078704833984, + "step": 1012 + }, + { + "epoch": 0.3739559780351622, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 7.0692851293930885e-06, + "logits/chosen": 197155093.33333334, + "logits/rejected": 227226521.6, + "logps/chosen": -376.658203125, + "logps/rejected": -426.30546875, + "loss": 0.058, + "rewards/chosen": 3.2991367975870767, + "rewards/margins": 9.240544573465982, + "rewards/rejected": -5.941407775878906, + "step": 1013 + }, + { + "epoch": 0.3743251349730054, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 7.063926028724861e-06, + "logits/chosen": 210542987.63636363, + "logits/rejected": 260333665.52380952, + "logps/chosen": -279.79838423295456, + "logps/rejected": -357.5131603422619, + "loss": 0.0853, + "rewards/chosen": 2.5774213617498223, + "rewards/margins": 7.5722377331225905, + "rewards/rejected": -4.994816371372768, + "step": 1014 + }, + { + "epoch": 0.3746942919108486, + "grad_norm": 6.0, + "kl": 1.4225330352783203, + "learning_rate": 7.058564068513344e-06, + "logits/chosen": 200798352.0, + "logits/rejected": 241072368.0, + "logps/chosen": -282.922119140625, + "logps/rejected": -335.4027099609375, + "loss": 0.0866, + "rewards/chosen": 2.96512770652771, + "rewards/margins": 8.404102087020874, + "rewards/rejected": -5.438974380493164, + "step": 1015 + }, + { + "epoch": 0.3750634488486918, + "grad_norm": 3.5, + "kl": 0.4922051429748535, + "learning_rate": 7.053199256187464e-06, + "logits/chosen": 143406414.76923078, + "logits/rejected": 189202970.9473684, + "logps/chosen": -286.6671799879808, + "logps/rejected": -313.54415090460526, + "loss": 0.0811, + "rewards/chosen": 2.934472891000601, + "rewards/margins": 7.887496299589211, + "rewards/rejected": -4.95302340858861, + "step": 1016 + }, + { + "epoch": 0.375432605786535, + "grad_norm": 6.15625, + "kl": 2.0399045944213867, + "learning_rate": 7.047831599180099e-06, + "logits/chosen": 220985702.4, + "logits/rejected": 153482912.0, + "logps/chosen": -333.277197265625, + "logps/rejected": -393.526611328125, + "loss": 0.1256, + "rewards/chosen": 2.6946287155151367, + "rewards/margins": 7.997757275899251, + "rewards/rejected": -5.303128560384114, + "step": 1017 + }, + { + "epoch": 0.375432605786535, + "eval_kl": 0.3321315050125122, + "eval_logits/chosen": 237900924.32671082, + "eval_logits/rejected": 202945974.16548464, + "eval_logps/chosen": -357.3242618653422, + "eval_logps/rejected": -441.22236997635935, + "eval_loss": 0.08742756396532059, + "eval_rewards/chosen": 2.6221295960954745, + "eval_rewards/margins": 8.91900286460168, + "eval_rewards/rejected": -6.296873268506205, + "eval_runtime": 46.7434, + "eval_samples_per_second": 18.741, + "eval_steps_per_second": 4.685, + "step": 1017 + }, + { + "epoch": 0.3758017627243782, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 7.042461104928072e-06, + "logits/chosen": 213025649.7777778, + "logits/rejected": 261864905.14285713, + "logps/chosen": -329.1077473958333, + "logps/rejected": -409.9056919642857, + "loss": 0.117, + "rewards/chosen": 2.398416943020291, + "rewards/margins": 8.183313793606228, + "rewards/rejected": -5.7848968505859375, + "step": 1018 + }, + { + "epoch": 0.3761709196622214, + "grad_norm": 5.90625, + "kl": 0.4898395538330078, + "learning_rate": 7.037087780872134e-06, + "logits/chosen": 259462087.1111111, + "logits/rejected": 144803273.14285713, + "logps/chosen": -462.2823893229167, + "logps/rejected": -429.54813058035717, + "loss": 0.0949, + "rewards/chosen": 2.5875606536865234, + "rewards/margins": 9.527287346976145, + "rewards/rejected": -6.939726693289621, + "step": 1019 + }, + { + "epoch": 0.3765400766000646, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 7.031711634456954e-06, + "logits/chosen": 218230937.6, + "logits/rejected": 234375213.17647058, + "logps/chosen": -343.7388020833333, + "logps/rejected": -383.7456916360294, + "loss": 0.1008, + "rewards/chosen": 2.1074156443277996, + "rewards/margins": 7.9165780086143345, + "rewards/rejected": -5.809162364286535, + "step": 1020 + }, + { + "epoch": 0.3769092335379078, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 7.02633267313112e-06, + "logits/chosen": 293054016.0, + "logits/rejected": 209220432.0, + "logps/chosen": -381.44195556640625, + "logps/rejected": -469.0828857421875, + "loss": 0.0806, + "rewards/chosen": 2.720479726791382, + "rewards/margins": 9.606562376022339, + "rewards/rejected": -6.886082649230957, + "step": 1021 + }, + { + "epoch": 0.377278390475751, + "grad_norm": 6.875, + "kl": 0.6336736679077148, + "learning_rate": 7.02095090434711e-06, + "logits/chosen": 188292633.6, + "logits/rejected": 177775968.0, + "logps/chosen": -317.303125, + "logps/rejected": -346.2544352213542, + "loss": 0.1235, + "rewards/chosen": 2.4350152969360352, + "rewards/margins": 8.584208106994629, + "rewards/rejected": -6.149192810058594, + "step": 1022 + }, + { + "epoch": 0.3776475474135942, + "grad_norm": 4.375, + "kl": 0.11547136306762695, + "learning_rate": 7.015566335561297e-06, + "logits/chosen": 204560718.76923078, + "logits/rejected": 221331375.15789473, + "logps/chosen": -411.8157301682692, + "logps/rejected": -334.03189247532896, + "loss": 0.0708, + "rewards/chosen": 2.2469987135667067, + "rewards/margins": 7.46131603534405, + "rewards/rejected": -5.214317321777344, + "step": 1023 + }, + { + "epoch": 0.3780167043514374, + "grad_norm": 7.65625, + "kl": 0.6102094650268555, + "learning_rate": 7.010178974233936e-06, + "logits/chosen": 193960905.14285713, + "logits/rejected": 203929486.2222222, + "logps/chosen": -348.11851283482144, + "logps/rejected": -359.3824055989583, + "loss": 0.1058, + "rewards/chosen": 2.054826055254255, + "rewards/margins": 7.858932873559377, + "rewards/rejected": -5.804106818305121, + "step": 1024 + }, + { + "epoch": 0.3783858612892806, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 7.004788827829143e-06, + "logits/chosen": 159392768.0, + "logits/rejected": 157585334.85714287, + "logps/chosen": -283.7284342447917, + "logps/rejected": -368.59078543526783, + "loss": 0.074, + "rewards/chosen": 3.3255396948920355, + "rewards/margins": 9.697817635914635, + "rewards/rejected": -6.372277941022601, + "step": 1025 + }, + { + "epoch": 0.3787550182271238, + "grad_norm": 5.125, + "kl": 0.2231903076171875, + "learning_rate": 6.9993959038149e-06, + "logits/chosen": 205963794.2857143, + "logits/rejected": 241136753.7777778, + "logps/chosen": -344.8714076450893, + "logps/rejected": -381.88525390625, + "loss": 0.0771, + "rewards/chosen": 3.1023766653878346, + "rewards/margins": 8.977194407629588, + "rewards/rejected": -5.874817742241754, + "step": 1026 + }, + { + "epoch": 0.379124175164967, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 6.994000209663037e-06, + "logits/chosen": 224779984.0, + "logits/rejected": 266160800.0, + "logps/chosen": -318.8004150390625, + "logps/rejected": -529.7281494140625, + "loss": 0.0926, + "rewards/chosen": 2.080021619796753, + "rewards/margins": 9.243794202804565, + "rewards/rejected": -7.1637725830078125, + "step": 1027 + }, + { + "epoch": 0.3794933321028102, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 6.988601752849213e-06, + "logits/chosen": 196755387.73333332, + "logits/rejected": 180450966.5882353, + "logps/chosen": -394.29970703125, + "logps/rejected": -349.6120174632353, + "loss": 0.0535, + "rewards/chosen": 3.4764422098795573, + "rewards/margins": 9.931724084592332, + "rewards/rejected": -6.455281874712775, + "step": 1028 + }, + { + "epoch": 0.3798624890406534, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.983200540852928e-06, + "logits/chosen": 209053988.57142857, + "logits/rejected": 243844551.1111111, + "logps/chosen": -315.1072474888393, + "logps/rejected": -426.71123589409723, + "loss": 0.071, + "rewards/chosen": 2.6347882407052174, + "rewards/margins": 7.843755237639897, + "rewards/rejected": -5.208966996934679, + "step": 1029 + }, + { + "epoch": 0.3802316459784966, + "grad_norm": 6.09375, + "kl": 0.5711812973022461, + "learning_rate": 6.97779658115749e-06, + "logits/chosen": 228833928.53333333, + "logits/rejected": 196430787.7647059, + "logps/chosen": -396.85520833333334, + "logps/rejected": -394.89527803308823, + "loss": 0.0609, + "rewards/chosen": 3.096612803141276, + "rewards/margins": 9.427958753997205, + "rewards/rejected": -6.331345950855928, + "step": 1030 + }, + { + "epoch": 0.3806008029163398, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.972389881250015e-06, + "logits/chosen": 241889792.0, + "logits/rejected": 302831342.93333334, + "logps/chosen": -357.44289981617646, + "logps/rejected": -532.1897135416667, + "loss": 0.0617, + "rewards/chosen": 3.2449053596047794, + "rewards/margins": 9.968447127996706, + "rewards/rejected": -6.723541768391927, + "step": 1031 + }, + { + "epoch": 0.380969959854183, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.9669804486214196e-06, + "logits/chosen": 210344746.66666666, + "logits/rejected": 162112256.0, + "logps/chosen": -331.2161051432292, + "logps/rejected": -369.0423583984375, + "loss": 0.0534, + "rewards/chosen": 2.7081003189086914, + "rewards/margins": 9.03688678741455, + "rewards/rejected": -6.328786468505859, + "step": 1032 + }, + { + "epoch": 0.3813391167920262, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 6.9615682907664025e-06, + "logits/chosen": 190863288.8888889, + "logits/rejected": 201850788.57142857, + "logps/chosen": -326.94322374131946, + "logps/rejected": -529.8704659598214, + "loss": 0.1272, + "rewards/chosen": 1.9485460917154949, + "rewards/margins": 9.549615950811477, + "rewards/rejected": -7.601069859095982, + "step": 1033 + }, + { + "epoch": 0.38170827372986943, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 6.95615341518344e-06, + "logits/chosen": 268640647.5294118, + "logits/rejected": 377570747.73333335, + "logps/chosen": -278.23204848345586, + "logps/rejected": -457.37067057291665, + "loss": 0.1018, + "rewards/chosen": 1.8122168148265165, + "rewards/margins": 8.599601865282246, + "rewards/rejected": -6.787385050455729, + "step": 1034 + }, + { + "epoch": 0.3820774306677126, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 6.950735829374773e-06, + "logits/chosen": 156254310.4, + "logits/rejected": 198146770.82352942, + "logps/chosen": -369.7700520833333, + "logps/rejected": -461.0030158547794, + "loss": 0.0799, + "rewards/chosen": 2.0129932403564452, + "rewards/margins": 9.591619985243854, + "rewards/rejected": -7.5786267448874085, + "step": 1035 + }, + { + "epoch": 0.38244658760555583, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 6.9453155408464005e-06, + "logits/chosen": 276563846.0952381, + "logits/rejected": 229594205.0909091, + "logps/chosen": -318.6088634672619, + "logps/rejected": -492.86075106534093, + "loss": 0.1234, + "rewards/chosen": 2.0867872692289806, + "rewards/margins": 8.917375870081253, + "rewards/rejected": -6.8305886008522725, + "step": 1036 + }, + { + "epoch": 0.382815744543399, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 6.939892557108059e-06, + "logits/chosen": 200502840.8888889, + "logits/rejected": 161519488.0, + "logps/chosen": -299.2921549479167, + "logps/rejected": -412.4236537388393, + "loss": 0.1114, + "rewards/chosen": 2.280755360921224, + "rewards/margins": 8.571082705543155, + "rewards/rejected": -6.2903273446219305, + "step": 1037 + }, + { + "epoch": 0.38318490148124223, + "grad_norm": 5.15625, + "kl": 1.194361686706543, + "learning_rate": 6.9344668856732255e-06, + "logits/chosen": 253131425.68421054, + "logits/rejected": 224510070.15384614, + "logps/chosen": -432.54209498355266, + "logps/rejected": -456.35727163461536, + "loss": 0.0954, + "rewards/chosen": 2.9335214715254936, + "rewards/margins": 8.813580346976215, + "rewards/rejected": -5.880058875450721, + "step": 1038 + }, + { + "epoch": 0.3835540584190854, + "grad_norm": 5.15625, + "kl": 0.39104461669921875, + "learning_rate": 6.9290385340591e-06, + "logits/chosen": 234393142.85714287, + "logits/rejected": 241968896.0, + "logps/chosen": -298.63539341517856, + "logps/rejected": -444.4926486545139, + "loss": 0.0719, + "rewards/chosen": 2.8662008558000838, + "rewards/margins": 9.923505661979554, + "rewards/rejected": -7.05730480617947, + "step": 1039 + }, + { + "epoch": 0.38392321535692864, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.923607509786593e-06, + "logits/chosen": 189873493.33333334, + "logits/rejected": 201802752.0, + "logps/chosen": -345.0933837890625, + "logps/rejected": -478.679638671875, + "loss": 0.0632, + "rewards/chosen": 3.1609373092651367, + "rewards/margins": 8.813966941833495, + "rewards/rejected": -5.6530296325683596, + "step": 1040 + }, + { + "epoch": 0.3842923722947718, + "grad_norm": 6.6875, + "kl": 0.6720867156982422, + "learning_rate": 6.918173820380321e-06, + "logits/chosen": 207911296.0, + "logits/rejected": 245095456.0, + "logps/chosen": -365.1194152832031, + "logps/rejected": -437.57379150390625, + "loss": 0.1188, + "rewards/chosen": 1.9882769584655762, + "rewards/margins": 8.136448383331299, + "rewards/rejected": -6.148171424865723, + "step": 1041 + }, + { + "epoch": 0.38466152923261504, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 6.91273747336859e-06, + "logits/chosen": 296298973.8666667, + "logits/rejected": 209180144.94117647, + "logps/chosen": -420.16357421875, + "logps/rejected": -432.7063419117647, + "loss": 0.0829, + "rewards/chosen": 2.4013659159342446, + "rewards/margins": 7.916681701061773, + "rewards/rejected": -5.515315785127528, + "step": 1042 + }, + { + "epoch": 0.3850306861704582, + "grad_norm": 4.90625, + "kl": 0.7454595565795898, + "learning_rate": 6.907298476283392e-06, + "logits/chosen": 185321796.26666668, + "logits/rejected": 199567721.4117647, + "logps/chosen": -365.1537760416667, + "logps/rejected": -398.4482421875, + "loss": 0.0541, + "rewards/chosen": 3.752794392903646, + "rewards/margins": 9.173185700061275, + "rewards/rejected": -5.420391307157629, + "step": 1043 + }, + { + "epoch": 0.38539984310830144, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 6.901856836660386e-06, + "logits/chosen": 245929393.23076922, + "logits/rejected": 233799868.63157895, + "logps/chosen": -316.95278695913464, + "logps/rejected": -390.4892578125, + "loss": 0.0792, + "rewards/chosen": 2.1846884213961086, + "rewards/margins": 8.311787709533444, + "rewards/rejected": -6.127099288137336, + "step": 1044 + }, + { + "epoch": 0.3857690000461446, + "grad_norm": 7.84375, + "kl": 0.0, + "learning_rate": 6.896412562038897e-06, + "logits/chosen": 222824686.93333334, + "logits/rejected": 215066669.17647058, + "logps/chosen": -342.8472005208333, + "logps/rejected": -350.21013327205884, + "loss": 0.1212, + "rewards/chosen": 1.6165011088053385, + "rewards/margins": 6.601598657346239, + "rewards/rejected": -4.9850975485409, + "step": 1045 + }, + { + "epoch": 0.3861381569839878, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 6.890965659961897e-06, + "logits/chosen": 195750041.6, + "logits/rejected": 203213568.0, + "logps/chosen": -286.4607177734375, + "logps/rejected": -447.17533735795456, + "loss": 0.0428, + "rewards/chosen": 2.753383827209473, + "rewards/margins": 10.314148486744273, + "rewards/rejected": -7.560764659534801, + "step": 1046 + }, + { + "epoch": 0.386507313921831, + "grad_norm": 5.0, + "kl": 1.1200839281082153, + "learning_rate": 6.885516137975998e-06, + "logits/chosen": 257045364.36363637, + "logits/rejected": 184640230.4, + "logps/chosen": -338.18943093039775, + "logps/rejected": -664.84580078125, + "loss": 0.1368, + "rewards/chosen": 2.3792386488481, + "rewards/margins": 9.065270753340288, + "rewards/rejected": -6.686032104492187, + "step": 1047 + }, + { + "epoch": 0.3868764708596742, + "grad_norm": 6.25, + "kl": 0.0613250732421875, + "learning_rate": 6.880064003631446e-06, + "logits/chosen": 178009827.55555555, + "logits/rejected": 190777526.85714287, + "logps/chosen": -407.4092068142361, + "logps/rejected": -498.88099888392856, + "loss": 0.0634, + "rewards/chosen": 2.7994475894504123, + "rewards/margins": 10.536919033716595, + "rewards/rejected": -7.737471444266183, + "step": 1048 + }, + { + "epoch": 0.3872456277975174, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.874609264482103e-06, + "logits/chosen": 175917745.23076922, + "logits/rejected": 162915004.63157895, + "logps/chosen": -272.78519381009613, + "logps/rejected": -395.32558079769734, + "loss": 0.1282, + "rewards/chosen": 1.9339945866511419, + "rewards/margins": 7.735625510273675, + "rewards/rejected": -5.8016309236225325, + "step": 1049 + }, + { + "epoch": 0.3876147847353606, + "grad_norm": 5.625, + "kl": 0.10732269287109375, + "learning_rate": 6.8691519280854406e-06, + "logits/chosen": 264075446.85714287, + "logits/rejected": 260840163.55555555, + "logps/chosen": -347.60972377232144, + "logps/rejected": -287.08257378472223, + "loss": 0.1062, + "rewards/chosen": 2.10887268611363, + "rewards/margins": 6.644293421790714, + "rewards/rejected": -4.535420735677083, + "step": 1050 + }, + { + "epoch": 0.3879839416732038, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 6.863692002002529e-06, + "logits/chosen": 195955792.0, + "logits/rejected": 179878448.0, + "logps/chosen": -428.1416320800781, + "logps/rejected": -468.6595153808594, + "loss": 0.0892, + "rewards/chosen": 2.5724523067474365, + "rewards/margins": 10.433073282241821, + "rewards/rejected": -7.860620975494385, + "step": 1051 + }, + { + "epoch": 0.388353098611047, + "grad_norm": 4.4375, + "kl": 0.20705127716064453, + "learning_rate": 6.858229493798026e-06, + "logits/chosen": 264184832.0, + "logits/rejected": 256600081.06666666, + "logps/chosen": -336.4892578125, + "logps/rejected": -378.3646484375, + "loss": 0.0695, + "rewards/chosen": 2.8887540031881893, + "rewards/margins": 8.389474218031939, + "rewards/rejected": -5.50072021484375, + "step": 1052 + }, + { + "epoch": 0.3887222555488902, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 6.85276441104017e-06, + "logits/chosen": 262709706.10526314, + "logits/rejected": 249608566.15384614, + "logps/chosen": -429.9628392269737, + "logps/rejected": -460.1716120793269, + "loss": 0.1358, + "rewards/chosen": 2.0074731927169, + "rewards/margins": 7.112284054157705, + "rewards/rejected": -5.104810861440805, + "step": 1053 + }, + { + "epoch": 0.3890914124867334, + "grad_norm": 6.09375, + "kl": 1.4703826904296875, + "learning_rate": 6.84729676130076e-06, + "logits/chosen": 257176144.84210527, + "logits/rejected": 283074796.3076923, + "logps/chosen": -411.84300472861844, + "logps/rejected": -496.4621394230769, + "loss": 0.0979, + "rewards/chosen": 2.6938586987947164, + "rewards/margins": 10.327300330405293, + "rewards/rejected": -7.633441631610577, + "step": 1054 + }, + { + "epoch": 0.38946056942457663, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 6.841826552155158e-06, + "logits/chosen": 229539748.57142857, + "logits/rejected": 165720149.33333334, + "logps/chosen": -463.0619419642857, + "logps/rejected": -476.3508572048611, + "loss": 0.0746, + "rewards/chosen": 3.0144214630126953, + "rewards/margins": 8.943041695488823, + "rewards/rejected": -5.928620232476129, + "step": 1055 + }, + { + "epoch": 0.3898297263624198, + "grad_norm": 5.0, + "kl": 1.2432737350463867, + "learning_rate": 6.836353791182266e-06, + "logits/chosen": 248490989.7142857, + "logits/rejected": 225068174.2222222, + "logps/chosen": -441.60777064732144, + "logps/rejected": -419.98589409722223, + "loss": 0.053, + "rewards/chosen": 3.348166057041713, + "rewards/margins": 9.252353577386765, + "rewards/rejected": -5.904187520345052, + "step": 1056 + }, + { + "epoch": 0.39019888330026303, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 6.830878485964528e-06, + "logits/chosen": 280448554.6666667, + "logits/rejected": 210144435.2, + "logps/chosen": -376.4357096354167, + "logps/rejected": -375.250927734375, + "loss": 0.0912, + "rewards/chosen": 2.148770014444987, + "rewards/margins": 7.9269167582194004, + "rewards/rejected": -5.778146743774414, + "step": 1057 + }, + { + "epoch": 0.3905680402381062, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.8254006440879094e-06, + "logits/chosen": 187329314.13333333, + "logits/rejected": 176470091.29411766, + "logps/chosen": -331.60406901041665, + "logps/rejected": -431.9635225183824, + "loss": 0.072, + "rewards/chosen": 2.5311192830403644, + "rewards/margins": 8.896956380208334, + "rewards/rejected": -6.365837097167969, + "step": 1058 + }, + { + "epoch": 0.39093719717594944, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 6.81992027314189e-06, + "logits/chosen": 190697782.85714287, + "logits/rejected": 262490908.44444445, + "logps/chosen": -321.35630580357144, + "logps/rejected": -482.7877604166667, + "loss": 0.1098, + "rewards/chosen": 1.8921852111816406, + "rewards/margins": 7.873935699462891, + "rewards/rejected": -5.98175048828125, + "step": 1059 + }, + { + "epoch": 0.3913063541137926, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 6.814437380719453e-06, + "logits/chosen": 204342487.57894737, + "logits/rejected": 164576590.76923078, + "logps/chosen": -335.6049033717105, + "logps/rejected": -277.8210261418269, + "loss": 0.1052, + "rewards/chosen": 2.159988804867393, + "rewards/margins": 8.023997781730374, + "rewards/rejected": -5.864008976862981, + "step": 1060 + }, + { + "epoch": 0.39167551105163584, + "grad_norm": 3.296875, + "kl": 0.5621843338012695, + "learning_rate": 6.808951974417077e-06, + "logits/chosen": 175764920.8888889, + "logits/rejected": 273820233.14285713, + "logps/chosen": -371.4764811197917, + "logps/rejected": -490.61669921875, + "loss": 0.0407, + "rewards/chosen": 3.8927252027723522, + "rewards/margins": 9.73645721919953, + "rewards/rejected": -5.843732016427176, + "step": 1061 + }, + { + "epoch": 0.392044667989479, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 6.803464061834725e-06, + "logits/chosen": 250648722.2857143, + "logits/rejected": 185211377.7777778, + "logps/chosen": -371.12217494419644, + "logps/rejected": -426.5260416666667, + "loss": 0.0781, + "rewards/chosen": 2.38756833757673, + "rewards/margins": 8.852304004487538, + "rewards/rejected": -6.464735666910808, + "step": 1062 + }, + { + "epoch": 0.39241382492732224, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 6.7979736505758264e-06, + "logits/chosen": 208626432.0, + "logits/rejected": 199715558.4, + "logps/chosen": -215.66766357421875, + "logps/rejected": -544.9439453125, + "loss": 0.0677, + "rewards/chosen": 2.9604574839274087, + "rewards/margins": 9.77754046122233, + "rewards/rejected": -6.8170829772949215, + "step": 1063 + }, + { + "epoch": 0.3927829818651654, + "grad_norm": 6.65625, + "kl": 1.5296931266784668, + "learning_rate": 6.792480748247278e-06, + "logits/chosen": 202622765.17647058, + "logits/rejected": 220957525.33333334, + "logps/chosen": -333.4392520680147, + "logps/rejected": -450.86077473958335, + "loss": 0.1148, + "rewards/chosen": 2.3493071163401886, + "rewards/margins": 8.780896878709981, + "rewards/rejected": -6.431589762369792, + "step": 1064 + }, + { + "epoch": 0.39315213880300864, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 6.786985362459427e-06, + "logits/chosen": 189357216.0, + "logits/rejected": 227813328.0, + "logps/chosen": -368.9334716796875, + "logps/rejected": -347.2249755859375, + "loss": 0.0872, + "rewards/chosen": 2.449962615966797, + "rewards/margins": 8.101953983306885, + "rewards/rejected": -5.651991367340088, + "step": 1065 + }, + { + "epoch": 0.3935212957408518, + "grad_norm": 6.09375, + "kl": 0.29131579399108887, + "learning_rate": 6.78148750082606e-06, + "logits/chosen": 161592210.2857143, + "logits/rejected": 188756679.1111111, + "logps/chosen": -328.75760323660717, + "logps/rejected": -464.04291449652777, + "loss": 0.0824, + "rewards/chosen": 3.1779357365199496, + "rewards/margins": 9.475403286161876, + "rewards/rejected": -6.297467549641927, + "step": 1066 + }, + { + "epoch": 0.39389045267869505, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.7759871709643934e-06, + "logits/chosen": 289162368.0, + "logits/rejected": 220698265.6, + "logps/chosen": -447.1001790364583, + "logps/rejected": -444.59296875, + "loss": 0.0478, + "rewards/chosen": 3.093407313028971, + "rewards/margins": 9.305509821573892, + "rewards/rejected": -6.2121025085449215, + "step": 1067 + }, + { + "epoch": 0.3942596096165382, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 6.770484380495064e-06, + "logits/chosen": 285074944.0, + "logits/rejected": 218558995.69230768, + "logps/chosen": -341.97884971217104, + "logps/rejected": -505.9655198317308, + "loss": 0.1279, + "rewards/chosen": 1.9814808494166325, + "rewards/margins": 8.436625878337907, + "rewards/rejected": -6.455145028921274, + "step": 1068 + }, + { + "epoch": 0.39462876655438145, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 6.76497913704212e-06, + "logits/chosen": 156163571.80952382, + "logits/rejected": 165647348.36363637, + "logps/chosen": -319.7328404017857, + "logps/rejected": -475.79496626420456, + "loss": 0.0985, + "rewards/chosen": 2.3191842578706288, + "rewards/margins": 9.40162264010607, + "rewards/rejected": -7.08243838223544, + "step": 1069 + }, + { + "epoch": 0.3949979234922246, + "grad_norm": 6.09375, + "kl": 0.9220829010009766, + "learning_rate": 6.759471448233008e-06, + "logits/chosen": 230560432.0, + "logits/rejected": 125446560.0, + "logps/chosen": -368.24322509765625, + "logps/rejected": -414.22979736328125, + "loss": 0.1041, + "rewards/chosen": 1.8503090143203735, + "rewards/margins": 10.638668656349182, + "rewards/rejected": -8.788359642028809, + "step": 1070 + }, + { + "epoch": 0.39536708043006785, + "grad_norm": 5.59375, + "kl": 0.5975522994995117, + "learning_rate": 6.7539613216985555e-06, + "logits/chosen": 237981262.76923078, + "logits/rejected": 193105461.89473686, + "logps/chosen": -431.81216195913464, + "logps/rejected": -327.85770456414474, + "loss": 0.0791, + "rewards/chosen": 2.098897933959961, + "rewards/margins": 7.08659573605186, + "rewards/rejected": -4.987697802091899, + "step": 1071 + }, + { + "epoch": 0.395736237367911, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 6.748448765072977e-06, + "logits/chosen": 331913088.0, + "logits/rejected": 174039889.45454547, + "logps/chosen": -327.7860107421875, + "logps/rejected": -344.5314275568182, + "loss": 0.0627, + "rewards/chosen": 2.0685966491699217, + "rewards/margins": 8.182600610906427, + "rewards/rejected": -6.114003961736506, + "step": 1072 + }, + { + "epoch": 0.39610539430575425, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 6.742933785993847e-06, + "logits/chosen": 218955414.5882353, + "logits/rejected": 258722952.53333333, + "logps/chosen": -322.3032801011029, + "logps/rejected": -555.78671875, + "loss": 0.0988, + "rewards/chosen": 2.496896407183479, + "rewards/margins": 9.781883793251188, + "rewards/rejected": -7.284987386067709, + "step": 1073 + }, + { + "epoch": 0.39647455124359743, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 6.737416392102101e-06, + "logits/chosen": 230875346.82352942, + "logits/rejected": 228052462.93333334, + "logps/chosen": -366.1852596507353, + "logps/rejected": -377.6708658854167, + "loss": 0.0866, + "rewards/chosen": 2.2702362958122704, + "rewards/margins": 8.910556120031021, + "rewards/rejected": -6.64031982421875, + "step": 1074 + }, + { + "epoch": 0.39684370818144066, + "grad_norm": 5.5, + "kl": 0.6899542808532715, + "learning_rate": 6.731896591042016e-06, + "logits/chosen": 160628160.0, + "logits/rejected": 221864857.6, + "logps/chosen": -300.344482421875, + "logps/rejected": -459.40166015625, + "loss": 0.1027, + "rewards/chosen": 2.2365795771280923, + "rewards/margins": 8.516134325663248, + "rewards/rejected": -6.279554748535157, + "step": 1075 + }, + { + "epoch": 0.39721286511928383, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 6.72637439046121e-06, + "logits/chosen": 240685193.84615386, + "logits/rejected": 187676442.9473684, + "logps/chosen": -363.5321514423077, + "logps/rejected": -392.1276212993421, + "loss": 0.0613, + "rewards/chosen": 3.430465111365685, + "rewards/margins": 10.060216617970331, + "rewards/rejected": -6.629751506604646, + "step": 1076 + }, + { + "epoch": 0.39758202205712706, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 6.720849798010618e-06, + "logits/chosen": 264025372.44444445, + "logits/rejected": 189875785.14285713, + "logps/chosen": -365.9698079427083, + "logps/rejected": -450.8152553013393, + "loss": 0.1174, + "rewards/chosen": 2.2456224229600696, + "rewards/margins": 8.167469085208953, + "rewards/rejected": -5.921846662248884, + "step": 1077 + }, + { + "epoch": 0.39795117899497023, + "grad_norm": 4.96875, + "kl": 0.09021949768066406, + "learning_rate": 6.715322821344495e-06, + "logits/chosen": 268931011.7647059, + "logits/rejected": 266378786.13333333, + "logps/chosen": -312.07864200367646, + "logps/rejected": -555.7192057291667, + "loss": 0.0793, + "rewards/chosen": 2.9451551998362824, + "rewards/margins": 10.667418790331073, + "rewards/rejected": -7.722263590494792, + "step": 1078 + }, + { + "epoch": 0.39832033593281346, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 6.709793468120395e-06, + "logits/chosen": 294728854.5882353, + "logits/rejected": 175888093.86666667, + "logps/chosen": -275.0919979319853, + "logps/rejected": -473.755859375, + "loss": 0.0884, + "rewards/chosen": 2.151873644660501, + "rewards/margins": 9.379850634406594, + "rewards/rejected": -7.227976989746094, + "step": 1079 + }, + { + "epoch": 0.39868949287065664, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.704261745999168e-06, + "logits/chosen": 277641674.1052632, + "logits/rejected": 145545708.30769232, + "logps/chosen": -321.7898591694079, + "logps/rejected": -417.9501953125, + "loss": 0.077, + "rewards/chosen": 2.9245432803505347, + "rewards/margins": 9.320543482236051, + "rewards/rejected": -6.396000201885517, + "step": 1080 + }, + { + "epoch": 0.39905864980849987, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.698727662644944e-06, + "logits/chosen": 297310766.54545456, + "logits/rejected": 204279064.3809524, + "logps/chosen": -326.89528586647725, + "logps/rejected": -458.4046223958333, + "loss": 0.0513, + "rewards/chosen": 3.170569333163175, + "rewards/margins": 9.141528951141225, + "rewards/rejected": -5.970959617978051, + "step": 1081 + }, + { + "epoch": 0.39942780674634304, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 6.693191225725125e-06, + "logits/chosen": 360225326.54545456, + "logits/rejected": 175879375.23809522, + "logps/chosen": -333.47434303977275, + "logps/rejected": -399.9608909970238, + "loss": 0.0744, + "rewards/chosen": 1.8623844493519177, + "rewards/margins": 8.84106428798659, + "rewards/rejected": -6.9786798386346724, + "step": 1082 + }, + { + "epoch": 0.3997969636841862, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 6.687652442910375e-06, + "logits/chosen": 329780906.6666667, + "logits/rejected": 216076333.17647058, + "logps/chosen": -463.7544270833333, + "logps/rejected": -384.0846737132353, + "loss": 0.091, + "rewards/chosen": 2.362404632568359, + "rewards/margins": 7.784204684986787, + "rewards/rejected": -5.421800052418428, + "step": 1083 + }, + { + "epoch": 0.40016612062202944, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 6.682111321874608e-06, + "logits/chosen": 219431978.66666666, + "logits/rejected": 191048358.4, + "logps/chosen": -284.51031494140625, + "logps/rejected": -441.525146484375, + "loss": 0.0694, + "rewards/chosen": 2.1284759839375815, + "rewards/margins": 8.250033219655355, + "rewards/rejected": -6.121557235717773, + "step": 1084 + }, + { + "epoch": 0.4005352775598726, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 6.6765678702949744e-06, + "logits/chosen": 235926016.0, + "logits/rejected": 244190759.3846154, + "logps/chosen": -374.9321803042763, + "logps/rejected": -507.60167518028845, + "loss": 0.0793, + "rewards/chosen": 2.919980902420847, + "rewards/margins": 8.947069361142301, + "rewards/rejected": -6.027088458721455, + "step": 1085 + }, + { + "epoch": 0.40090443449771584, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.671022095851857e-06, + "logits/chosen": 257572010.66666666, + "logits/rejected": 210316262.4, + "logps/chosen": -314.98195393880206, + "logps/rejected": -424.40205078125, + "loss": 0.0419, + "rewards/chosen": 2.692194620768229, + "rewards/margins": 9.493378702799479, + "rewards/rejected": -6.80118408203125, + "step": 1086 + }, + { + "epoch": 0.401273591435559, + "grad_norm": 6.375, + "kl": 0.47521305084228516, + "learning_rate": 6.6654740062288555e-06, + "logits/chosen": 224270577.7777778, + "logits/rejected": 276851712.0, + "logps/chosen": -418.7084689670139, + "logps/rejected": -481.80203683035717, + "loss": 0.0886, + "rewards/chosen": 2.2186942630343967, + "rewards/margins": 9.599883336869498, + "rewards/rejected": -7.381189073835101, + "step": 1087 + }, + { + "epoch": 0.40164274837340225, + "grad_norm": 7.0, + "kl": 0.5789647102355957, + "learning_rate": 6.65992360911278e-06, + "logits/chosen": 222890197.33333334, + "logits/rejected": 173244379.42857143, + "logps/chosen": -332.312255859375, + "logps/rejected": -359.53341238839283, + "loss": 0.1129, + "rewards/chosen": 2.1848165724012585, + "rewards/margins": 7.299820975651817, + "rewards/rejected": -5.115004403250558, + "step": 1088 + }, + { + "epoch": 0.4020119053112454, + "grad_norm": 4.75, + "kl": 0.1920003890991211, + "learning_rate": 6.654370912193633e-06, + "logits/chosen": 215027488.0, + "logits/rejected": 170172592.0, + "logps/chosen": -327.80474853515625, + "logps/rejected": -334.8535461425781, + "loss": 0.0883, + "rewards/chosen": 2.587928533554077, + "rewards/margins": 7.5340211391448975, + "rewards/rejected": -4.94609260559082, + "step": 1089 + }, + { + "epoch": 0.40238106224908865, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 6.648815923164604e-06, + "logits/chosen": 170143096.47058824, + "logits/rejected": 297022464.0, + "logps/chosen": -260.73058363970586, + "logps/rejected": -521.1806640625, + "loss": 0.1146, + "rewards/chosen": 2.2253727632410385, + "rewards/margins": 8.972905417049631, + "rewards/rejected": -6.747532653808594, + "step": 1090 + }, + { + "epoch": 0.4027502191869318, + "grad_norm": 4.875, + "kl": 1.0765480995178223, + "learning_rate": 6.6432586497220615e-06, + "logits/chosen": 193493504.0, + "logits/rejected": 160900215.46666667, + "logps/chosen": -303.2505744485294, + "logps/rejected": -288.1173828125, + "loss": 0.0969, + "rewards/chosen": 3.0290666468003216, + "rewards/margins": 8.429966406728706, + "rewards/rejected": -5.400899759928385, + "step": 1091 + }, + { + "epoch": 0.40311937612477505, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 6.637699099565538e-06, + "logits/chosen": 208414478.2222222, + "logits/rejected": 325765120.0, + "logps/chosen": -362.73773871527777, + "logps/rejected": -361.85164741847825, + "loss": 0.0709, + "rewards/chosen": 2.0636204613579645, + "rewards/margins": 7.963012630812788, + "rewards/rejected": -5.899392169454823, + "step": 1092 + }, + { + "epoch": 0.4034885330626182, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 6.632137280397719e-06, + "logits/chosen": 237418968.6153846, + "logits/rejected": 147396904.42105263, + "logps/chosen": -296.2749586838942, + "logps/rejected": -362.73655941611844, + "loss": 0.0755, + "rewards/chosen": 2.3526109548715444, + "rewards/margins": 8.368448774824259, + "rewards/rejected": -6.015837819952714, + "step": 1093 + }, + { + "epoch": 0.40385769000046146, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 6.626573199924433e-06, + "logits/chosen": 260701627.73333332, + "logits/rejected": 236565760.0, + "logps/chosen": -338.01494140625, + "logps/rejected": -567.0035041360294, + "loss": 0.0761, + "rewards/chosen": 2.3536649068196613, + "rewards/margins": 10.238121765735102, + "rewards/rejected": -7.884456858915441, + "step": 1094 + }, + { + "epoch": 0.40422684693830463, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 6.621006865854645e-06, + "logits/chosen": 291768891.0769231, + "logits/rejected": 266900129.68421054, + "logps/chosen": -302.59166541466345, + "logps/rejected": -421.77806332236844, + "loss": 0.0834, + "rewards/chosen": 1.7362763331486628, + "rewards/margins": 8.480751848413878, + "rewards/rejected": -6.744475515265214, + "step": 1095 + }, + { + "epoch": 0.40459600387614786, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.6154382859004385e-06, + "logits/chosen": 199343177.14285713, + "logits/rejected": 197672903.1111111, + "logps/chosen": -297.5789271763393, + "logps/rejected": -451.35381401909723, + "loss": 0.07, + "rewards/chosen": 3.0808862958635603, + "rewards/margins": 10.050756545293899, + "rewards/rejected": -6.969870249430339, + "step": 1096 + }, + { + "epoch": 0.40496516081399103, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 6.609867467777011e-06, + "logits/chosen": 180902954.66666666, + "logits/rejected": 235522139.42857143, + "logps/chosen": -354.21636284722223, + "logps/rejected": -469.31815011160717, + "loss": 0.0728, + "rewards/chosen": 2.573001437717014, + "rewards/margins": 9.568554045662047, + "rewards/rejected": -6.995552607945034, + "step": 1097 + }, + { + "epoch": 0.40533431775183426, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 6.60429441920266e-06, + "logits/chosen": 214028567.27272728, + "logits/rejected": 199210556.95238096, + "logps/chosen": -395.24063387784093, + "logps/rejected": -472.1279761904762, + "loss": 0.0668, + "rewards/chosen": 2.31555834683505, + "rewards/margins": 8.98873252373237, + "rewards/rejected": -6.673174176897321, + "step": 1098 + }, + { + "epoch": 0.40570347468967743, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 6.598719147898773e-06, + "logits/chosen": 197594163.2, + "logits/rejected": 179681099.29411766, + "logps/chosen": -318.03785807291666, + "logps/rejected": -453.34294577205884, + "loss": 0.0794, + "rewards/chosen": 2.4214886983235675, + "rewards/margins": 9.091442437265433, + "rewards/rejected": -6.6699537389418655, + "step": 1099 + }, + { + "epoch": 0.40607263162752066, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 6.593141661589819e-06, + "logits/chosen": 193141760.0, + "logits/rejected": 206744422.4, + "logps/chosen": -255.8233642578125, + "logps/rejected": -456.989013671875, + "loss": 0.085, + "rewards/chosen": 2.5009568532307944, + "rewards/margins": 8.3723450978597, + "rewards/rejected": -5.871388244628906, + "step": 1100 + }, + { + "epoch": 0.40644178856536384, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.5875619680033334e-06, + "logits/chosen": 242746912.0, + "logits/rejected": 241208192.0, + "logps/chosen": -378.28076171875, + "logps/rejected": -422.45428466796875, + "loss": 0.0664, + "rewards/chosen": 2.714956521987915, + "rewards/margins": 10.122695684432983, + "rewards/rejected": -7.407739162445068, + "step": 1101 + }, + { + "epoch": 0.40681094550320707, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.581980074869911e-06, + "logits/chosen": 197388342.85714287, + "logits/rejected": 257861034.66666666, + "logps/chosen": -358.06107003348217, + "logps/rejected": -534.3055013020834, + "loss": 0.0583, + "rewards/chosen": 3.2400433676583424, + "rewards/margins": 10.217296327863421, + "rewards/rejected": -6.977252960205078, + "step": 1102 + }, + { + "epoch": 0.40718010244105024, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 6.576395989923193e-06, + "logits/chosen": 280280941.71428573, + "logits/rejected": 359958727.1111111, + "logps/chosen": -403.6800013950893, + "logps/rejected": -490.7912326388889, + "loss": 0.0741, + "rewards/chosen": 2.239407539367676, + "rewards/margins": 9.369334432813856, + "rewards/rejected": -7.12992689344618, + "step": 1103 + }, + { + "epoch": 0.40754925937889347, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 6.57080972089986e-06, + "logits/chosen": 202504567.46666667, + "logits/rejected": 201007917.17647058, + "logps/chosen": -260.77459309895835, + "logps/rejected": -430.47472426470586, + "loss": 0.0521, + "rewards/chosen": 3.3091214497884116, + "rewards/margins": 9.540104061949487, + "rewards/rejected": -6.230982612161076, + "step": 1104 + }, + { + "epoch": 0.40791841631673664, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 6.565221275539615e-06, + "logits/chosen": 240852004.57142857, + "logits/rejected": 203826560.0, + "logps/chosen": -357.762451171875, + "logps/rejected": -341.2883572048611, + "loss": 0.1219, + "rewards/chosen": 2.421158654349191, + "rewards/margins": 7.391503621661474, + "rewards/rejected": -4.970344967312283, + "step": 1105 + }, + { + "epoch": 0.40828757325457987, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.559630661585179e-06, + "logits/chosen": 400951369.14285713, + "logits/rejected": 214609322.66666666, + "logps/chosen": -393.84256417410717, + "logps/rejected": -462.02886284722223, + "loss": 0.0661, + "rewards/chosen": 3.3381124223981584, + "rewards/margins": 9.75339393010215, + "rewards/rejected": -6.415281507703993, + "step": 1106 + }, + { + "epoch": 0.40865673019242305, + "grad_norm": 7.21875, + "kl": 2.369879722595215, + "learning_rate": 6.554037886782276e-06, + "logits/chosen": 183517048.47058824, + "logits/rejected": 121161070.93333334, + "logps/chosen": -320.35075827205884, + "logps/rejected": -357.31376953125, + "loss": 0.1447, + "rewards/chosen": 1.9838490205652572, + "rewards/margins": 8.328591859106924, + "rewards/rejected": -6.344742838541666, + "step": 1107 + }, + { + "epoch": 0.4090258871302663, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 6.548442958879624e-06, + "logits/chosen": 304887808.0, + "logits/rejected": 222936966.0952381, + "logps/chosen": -351.7103826349432, + "logps/rejected": -403.00558035714283, + "loss": 0.0757, + "rewards/chosen": 3.17268891768022, + "rewards/margins": 9.423492547237512, + "rewards/rejected": -6.250803629557292, + "step": 1108 + }, + { + "epoch": 0.40939504406810945, + "grad_norm": 4.78125, + "kl": 0.01049041748046875, + "learning_rate": 6.542845885628926e-06, + "logits/chosen": 184506043.73333332, + "logits/rejected": 224654908.2352941, + "logps/chosen": -298.29775390625, + "logps/rejected": -522.3643152573529, + "loss": 0.0644, + "rewards/chosen": 3.0702898661295572, + "rewards/margins": 9.335040627273859, + "rewards/rejected": -6.264750761144302, + "step": 1109 + }, + { + "epoch": 0.4097642010059527, + "grad_norm": 5.96875, + "kl": 0.5792617797851562, + "learning_rate": 6.537246674784855e-06, + "logits/chosen": 240533913.6, + "logits/rejected": 177899655.52941176, + "logps/chosen": -280.88050130208336, + "logps/rejected": -429.2906135110294, + "loss": 0.1045, + "rewards/chosen": 2.151641591389974, + "rewards/margins": 8.317056738161574, + "rewards/rejected": -6.1654151467716, + "step": 1110 + }, + { + "epoch": 0.41013335794379585, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 6.531645334105045e-06, + "logits/chosen": 176145392.0, + "logits/rejected": 318348256.0, + "logps/chosen": -332.5653381347656, + "logps/rejected": -663.3853759765625, + "loss": 0.0496, + "rewards/chosen": 3.652228832244873, + "rewards/margins": 10.362563133239746, + "rewards/rejected": -6.710334300994873, + "step": 1111 + }, + { + "epoch": 0.4105025148816391, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 6.526041871350086e-06, + "logits/chosen": 269340525.71428573, + "logits/rejected": 164046250.66666666, + "logps/chosen": -312.64896065848217, + "logps/rejected": -397.7747395833333, + "loss": 0.0655, + "rewards/chosen": 2.43546267918178, + "rewards/margins": 9.085764718434167, + "rewards/rejected": -6.650302039252387, + "step": 1112 + }, + { + "epoch": 0.41087167181948225, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.520436294283503e-06, + "logits/chosen": 267186537.4117647, + "logits/rejected": 172801826.13333333, + "logps/chosen": -349.92652803308823, + "logps/rejected": -331.30091145833336, + "loss": 0.1003, + "rewards/chosen": 2.3769921695484832, + "rewards/margins": 8.016823712517233, + "rewards/rejected": -5.63983154296875, + "step": 1113 + }, + { + "epoch": 0.4112408287573255, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 6.514828610671751e-06, + "logits/chosen": 212689728.0, + "logits/rejected": 210101056.0, + "logps/chosen": -387.16497802734375, + "logps/rejected": -438.450439453125, + "loss": 0.0876, + "rewards/chosen": 2.1458916664123535, + "rewards/margins": 8.560383796691895, + "rewards/rejected": -6.414492130279541, + "step": 1114 + }, + { + "epoch": 0.41160998569516866, + "grad_norm": 7.65625, + "kl": 0.016630172729492188, + "learning_rate": 6.509218828284203e-06, + "logits/chosen": 249944857.6, + "logits/rejected": 244317376.0, + "logps/chosen": -381.77470703125, + "logps/rejected": -467.2809244791667, + "loss": 0.0844, + "rewards/chosen": 2.692351531982422, + "rewards/margins": 8.730352274576823, + "rewards/rejected": -6.038000742594401, + "step": 1115 + }, + { + "epoch": 0.4119791426330119, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 6.503606954893143e-06, + "logits/chosen": 181685301.89473686, + "logits/rejected": 184821641.84615386, + "logps/chosen": -273.9696751644737, + "logps/rejected": -512.9118464543269, + "loss": 0.071, + "rewards/chosen": 3.0422084206028988, + "rewards/margins": 11.078519056683128, + "rewards/rejected": -8.036310636080229, + "step": 1116 + }, + { + "epoch": 0.41234829957085506, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 6.497992998273751e-06, + "logits/chosen": 193337042.82352942, + "logits/rejected": 191324330.66666666, + "logps/chosen": -260.8240176930147, + "logps/rejected": -433.3785807291667, + "loss": 0.1107, + "rewards/chosen": 2.2328302720013786, + "rewards/margins": 9.00230048684513, + "rewards/rejected": -6.76947021484375, + "step": 1117 + }, + { + "epoch": 0.41271745650869823, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 6.492376966204092e-06, + "logits/chosen": 167093888.0, + "logits/rejected": 246398976.0, + "logps/chosen": -362.11309136284723, + "logps/rejected": -307.65272739955356, + "loss": 0.0775, + "rewards/chosen": 3.3917914496527777, + "rewards/margins": 8.625885251968626, + "rewards/rejected": -5.234093802315848, + "step": 1118 + }, + { + "epoch": 0.41308661344654146, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 6.486758866465106e-06, + "logits/chosen": 255638905.2631579, + "logits/rejected": 217783473.23076922, + "logps/chosen": -351.5440738075658, + "logps/rejected": -411.3232421875, + "loss": 0.124, + "rewards/chosen": 2.0027909529836556, + "rewards/margins": 8.395372012366167, + "rewards/rejected": -6.392581059382512, + "step": 1119 + }, + { + "epoch": 0.41345577038438464, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 6.4811387068406e-06, + "logits/chosen": 233983453.86666667, + "logits/rejected": 278448218.35294116, + "logps/chosen": -345.569921875, + "logps/rejected": -530.0120634191177, + "loss": 0.0896, + "rewards/chosen": 2.9893142700195314, + "rewards/margins": 9.87850045596852, + "rewards/rejected": -6.889186185948989, + "step": 1120 + }, + { + "epoch": 0.41382492732222786, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.475516495117233e-06, + "logits/chosen": 255472213.33333334, + "logits/rejected": 170691072.0, + "logps/chosen": -399.33753797743054, + "logps/rejected": -454.04813058035717, + "loss": 0.0992, + "rewards/chosen": 2.5724919637044272, + "rewards/margins": 8.985558282761346, + "rewards/rejected": -6.413066319056919, + "step": 1121 + }, + { + "epoch": 0.41419408426007104, + "grad_norm": 6.5625, + "kl": 1.3000054359436035, + "learning_rate": 6.4698922390845085e-06, + "logits/chosen": 164054001.7777778, + "logits/rejected": 192471076.57142857, + "logps/chosen": -358.01752387152777, + "logps/rejected": -271.9027099609375, + "loss": 0.0941, + "rewards/chosen": 2.809358384874132, + "rewards/margins": 7.263302454872737, + "rewards/rejected": -4.453944069998605, + "step": 1122 + }, + { + "epoch": 0.41456324119791427, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 6.464265946534762e-06, + "logits/chosen": 204388128.0, + "logits/rejected": 163446144.0, + "logps/chosen": -284.25054931640625, + "logps/rejected": -343.8726806640625, + "loss": 0.1096, + "rewards/chosen": 2.4411263465881348, + "rewards/margins": 8.13528823852539, + "rewards/rejected": -5.694161891937256, + "step": 1123 + }, + { + "epoch": 0.41493239813575744, + "grad_norm": 6.375, + "kl": 1.0279746055603027, + "learning_rate": 6.4586376252631485e-06, + "logits/chosen": 226218581.33333334, + "logits/rejected": 247044784.0, + "logps/chosen": -407.3721923828125, + "logps/rejected": -426.7769775390625, + "loss": 0.1387, + "rewards/chosen": 2.1329903602600098, + "rewards/margins": 8.585532188415527, + "rewards/rejected": -6.452541828155518, + "step": 1124 + }, + { + "epoch": 0.41530155507360067, + "grad_norm": 5.84375, + "kl": 1.7249093055725098, + "learning_rate": 6.453007283067638e-06, + "logits/chosen": 161767599.15789473, + "logits/rejected": 210007512.6153846, + "logps/chosen": -313.03641550164474, + "logps/rejected": -472.95643028846155, + "loss": 0.0821, + "rewards/chosen": 3.3829715126439144, + "rewards/margins": 10.15133722591014, + "rewards/rejected": -6.768365713266226, + "step": 1125 + }, + { + "epoch": 0.41567071201144384, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.447374927748997e-06, + "logits/chosen": 330408576.0, + "logits/rejected": 292501984.0, + "logps/chosen": -312.8070068359375, + "logps/rejected": -476.1676940917969, + "loss": 0.0836, + "rewards/chosen": 2.1178884506225586, + "rewards/margins": 9.296717166900635, + "rewards/rejected": -7.178828716278076, + "step": 1126 + }, + { + "epoch": 0.4160398689492871, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 6.4417405671107826e-06, + "logits/chosen": 194876514.46153846, + "logits/rejected": 209703504.84210527, + "logps/chosen": -378.56479116586536, + "logps/rejected": -472.13199013157896, + "loss": 0.079, + "rewards/chosen": 2.2055724217341495, + "rewards/margins": 9.82568398757502, + "rewards/rejected": -7.620111565840872, + "step": 1127 + }, + { + "epoch": 0.41640902588713025, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 6.4361042089593285e-06, + "logits/chosen": 469918720.0, + "logits/rejected": 169647634.2857143, + "logps/chosen": -371.574951171875, + "logps/rejected": -372.05503627232144, + "loss": 0.1184, + "rewards/chosen": 1.6475825839572482, + "rewards/margins": 8.155374375600664, + "rewards/rejected": -6.507791791643415, + "step": 1128 + }, + { + "epoch": 0.4167781828249735, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 6.43046586110374e-06, + "logits/chosen": 203683271.1111111, + "logits/rejected": 330676187.4285714, + "logps/chosen": -333.31141493055554, + "logps/rejected": -519.4291294642857, + "loss": 0.088, + "rewards/chosen": 2.738742404513889, + "rewards/margins": 9.676016247461712, + "rewards/rejected": -6.937273842947824, + "step": 1129 + }, + { + "epoch": 0.41714733976281665, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 6.4248255313558735e-06, + "logits/chosen": 168654004.70588234, + "logits/rejected": 241913361.06666666, + "logps/chosen": -398.3058651194853, + "logps/rejected": -465.31015625, + "loss": 0.0734, + "rewards/chosen": 2.2277986863080192, + "rewards/margins": 10.085034404081457, + "rewards/rejected": -7.857235717773437, + "step": 1130 + }, + { + "epoch": 0.4175164967006599, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 6.419183227530336e-06, + "logits/chosen": 242708873.84615386, + "logits/rejected": 255289290.10526314, + "logps/chosen": -421.32895132211536, + "logps/rejected": -533.3855365953947, + "loss": 0.0831, + "rewards/chosen": 3.0714481060321512, + "rewards/margins": 9.879890874329849, + "rewards/rejected": -6.808442768297698, + "step": 1131 + }, + { + "epoch": 0.41788565363850305, + "grad_norm": 8.1875, + "kl": 0.4193301200866699, + "learning_rate": 6.413538957444468e-06, + "logits/chosen": 218584892.95238096, + "logits/rejected": 260838656.0, + "logps/chosen": -381.4593796502976, + "logps/rejected": -462.8942205255682, + "loss": 0.1275, + "rewards/chosen": 2.2322850908551897, + "rewards/margins": 8.78069359915597, + "rewards/rejected": -6.548408508300781, + "step": 1132 + }, + { + "epoch": 0.4182548105763463, + "grad_norm": 6.90625, + "kl": 1.3079099655151367, + "learning_rate": 6.407892728918333e-06, + "logits/chosen": 216278362.3529412, + "logits/rejected": 122903910.4, + "logps/chosen": -317.2608283547794, + "logps/rejected": -435.028515625, + "loss": 0.1202, + "rewards/chosen": 2.800192440257353, + "rewards/margins": 8.860603003408395, + "rewards/rejected": -6.060410563151041, + "step": 1133 + }, + { + "epoch": 0.41862396751418945, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 6.402244549774707e-06, + "logits/chosen": 161781552.0, + "logits/rejected": 291221312.0, + "logps/chosen": -313.8560791015625, + "logps/rejected": -501.9214172363281, + "loss": 0.1136, + "rewards/chosen": 2.0200986862182617, + "rewards/margins": 8.455296516418457, + "rewards/rejected": -6.435197830200195, + "step": 1134 + }, + { + "epoch": 0.4189931244520327, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.396594427839076e-06, + "logits/chosen": 208131644.2352941, + "logits/rejected": 173456418.13333333, + "logps/chosen": -361.9690372242647, + "logps/rejected": -422.83001302083335, + "loss": 0.0645, + "rewards/chosen": 3.2294087129480697, + "rewards/margins": 9.070155274634267, + "rewards/rejected": -5.840746561686198, + "step": 1135 + }, + { + "epoch": 0.41936228138987586, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 6.3909423709396054e-06, + "logits/chosen": 294574153.14285713, + "logits/rejected": 191824199.1111111, + "logps/chosen": -417.35567801339283, + "logps/rejected": -441.4407552083333, + "loss": 0.0583, + "rewards/chosen": 2.696319035121373, + "rewards/margins": 9.240991683233352, + "rewards/rejected": -6.5446726481119795, + "step": 1136 + }, + { + "epoch": 0.4197314383277191, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 6.385288386907155e-06, + "logits/chosen": 270342112.0, + "logits/rejected": 228325408.0, + "logps/chosen": -443.8522033691406, + "logps/rejected": -432.0166931152344, + "loss": 0.0725, + "rewards/chosen": 2.5140719413757324, + "rewards/margins": 8.276425838470459, + "rewards/rejected": -5.762353897094727, + "step": 1137 + }, + { + "epoch": 0.42010059526556226, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 6.379632483575242e-06, + "logits/chosen": 220426240.0, + "logits/rejected": 221386609.7777778, + "logps/chosen": -452.3754185267857, + "logps/rejected": -486.17078993055554, + "loss": 0.0875, + "rewards/chosen": 2.827883311680385, + "rewards/margins": 9.737532630799308, + "rewards/rejected": -6.909649319118923, + "step": 1138 + }, + { + "epoch": 0.4204697522034055, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 6.373974668780053e-06, + "logits/chosen": 332509805.71428573, + "logits/rejected": 156540714.66666666, + "logps/chosen": -400.905517578125, + "logps/rejected": -409.45057508680554, + "loss": 0.1061, + "rewards/chosen": 1.7724809646606445, + "rewards/margins": 8.26297770606147, + "rewards/rejected": -6.4904967414008246, + "step": 1139 + }, + { + "epoch": 0.42083890914124866, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.368314950360416e-06, + "logits/chosen": 252487706.9473684, + "logits/rejected": 299012745.84615386, + "logps/chosen": -331.96463815789474, + "logps/rejected": -421.5065354567308, + "loss": 0.0935, + "rewards/chosen": 2.472697609349301, + "rewards/margins": 8.61754107764858, + "rewards/rejected": -6.144843468299279, + "step": 1140 + }, + { + "epoch": 0.4212080660790919, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.362653336157798e-06, + "logits/chosen": 188181519.05882353, + "logits/rejected": 297107387.73333335, + "logps/chosen": -275.20295266544116, + "logps/rejected": -445.4169596354167, + "loss": 0.0876, + "rewards/chosen": 2.6998259600471046, + "rewards/margins": 8.996063142664292, + "rewards/rejected": -6.296237182617188, + "step": 1141 + }, + { + "epoch": 0.42157722301693507, + "grad_norm": 6.71875, + "kl": 1.828162670135498, + "learning_rate": 6.356989834016296e-06, + "logits/chosen": 305660441.6, + "logits/rejected": 201285205.33333334, + "logps/chosen": -373.831494140625, + "logps/rejected": -402.0146484375, + "loss": 0.1343, + "rewards/chosen": 1.97908935546875, + "rewards/margins": 7.514822896321615, + "rewards/rejected": -5.535733540852864, + "step": 1142 + }, + { + "epoch": 0.4219463799547783, + "grad_norm": 6.78125, + "kl": 0.32216835021972656, + "learning_rate": 6.35132445178262e-06, + "logits/chosen": 219663658.66666666, + "logits/rejected": 270446939.4285714, + "logps/chosen": -429.65619574652777, + "logps/rejected": -552.6942313058036, + "loss": 0.0932, + "rewards/chosen": 2.2438125610351562, + "rewards/margins": 9.554304940359934, + "rewards/rejected": -7.310492379324777, + "step": 1143 + }, + { + "epoch": 0.42231553689262147, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 6.3456571973060835e-06, + "logits/chosen": 189432636.2352941, + "logits/rejected": 209556309.33333334, + "logps/chosen": -402.12132352941177, + "logps/rejected": -523.41015625, + "loss": 0.0538, + "rewards/chosen": 3.0103203268612133, + "rewards/margins": 10.60567351696538, + "rewards/rejected": -7.595353190104166, + "step": 1144 + }, + { + "epoch": 0.4226846938304647, + "grad_norm": 5.8125, + "kl": 1.3504180908203125, + "learning_rate": 6.339988078438597e-06, + "logits/chosen": 251477940.70588234, + "logits/rejected": 184972782.93333334, + "logps/chosen": -390.0186982996324, + "logps/rejected": -377.66920572916666, + "loss": 0.1163, + "rewards/chosen": 2.7363772672765396, + "rewards/margins": 8.770052906111175, + "rewards/rejected": -6.0336756388346355, + "step": 1145 + }, + { + "epoch": 0.42305385076830787, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 6.3343171030346525e-06, + "logits/chosen": 266098736.0, + "logits/rejected": 201592912.0, + "logps/chosen": -353.0704650878906, + "logps/rejected": -427.646728515625, + "loss": 0.102, + "rewards/chosen": 1.795764446258545, + "rewards/margins": 8.176724910736084, + "rewards/rejected": -6.380960464477539, + "step": 1146 + }, + { + "epoch": 0.4234230077061511, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 6.3286442789513135e-06, + "logits/chosen": 266159990.15384614, + "logits/rejected": 229427092.21052632, + "logps/chosen": -279.67226938100964, + "logps/rejected": -487.9091282894737, + "loss": 0.0894, + "rewards/chosen": 2.5459873492901144, + "rewards/margins": 9.15653245748296, + "rewards/rejected": -6.610545108192845, + "step": 1147 + }, + { + "epoch": 0.4237921646439943, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 6.322969614048207e-06, + "logits/chosen": 196001792.0, + "logits/rejected": 177217424.0, + "logps/chosen": -360.83636474609375, + "logps/rejected": -438.98297119140625, + "loss": 0.1157, + "rewards/chosen": 2.0165131092071533, + "rewards/margins": 7.2884156703948975, + "rewards/rejected": -5.271902561187744, + "step": 1148 + }, + { + "epoch": 0.4241613215818375, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 6.317293116187508e-06, + "logits/chosen": 190322116.92307693, + "logits/rejected": 209873542.7368421, + "logps/chosen": -271.4104191706731, + "logps/rejected": -447.24635074013156, + "loss": 0.0391, + "rewards/chosen": 3.150075472318209, + "rewards/margins": 10.245261404678407, + "rewards/rejected": -7.095185932360198, + "step": 1149 + }, + { + "epoch": 0.4245304785196807, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.311614793233932e-06, + "logits/chosen": 219190272.0, + "logits/rejected": 254813997.17647058, + "logps/chosen": -303.84599609375, + "logps/rejected": -434.3762637867647, + "loss": 0.0897, + "rewards/chosen": 2.7938255310058593, + "rewards/margins": 8.266265465231502, + "rewards/rejected": -5.472439934225643, + "step": 1150 + }, + { + "epoch": 0.4248996354575239, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 6.3059346530547245e-06, + "logits/chosen": 215813809.23076922, + "logits/rejected": 248724641.68421054, + "logps/chosen": -297.82157076322113, + "logps/rejected": -496.8762849506579, + "loss": 0.0811, + "rewards/chosen": 2.7820205688476562, + "rewards/margins": 9.700011002390008, + "rewards/rejected": -6.917990433542352, + "step": 1151 + }, + { + "epoch": 0.4252687923953671, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 6.300252703519647e-06, + "logits/chosen": 177877872.0, + "logits/rejected": 183525808.0, + "logps/chosen": -301.79168701171875, + "logps/rejected": -415.43682861328125, + "loss": 0.1155, + "rewards/chosen": 2.4128944873809814, + "rewards/margins": 8.640072584152222, + "rewards/rejected": -6.22717809677124, + "step": 1152 + }, + { + "epoch": 0.4256379493332103, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 6.294568952500968e-06, + "logits/chosen": 277887829.3333333, + "logits/rejected": 234855705.6, + "logps/chosen": -330.9969889322917, + "logps/rejected": -388.059521484375, + "loss": 0.0627, + "rewards/chosen": 2.603086471557617, + "rewards/margins": 9.006351089477539, + "rewards/rejected": -6.4032646179199215, + "step": 1153 + }, + { + "epoch": 0.4260071062710535, + "grad_norm": 6.15625, + "kl": 1.2792978286743164, + "learning_rate": 6.288883407873452e-06, + "logits/chosen": 194443956.70588234, + "logits/rejected": 268436445.8666667, + "logps/chosen": -400.40515854779414, + "logps/rejected": -421.9687825520833, + "loss": 0.1245, + "rewards/chosen": 2.6348796171300553, + "rewards/margins": 7.591022296980316, + "rewards/rejected": -4.95614267985026, + "step": 1154 + }, + { + "epoch": 0.42637626320889666, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 6.283196077514351e-06, + "logits/chosen": 205054798.76923078, + "logits/rejected": 215307344.84210527, + "logps/chosen": -389.0979567307692, + "logps/rejected": -427.21055201480266, + "loss": 0.0821, + "rewards/chosen": 2.882581270658053, + "rewards/margins": 9.376056532145512, + "rewards/rejected": -6.4934752614874585, + "step": 1155 + }, + { + "epoch": 0.4267454201467399, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 6.277506969303387e-06, + "logits/chosen": 245427421.86666667, + "logits/rejected": 242913566.11764705, + "logps/chosen": -335.7366536458333, + "logps/rejected": -481.11072495404414, + "loss": 0.0807, + "rewards/chosen": 2.6946339925130207, + "rewards/margins": 9.171166872510723, + "rewards/rejected": -6.476532879997702, + "step": 1156 + }, + { + "epoch": 0.42711457708458306, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 6.271816091122748e-06, + "logits/chosen": 142978094.54545453, + "logits/rejected": 206511104.0, + "logps/chosen": -381.6993963068182, + "logps/rejected": -377.5033017113095, + "loss": 0.0966, + "rewards/chosen": 2.6726802479137075, + "rewards/margins": 7.716492590966164, + "rewards/rejected": -5.043812343052456, + "step": 1157 + }, + { + "epoch": 0.4274837340224263, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 6.266123450857071e-06, + "logits/chosen": 241228515.55555555, + "logits/rejected": 172856230.95652175, + "logps/chosen": -304.1264919704861, + "logps/rejected": -471.2736073369565, + "loss": 0.0828, + "rewards/chosen": 1.8595466613769531, + "rewards/margins": 7.994691765826682, + "rewards/rejected": -6.1351451044497285, + "step": 1158 + }, + { + "epoch": 0.42785289096026946, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 6.26042905639344e-06, + "logits/chosen": 245803904.0, + "logits/rejected": 175642016.0, + "logps/chosen": -436.6184387207031, + "logps/rejected": -443.5486145019531, + "loss": 0.0456, + "rewards/chosen": 3.589250087738037, + "rewards/margins": 10.692718505859375, + "rewards/rejected": -7.103468418121338, + "step": 1159 + }, + { + "epoch": 0.4282220478981127, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 6.254732915621365e-06, + "logits/chosen": 192898002.82352942, + "logits/rejected": 210349294.93333334, + "logps/chosen": -371.15105124080884, + "logps/rejected": -414.57158203125, + "loss": 0.0713, + "rewards/chosen": 2.9843882392434513, + "rewards/margins": 8.550355544744754, + "rewards/rejected": -5.565967305501302, + "step": 1160 + }, + { + "epoch": 0.42859120483595586, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 6.249035036432776e-06, + "logits/chosen": 208423014.4, + "logits/rejected": 187769344.0, + "logps/chosen": -324.8227213541667, + "logps/rejected": -456.4622587316176, + "loss": 0.0738, + "rewards/chosen": 2.516859181722005, + "rewards/margins": 8.9306344574573, + "rewards/rejected": -6.413775275735294, + "step": 1161 + }, + { + "epoch": 0.4289603617737991, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 6.243335426722014e-06, + "logits/chosen": 308140259.5555556, + "logits/rejected": 201217590.85714287, + "logps/chosen": -401.49647352430554, + "logps/rejected": -386.2593470982143, + "loss": 0.1071, + "rewards/chosen": 1.892488267686632, + "rewards/margins": 7.321068960522848, + "rewards/rejected": -5.428580692836216, + "step": 1162 + }, + { + "epoch": 0.42932951871164227, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.237634094385814e-06, + "logits/chosen": 244723757.17647058, + "logits/rejected": 255606425.6, + "logps/chosen": -319.4209846047794, + "logps/rejected": -489.70667317708336, + "loss": 0.0674, + "rewards/chosen": 3.028535955092486, + "rewards/margins": 9.806833379408893, + "rewards/rejected": -6.778297424316406, + "step": 1163 + }, + { + "epoch": 0.4296986756494855, + "grad_norm": 4.53125, + "kl": 0.5488295555114746, + "learning_rate": 6.2319310473233e-06, + "logits/chosen": 261217765.0526316, + "logits/rejected": 151014025.84615386, + "logps/chosen": -370.8421052631579, + "logps/rejected": -338.6317608173077, + "loss": 0.0828, + "rewards/chosen": 3.056101548044305, + "rewards/margins": 8.724710580308429, + "rewards/rejected": -5.668609032264123, + "step": 1164 + }, + { + "epoch": 0.43006783258732867, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.226226293435973e-06, + "logits/chosen": 158897600.0, + "logits/rejected": 277640960.0, + "logps/chosen": -338.4738362630208, + "logps/rejected": -462.373681640625, + "loss": 0.0558, + "rewards/chosen": 3.5029271443684897, + "rewards/margins": 10.941753133138022, + "rewards/rejected": -7.438825988769532, + "step": 1165 + }, + { + "epoch": 0.4304369895251719, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 6.2205198406276946e-06, + "logits/chosen": 210766738.2857143, + "logits/rejected": 208812074.66666666, + "logps/chosen": -399.11983816964283, + "logps/rejected": -545.6346571180555, + "loss": 0.0681, + "rewards/chosen": 2.2717665263584683, + "rewards/margins": 9.776755302671402, + "rewards/rejected": -7.504988776312934, + "step": 1166 + }, + { + "epoch": 0.43080614646301507, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 6.214811696804682e-06, + "logits/chosen": 273132784.9411765, + "logits/rejected": 133618474.66666667, + "logps/chosen": -458.34015969669116, + "logps/rejected": -353.24912109375, + "loss": 0.0759, + "rewards/chosen": 2.841523338766659, + "rewards/margins": 9.584070946188534, + "rewards/rejected": -6.742547607421875, + "step": 1167 + }, + { + "epoch": 0.4311753034008583, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 6.2091018698755e-06, + "logits/chosen": 249316668.2352941, + "logits/rejected": 167371571.2, + "logps/chosen": -342.5020392922794, + "logps/rejected": -349.34088541666665, + "loss": 0.0974, + "rewards/chosen": 2.130272584802964, + "rewards/margins": 7.193490765141505, + "rewards/rejected": -5.063218180338541, + "step": 1168 + }, + { + "epoch": 0.4315444603387015, + "grad_norm": 4.375, + "kl": 0.005348682403564453, + "learning_rate": 6.203390367751038e-06, + "logits/chosen": 214656903.52941176, + "logits/rejected": 178272768.0, + "logps/chosen": -350.59866153492646, + "logps/rejected": -451.428125, + "loss": 0.0855, + "rewards/chosen": 2.372753816492417, + "rewards/margins": 8.526968802657782, + "rewards/rejected": -6.154214986165365, + "step": 1169 + }, + { + "epoch": 0.4319136172765447, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.197677198344508e-06, + "logits/chosen": 243711089.7777778, + "logits/rejected": 200333385.14285713, + "logps/chosen": -362.9586588541667, + "logps/rejected": -498.82686941964283, + "loss": 0.1198, + "rewards/chosen": 2.3088995615641275, + "rewards/margins": 9.006856191725959, + "rewards/rejected": -6.697956630161831, + "step": 1170 + }, + { + "epoch": 0.4322827742143879, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.191962369571439e-06, + "logits/chosen": 298944109.71428573, + "logits/rejected": 171661283.55555555, + "logps/chosen": -312.04213169642856, + "logps/rejected": -413.5293782552083, + "loss": 0.0558, + "rewards/chosen": 2.4063638959612166, + "rewards/margins": 9.090550195603143, + "rewards/rejected": -6.684186299641927, + "step": 1171 + }, + { + "epoch": 0.4326519311522311, + "grad_norm": 6.0625, + "kl": 1.289773941040039, + "learning_rate": 6.18624588934965e-06, + "logits/chosen": 293116050.28571427, + "logits/rejected": 174805731.55555555, + "logps/chosen": -333.28212193080356, + "logps/rejected": -390.64784071180554, + "loss": 0.0924, + "rewards/chosen": 2.522503989083426, + "rewards/margins": 8.287040528796968, + "rewards/rejected": -5.764536539713542, + "step": 1172 + }, + { + "epoch": 0.4330210880900743, + "grad_norm": 5.96875, + "kl": 0.8630952835083008, + "learning_rate": 6.1805277655992514e-06, + "logits/chosen": 251846682.9473684, + "logits/rejected": 201850368.0, + "logps/chosen": -318.7327816611842, + "logps/rejected": -474.0827824519231, + "loss": 0.1032, + "rewards/chosen": 2.508672413073088, + "rewards/margins": 10.789943540627174, + "rewards/rejected": -8.281271127554087, + "step": 1173 + }, + { + "epoch": 0.4333902450279175, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 6.1748080062426345e-06, + "logits/chosen": 241089911.46666667, + "logits/rejected": 189412080.94117647, + "logps/chosen": -420.6056315104167, + "logps/rejected": -350.74072265625, + "loss": 0.071, + "rewards/chosen": 2.655975850423177, + "rewards/margins": 8.87900100408816, + "rewards/rejected": -6.223025153664982, + "step": 1174 + }, + { + "epoch": 0.4337594019657607, + "grad_norm": 4.15625, + "kl": 0.6449737548828125, + "learning_rate": 6.169086619204447e-06, + "logits/chosen": 199316495.05882353, + "logits/rejected": 268790493.8666667, + "logps/chosen": -401.31198299632354, + "logps/rejected": -526.8650065104167, + "loss": 0.0419, + "rewards/chosen": 4.045277315027573, + "rewards/margins": 10.936525890873927, + "rewards/rejected": -6.891248575846354, + "step": 1175 + }, + { + "epoch": 0.4341285589036039, + "grad_norm": 4.0625, + "kl": 0.48915863037109375, + "learning_rate": 6.1633636124116045e-06, + "logits/chosen": 139129187.55555555, + "logits/rejected": 156134747.42857143, + "logps/chosen": -333.964111328125, + "logps/rejected": -326.27284458705356, + "loss": 0.0596, + "rewards/chosen": 3.734167310926649, + "rewards/margins": 9.655743129669673, + "rewards/rejected": -5.921575818743024, + "step": 1176 + }, + { + "epoch": 0.4344977158414471, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.157638993793257e-06, + "logits/chosen": 195198916.92307693, + "logits/rejected": 316505896.42105263, + "logps/chosen": -312.7058856670673, + "logps/rejected": -417.47142269736844, + "loss": 0.0726, + "rewards/chosen": 2.3723675654484677, + "rewards/margins": 8.383831734599372, + "rewards/rejected": -6.011464169150905, + "step": 1177 + }, + { + "epoch": 0.4348668727792903, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 6.15191277128079e-06, + "logits/chosen": 231229252.26666668, + "logits/rejected": 226654433.88235295, + "logps/chosen": -406.63291015625, + "logps/rejected": -459.13137637867646, + "loss": 0.0889, + "rewards/chosen": 2.4239702860514325, + "rewards/margins": 9.00591402240828, + "rewards/rejected": -6.581943736356847, + "step": 1178 + }, + { + "epoch": 0.4352360297171335, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 6.146184952807815e-06, + "logits/chosen": 144125493.33333334, + "logits/rejected": 192733632.0, + "logps/chosen": -270.904296875, + "logps/rejected": -434.036474609375, + "loss": 0.0771, + "rewards/chosen": 2.433493137359619, + "rewards/margins": 8.810337543487549, + "rewards/rejected": -6.37684440612793, + "step": 1179 + }, + { + "epoch": 0.4356051866549767, + "grad_norm": 6.0, + "kl": 0.03809833526611328, + "learning_rate": 6.140455546310149e-06, + "logits/chosen": 229664498.52631578, + "logits/rejected": 120597129.84615384, + "logps/chosen": -399.7744911595395, + "logps/rejected": -399.29833984375, + "loss": 0.0728, + "rewards/chosen": 2.410933444374486, + "rewards/margins": 10.664934768367875, + "rewards/rejected": -8.25400132399339, + "step": 1180 + }, + { + "epoch": 0.4359743435928199, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 6.134724559725812e-06, + "logits/chosen": 225153297.06666666, + "logits/rejected": 148199875.7647059, + "logps/chosen": -429.241015625, + "logps/rejected": -414.1749482996324, + "loss": 0.0454, + "rewards/chosen": 3.1665440877278646, + "rewards/margins": 9.30303664861941, + "rewards/rejected": -6.136492560891544, + "step": 1181 + }, + { + "epoch": 0.4363435005306631, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 6.128992000995015e-06, + "logits/chosen": 303993344.0, + "logits/rejected": 274649207.46666664, + "logps/chosen": -361.51496438419116, + "logps/rejected": -480.71751302083334, + "loss": 0.0515, + "rewards/chosen": 3.193499845616958, + "rewards/margins": 9.653730863683364, + "rewards/rejected": -6.460231018066406, + "step": 1182 + }, + { + "epoch": 0.4367126574685063, + "grad_norm": 4.6875, + "kl": 1.3669404983520508, + "learning_rate": 6.123257878060146e-06, + "logits/chosen": 246776806.4, + "logits/rejected": 222714240.0, + "logps/chosen": -338.73828125, + "logps/rejected": -450.6812337239583, + "loss": 0.0822, + "rewards/chosen": 2.8015607833862304, + "rewards/margins": 8.729040845235188, + "rewards/rejected": -5.927480061848958, + "step": 1183 + }, + { + "epoch": 0.4370818144063495, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 6.1175221988657555e-06, + "logits/chosen": 244463802.1818182, + "logits/rejected": 278725606.4, + "logps/chosen": -319.476806640625, + "logps/rejected": -523.57861328125, + "loss": 0.0964, + "rewards/chosen": 2.6791283000599253, + "rewards/margins": 9.434938083995473, + "rewards/rejected": -6.755809783935547, + "step": 1184 + }, + { + "epoch": 0.4374509713441927, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 6.111784971358556e-06, + "logits/chosen": 231513301.33333334, + "logits/rejected": 195120230.4, + "logps/chosen": -388.6943359375, + "logps/rejected": -519.486279296875, + "loss": 0.0629, + "rewards/chosen": 2.417099952697754, + "rewards/margins": 9.123018074035645, + "rewards/rejected": -6.7059181213378904, + "step": 1185 + }, + { + "epoch": 0.4378201282820359, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 6.106046203487406e-06, + "logits/chosen": 270910924.8, + "logits/rejected": 165178548.70588234, + "logps/chosen": -355.40875651041665, + "logps/rejected": -458.0936063878676, + "loss": 0.055, + "rewards/chosen": 3.3018369038899738, + "rewards/margins": 11.209143350638595, + "rewards/rejected": -7.907306446748621, + "step": 1186 + }, + { + "epoch": 0.4381892852198791, + "grad_norm": 7.21875, + "kl": 0.5268621444702148, + "learning_rate": 6.100305903203292e-06, + "logits/chosen": 140663700.21052632, + "logits/rejected": 189865432.6153846, + "logps/chosen": -339.20448704769734, + "logps/rejected": -386.7316331129808, + "loss": 0.1204, + "rewards/chosen": 2.662191892925062, + "rewards/margins": 8.495307690701505, + "rewards/rejected": -5.8331157977764425, + "step": 1187 + }, + { + "epoch": 0.43855844215772233, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 6.094564078459329e-06, + "logits/chosen": 181181147.42857143, + "logits/rejected": 188613589.33333334, + "logps/chosen": -404.08963448660717, + "logps/rejected": -432.88026258680554, + "loss": 0.0635, + "rewards/chosen": 2.9884510040283203, + "rewards/margins": 9.510334226820204, + "rewards/rejected": -6.5218832227918835, + "step": 1188 + }, + { + "epoch": 0.4389275990955655, + "grad_norm": 5.6875, + "kl": 0.31156253814697266, + "learning_rate": 6.08882073721074e-06, + "logits/chosen": 243350920.53333333, + "logits/rejected": 209004182.5882353, + "logps/chosen": -407.0466796875, + "logps/rejected": -480.77424172794116, + "loss": 0.078, + "rewards/chosen": 2.8092547098795575, + "rewards/margins": 9.063779389624502, + "rewards/rejected": -6.254524679744945, + "step": 1189 + }, + { + "epoch": 0.4392967560334087, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 6.083075887414854e-06, + "logits/chosen": 224407488.0, + "logits/rejected": 315675744.0, + "logps/chosen": -335.5931091308594, + "logps/rejected": -376.7174987792969, + "loss": 0.0952, + "rewards/chosen": 2.163275718688965, + "rewards/margins": 8.829992771148682, + "rewards/rejected": -6.666717052459717, + "step": 1190 + }, + { + "epoch": 0.4396659129712519, + "grad_norm": 5.9375, + "kl": 1.0468735694885254, + "learning_rate": 6.077329537031087e-06, + "logits/chosen": 228898867.2, + "logits/rejected": 153466986.66666666, + "logps/chosen": -351.6412109375, + "logps/rejected": -371.1853841145833, + "loss": 0.1234, + "rewards/chosen": 2.4509239196777344, + "rewards/margins": 7.888140360514323, + "rewards/rejected": -5.437216440836589, + "step": 1191 + }, + { + "epoch": 0.4400350699090951, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 6.071581694020933e-06, + "logits/chosen": 158698997.33333334, + "logits/rejected": 197872294.4, + "logps/chosen": -341.2270914713542, + "logps/rejected": -453.041015625, + "loss": 0.0737, + "rewards/chosen": 2.547694683074951, + "rewards/margins": 8.443089962005615, + "rewards/rejected": -5.895395278930664, + "step": 1192 + }, + { + "epoch": 0.4404042268469383, + "grad_norm": 6.34375, + "kl": 0.4232006072998047, + "learning_rate": 6.0658323663479555e-06, + "logits/chosen": 225595557.6470588, + "logits/rejected": 126345369.6, + "logps/chosen": -300.44241153492646, + "logps/rejected": -317.38684895833336, + "loss": 0.1155, + "rewards/chosen": 2.5308620228486904, + "rewards/margins": 8.479580793193742, + "rewards/rejected": -5.948718770345052, + "step": 1193 + }, + { + "epoch": 0.4407733837847815, + "grad_norm": 5.375, + "kl": 0.5719594955444336, + "learning_rate": 6.060081561977778e-06, + "logits/chosen": 168469970.82352942, + "logits/rejected": 152212616.53333333, + "logps/chosen": -381.87712545955884, + "logps/rejected": -436.3756510416667, + "loss": 0.0667, + "rewards/chosen": 3.013052323285271, + "rewards/margins": 10.804836991254021, + "rewards/rejected": -7.79178466796875, + "step": 1194 + }, + { + "epoch": 0.4411425407226247, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 6.054329288878062e-06, + "logits/chosen": 189122839.27272728, + "logits/rejected": 267722483.80952382, + "logps/chosen": -237.2239435369318, + "logps/rejected": -516.820079985119, + "loss": 0.0454, + "rewards/chosen": 2.5823808149857954, + "rewards/margins": 10.04003331568334, + "rewards/rejected": -7.457652500697544, + "step": 1195 + }, + { + "epoch": 0.4415116976604679, + "grad_norm": 3.109375, + "kl": 0.2542295455932617, + "learning_rate": 6.048575555018512e-06, + "logits/chosen": 169244041.84615386, + "logits/rejected": 234586866.52631578, + "logps/chosen": -347.42149939903845, + "logps/rejected": -398.7607421875, + "loss": 0.057, + "rewards/chosen": 3.05949959388146, + "rewards/margins": 8.853106803739601, + "rewards/rejected": -5.793607209858141, + "step": 1196 + }, + { + "epoch": 0.4418808545983111, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.042820368370854e-06, + "logits/chosen": 206187929.6, + "logits/rejected": 222302177.88235295, + "logps/chosen": -368.0984700520833, + "logps/rejected": -443.20237821691177, + "loss": 0.0716, + "rewards/chosen": 2.5228037516276043, + "rewards/margins": 8.827611078000537, + "rewards/rejected": -6.304807326372932, + "step": 1197 + }, + { + "epoch": 0.4422500115361543, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 6.037063736908822e-06, + "logits/chosen": 193309105.23076922, + "logits/rejected": 229923543.57894737, + "logps/chosen": -328.67934945913464, + "logps/rejected": -399.11024876644734, + "loss": 0.0685, + "rewards/chosen": 2.9450167142427883, + "rewards/margins": 9.048342407473669, + "rewards/rejected": -6.10332569323088, + "step": 1198 + }, + { + "epoch": 0.4426191684739975, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 6.03130566860816e-06, + "logits/chosen": 196655854.93333334, + "logits/rejected": 137334482.82352942, + "logps/chosen": -352.569921875, + "logps/rejected": -372.50143612132354, + "loss": 0.1034, + "rewards/chosen": 2.3007848103841146, + "rewards/margins": 8.21691936418122, + "rewards/rejected": -5.916134553797105, + "step": 1199 + }, + { + "epoch": 0.4429883254118407, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 6.025546171446599e-06, + "logits/chosen": 140727068.44444445, + "logits/rejected": 187156736.0, + "logps/chosen": -273.271728515625, + "logps/rejected": -546.6842215401786, + "loss": 0.0766, + "rewards/chosen": 3.256980684068468, + "rewards/margins": 11.10652193947444, + "rewards/rejected": -7.849541255405971, + "step": 1200 + }, + { + "epoch": 0.4433574823496839, + "grad_norm": 6.84375, + "kl": 1.3710215091705322, + "learning_rate": 6.019785253403843e-06, + "logits/chosen": 247955267.36842105, + "logits/rejected": 210854518.15384614, + "logps/chosen": -357.7436009457237, + "logps/rejected": -522.2315579927885, + "loss": 0.1522, + "rewards/chosen": 1.797452826248972, + "rewards/margins": 8.048589606034128, + "rewards/rejected": -6.251136779785156, + "step": 1201 + }, + { + "epoch": 0.4437266392875271, + "grad_norm": 5.78125, + "kl": 0.4578104019165039, + "learning_rate": 6.0140229224615765e-06, + "logits/chosen": 174494560.0, + "logits/rejected": 151024752.0, + "logps/chosen": -317.0697021484375, + "logps/rejected": -441.3297119140625, + "loss": 0.0866, + "rewards/chosen": 2.793764114379883, + "rewards/margins": 8.350658893585205, + "rewards/rejected": -5.556894779205322, + "step": 1202 + }, + { + "epoch": 0.4440957962253703, + "grad_norm": 6.5, + "kl": 0.0, + "learning_rate": 6.008259186603434e-06, + "logits/chosen": 196812515.55555555, + "logits/rejected": 148412379.42857143, + "logps/chosen": -388.6372884114583, + "logps/rejected": -414.77566964285717, + "loss": 0.0991, + "rewards/chosen": 2.374408721923828, + "rewards/margins": 8.70845195225307, + "rewards/rejected": -6.334043230329241, + "step": 1203 + }, + { + "epoch": 0.4444649531632135, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 6.0024940538149965e-06, + "logits/chosen": 160282654.11764705, + "logits/rejected": 200783377.06666666, + "logps/chosen": -288.7698759191176, + "logps/rejected": -487.17646484375, + "loss": 0.0974, + "rewards/chosen": 3.163381913129021, + "rewards/margins": 10.613751175824333, + "rewards/rejected": -7.450369262695313, + "step": 1204 + }, + { + "epoch": 0.4448341101010567, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.996727532083786e-06, + "logits/chosen": 206740114.2857143, + "logits/rejected": 183864021.33333334, + "logps/chosen": -371.0096958705357, + "logps/rejected": -485.4587673611111, + "loss": 0.0425, + "rewards/chosen": 3.4318850381033763, + "rewards/margins": 10.452988791087318, + "rewards/rejected": -7.021103752983941, + "step": 1205 + }, + { + "epoch": 0.4452032670388999, + "grad_norm": 7.875, + "kl": 1.5038070678710938, + "learning_rate": 5.990959629399242e-06, + "logits/chosen": 205616308.70588234, + "logits/rejected": 177008145.06666666, + "logps/chosen": -333.1383272058824, + "logps/rejected": -332.67145182291665, + "loss": 0.164, + "rewards/chosen": 1.7028985864975874, + "rewards/margins": 7.428507412181181, + "rewards/rejected": -5.725608825683594, + "step": 1206 + }, + { + "epoch": 0.4455724239767431, + "grad_norm": 6.46875, + "kl": 0.3257017135620117, + "learning_rate": 5.9851903537527225e-06, + "logits/chosen": 183066880.0, + "logits/rejected": 242776448.0, + "logps/chosen": -278.1645812988281, + "logps/rejected": -443.14892578125, + "loss": 0.1387, + "rewards/chosen": 1.659299373626709, + "rewards/margins": 7.638757705688477, + "rewards/rejected": -5.979458332061768, + "step": 1207 + }, + { + "epoch": 0.4459415809145863, + "grad_norm": 7.125, + "kl": 2.083446979522705, + "learning_rate": 5.979419713137484e-06, + "logits/chosen": 259647897.6, + "logits/rejected": 274615061.3333333, + "logps/chosen": -308.99208984375, + "logps/rejected": -396.7795003255208, + "loss": 0.146, + "rewards/chosen": 3.235888671875, + "rewards/margins": 8.859840265909831, + "rewards/rejected": -5.623951594034831, + "step": 1208 + }, + { + "epoch": 0.44631073785242953, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5.973647715548676e-06, + "logits/chosen": 246890334.31578946, + "logits/rejected": 171942656.0, + "logps/chosen": -364.6328381990132, + "logps/rejected": -420.40718900240387, + "loss": 0.1019, + "rewards/chosen": 2.7346084996273645, + "rewards/margins": 9.585374608213602, + "rewards/rejected": -6.850766108586238, + "step": 1209 + }, + { + "epoch": 0.4466798947902727, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5.9678743689833284e-06, + "logits/chosen": 220823944.53333333, + "logits/rejected": 188622441.4117647, + "logps/chosen": -294.79029947916666, + "logps/rejected": -398.3384363511029, + "loss": 0.0865, + "rewards/chosen": 2.579123942057292, + "rewards/margins": 8.518428967045804, + "rewards/rejected": -5.939305024988511, + "step": 1210 + }, + { + "epoch": 0.44704905172811593, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5.962099681440341e-06, + "logits/chosen": 173995827.2, + "logits/rejected": 174129212.2352941, + "logps/chosen": -316.3947265625, + "logps/rejected": -495.8938993566176, + "loss": 0.0637, + "rewards/chosen": 3.1120063781738283, + "rewards/margins": 10.576458336325253, + "rewards/rejected": -7.464451958151424, + "step": 1211 + }, + { + "epoch": 0.4474182086659591, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 5.9563236609204655e-06, + "logits/chosen": 154562752.0, + "logits/rejected": 194604864.0, + "logps/chosen": -380.8797607421875, + "logps/rejected": -382.0147216796875, + "loss": 0.0702, + "rewards/chosen": 3.4456443786621094, + "rewards/margins": 9.372895431518554, + "rewards/rejected": -5.9272510528564455, + "step": 1212 + }, + { + "epoch": 0.44778736560380233, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5.950546315426309e-06, + "logits/chosen": 198495778.13333333, + "logits/rejected": 242598023.52941176, + "logps/chosen": -319.8947265625, + "logps/rejected": -499.4370978860294, + "loss": 0.106, + "rewards/chosen": 2.00447514851888, + "rewards/margins": 9.16204413619696, + "rewards/rejected": -7.157568987678079, + "step": 1213 + }, + { + "epoch": 0.4481565225416455, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.944767652962309e-06, + "logits/chosen": 251335089.23076922, + "logits/rejected": 222025054.31578946, + "logps/chosen": -384.1488506610577, + "logps/rejected": -493.9413548519737, + "loss": 0.0594, + "rewards/chosen": 3.607341766357422, + "rewards/margins": 9.837350744950143, + "rewards/rejected": -6.230008978592722, + "step": 1214 + }, + { + "epoch": 0.44852567947948874, + "grad_norm": 5.09375, + "kl": 2.5117359161376953, + "learning_rate": 5.938987681534729e-06, + "logits/chosen": 205533107.2, + "logits/rejected": 331221418.6666667, + "logps/chosen": -399.45439453125, + "logps/rejected": -475.8469645182292, + "loss": 0.0654, + "rewards/chosen": 3.513682556152344, + "rewards/margins": 10.051121266682943, + "rewards/rejected": -6.537438710530599, + "step": 1215 + }, + { + "epoch": 0.4488948364173319, + "grad_norm": 5.78125, + "kl": 0.9845080375671387, + "learning_rate": 5.933206409151646e-06, + "logits/chosen": 208328192.0, + "logits/rejected": 210629120.0, + "logps/chosen": -335.68043277138156, + "logps/rejected": -470.65467247596155, + "loss": 0.1145, + "rewards/chosen": 2.6270300212659334, + "rewards/margins": 8.956210225217255, + "rewards/rejected": -6.3291802039513225, + "step": 1216 + }, + { + "epoch": 0.44926399335517514, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 5.92742384382294e-06, + "logits/chosen": 237494306.13333333, + "logits/rejected": 182092769.88235295, + "logps/chosen": -327.9189778645833, + "logps/rejected": -464.7008272058824, + "loss": 0.036, + "rewards/chosen": 3.4505584716796873, + "rewards/margins": 10.81361981560202, + "rewards/rejected": -7.363061343922334, + "step": 1217 + }, + { + "epoch": 0.4496331502930183, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5.92163999356028e-06, + "logits/chosen": 207168352.0, + "logits/rejected": 205059280.0, + "logps/chosen": -315.043212890625, + "logps/rejected": -397.8348083496094, + "loss": 0.0736, + "rewards/chosen": 2.5557057857513428, + "rewards/margins": 8.704808473587036, + "rewards/rejected": -6.149102687835693, + "step": 1218 + }, + { + "epoch": 0.45000230723086154, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5.91585486637712e-06, + "logits/chosen": 183031744.0, + "logits/rejected": 199056224.0, + "logps/chosen": -406.3924255371094, + "logps/rejected": -379.8668212890625, + "loss": 0.0671, + "rewards/chosen": 3.0440127849578857, + "rewards/margins": 9.261277437210083, + "rewards/rejected": -6.217264652252197, + "step": 1219 + }, + { + "epoch": 0.4503714641687047, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5.910068470288677e-06, + "logits/chosen": 326985924.9230769, + "logits/rejected": 187400811.78947368, + "logps/chosen": -378.9082782451923, + "logps/rejected": -372.1810238486842, + "loss": 0.0834, + "rewards/chosen": 2.361889912531926, + "rewards/margins": 7.404592838364574, + "rewards/rejected": -5.042702925832648, + "step": 1220 + }, + { + "epoch": 0.45074062110654795, + "grad_norm": 5.53125, + "kl": 0.47302889823913574, + "learning_rate": 5.90428081331193e-06, + "logits/chosen": 220867730.2857143, + "logits/rejected": 180232785.45454547, + "logps/chosen": -407.2914806547619, + "logps/rejected": -489.0673828125, + "loss": 0.1116, + "rewards/chosen": 2.6637700398763022, + "rewards/margins": 9.67397539543383, + "rewards/rejected": -7.010205355557528, + "step": 1221 + }, + { + "epoch": 0.4511097780443911, + "grad_norm": 4.78125, + "kl": 1.1283798217773438, + "learning_rate": 5.898491903465607e-06, + "logits/chosen": 194416878.93333334, + "logits/rejected": 207604163.7647059, + "logps/chosen": -332.7265299479167, + "logps/rejected": -407.1070772058824, + "loss": 0.0682, + "rewards/chosen": 3.141701507568359, + "rewards/margins": 9.476484276266659, + "rewards/rejected": -6.334782768698299, + "step": 1222 + }, + { + "epoch": 0.45147893498223435, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.892701748770165e-06, + "logits/chosen": 162300785.7777778, + "logits/rejected": 183186944.0, + "logps/chosen": -270.0886501736111, + "logps/rejected": -504.79593331473217, + "loss": 0.071, + "rewards/chosen": 3.0360844930013022, + "rewards/margins": 10.297741117931547, + "rewards/rejected": -7.261656624930246, + "step": 1223 + }, + { + "epoch": 0.4518480919200775, + "grad_norm": 6.90625, + "kl": 2.767086982727051, + "learning_rate": 5.886910357247792e-06, + "logits/chosen": 256858866.52631578, + "logits/rejected": 201852928.0, + "logps/chosen": -483.8391755756579, + "logps/rejected": -393.2539813701923, + "loss": 0.104, + "rewards/chosen": 2.9034596493369653, + "rewards/margins": 8.956222781285582, + "rewards/rejected": -6.052763131948618, + "step": 1224 + }, + { + "epoch": 0.45221724885792075, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5.8811177369223895e-06, + "logits/chosen": 163277952.0, + "logits/rejected": 194605988.57142857, + "logps/chosen": -315.50341796875, + "logps/rejected": -403.28690011160717, + "loss": 0.0944, + "rewards/chosen": 3.220728980170356, + "rewards/margins": 8.340480501689608, + "rewards/rejected": -5.119751521519253, + "step": 1225 + }, + { + "epoch": 0.4525864057957639, + "grad_norm": 5.03125, + "kl": 0.2932853698730469, + "learning_rate": 5.875323895819554e-06, + "logits/chosen": 203954500.26666668, + "logits/rejected": 272514138.35294116, + "logps/chosen": -320.1343098958333, + "logps/rejected": -397.06959443933823, + "loss": 0.0776, + "rewards/chosen": 2.465409851074219, + "rewards/margins": 8.043796045639935, + "rewards/rejected": -5.5783861945657165, + "step": 1226 + }, + { + "epoch": 0.4529555627336071, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 5.869528841966583e-06, + "logits/chosen": 237482026.66666666, + "logits/rejected": 184064109.7142857, + "logps/chosen": -358.8581271701389, + "logps/rejected": -385.47645786830356, + "loss": 0.0478, + "rewards/chosen": 3.358695136176215, + "rewards/margins": 9.99384059603252, + "rewards/rejected": -6.6351454598563055, + "step": 1227 + }, + { + "epoch": 0.4533247196714503, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5.8637325833924494e-06, + "logits/chosen": 256096603.42857143, + "logits/rejected": 165011100.44444445, + "logps/chosen": -278.6639404296875, + "logps/rejected": -486.75623914930554, + "loss": 0.0867, + "rewards/chosen": 2.2913246154785156, + "rewards/margins": 10.06012683444553, + "rewards/rejected": -7.768802218967014, + "step": 1228 + }, + { + "epoch": 0.4536938766092935, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 5.857935128127793e-06, + "logits/chosen": 218574560.0, + "logits/rejected": 232279216.0, + "logps/chosen": -311.3438720703125, + "logps/rejected": -522.0889892578125, + "loss": 0.0606, + "rewards/chosen": 3.2264444828033447, + "rewards/margins": 10.289335012435913, + "rewards/rejected": -7.062890529632568, + "step": 1229 + }, + { + "epoch": 0.45406303354713673, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5.852136484204918e-06, + "logits/chosen": 321868860.2352941, + "logits/rejected": 214329275.73333332, + "logps/chosen": -289.17578125, + "logps/rejected": -448.4111002604167, + "loss": 0.0727, + "rewards/chosen": 2.4729769089642692, + "rewards/margins": 9.621255717558018, + "rewards/rejected": -7.14827880859375, + "step": 1230 + }, + { + "epoch": 0.4544321904849799, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5.8463366596577706e-06, + "logits/chosen": 164100480.0, + "logits/rejected": 178600690.52631578, + "logps/chosen": -300.8161808894231, + "logps/rejected": -451.0731907894737, + "loss": 0.0469, + "rewards/chosen": 3.513208242563101, + "rewards/margins": 9.704624531240116, + "rewards/rejected": -6.191416288677015, + "step": 1231 + }, + { + "epoch": 0.45480134742282313, + "grad_norm": 6.40625, + "kl": 0.0, + "learning_rate": 5.8405356625219335e-06, + "logits/chosen": 216077342.11764705, + "logits/rejected": 183622929.06666666, + "logps/chosen": -387.82933134191177, + "logps/rejected": -423.28899739583335, + "loss": 0.0834, + "rewards/chosen": 2.443554597742417, + "rewards/margins": 7.921492423263251, + "rewards/rejected": -5.477937825520834, + "step": 1232 + }, + { + "epoch": 0.4551705043606663, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 5.834733500834615e-06, + "logits/chosen": 274617107.6923077, + "logits/rejected": 246885483.78947368, + "logps/chosen": -349.72960486778845, + "logps/rejected": -450.25894325657896, + "loss": 0.0925, + "rewards/chosen": 1.9352485950176532, + "rewards/margins": 8.375996307805481, + "rewards/rejected": -6.440747712787829, + "step": 1233 + }, + { + "epoch": 0.45553966129850954, + "grad_norm": 4.59375, + "kl": 0.6103115081787109, + "learning_rate": 5.8289301826346375e-06, + "logits/chosen": 237357410.46153846, + "logits/rejected": 190892368.84210527, + "logps/chosen": -299.2267878605769, + "logps/rejected": -421.9144222861842, + "loss": 0.0831, + "rewards/chosen": 2.4488052955040565, + "rewards/margins": 9.020334483158251, + "rewards/rejected": -6.571529187654194, + "step": 1234 + }, + { + "epoch": 0.4559088182363527, + "grad_norm": 4.6875, + "kl": 0.48291015625, + "learning_rate": 5.823125715962421e-06, + "logits/chosen": 186745821.86666667, + "logits/rejected": 201615781.6470588, + "logps/chosen": -262.72587890625, + "logps/rejected": -423.97366153492646, + "loss": 0.0687, + "rewards/chosen": 3.161589813232422, + "rewards/margins": 10.084409511790557, + "rewards/rejected": -6.9228196985581345, + "step": 1235 + }, + { + "epoch": 0.45627797517419594, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 5.817320108859984e-06, + "logits/chosen": 250886613.33333334, + "logits/rejected": 331684608.0, + "logps/chosen": -311.81630452473956, + "logps/rejected": -411.21845703125, + "loss": 0.0759, + "rewards/chosen": 2.106440703074137, + "rewards/margins": 8.323891417185465, + "rewards/rejected": -6.217450714111328, + "step": 1236 + }, + { + "epoch": 0.4566471321120391, + "grad_norm": 6.0625, + "kl": 1.3746919631958008, + "learning_rate": 5.811513369370921e-06, + "logits/chosen": 244964454.4, + "logits/rejected": 263745024.0, + "logps/chosen": -364.713671875, + "logps/rejected": -447.0391438802083, + "loss": 0.1101, + "rewards/chosen": 2.510615348815918, + "rewards/margins": 8.76806100209554, + "rewards/rejected": -6.257445653279622, + "step": 1237 + }, + { + "epoch": 0.45701628904988234, + "grad_norm": 5.3125, + "kl": 0.22956085205078125, + "learning_rate": 5.805705505540392e-06, + "logits/chosen": 160609810.2857143, + "logits/rejected": 123552199.1111111, + "logps/chosen": -404.8291015625, + "logps/rejected": -334.6553005642361, + "loss": 0.0627, + "rewards/chosen": 3.2363665444510326, + "rewards/margins": 9.257717405046735, + "rewards/rejected": -6.021350860595703, + "step": 1238 + }, + { + "epoch": 0.4573854459877255, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 5.799896525415124e-06, + "logits/chosen": 180081042.2857143, + "logits/rejected": 219481031.1111111, + "logps/chosen": -442.509521484375, + "logps/rejected": -462.7243923611111, + "loss": 0.0919, + "rewards/chosen": 2.2355878012520924, + "rewards/margins": 8.412924600025963, + "rewards/rejected": -6.177336798773871, + "step": 1239 + }, + { + "epoch": 0.45775460292556874, + "grad_norm": 4.125, + "kl": 0.14220809936523438, + "learning_rate": 5.7940864370433825e-06, + "logits/chosen": 249344170.66666666, + "logits/rejected": 255349778.2857143, + "logps/chosen": -353.5138346354167, + "logps/rejected": -473.68697684151783, + "loss": 0.0553, + "rewards/chosen": 3.462142096625434, + "rewards/margins": 10.426275767977275, + "rewards/rejected": -6.964133671351841, + "step": 1240 + }, + { + "epoch": 0.4581237598634119, + "grad_norm": 5.21875, + "kl": 0.41724586486816406, + "learning_rate": 5.78827524847497e-06, + "logits/chosen": 152935040.0, + "logits/rejected": 173903061.33333334, + "logps/chosen": -289.81484375, + "logps/rejected": -587.8487955729166, + "loss": 0.1157, + "rewards/chosen": 2.593410873413086, + "rewards/margins": 10.129857762654622, + "rewards/rejected": -7.536446889241536, + "step": 1241 + }, + { + "epoch": 0.45849291680125515, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5.782462967761217e-06, + "logits/chosen": 189566444.30769232, + "logits/rejected": 224447434.10526314, + "logps/chosen": -338.6279296875, + "logps/rejected": -449.58511513157896, + "loss": 0.0675, + "rewards/chosen": 3.1417127755972056, + "rewards/margins": 10.506103098633801, + "rewards/rejected": -7.364390323036595, + "step": 1242 + }, + { + "epoch": 0.4588620737390983, + "grad_norm": 4.25, + "kl": 0.2800483703613281, + "learning_rate": 5.776649602954963e-06, + "logits/chosen": 208137185.88235295, + "logits/rejected": 141916774.4, + "logps/chosen": -309.18640854779414, + "logps/rejected": -334.142578125, + "loss": 0.0959, + "rewards/chosen": 3.0752417620490577, + "rewards/margins": 8.512807374842026, + "rewards/rejected": -5.437565612792969, + "step": 1243 + }, + { + "epoch": 0.45923123067694155, + "grad_norm": 3.234375, + "kl": 0.0, + "learning_rate": 5.770835162110551e-06, + "logits/chosen": 245616932.57142857, + "logits/rejected": 138437560.8888889, + "logps/chosen": -284.7531040736607, + "logps/rejected": -430.57872178819446, + "loss": 0.0558, + "rewards/chosen": 2.6966124943324496, + "rewards/margins": 10.260286573379759, + "rewards/rejected": -7.563674079047309, + "step": 1244 + }, + { + "epoch": 0.4596003876147847, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 5.765019653283814e-06, + "logits/chosen": 233215813.8181818, + "logits/rejected": 180152490.66666666, + "logps/chosen": -418.11079545454544, + "logps/rejected": -483.95977492559524, + "loss": 0.0686, + "rewards/chosen": 1.9618807705965908, + "rewards/margins": 9.06084782323796, + "rewards/rejected": -7.098967052641369, + "step": 1245 + }, + { + "epoch": 0.45996954455262795, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5.759203084532068e-06, + "logits/chosen": 260487649.88235295, + "logits/rejected": 161111620.26666668, + "logps/chosen": -343.2968175551471, + "logps/rejected": -399.85748697916665, + "loss": 0.1089, + "rewards/chosen": 2.072814043830423, + "rewards/margins": 8.938165402879903, + "rewards/rejected": -6.8653513590494795, + "step": 1246 + }, + { + "epoch": 0.4603387014904711, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5.753385463914094e-06, + "logits/chosen": 257019392.0, + "logits/rejected": 261747939.55555555, + "logps/chosen": -374.23046875, + "logps/rejected": -299.26318359375, + "loss": 0.0706, + "rewards/chosen": 2.5737293788364957, + "rewards/margins": 8.535251799083891, + "rewards/rejected": -5.9615224202473955, + "step": 1247 + }, + { + "epoch": 0.46070785842831435, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.7475667994901316e-06, + "logits/chosen": 225576594.2857143, + "logits/rejected": 159807075.55555555, + "logps/chosen": -260.12130301339283, + "logps/rejected": -471.0791015625, + "loss": 0.0622, + "rewards/chosen": 2.7926052638462613, + "rewards/margins": 10.15409954010494, + "rewards/rejected": -7.36149427625868, + "step": 1248 + }, + { + "epoch": 0.4610770153661575, + "grad_norm": 4.5625, + "kl": 0.8596096038818359, + "learning_rate": 5.741747099321866e-06, + "logits/chosen": 256238996.21052632, + "logits/rejected": 322016315.0769231, + "logps/chosen": -347.19189453125, + "logps/rejected": -504.45316256009613, + "loss": 0.0527, + "rewards/chosen": 3.26181371588456, + "rewards/margins": 11.193112778760161, + "rewards/rejected": -7.931299062875601, + "step": 1249 + }, + { + "epoch": 0.46144617230400076, + "grad_norm": 5.4375, + "kl": 0.32234621047973633, + "learning_rate": 5.735926371472418e-06, + "logits/chosen": 208544984.6153846, + "logits/rejected": 177942137.2631579, + "logps/chosen": -314.48694786658655, + "logps/rejected": -327.21767064144734, + "loss": 0.0833, + "rewards/chosen": 2.604426457331731, + "rewards/margins": 8.401948349678564, + "rewards/rejected": -5.7975218923468335, + "step": 1250 + }, + { + "epoch": 0.46181532924184393, + "grad_norm": 5.96875, + "kl": 0.13190269470214844, + "learning_rate": 5.730104624006333e-06, + "logits/chosen": 220553161.14285713, + "logits/rejected": 252693048.8888889, + "logps/chosen": -432.2550571986607, + "logps/rejected": -467.33311631944446, + "loss": 0.0645, + "rewards/chosen": 3.7767535618373325, + "rewards/margins": 10.58206794375465, + "rewards/rejected": -6.805314381917317, + "step": 1251 + }, + { + "epoch": 0.46218448617968716, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 5.724281864989567e-06, + "logits/chosen": 274263153.7777778, + "logits/rejected": 202948498.2857143, + "logps/chosen": -509.3173828125, + "logps/rejected": -426.86104910714283, + "loss": 0.1415, + "rewards/chosen": 2.083794911702474, + "rewards/margins": 7.219352903820219, + "rewards/rejected": -5.135557992117746, + "step": 1252 + }, + { + "epoch": 0.46255364311753033, + "grad_norm": 5.96875, + "kl": 1.568807601928711, + "learning_rate": 5.718458102489479e-06, + "logits/chosen": 208165522.2857143, + "logits/rejected": 156860648.72727272, + "logps/chosen": -322.7779250372024, + "logps/rejected": -379.2332208806818, + "loss": 0.1186, + "rewards/chosen": 2.3213646298363093, + "rewards/margins": 9.10813553715165, + "rewards/rejected": -6.786770907315341, + "step": 1253 + }, + { + "epoch": 0.46292280005537356, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5.712633344574816e-06, + "logits/chosen": 249874048.0, + "logits/rejected": 186357280.0, + "logps/chosen": -427.33636474609375, + "logps/rejected": -419.06097412109375, + "loss": 0.0651, + "rewards/chosen": 3.122812271118164, + "rewards/margins": 8.654112339019775, + "rewards/rejected": -5.531300067901611, + "step": 1254 + }, + { + "epoch": 0.46329195699321674, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 5.70680759931571e-06, + "logits/chosen": 180850538.66666666, + "logits/rejected": 213875763.2, + "logps/chosen": -345.5939127604167, + "logps/rejected": -661.75576171875, + "loss": 0.06, + "rewards/chosen": 2.589045524597168, + "rewards/margins": 11.096788597106933, + "rewards/rejected": -8.507743072509765, + "step": 1255 + }, + { + "epoch": 0.46366111393105996, + "grad_norm": 6.25, + "kl": 0.8158950805664062, + "learning_rate": 5.7009808747836546e-06, + "logits/chosen": 261274336.0, + "logits/rejected": 195314960.0, + "logps/chosen": -439.26385498046875, + "logps/rejected": -302.48382568359375, + "loss": 0.1004, + "rewards/chosen": 2.0871083736419678, + "rewards/margins": 7.744224786758423, + "rewards/rejected": -5.657116413116455, + "step": 1256 + }, + { + "epoch": 0.46403027086890314, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5.6951531790515045e-06, + "logits/chosen": 187286637.7142857, + "logits/rejected": 170129920.0, + "logps/chosen": -323.5991908482143, + "logps/rejected": -409.4473470052083, + "loss": 0.1249, + "rewards/chosen": 1.9381529944283622, + "rewards/margins": 8.819877881852408, + "rewards/rejected": -6.8817248874240455, + "step": 1257 + }, + { + "epoch": 0.46439942780674637, + "grad_norm": 7.65625, + "kl": 0.0, + "learning_rate": 5.689324520193455e-06, + "logits/chosen": 196319047.1111111, + "logits/rejected": 252319341.7142857, + "logps/chosen": -307.3476833767361, + "logps/rejected": -453.90237862723217, + "loss": 0.141, + "rewards/chosen": 1.797366460164388, + "rewards/margins": 8.020318076724099, + "rewards/rejected": -6.22295161655971, + "step": 1258 + }, + { + "epoch": 0.46476858474458954, + "grad_norm": 6.125, + "kl": 0.17361187934875488, + "learning_rate": 5.68349490628504e-06, + "logits/chosen": 179355818.66666666, + "logits/rejected": 188703879.52941176, + "logps/chosen": -347.0073567708333, + "logps/rejected": -506.1032284007353, + "loss": 0.1033, + "rewards/chosen": 2.0667054494222006, + "rewards/margins": 8.845312537398993, + "rewards/rejected": -6.778607087976792, + "step": 1259 + }, + { + "epoch": 0.46513774168243277, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.677664345403118e-06, + "logits/chosen": 157794560.0, + "logits/rejected": 267182528.0, + "logps/chosen": -327.93896484375, + "logps/rejected": -506.62518310546875, + "loss": 0.0564, + "rewards/chosen": 3.237515687942505, + "rewards/margins": 10.914199113845825, + "rewards/rejected": -7.67668342590332, + "step": 1260 + }, + { + "epoch": 0.46550689862027594, + "grad_norm": 5.40625, + "kl": 0.27829694747924805, + "learning_rate": 5.671832845625853e-06, + "logits/chosen": 159446030.2222222, + "logits/rejected": 235936969.14285713, + "logps/chosen": -325.97406684027777, + "logps/rejected": -452.22035435267856, + "loss": 0.0891, + "rewards/chosen": 2.582556406656901, + "rewards/margins": 8.780360993884859, + "rewards/rejected": -6.1978045872279575, + "step": 1261 + }, + { + "epoch": 0.4658760555581192, + "grad_norm": 5.375, + "kl": 1.5105524063110352, + "learning_rate": 5.6660004150327175e-06, + "logits/chosen": 279445134.2222222, + "logits/rejected": 170028050.2857143, + "logps/chosen": -399.08094618055554, + "logps/rejected": -455.6284877232143, + "loss": 0.0962, + "rewards/chosen": 2.6685013241238065, + "rewards/margins": 8.852125591701931, + "rewards/rejected": -6.183624267578125, + "step": 1262 + }, + { + "epoch": 0.46624521249596235, + "grad_norm": 5.0, + "kl": 0.23043155670166016, + "learning_rate": 5.660167061704467e-06, + "logits/chosen": 208466523.42857143, + "logits/rejected": 222217272.8888889, + "logps/chosen": -263.7559116908482, + "logps/rejected": -316.28447808159723, + "loss": 0.1109, + "rewards/chosen": 1.823603902544294, + "rewards/margins": 8.271878726898677, + "rewards/rejected": -6.4482748243543835, + "step": 1263 + }, + { + "epoch": 0.4666143694338055, + "grad_norm": 5.875, + "kl": 0.1804056167602539, + "learning_rate": 5.654332793723141e-06, + "logits/chosen": 178693707.29411766, + "logits/rejected": 202826308.26666668, + "logps/chosen": -354.1507927389706, + "logps/rejected": -455.89361979166665, + "loss": 0.0753, + "rewards/chosen": 2.9437018001780793, + "rewards/margins": 9.422912208706725, + "rewards/rejected": -6.479210408528646, + "step": 1264 + }, + { + "epoch": 0.46698352637164875, + "grad_norm": 7.75, + "kl": 1.2193059921264648, + "learning_rate": 5.648497619172042e-06, + "logits/chosen": 262642752.0, + "logits/rejected": 265970688.0, + "logps/chosen": -392.4170735677083, + "logps/rejected": -297.9325866699219, + "loss": 0.1323, + "rewards/chosen": 2.6583776473999023, + "rewards/margins": 7.405609607696533, + "rewards/rejected": -4.747231960296631, + "step": 1265 + }, + { + "epoch": 0.4673526833094919, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5.6426615461357305e-06, + "logits/chosen": 219168016.0, + "logits/rejected": 202634912.0, + "logps/chosen": -290.4700927734375, + "logps/rejected": -514.543701171875, + "loss": 0.1013, + "rewards/chosen": 2.0645620822906494, + "rewards/margins": 9.07791256904602, + "rewards/rejected": -7.013350486755371, + "step": 1266 + }, + { + "epoch": 0.46772184024733515, + "grad_norm": 5.875, + "kl": 0.42235565185546875, + "learning_rate": 5.636824582700012e-06, + "logits/chosen": 251427900.2352941, + "logits/rejected": 179884987.73333332, + "logps/chosen": -450.25735294117646, + "logps/rejected": -376.8301106770833, + "loss": 0.0998, + "rewards/chosen": 2.726621291216682, + "rewards/margins": 9.1597313376034, + "rewards/rejected": -6.433110046386719, + "step": 1267 + }, + { + "epoch": 0.4680909971851783, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.630986736951925e-06, + "logits/chosen": 382052019.2, + "logits/rejected": 210979816.72727272, + "logps/chosen": -472.01123046875, + "logps/rejected": -430.8916015625, + "loss": 0.0573, + "rewards/chosen": 1.918398666381836, + "rewards/margins": 8.432117011330345, + "rewards/rejected": -6.513718344948509, + "step": 1268 + }, + { + "epoch": 0.46846015412302155, + "grad_norm": 5.3125, + "kl": 0.008028507232666016, + "learning_rate": 5.625148016979731e-06, + "logits/chosen": 170905709.7142857, + "logits/rejected": 201225914.1818182, + "logps/chosen": -295.0370396205357, + "logps/rejected": -477.05149147727275, + "loss": 0.0928, + "rewards/chosen": 2.411216554187593, + "rewards/margins": 9.348299018232337, + "rewards/rejected": -6.937082464044744, + "step": 1269 + }, + { + "epoch": 0.46882931106086473, + "grad_norm": 6.125, + "kl": 2.1024532318115234, + "learning_rate": 5.619308430872902e-06, + "logits/chosen": 225740185.6, + "logits/rejected": 238099776.0, + "logps/chosen": -323.00517578125, + "logps/rejected": -387.8243815104167, + "loss": 0.1263, + "rewards/chosen": 2.4715639114379884, + "rewards/margins": 8.472287432352703, + "rewards/rejected": -6.000723520914714, + "step": 1270 + }, + { + "epoch": 0.46919846799870796, + "grad_norm": 6.25, + "kl": 0.21569108963012695, + "learning_rate": 5.613467986722109e-06, + "logits/chosen": 172040764.2352941, + "logits/rejected": 165815876.26666668, + "logps/chosen": -401.37267348345586, + "logps/rejected": -384.12389322916664, + "loss": 0.0821, + "rewards/chosen": 2.7529498829561123, + "rewards/margins": 9.294931389303768, + "rewards/rejected": -6.541981506347656, + "step": 1271 + }, + { + "epoch": 0.46956762493655113, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.607626692619216e-06, + "logits/chosen": 301821440.0, + "logits/rejected": 196558400.0, + "logps/chosen": -364.2340087890625, + "logps/rejected": -536.6359252929688, + "loss": 0.0583, + "rewards/chosen": 2.3936047554016113, + "rewards/margins": 9.287053108215332, + "rewards/rejected": -6.893448352813721, + "step": 1272 + }, + { + "epoch": 0.46993678187439436, + "grad_norm": 7.625, + "kl": 1.3913662433624268, + "learning_rate": 5.601784556657259e-06, + "logits/chosen": 187381786.9473684, + "logits/rejected": 302402481.2307692, + "logps/chosen": -337.4148591694079, + "logps/rejected": -426.8018329326923, + "loss": 0.1373, + "rewards/chosen": 2.3809131823087992, + "rewards/margins": 7.4484720577595205, + "rewards/rejected": -5.067558875450721, + "step": 1273 + }, + { + "epoch": 0.47030593881223753, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5.5959415869304445e-06, + "logits/chosen": 215220412.63157895, + "logits/rejected": 135727389.53846154, + "logps/chosen": -324.39622738486844, + "logps/rejected": -348.2399714543269, + "loss": 0.1012, + "rewards/chosen": 2.7027889050935445, + "rewards/margins": 8.522610776337535, + "rewards/rejected": -5.81982187124399, + "step": 1274 + }, + { + "epoch": 0.47067509575008076, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.590097791534132e-06, + "logits/chosen": 200504395.29411766, + "logits/rejected": 276761651.2, + "logps/chosen": -331.68615004595586, + "logps/rejected": -583.6888671875, + "loss": 0.0781, + "rewards/chosen": 2.6768085255342373, + "rewards/margins": 9.341009880514706, + "rewards/rejected": -6.664201354980468, + "step": 1275 + }, + { + "epoch": 0.47104425268792394, + "grad_norm": 7.875, + "kl": 0.2568354606628418, + "learning_rate": 5.584253178564829e-06, + "logits/chosen": 162282093.7142857, + "logits/rejected": 196762424.8888889, + "logps/chosen": -328.79317801339283, + "logps/rejected": -406.283935546875, + "loss": 0.0844, + "rewards/chosen": 2.159219333103725, + "rewards/margins": 8.541894927857413, + "rewards/rejected": -6.382675594753689, + "step": 1276 + }, + { + "epoch": 0.47141340962576717, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.578407756120167e-06, + "logits/chosen": 245454774.85714287, + "logits/rejected": 180957397.33333334, + "logps/chosen": -380.93031529017856, + "logps/rejected": -396.1161838107639, + "loss": 0.0755, + "rewards/chosen": 2.3301944732666016, + "rewards/margins": 8.243118498060438, + "rewards/rejected": -5.912924024793837, + "step": 1277 + }, + { + "epoch": 0.47178256656361034, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 5.57256153229891e-06, + "logits/chosen": 126297742.22222222, + "logits/rejected": 196211622.95652175, + "logps/chosen": -390.83021375868054, + "logps/rejected": -351.2519106657609, + "loss": 0.0998, + "rewards/chosen": 2.486887402004666, + "rewards/margins": 7.596034183594339, + "rewards/rejected": -5.109146781589674, + "step": 1278 + }, + { + "epoch": 0.47215172350145357, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5.566714515200924e-06, + "logits/chosen": 316438374.4, + "logits/rejected": 165031261.0909091, + "logps/chosen": -400.984912109375, + "logps/rejected": -398.1016956676136, + "loss": 0.0623, + "rewards/chosen": 2.9813732147216796, + "rewards/margins": 8.371711072054776, + "rewards/rejected": -5.390337857333097, + "step": 1279 + }, + { + "epoch": 0.47252088043929674, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.560866712927176e-06, + "logits/chosen": 183336305.7777778, + "logits/rejected": 209606433.39130434, + "logps/chosen": -349.77332899305554, + "logps/rejected": -424.16376528532606, + "loss": 0.0557, + "rewards/chosen": 2.841385735405816, + "rewards/margins": 9.513512173712542, + "rewards/rejected": -6.672126438306726, + "step": 1280 + }, + { + "epoch": 0.47289003737713997, + "grad_norm": 4.21875, + "kl": 3.7070083618164062, + "learning_rate": 5.555018133579723e-06, + "logits/chosen": 197688805.0526316, + "logits/rejected": 208920064.0, + "logps/chosen": -383.9103361430921, + "logps/rejected": -388.3387920673077, + "loss": 0.0953, + "rewards/chosen": 3.508932013260691, + "rewards/margins": 8.809707394495668, + "rewards/rejected": -5.300775381234976, + "step": 1281 + }, + { + "epoch": 0.47325919431498314, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 5.549168785261698e-06, + "logits/chosen": 164810321.45454547, + "logits/rejected": 190773089.52380952, + "logps/chosen": -324.18539151278407, + "logps/rejected": -571.4298270089286, + "loss": 0.077, + "rewards/chosen": 2.557520259510387, + "rewards/margins": 11.035131256301682, + "rewards/rejected": -8.477610996791295, + "step": 1282 + }, + { + "epoch": 0.4736283512528264, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 5.543318676077297e-06, + "logits/chosen": 223296000.0, + "logits/rejected": 263106350.54545453, + "logps/chosen": -395.633544921875, + "logps/rejected": -421.69429154829544, + "loss": 0.033, + "rewards/chosen": 4.023113250732422, + "rewards/margins": 9.492212122136896, + "rewards/rejected": -5.469098871404475, + "step": 1283 + }, + { + "epoch": 0.47399750819066955, + "grad_norm": 6.0625, + "kl": 0.2798619270324707, + "learning_rate": 5.537467814131774e-06, + "logits/chosen": 216714825.14285713, + "logits/rejected": 190108060.44444445, + "logps/chosen": -418.04879324776783, + "logps/rejected": -378.64301215277777, + "loss": 0.1074, + "rewards/chosen": 2.45407349722726, + "rewards/margins": 7.958184045458596, + "rewards/rejected": -5.504110548231337, + "step": 1284 + }, + { + "epoch": 0.4743666651285128, + "grad_norm": 5.15625, + "kl": 0.36394214630126953, + "learning_rate": 5.531616207531423e-06, + "logits/chosen": 236541713.06666666, + "logits/rejected": 204975841.88235295, + "logps/chosen": -428.04124348958334, + "logps/rejected": -465.5245576746324, + "loss": 0.0803, + "rewards/chosen": 2.559741719563802, + "rewards/margins": 8.437161583993948, + "rewards/rejected": -5.877419864430147, + "step": 1285 + }, + { + "epoch": 0.47473582206635595, + "grad_norm": 6.5, + "kl": 1.012237548828125, + "learning_rate": 5.525763864383571e-06, + "logits/chosen": 158869794.13333333, + "logits/rejected": 185442846.11764705, + "logps/chosen": -397.08587239583335, + "logps/rejected": -407.71375229779414, + "loss": 0.0882, + "rewards/chosen": 2.9097874959309897, + "rewards/margins": 8.661666570925245, + "rewards/rejected": -5.751879074994256, + "step": 1286 + }, + { + "epoch": 0.4751049790041992, + "grad_norm": 5.0625, + "kl": 0.8482975959777832, + "learning_rate": 5.519910792796565e-06, + "logits/chosen": 199707008.0, + "logits/rejected": 170726328.8888889, + "logps/chosen": -331.591796875, + "logps/rejected": -435.9225260416667, + "loss": 0.0744, + "rewards/chosen": 2.9902210235595703, + "rewards/margins": 8.696532567342121, + "rewards/rejected": -5.706311543782552, + "step": 1287 + }, + { + "epoch": 0.47547413594204235, + "grad_norm": 9.75, + "kl": 0.0, + "learning_rate": 5.514057000879759e-06, + "logits/chosen": 203805774.76923078, + "logits/rejected": 171175019.78947368, + "logps/chosen": -387.07211538461536, + "logps/rejected": -378.0253392269737, + "loss": 0.1245, + "rewards/chosen": 1.4812531104454627, + "rewards/margins": 7.2824270734902825, + "rewards/rejected": -5.801173963044819, + "step": 1288 + }, + { + "epoch": 0.4758432928798856, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 5.508202496743511e-06, + "logits/chosen": 222042180.26666668, + "logits/rejected": 238442752.0, + "logps/chosen": -370.9294921875, + "logps/rejected": -517.8901079963235, + "loss": 0.0736, + "rewards/chosen": 3.0676607767740887, + "rewards/margins": 10.317241608862783, + "rewards/rejected": -7.249580832088695, + "step": 1289 + }, + { + "epoch": 0.47621244981772876, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.50234728849916e-06, + "logits/chosen": 225583762.2857143, + "logits/rejected": 180051768.8888889, + "logps/chosen": -324.2652064732143, + "logps/rejected": -474.59559461805554, + "loss": 0.0849, + "rewards/chosen": 2.2506607600620816, + "rewards/margins": 8.841848978920588, + "rewards/rejected": -6.591188218858507, + "step": 1290 + }, + { + "epoch": 0.476581606755572, + "grad_norm": 7.59375, + "kl": 0.35928988456726074, + "learning_rate": 5.496491384259022e-06, + "logits/chosen": 188849109.33333334, + "logits/rejected": 195128192.0, + "logps/chosen": -310.59526909722223, + "logps/rejected": -542.1033761160714, + "loss": 0.1218, + "rewards/chosen": 1.9890189700656467, + "rewards/margins": 9.393738731505378, + "rewards/rejected": -7.404719761439732, + "step": 1291 + }, + { + "epoch": 0.47695076369341516, + "grad_norm": 5.84375, + "kl": 1.728139877319336, + "learning_rate": 5.49063479213638e-06, + "logits/chosen": 298135009.88235295, + "logits/rejected": 174359517.86666667, + "logps/chosen": -353.0707433363971, + "logps/rejected": -358.17555338541666, + "loss": 0.1084, + "rewards/chosen": 2.268212037927964, + "rewards/margins": 8.330118979659735, + "rewards/rejected": -6.061906941731771, + "step": 1292 + }, + { + "epoch": 0.4773199206312584, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.484777520245467e-06, + "logits/chosen": 149465285.8181818, + "logits/rejected": 245119439.23809522, + "logps/chosen": -379.06001420454544, + "logps/rejected": -484.43145461309524, + "loss": 0.0576, + "rewards/chosen": 3.0722198486328125, + "rewards/margins": 8.922560192289806, + "rewards/rejected": -5.850340343656994, + "step": 1293 + }, + { + "epoch": 0.47768907756910156, + "grad_norm": 4.5, + "kl": 0.6464662551879883, + "learning_rate": 5.478919576701459e-06, + "logits/chosen": 222153892.57142857, + "logits/rejected": 181941219.55555555, + "logps/chosen": -346.37667410714283, + "logps/rejected": -388.07025824652777, + "loss": 0.057, + "rewards/chosen": 4.031380244663784, + "rewards/margins": 9.9230589488196, + "rewards/rejected": -5.891678704155816, + "step": 1294 + }, + { + "epoch": 0.4780582345069448, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 5.473060969620462e-06, + "logits/chosen": 245040928.0, + "logits/rejected": 171003120.0, + "logps/chosen": -338.7032775878906, + "logps/rejected": -375.3601989746094, + "loss": 0.0889, + "rewards/chosen": 2.2006072998046875, + "rewards/margins": 7.837211608886719, + "rewards/rejected": -5.636604309082031, + "step": 1295 + }, + { + "epoch": 0.47842739144478796, + "grad_norm": 5.875, + "kl": 0.8520078659057617, + "learning_rate": 5.467201707119501e-06, + "logits/chosen": 186435931.42857143, + "logits/rejected": 244285070.2222222, + "logps/chosen": -353.50184849330356, + "logps/rejected": -469.3614908854167, + "loss": 0.0787, + "rewards/chosen": 2.481628962925502, + "rewards/margins": 8.136440822056361, + "rewards/rejected": -5.654811859130859, + "step": 1296 + }, + { + "epoch": 0.4787965483826312, + "grad_norm": 6.09375, + "kl": 0.3754606246948242, + "learning_rate": 5.46134179731651e-06, + "logits/chosen": 213929614.2222222, + "logits/rejected": 226014610.2857143, + "logps/chosen": -396.2440592447917, + "logps/rejected": -522.3163364955357, + "loss": 0.0864, + "rewards/chosen": 2.6633374955919056, + "rewards/margins": 9.754101677546426, + "rewards/rejected": -7.09076418195452, + "step": 1297 + }, + { + "epoch": 0.47916570532047437, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 5.455481248330322e-06, + "logits/chosen": 212440182.15384614, + "logits/rejected": 227270656.0, + "logps/chosen": -349.5615234375, + "logps/rejected": -430.6519839638158, + "loss": 0.0743, + "rewards/chosen": 2.7128216670109677, + "rewards/margins": 9.156252918938394, + "rewards/rejected": -6.443431251927426, + "step": 1298 + }, + { + "epoch": 0.47953486225831754, + "grad_norm": 4.78125, + "kl": 0.9882087707519531, + "learning_rate": 5.44962006828065e-06, + "logits/chosen": 219149141.33333334, + "logits/rejected": 211246727.52941176, + "logps/chosen": -304.0873046875, + "logps/rejected": -443.90087890625, + "loss": 0.0771, + "rewards/chosen": 2.8261990865071613, + "rewards/margins": 9.646045744652842, + "rewards/rejected": -6.81984665814568, + "step": 1299 + }, + { + "epoch": 0.47990401919616077, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 5.443758265288086e-06, + "logits/chosen": 169767632.0, + "logits/rejected": 207917280.0, + "logps/chosen": -335.9193115234375, + "logps/rejected": -267.9986572265625, + "loss": 0.0619, + "rewards/chosen": 3.272312879562378, + "rewards/margins": 9.481057405471802, + "rewards/rejected": -6.208744525909424, + "step": 1300 + }, + { + "epoch": 0.48027317613400394, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 5.4378958474740826e-06, + "logits/chosen": 194281813.33333334, + "logits/rejected": 273517824.0, + "logps/chosen": -411.6641031901042, + "logps/rejected": -449.13330078125, + "loss": 0.0723, + "rewards/chosen": 3.2165629069010415, + "rewards/margins": 9.071945444742838, + "rewards/rejected": -5.8553825378417965, + "step": 1301 + }, + { + "epoch": 0.48064233307184717, + "grad_norm": 5.625, + "kl": 3.1386213302612305, + "learning_rate": 5.4320328229609475e-06, + "logits/chosen": 174997955.7647059, + "logits/rejected": 215787332.26666668, + "logps/chosen": -291.92141544117646, + "logps/rejected": -443.3771158854167, + "loss": 0.0824, + "rewards/chosen": 3.2540031881893383, + "rewards/margins": 10.506421197629443, + "rewards/rejected": -7.252418009440104, + "step": 1302 + }, + { + "epoch": 0.48101149000969035, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 5.426169199871824e-06, + "logits/chosen": 254587823.15789473, + "logits/rejected": 262536704.0, + "logps/chosen": -374.6968030427632, + "logps/rejected": -578.6143704927885, + "loss": 0.1077, + "rewards/chosen": 2.122119903564453, + "rewards/margins": 9.734496189997746, + "rewards/rejected": -7.612376286433293, + "step": 1303 + }, + { + "epoch": 0.4813806469475336, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 5.42030498633069e-06, + "logits/chosen": 241596958.11764705, + "logits/rejected": 283786956.8, + "logps/chosen": -318.4401424632353, + "logps/rejected": -622.2921875, + "loss": 0.1118, + "rewards/chosen": 1.787734536563649, + "rewards/margins": 8.973994505639169, + "rewards/rejected": -7.186259969075521, + "step": 1304 + }, + { + "epoch": 0.48174980388537675, + "grad_norm": 6.5625, + "kl": 0.8882207870483398, + "learning_rate": 5.414440190462336e-06, + "logits/chosen": 242968387.36842105, + "logits/rejected": 333875672.61538464, + "logps/chosen": -323.0542506167763, + "logps/rejected": -491.6143329326923, + "loss": 0.1357, + "rewards/chosen": 1.8558771233809621, + "rewards/margins": 8.997889499432645, + "rewards/rejected": -7.1420123760516825, + "step": 1305 + }, + { + "epoch": 0.48211896082322, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 5.408574820392364e-06, + "logits/chosen": 222416416.0, + "logits/rejected": 132633488.0, + "logps/chosen": -334.00274658203125, + "logps/rejected": -440.76214599609375, + "loss": 0.0986, + "rewards/chosen": 2.2015328407287598, + "rewards/margins": 8.995605945587158, + "rewards/rejected": -6.794073104858398, + "step": 1306 + }, + { + "epoch": 0.48248811776106315, + "grad_norm": 4.625, + "kl": 1.71002197265625, + "learning_rate": 5.402708884247169e-06, + "logits/chosen": 261156256.0, + "logits/rejected": 156230112.0, + "logps/chosen": -342.4004821777344, + "logps/rejected": -374.00341796875, + "loss": 0.0819, + "rewards/chosen": 3.0572617053985596, + "rewards/margins": 8.025655031204224, + "rewards/rejected": -4.968393325805664, + "step": 1307 + }, + { + "epoch": 0.4828572746989064, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 5.39684239015393e-06, + "logits/chosen": 231846382.93333334, + "logits/rejected": 260719856.94117647, + "logps/chosen": -253.778271484375, + "logps/rejected": -389.4993106617647, + "loss": 0.0834, + "rewards/chosen": 2.938848622639974, + "rewards/margins": 9.108840957342409, + "rewards/rejected": -6.169992334702435, + "step": 1308 + }, + { + "epoch": 0.48322643163674955, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 5.390975346240602e-06, + "logits/chosen": 193583346.52631578, + "logits/rejected": 222342636.30769232, + "logps/chosen": -337.50059107730266, + "logps/rejected": -395.6141826923077, + "loss": 0.0673, + "rewards/chosen": 3.3497316460860405, + "rewards/margins": 8.85949727108604, + "rewards/rejected": -5.509765625, + "step": 1309 + }, + { + "epoch": 0.4835955885745928, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 5.3851077606359e-06, + "logits/chosen": 200736492.30769232, + "logits/rejected": 210541109.89473686, + "logps/chosen": -355.43997896634613, + "logps/rejected": -470.59981496710526, + "loss": 0.1054, + "rewards/chosen": 2.136002173790565, + "rewards/margins": 8.485197553750474, + "rewards/rejected": -6.34919537995991, + "step": 1310 + }, + { + "epoch": 0.48396474551243596, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.3792396414692895e-06, + "logits/chosen": 187523644.2352941, + "logits/rejected": 211143731.2, + "logps/chosen": -358.5231502757353, + "logps/rejected": -424.19443359375, + "loss": 0.0869, + "rewards/chosen": 2.843328588149127, + "rewards/margins": 9.602250910740272, + "rewards/rejected": -6.758922322591146, + "step": 1311 + }, + { + "epoch": 0.4843339024502792, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.373370996870972e-06, + "logits/chosen": 184706048.0, + "logits/rejected": 167446771.2, + "logps/chosen": -337.25022379557294, + "logps/rejected": -399.3164306640625, + "loss": 0.0892, + "rewards/chosen": 2.2623252868652344, + "rewards/margins": 9.075574493408203, + "rewards/rejected": -6.813249206542968, + "step": 1312 + }, + { + "epoch": 0.48470305938812236, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5.367501834971882e-06, + "logits/chosen": 129958952.0, + "logits/rejected": 215550528.0, + "logps/chosen": -318.4737854003906, + "logps/rejected": -407.7665201822917, + "loss": 0.0568, + "rewards/chosen": 5.401557445526123, + "rewards/margins": 10.985916932423908, + "rewards/rejected": -5.584359486897786, + "step": 1313 + }, + { + "epoch": 0.4850722163259656, + "grad_norm": 3.484375, + "kl": 0.0, + "learning_rate": 5.3616321639036685e-06, + "logits/chosen": 208238933.33333334, + "logits/rejected": 236415131.82608697, + "logps/chosen": -299.673583984375, + "logps/rejected": -475.8907948369565, + "loss": 0.0459, + "rewards/chosen": 3.445320553249783, + "rewards/margins": 9.555488346855421, + "rewards/rejected": -6.110167793605639, + "step": 1314 + }, + { + "epoch": 0.48544137326380876, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 5.355761991798688e-06, + "logits/chosen": 155719082.66666666, + "logits/rejected": 191924553.14285713, + "logps/chosen": -327.28309461805554, + "logps/rejected": -425.44140625, + "loss": 0.0538, + "rewards/chosen": 2.993289099799262, + "rewards/margins": 9.426229446653336, + "rewards/rejected": -6.432940346854074, + "step": 1315 + }, + { + "epoch": 0.485810530201652, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.3498913267899864e-06, + "logits/chosen": 201907561.4117647, + "logits/rejected": 203703483.73333332, + "logps/chosen": -302.16676240808823, + "logps/rejected": -398.6220703125, + "loss": 0.0863, + "rewards/chosen": 2.533665152157054, + "rewards/margins": 9.369932750627108, + "rewards/rejected": -6.8362675984700525, + "step": 1316 + }, + { + "epoch": 0.48617968713949516, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 5.344020177011297e-06, + "logits/chosen": 249557504.0, + "logits/rejected": 187751720.42105263, + "logps/chosen": -288.69786658653845, + "logps/rejected": -445.8209292763158, + "loss": 0.0583, + "rewards/chosen": 2.746991671048678, + "rewards/margins": 8.67850420252997, + "rewards/rejected": -5.9315125314812915, + "step": 1317 + }, + { + "epoch": 0.4865488440773384, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 5.3381485505970235e-06, + "logits/chosen": 199169437.53846154, + "logits/rejected": 172665290.10526314, + "logps/chosen": -393.82666015625, + "logps/rejected": -388.46109169407896, + "loss": 0.0825, + "rewards/chosen": 2.26698244535006, + "rewards/margins": 8.465050361417083, + "rewards/rejected": -6.198067916067023, + "step": 1318 + }, + { + "epoch": 0.48691800101518157, + "grad_norm": 4.71875, + "kl": 0.36202526092529297, + "learning_rate": 5.3322764556822296e-06, + "logits/chosen": 276610128.84210527, + "logits/rejected": 279787579.0769231, + "logps/chosen": -436.6947985197368, + "logps/rejected": -449.76070462740387, + "loss": 0.0932, + "rewards/chosen": 2.276329241300884, + "rewards/margins": 9.837757604807495, + "rewards/rejected": -7.561428363506611, + "step": 1319 + }, + { + "epoch": 0.4872871579530248, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 5.326403900402627e-06, + "logits/chosen": 206744934.4, + "logits/rejected": 186120673.88235295, + "logps/chosen": -328.7402018229167, + "logps/rejected": -461.3832146139706, + "loss": 0.0943, + "rewards/chosen": 2.136397298177083, + "rewards/margins": 9.143820250268076, + "rewards/rejected": -7.007422952090993, + "step": 1320 + }, + { + "epoch": 0.48765631489086797, + "grad_norm": 5.09375, + "kl": 0.32390356063842773, + "learning_rate": 5.3205308928945676e-06, + "logits/chosen": 194469948.2352941, + "logits/rejected": 206564744.53333333, + "logps/chosen": -381.53142233455884, + "logps/rejected": -349.44602864583334, + "loss": 0.082, + "rewards/chosen": 2.8852350571576286, + "rewards/margins": 8.345687836291743, + "rewards/rejected": -5.460452779134115, + "step": 1321 + }, + { + "epoch": 0.4880254718287112, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.314657441295028e-06, + "logits/chosen": 200973836.8, + "logits/rejected": 296885376.0, + "logps/chosen": -350.9165283203125, + "logps/rejected": -486.275634765625, + "loss": 0.073, + "rewards/chosen": 2.5418632507324217, + "rewards/margins": 10.049670664469401, + "rewards/rejected": -7.5078074137369795, + "step": 1322 + }, + { + "epoch": 0.4883946287665544, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 5.308783553741602e-06, + "logits/chosen": 185206032.0, + "logits/rejected": 246473696.0, + "logps/chosen": -360.5896911621094, + "logps/rejected": -362.03668212890625, + "loss": 0.0811, + "rewards/chosen": 2.8805437088012695, + "rewards/margins": 8.814156532287598, + "rewards/rejected": -5.933612823486328, + "step": 1323 + }, + { + "epoch": 0.4887637857043976, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5.302909238372485e-06, + "logits/chosen": 211378005.33333334, + "logits/rejected": 202055183.05882353, + "logps/chosen": -319.92109375, + "logps/rejected": -430.6858915441176, + "loss": 0.0775, + "rewards/chosen": 2.659412384033203, + "rewards/margins": 10.051050298354205, + "rewards/rejected": -7.391637914321002, + "step": 1324 + }, + { + "epoch": 0.4891329426422408, + "grad_norm": 4.3125, + "kl": 1.398263931274414, + "learning_rate": 5.297034503326466e-06, + "logits/chosen": 179737139.2, + "logits/rejected": 211563968.0, + "logps/chosen": -344.0910400390625, + "logps/rejected": -580.70166015625, + "loss": 0.0761, + "rewards/chosen": 3.5110248565673827, + "rewards/margins": 10.694475555419922, + "rewards/rejected": -7.183450698852539, + "step": 1325 + }, + { + "epoch": 0.489502099580084, + "grad_norm": 4.28125, + "kl": 0.44229912757873535, + "learning_rate": 5.291159356742918e-06, + "logits/chosen": 144642677.33333334, + "logits/rejected": 159002521.6, + "logps/chosen": -279.1611328125, + "logps/rejected": -393.719873046875, + "loss": 0.0748, + "rewards/chosen": 2.8953189849853516, + "rewards/margins": 9.119862747192382, + "rewards/rejected": -6.224543762207031, + "step": 1326 + }, + { + "epoch": 0.4898712565179272, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 5.285283806761778e-06, + "logits/chosen": 193474804.36363637, + "logits/rejected": 180399299.04761904, + "logps/chosen": -314.21000532670456, + "logps/rejected": -403.7817615327381, + "loss": 0.0796, + "rewards/chosen": 2.3371727683327417, + "rewards/margins": 8.423726432767266, + "rewards/rejected": -6.086553664434524, + "step": 1327 + }, + { + "epoch": 0.4902404134557704, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 5.27940786152355e-06, + "logits/chosen": 247029942.85714287, + "logits/rejected": 156447246.2222222, + "logps/chosen": -384.7394321986607, + "logps/rejected": -376.72002495659723, + "loss": 0.0421, + "rewards/chosen": 3.013225555419922, + "rewards/margins": 9.239714728461372, + "rewards/rejected": -6.2264891730414496, + "step": 1328 + }, + { + "epoch": 0.4906095703936136, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.27353152916928e-06, + "logits/chosen": 156534640.0, + "logits/rejected": 254486368.0, + "logps/chosen": -295.7970886230469, + "logps/rejected": -419.3265075683594, + "loss": 0.0837, + "rewards/chosen": 3.0297648906707764, + "rewards/margins": 9.145085096359253, + "rewards/rejected": -6.115320205688477, + "step": 1329 + }, + { + "epoch": 0.4909787273314568, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 5.267654817840552e-06, + "logits/chosen": 158684324.57142857, + "logits/rejected": 285003576.8888889, + "logps/chosen": -240.12015206473214, + "logps/rejected": -412.53526475694446, + "loss": 0.0781, + "rewards/chosen": 3.010879244123186, + "rewards/margins": 9.130796371944367, + "rewards/rejected": -6.11991712782118, + "step": 1330 + }, + { + "epoch": 0.4913478842693, + "grad_norm": 6.125, + "kl": 1.9608173370361328, + "learning_rate": 5.261777735679472e-06, + "logits/chosen": 237765774.2222222, + "logits/rejected": 146716196.57142857, + "logps/chosen": -365.2514377170139, + "logps/rejected": -344.42919921875, + "loss": 0.1201, + "rewards/chosen": 2.2987812889946833, + "rewards/margins": 7.7993028126065695, + "rewards/rejected": -5.500521523611886, + "step": 1331 + }, + { + "epoch": 0.4917170412071432, + "grad_norm": 5.03125, + "kl": 1.386897087097168, + "learning_rate": 5.255900290828666e-06, + "logits/chosen": 163112288.0, + "logits/rejected": 286611744.0, + "logps/chosen": -282.5770263671875, + "logps/rejected": -289.42694091796875, + "loss": 0.112, + "rewards/chosen": 2.636589765548706, + "rewards/margins": 7.698504686355591, + "rewards/rejected": -5.061914920806885, + "step": 1332 + }, + { + "epoch": 0.4920861981449864, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 5.250022491431259e-06, + "logits/chosen": 220355609.6, + "logits/rejected": 167010421.33333334, + "logps/chosen": -378.0765625, + "logps/rejected": -565.9065755208334, + "loss": 0.0899, + "rewards/chosen": 2.370681953430176, + "rewards/margins": 11.179320208231609, + "rewards/rejected": -8.808638254801432, + "step": 1333 + }, + { + "epoch": 0.4924553550828296, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 5.2441443456308665e-06, + "logits/chosen": 171280164.57142857, + "logits/rejected": 216480583.1111111, + "logps/chosen": -328.55555943080356, + "logps/rejected": -485.5959201388889, + "loss": 0.0645, + "rewards/chosen": 3.1375094822474887, + "rewards/margins": 10.624628521147228, + "rewards/rejected": -7.487119038899739, + "step": 1334 + }, + { + "epoch": 0.4928245120206728, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.238265861571585e-06, + "logits/chosen": 261006728.53333333, + "logits/rejected": 266388058.3529412, + "logps/chosen": -281.65491536458336, + "logps/rejected": -366.4839441636029, + "loss": 0.1198, + "rewards/chosen": 2.6093488057454426, + "rewards/margins": 8.110295748243145, + "rewards/rejected": -5.500946942497702, + "step": 1335 + }, + { + "epoch": 0.49319366895851596, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 5.232387047397979e-06, + "logits/chosen": 247319625.14285713, + "logits/rejected": 156500551.1111111, + "logps/chosen": -297.42606026785717, + "logps/rejected": -432.55943467881946, + "loss": 0.0687, + "rewards/chosen": 2.541843686785017, + "rewards/margins": 9.143304037669349, + "rewards/rejected": -6.601460350884332, + "step": 1336 + }, + { + "epoch": 0.4935628258963592, + "grad_norm": 5.71875, + "kl": 0.1953134536743164, + "learning_rate": 5.226507911255071e-06, + "logits/chosen": 232882445.47368422, + "logits/rejected": 179844765.53846154, + "logps/chosen": -307.35904091282896, + "logps/rejected": -358.0269305889423, + "loss": 0.0829, + "rewards/chosen": 2.533077641537315, + "rewards/margins": 8.570316716244346, + "rewards/rejected": -6.037239074707031, + "step": 1337 + }, + { + "epoch": 0.49393198283420237, + "grad_norm": 6.0625, + "kl": 0.709071159362793, + "learning_rate": 5.22062846128833e-06, + "logits/chosen": 153273563.42857143, + "logits/rejected": 165392477.0909091, + "logps/chosen": -327.8559802827381, + "logps/rejected": -467.4290216619318, + "loss": 0.1111, + "rewards/chosen": 3.115337916782924, + "rewards/margins": 9.035452607390168, + "rewards/rejected": -5.920114690607244, + "step": 1338 + }, + { + "epoch": 0.4943011397720456, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 5.214748705643659e-06, + "logits/chosen": 213813668.57142857, + "logits/rejected": 199385969.7777778, + "logps/chosen": -228.62527901785714, + "logps/rejected": -462.38633897569446, + "loss": 0.0601, + "rewards/chosen": 2.8425467354910716, + "rewards/margins": 9.360076964847625, + "rewards/rejected": -6.517530229356554, + "step": 1339 + }, + { + "epoch": 0.49467029670988877, + "grad_norm": 5.78125, + "kl": 0.765955924987793, + "learning_rate": 5.208868652467385e-06, + "logits/chosen": 212440912.84210527, + "logits/rejected": 151673984.0, + "logps/chosen": -365.5853721217105, + "logps/rejected": -432.3176457331731, + "loss": 0.1212, + "rewards/chosen": 2.326905702289782, + "rewards/margins": 10.258082694852883, + "rewards/rejected": -7.931176992563101, + "step": 1340 + }, + { + "epoch": 0.495039453647732, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5.202988309906246e-06, + "logits/chosen": 252274010.3529412, + "logits/rejected": 138403310.93333334, + "logps/chosen": -434.3408203125, + "logps/rejected": -308.7478841145833, + "loss": 0.0703, + "rewards/chosen": 2.899468590231503, + "rewards/margins": 8.255510053447649, + "rewards/rejected": -5.356041463216146, + "step": 1341 + }, + { + "epoch": 0.49540861058557517, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 5.1971076861073825e-06, + "logits/chosen": 256498152.72727272, + "logits/rejected": 188074642.2857143, + "logps/chosen": -427.97021484375, + "logps/rejected": -453.9062965029762, + "loss": 0.0461, + "rewards/chosen": 2.8574215282093394, + "rewards/margins": 9.095064088895723, + "rewards/rejected": -6.237642560686384, + "step": 1342 + }, + { + "epoch": 0.4957777675234184, + "grad_norm": 6.0, + "kl": 0.5062065124511719, + "learning_rate": 5.1912267892183245e-06, + "logits/chosen": 278073920.0, + "logits/rejected": 220258176.0, + "logps/chosen": -408.8844299316406, + "logps/rejected": -334.48443603515625, + "loss": 0.0919, + "rewards/chosen": 2.9891347885131836, + "rewards/margins": 8.696258068084717, + "rewards/rejected": -5.707123279571533, + "step": 1343 + }, + { + "epoch": 0.4961469244612616, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5.1853456273869794e-06, + "logits/chosen": 182738272.0, + "logits/rejected": 220193328.0, + "logps/chosen": -283.1431884765625, + "logps/rejected": -372.85076904296875, + "loss": 0.0822, + "rewards/chosen": 2.4466800689697266, + "rewards/margins": 10.274794101715088, + "rewards/rejected": -7.828114032745361, + "step": 1344 + }, + { + "epoch": 0.4965160813991048, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 5.179464208761622e-06, + "logits/chosen": 364829632.0, + "logits/rejected": 184685744.0, + "logps/chosen": -478.6460876464844, + "logps/rejected": -418.5723876953125, + "loss": 0.0618, + "rewards/chosen": 3.1400792598724365, + "rewards/margins": 9.501708745956421, + "rewards/rejected": -6.361629486083984, + "step": 1345 + }, + { + "epoch": 0.496885238336948, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 5.173582541490886e-06, + "logits/chosen": 215591244.8, + "logits/rejected": 242503912.72727272, + "logps/chosen": -463.44677734375, + "logps/rejected": -417.2298473011364, + "loss": 0.083, + "rewards/chosen": 2.3735612869262694, + "rewards/margins": 8.020937607505104, + "rewards/rejected": -5.647376320578835, + "step": 1346 + }, + { + "epoch": 0.4972543952747912, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 5.167700633723742e-06, + "logits/chosen": 236303650.13333333, + "logits/rejected": 143400688.94117647, + "logps/chosen": -362.97034505208336, + "logps/rejected": -415.39728860294116, + "loss": 0.0704, + "rewards/chosen": 2.862133026123047, + "rewards/margins": 9.832286834716797, + "rewards/rejected": -6.97015380859375, + "step": 1347 + }, + { + "epoch": 0.4976235522126344, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 5.1618184936095e-06, + "logits/chosen": 223917346.13333333, + "logits/rejected": 180843670.5882353, + "logps/chosen": -324.2706705729167, + "logps/rejected": -439.72144990808823, + "loss": 0.0782, + "rewards/chosen": 2.5901758829752604, + "rewards/margins": 9.45827756395527, + "rewards/rejected": -6.8681016809800095, + "step": 1348 + }, + { + "epoch": 0.4979927091504776, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5.1559361292977915e-06, + "logits/chosen": 277719068.4444444, + "logits/rejected": 177395584.0, + "logps/chosen": -374.7177734375, + "logps/rejected": -336.6943359375, + "loss": 0.1218, + "rewards/chosen": 2.107276280721029, + "rewards/margins": 8.544350124540783, + "rewards/rejected": -6.437073843819754, + "step": 1349 + }, + { + "epoch": 0.4983618660883208, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 5.150053548938557e-06, + "logits/chosen": 261128320.0, + "logits/rejected": 268685084.4444444, + "logps/chosen": -463.5860072544643, + "logps/rejected": -374.0697970920139, + "loss": 0.0619, + "rewards/chosen": 2.5521518162318637, + "rewards/margins": 8.47548572600834, + "rewards/rejected": -5.923333909776476, + "step": 1350 + }, + { + "epoch": 0.498731023026164, + "grad_norm": 4.1875, + "kl": 0.0938577651977539, + "learning_rate": 5.1441707606820365e-06, + "logits/chosen": 200640493.7142857, + "logits/rejected": 221565440.0, + "logps/chosen": -332.23814174107144, + "logps/rejected": -536.7719184027778, + "loss": 0.0659, + "rewards/chosen": 2.3003008706229076, + "rewards/margins": 9.593212824019174, + "rewards/rejected": -7.292911953396267, + "step": 1351 + }, + { + "epoch": 0.4991001799640072, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.138287772678759e-06, + "logits/chosen": 190981461.33333334, + "logits/rejected": 217275494.4, + "logps/chosen": -299.9886474609375, + "logps/rejected": -578.377978515625, + "loss": 0.0485, + "rewards/chosen": 2.9966376622517905, + "rewards/margins": 10.50960610707601, + "rewards/rejected": -7.512968444824219, + "step": 1352 + }, + { + "epoch": 0.4994693369018504, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.132404593079531e-06, + "logits/chosen": 175870957.7142857, + "logits/rejected": 178075420.44444445, + "logps/chosen": -377.84511021205356, + "logps/rejected": -370.86626519097223, + "loss": 0.0678, + "rewards/chosen": 3.0757435389927457, + "rewards/margins": 8.342764294336714, + "rewards/rejected": -5.267020755343967, + "step": 1353 + }, + { + "epoch": 0.4998384938396936, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 5.1265212300354205e-06, + "logits/chosen": 190721114.3529412, + "logits/rejected": 210342058.66666666, + "logps/chosen": -263.68810317095586, + "logps/rejected": -503.66022135416665, + "loss": 0.066, + "rewards/chosen": 2.79994358735926, + "rewards/margins": 10.031190535601448, + "rewards/rejected": -7.2312469482421875, + "step": 1354 + }, + { + "epoch": 0.5002076507775368, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.120637691697753e-06, + "logits/chosen": 203584433.23076922, + "logits/rejected": 211981999.15789473, + "logps/chosen": -325.9372746394231, + "logps/rejected": -445.6390316611842, + "loss": 0.0743, + "rewards/chosen": 2.9209779592660756, + "rewards/margins": 9.330566020147037, + "rewards/rejected": -6.409588060880962, + "step": 1355 + }, + { + "epoch": 0.50057680771538, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 5.114753986218095e-06, + "logits/chosen": 278906277.64705884, + "logits/rejected": 207164347.73333332, + "logps/chosen": -299.9242302389706, + "logps/rejected": -450.18763020833336, + "loss": 0.1028, + "rewards/chosen": 2.2245308371151196, + "rewards/margins": 8.945831598020067, + "rewards/rejected": -6.721300760904948, + "step": 1356 + }, + { + "epoch": 0.50057680771538, + "eval_kl": 0.35627973079681396, + "eval_logits/chosen": 227420716.0794702, + "eval_logits/rejected": 193641682.60992908, + "eval_logps/chosen": -356.1331401766004, + "eval_logps/rejected": -443.82816193853427, + "eval_loss": 0.08317455649375916, + "eval_rewards/chosen": 2.7412392319432946, + "eval_rewards/margins": 9.298692504697431, + "eval_rewards/rejected": -6.557453272754137, + "eval_runtime": 53.7251, + "eval_samples_per_second": 16.305, + "eval_steps_per_second": 4.076, + "step": 1356 + }, + { + "epoch": 0.5009459646532232, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 5.108870121748248e-06, + "logits/chosen": 203598105.6, + "logits/rejected": 178629386.66666666, + "logps/chosen": -333.530859375, + "logps/rejected": -401.3287353515625, + "loss": 0.0904, + "rewards/chosen": 2.557644844055176, + "rewards/margins": 9.213731447855633, + "rewards/rejected": -6.656086603800456, + "step": 1357 + }, + { + "epoch": 0.5013151215910664, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.102986106440232e-06, + "logits/chosen": 218723726.2222222, + "logits/rejected": 198444672.0, + "logps/chosen": -340.47173394097223, + "logps/rejected": -340.19301060267856, + "loss": 0.0881, + "rewards/chosen": 2.782148996988932, + "rewards/margins": 8.502230871291388, + "rewards/rejected": -5.720081874302456, + "step": 1358 + }, + { + "epoch": 0.5016842785289096, + "grad_norm": 6.59375, + "kl": 1.097461223602295, + "learning_rate": 5.097101948446272e-06, + "logits/chosen": 264667545.6, + "logits/rejected": 223822101.33333334, + "logps/chosen": -378.438330078125, + "logps/rejected": -361.420654296875, + "loss": 0.1238, + "rewards/chosen": 2.5479150772094727, + "rewards/margins": 7.752857462565105, + "rewards/rejected": -5.204942385355632, + "step": 1359 + }, + { + "epoch": 0.5020534354667529, + "grad_norm": 6.40625, + "kl": 1.377685546875, + "learning_rate": 5.091217655918797e-06, + "logits/chosen": 301931429.64705884, + "logits/rejected": 254111914.66666666, + "logps/chosen": -354.92198988970586, + "logps/rejected": -392.5115234375, + "loss": 0.1231, + "rewards/chosen": 2.0258295395795036, + "rewards/margins": 8.641863475126378, + "rewards/rejected": -6.616033935546875, + "step": 1360 + }, + { + "epoch": 0.502422592404596, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5.085333237010418e-06, + "logits/chosen": 152028690.2857143, + "logits/rejected": 246574961.7777778, + "logps/chosen": -325.70884486607144, + "logps/rejected": -417.3498263888889, + "loss": 0.0748, + "rewards/chosen": 3.7063189915248325, + "rewards/margins": 9.935305943564764, + "rewards/rejected": -6.22898695203993, + "step": 1361 + }, + { + "epoch": 0.5027917493424392, + "grad_norm": 3.890625, + "kl": 0.004987239837646484, + "learning_rate": 5.0794486998739235e-06, + "logits/chosen": 188514666.66666666, + "logits/rejected": 264012441.6, + "logps/chosen": -388.8651123046875, + "logps/rejected": -511.99443359375, + "loss": 0.0387, + "rewards/chosen": 3.5053981145222983, + "rewards/margins": 9.885348065694174, + "rewards/rejected": -6.379949951171875, + "step": 1362 + }, + { + "epoch": 0.5031609062802824, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 5.073564052662265e-06, + "logits/chosen": 178101632.0, + "logits/rejected": 255458368.0, + "logps/chosen": -315.2209777832031, + "logps/rejected": -507.9679870605469, + "loss": 0.0648, + "rewards/chosen": 2.6630501747131348, + "rewards/margins": 10.645615100860596, + "rewards/rejected": -7.982564926147461, + "step": 1363 + }, + { + "epoch": 0.5035300632181257, + "grad_norm": 6.1875, + "kl": 2.0618507862091064, + "learning_rate": 5.067679303528546e-06, + "logits/chosen": 156031533.17647058, + "logits/rejected": 312887193.6, + "logps/chosen": -375.3972598805147, + "logps/rejected": -453.76673177083336, + "loss": 0.1069, + "rewards/chosen": 2.2923579496495865, + "rewards/margins": 8.956285663679534, + "rewards/rejected": -6.663927714029948, + "step": 1364 + }, + { + "epoch": 0.5038992201559688, + "grad_norm": 4.625, + "kl": 0.06595993041992188, + "learning_rate": 5.061794460626012e-06, + "logits/chosen": 227202528.0, + "logits/rejected": 182315392.0, + "logps/chosen": -421.8013610839844, + "logps/rejected": -472.6565856933594, + "loss": 0.0601, + "rewards/chosen": 3.1202404499053955, + "rewards/margins": 10.999737977981567, + "rewards/rejected": -7.879497528076172, + "step": 1365 + }, + { + "epoch": 0.504268377093812, + "grad_norm": 5.53125, + "kl": 0.39707326889038086, + "learning_rate": 5.055909532108038e-06, + "logits/chosen": 191348147.2, + "logits/rejected": 178348096.0, + "logps/chosen": -333.9248291015625, + "logps/rejected": -384.883544921875, + "loss": 0.0983, + "rewards/chosen": 2.8280031204223635, + "rewards/margins": 8.554032198588054, + "rewards/rejected": -5.72602907816569, + "step": 1366 + }, + { + "epoch": 0.5046375340316552, + "grad_norm": 7.09375, + "kl": 1.568817138671875, + "learning_rate": 5.050024526128118e-06, + "logits/chosen": 263525522.2857143, + "logits/rejected": 243162507.63636363, + "logps/chosen": -373.2213541666667, + "logps/rejected": -386.720703125, + "loss": 0.1219, + "rewards/chosen": 2.7512183416457403, + "rewards/margins": 10.138908105495172, + "rewards/rejected": -7.387689763849432, + "step": 1367 + }, + { + "epoch": 0.5050066909694985, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 5.044139450839851e-06, + "logits/chosen": 168174829.7142857, + "logits/rejected": 190120448.0, + "logps/chosen": -329.01112583705356, + "logps/rejected": -495.41954210069446, + "loss": 0.0838, + "rewards/chosen": 2.0186963762555803, + "rewards/margins": 8.976248544359963, + "rewards/rejected": -6.9575521681043835, + "step": 1368 + }, + { + "epoch": 0.5053758479073416, + "grad_norm": 6.6875, + "kl": 0.050354957580566406, + "learning_rate": 5.038254314396936e-06, + "logits/chosen": 216341196.8, + "logits/rejected": 221865813.33333334, + "logps/chosen": -394.3534912109375, + "logps/rejected": -488.8938395182292, + "loss": 0.1085, + "rewards/chosen": 2.3782938003540037, + "rewards/margins": 10.858763821919759, + "rewards/rejected": -8.480470021565756, + "step": 1369 + }, + { + "epoch": 0.5057450048451848, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 5.032369124953156e-06, + "logits/chosen": 298484666.1818182, + "logits/rejected": 241471073.52380952, + "logps/chosen": -426.41592684659093, + "logps/rejected": -462.5751953125, + "loss": 0.0416, + "rewards/chosen": 3.317350214177912, + "rewards/margins": 9.643327357965115, + "rewards/rejected": -6.3259771437872026, + "step": 1370 + }, + { + "epoch": 0.506114161783028, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.026483890662363e-06, + "logits/chosen": 208740710.4, + "logits/rejected": 207369336.47058824, + "logps/chosen": -369.6121419270833, + "logps/rejected": -449.47291475183823, + "loss": 0.0615, + "rewards/chosen": 2.938982391357422, + "rewards/margins": 9.670619785084444, + "rewards/rejected": -6.731637393727022, + "step": 1371 + }, + { + "epoch": 0.5064833187208712, + "grad_norm": 4.3125, + "kl": 0.8232498168945312, + "learning_rate": 5.020598619678478e-06, + "logits/chosen": 209501440.0, + "logits/rejected": 173830580.70588234, + "logps/chosen": -394.9218424479167, + "logps/rejected": -386.65894990808823, + "loss": 0.0624, + "rewards/chosen": 3.425559488932292, + "rewards/margins": 9.15514367795458, + "rewards/rejected": -5.7295841890222885, + "step": 1372 + }, + { + "epoch": 0.5068524756587144, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 5.014713320155464e-06, + "logits/chosen": 184353396.36363637, + "logits/rejected": 255137587.2, + "logps/chosen": -295.489501953125, + "logps/rejected": -582.8923828125, + "loss": 0.0686, + "rewards/chosen": 3.634876251220703, + "rewards/margins": 9.941403198242188, + "rewards/rejected": -6.306526947021484, + "step": 1373 + }, + { + "epoch": 0.5072216325965576, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.008828000247335e-06, + "logits/chosen": 181333120.0, + "logits/rejected": 302206776.8888889, + "logps/chosen": -316.32114955357144, + "logps/rejected": -439.33270941840277, + "loss": 0.0676, + "rewards/chosen": 3.071406500680106, + "rewards/margins": 8.980904079618908, + "rewards/rejected": -5.909497578938802, + "step": 1374 + }, + { + "epoch": 0.5075907895344008, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 5.002942668108121e-06, + "logits/chosen": 247319808.0, + "logits/rejected": 225670197.89473686, + "logps/chosen": -342.1579402043269, + "logps/rejected": -497.2348375822368, + "loss": 0.0708, + "rewards/chosen": 2.3402445866511417, + "rewards/margins": 9.619295714837818, + "rewards/rejected": -7.279051128186677, + "step": 1375 + }, + { + "epoch": 0.507959946472244, + "grad_norm": 6.59375, + "kl": 1.0437264442443848, + "learning_rate": 4.99705733189188e-06, + "logits/chosen": 215142784.0, + "logits/rejected": 221164708.57142857, + "logps/chosen": -338.4690755208333, + "logps/rejected": -540.6833147321429, + "loss": 0.1167, + "rewards/chosen": 2.402571784125434, + "rewards/margins": 9.3701048956977, + "rewards/rejected": -6.967533111572266, + "step": 1376 + }, + { + "epoch": 0.5083291034100872, + "grad_norm": 5.65625, + "kl": 0.4487733840942383, + "learning_rate": 4.991171999752668e-06, + "logits/chosen": 225341906.82352942, + "logits/rejected": 210392593.06666666, + "logps/chosen": -334.7643612132353, + "logps/rejected": -398.15283203125, + "loss": 0.1039, + "rewards/chosen": 2.640654171214384, + "rewards/margins": 8.804104793772979, + "rewards/rejected": -6.163450622558594, + "step": 1377 + }, + { + "epoch": 0.5086982603479304, + "grad_norm": 5.75, + "kl": 0.6094112396240234, + "learning_rate": 4.985286679844537e-06, + "logits/chosen": 399796845.71428573, + "logits/rejected": 351688135.1111111, + "logps/chosen": -449.46798270089283, + "logps/rejected": -615.4810112847222, + "loss": 0.0677, + "rewards/chosen": 2.3597706386021207, + "rewards/margins": 10.460748218354723, + "rewards/rejected": -8.100977579752604, + "step": 1378 + }, + { + "epoch": 0.5090674172857736, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.979401380321525e-06, + "logits/chosen": 182215332.57142857, + "logits/rejected": 181739221.33333334, + "logps/chosen": -366.96693638392856, + "logps/rejected": -430.62977430555554, + "loss": 0.0679, + "rewards/chosen": 2.428300585065569, + "rewards/margins": 8.940100715273902, + "rewards/rejected": -6.511800130208333, + "step": 1379 + }, + { + "epoch": 0.5094365742236168, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 4.973516109337639e-06, + "logits/chosen": 190093387.29411766, + "logits/rejected": 185183095.46666667, + "logps/chosen": -347.3824103860294, + "logps/rejected": -529.5743489583333, + "loss": 0.0836, + "rewards/chosen": 2.510632683249081, + "rewards/margins": 10.306085055482153, + "rewards/rejected": -7.795452372233073, + "step": 1380 + }, + { + "epoch": 0.50980573116146, + "grad_norm": 4.46875, + "kl": 1.0095338821411133, + "learning_rate": 4.967630875046845e-06, + "logits/chosen": 176362393.6, + "logits/rejected": 196646058.66666666, + "logps/chosen": -308.062744140625, + "logps/rejected": -381.9999593098958, + "loss": 0.0787, + "rewards/chosen": 2.9594215393066405, + "rewards/margins": 8.466460927327473, + "rewards/rejected": -5.507039388020833, + "step": 1381 + }, + { + "epoch": 0.5101748880993032, + "grad_norm": 6.875, + "kl": 1.6023073196411133, + "learning_rate": 4.961745685603065e-06, + "logits/chosen": 261525317.8181818, + "logits/rejected": 325521945.6, + "logps/chosen": -300.0753728693182, + "logps/rejected": -361.1354736328125, + "loss": 0.1639, + "rewards/chosen": 1.971467451615767, + "rewards/margins": 8.091594175858932, + "rewards/rejected": -6.120126724243164, + "step": 1382 + }, + { + "epoch": 0.5105440450371465, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 4.95586054916015e-06, + "logits/chosen": 222629193.14285713, + "logits/rejected": 240285240.8888889, + "logps/chosen": -330.85414341517856, + "logps/rejected": -499.51133897569446, + "loss": 0.0519, + "rewards/chosen": 2.8307416098458424, + "rewards/margins": 10.511811725677006, + "rewards/rejected": -7.681070115831163, + "step": 1383 + }, + { + "epoch": 0.5109132019749896, + "grad_norm": 4.40625, + "kl": 0.8240547180175781, + "learning_rate": 4.9499754738718835e-06, + "logits/chosen": 181044872.53333333, + "logits/rejected": 206531493.6470588, + "logps/chosen": -359.4293619791667, + "logps/rejected": -430.3114659926471, + "loss": 0.0748, + "rewards/chosen": 3.0689404805501304, + "rewards/margins": 10.815126680860333, + "rewards/rejected": -7.746186200310202, + "step": 1384 + }, + { + "epoch": 0.5112823589128328, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 4.944090467891963e-06, + "logits/chosen": 174490936.8888889, + "logits/rejected": 332540489.14285713, + "logps/chosen": -348.0022786458333, + "logps/rejected": -404.51632254464283, + "loss": 0.0934, + "rewards/chosen": 3.4139946831597223, + "rewards/margins": 9.473602052718874, + "rewards/rejected": -6.059607369559152, + "step": 1385 + }, + { + "epoch": 0.511651515850676, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 4.938205539373989e-06, + "logits/chosen": 208092433.06666666, + "logits/rejected": 252965586.82352942, + "logps/chosen": -344.98805338541666, + "logps/rejected": -478.8070427389706, + "loss": 0.0669, + "rewards/chosen": 3.038161214192708, + "rewards/margins": 9.959530879001992, + "rewards/rejected": -6.9213696648092835, + "step": 1386 + }, + { + "epoch": 0.5120206727885193, + "grad_norm": 6.4375, + "kl": 0.35227513313293457, + "learning_rate": 4.932320696471455e-06, + "logits/chosen": 178867441.7777778, + "logits/rejected": 131382875.42857143, + "logps/chosen": -376.2277560763889, + "logps/rejected": -280.66102818080356, + "loss": 0.1259, + "rewards/chosen": 2.6259161631266275, + "rewards/margins": 7.747838792346773, + "rewards/rejected": -5.121922629220145, + "step": 1387 + }, + { + "epoch": 0.5123898297263624, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 4.926435947337736e-06, + "logits/chosen": 234279116.8, + "logits/rejected": 321506846.11764705, + "logps/chosen": -330.7978841145833, + "logps/rejected": -545.8468520220588, + "loss": 0.0957, + "rewards/chosen": 2.8141830444335936, + "rewards/margins": 10.056419462316176, + "rewards/rejected": -7.242236417882583, + "step": 1388 + }, + { + "epoch": 0.5127589866642056, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.920551300126077e-06, + "logits/chosen": 223691648.0, + "logits/rejected": 154891878.4, + "logps/chosen": -406.3864339192708, + "logps/rejected": -444.840771484375, + "loss": 0.0668, + "rewards/chosen": 2.7489938735961914, + "rewards/margins": 9.619949531555175, + "rewards/rejected": -6.8709556579589846, + "step": 1389 + }, + { + "epoch": 0.5131281436020488, + "grad_norm": 5.125, + "kl": 1.5219526290893555, + "learning_rate": 4.9146667629895836e-06, + "logits/chosen": 243147641.2631579, + "logits/rejected": 139042520.6153846, + "logps/chosen": -411.0696443256579, + "logps/rejected": -411.05904447115387, + "loss": 0.0818, + "rewards/chosen": 2.577381736353824, + "rewards/margins": 8.643553235752861, + "rewards/rejected": -6.066171499399038, + "step": 1390 + }, + { + "epoch": 0.5134973005398921, + "grad_norm": 3.90625, + "kl": 0.5582895278930664, + "learning_rate": 4.908782344081204e-06, + "logits/chosen": 236474880.0, + "logits/rejected": 292150091.2941176, + "logps/chosen": -267.13172200520836, + "logps/rejected": -469.78294462316177, + "loss": 0.068, + "rewards/chosen": 3.295152791341146, + "rewards/margins": 9.523283356311275, + "rewards/rejected": -6.228130564970129, + "step": 1391 + }, + { + "epoch": 0.5138664574777352, + "grad_norm": 5.15625, + "kl": 0.11983251571655273, + "learning_rate": 4.902898051553729e-06, + "logits/chosen": 202030165.33333334, + "logits/rejected": 180361274.1818182, + "logps/chosen": -365.56201171875, + "logps/rejected": -339.01708984375, + "loss": 0.0777, + "rewards/chosen": 3.1498300461542037, + "rewards/margins": 9.268173052635028, + "rewards/rejected": -6.118343006480824, + "step": 1392 + }, + { + "epoch": 0.5142356144155784, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 4.897013893559771e-06, + "logits/chosen": 244669713.06666666, + "logits/rejected": 173639936.0, + "logps/chosen": -386.51783854166666, + "logps/rejected": -441.68118106617646, + "loss": 0.1059, + "rewards/chosen": 1.9578104654947917, + "rewards/margins": 8.550985119389553, + "rewards/rejected": -6.593174653894761, + "step": 1393 + }, + { + "epoch": 0.5146047713534216, + "grad_norm": 7.40625, + "kl": 1.2831873893737793, + "learning_rate": 4.891129878251754e-06, + "logits/chosen": 192047646.11764705, + "logits/rejected": 239456699.73333332, + "logps/chosen": -352.3636259191176, + "logps/rejected": -355.01145833333334, + "loss": 0.1181, + "rewards/chosen": 2.4777870178222656, + "rewards/margins": 8.133113352457682, + "rewards/rejected": -5.655326334635417, + "step": 1394 + }, + { + "epoch": 0.5149739282912649, + "grad_norm": 4.84375, + "kl": 0.7432661056518555, + "learning_rate": 4.8852460137819065e-06, + "logits/chosen": 187761408.0, + "logits/rejected": 242166363.42857143, + "logps/chosen": -314.46544053819446, + "logps/rejected": -396.3698032924107, + "loss": 0.0794, + "rewards/chosen": 3.1082644992404513, + "rewards/margins": 8.817806364997985, + "rewards/rejected": -5.709541865757534, + "step": 1395 + }, + { + "epoch": 0.515343085229108, + "grad_norm": 5.96875, + "kl": 0.21212482452392578, + "learning_rate": 4.87936230830225e-06, + "logits/chosen": 302802077.53846157, + "logits/rejected": 248109352.42105263, + "logps/chosen": -358.9646183894231, + "logps/rejected": -425.5939298930921, + "loss": 0.0951, + "rewards/chosen": 1.872918935922476, + "rewards/margins": 8.848002136477575, + "rewards/rejected": -6.9750832005550985, + "step": 1396 + }, + { + "epoch": 0.5157122421669512, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 4.873478769964583e-06, + "logits/chosen": 411299693.71428573, + "logits/rejected": 169097856.0, + "logps/chosen": -412.78731863839283, + "logps/rejected": -390.065673828125, + "loss": 0.0459, + "rewards/chosen": 3.099510737827846, + "rewards/margins": 9.671939970955016, + "rewards/rejected": -6.5724292331271705, + "step": 1397 + }, + { + "epoch": 0.5160813991047944, + "grad_norm": 5.75, + "kl": 0.0004749298095703125, + "learning_rate": 4.867595406920471e-06, + "logits/chosen": 261917803.78947368, + "logits/rejected": 219701011.69230768, + "logps/chosen": -358.9085115131579, + "logps/rejected": -543.4847130408654, + "loss": 0.0815, + "rewards/chosen": 2.586830741480777, + "rewards/margins": 10.652104319831139, + "rewards/rejected": -8.06527357835036, + "step": 1398 + }, + { + "epoch": 0.5164505560426377, + "grad_norm": 6.8125, + "kl": 1.0157699584960938, + "learning_rate": 4.8617122273212414e-06, + "logits/chosen": 207001584.0, + "logits/rejected": 277233472.0, + "logps/chosen": -443.41339111328125, + "logps/rejected": -514.6201782226562, + "loss": 0.0814, + "rewards/chosen": 3.022653341293335, + "rewards/margins": 9.924786806106567, + "rewards/rejected": -6.902133464813232, + "step": 1399 + }, + { + "epoch": 0.5168197129804808, + "grad_norm": 6.71875, + "kl": 0.6697254180908203, + "learning_rate": 4.855829239317964e-06, + "logits/chosen": 278217269.8947368, + "logits/rejected": 241296502.15384614, + "logps/chosen": -353.84619140625, + "logps/rejected": -426.80697866586536, + "loss": 0.1072, + "rewards/chosen": 2.71430105912058, + "rewards/margins": 9.327677383113969, + "rewards/rejected": -6.613376323993389, + "step": 1400 + }, + { + "epoch": 0.517188869918324, + "grad_norm": 5.25, + "kl": 0.29302072525024414, + "learning_rate": 4.849946451061444e-06, + "logits/chosen": 194975051.29411766, + "logits/rejected": 142849612.8, + "logps/chosen": -400.40418198529414, + "logps/rejected": -394.76669921875, + "loss": 0.0791, + "rewards/chosen": 3.323132683249081, + "rewards/margins": 9.002125908346738, + "rewards/rejected": -5.678993225097656, + "step": 1401 + }, + { + "epoch": 0.5175580268561673, + "grad_norm": 4.4375, + "kl": 0.3162250518798828, + "learning_rate": 4.84406387070221e-06, + "logits/chosen": 374309546.6666667, + "logits/rejected": 220953584.94117647, + "logps/chosen": -392.43873697916666, + "logps/rejected": -507.66228170955884, + "loss": 0.052, + "rewards/chosen": 2.756882985432943, + "rewards/margins": 10.005590027453852, + "rewards/rejected": -7.24870704202091, + "step": 1402 + }, + { + "epoch": 0.5179271837940105, + "grad_norm": 5.59375, + "kl": 0.6485347747802734, + "learning_rate": 4.838181506390501e-06, + "logits/chosen": 257777420.19047618, + "logits/rejected": 213297850.1818182, + "logps/chosen": -264.18531436011904, + "logps/rejected": -434.376953125, + "loss": 0.1212, + "rewards/chosen": 2.812830243791853, + "rewards/margins": 10.003397953974737, + "rewards/rejected": -7.190567710182884, + "step": 1403 + }, + { + "epoch": 0.5182963407318536, + "grad_norm": 4.9375, + "kl": 0.3320465087890625, + "learning_rate": 4.83229936627626e-06, + "logits/chosen": 199050300.95238096, + "logits/rejected": 183748887.27272728, + "logps/chosen": -340.9506370907738, + "logps/rejected": -441.96186967329544, + "loss": 0.0914, + "rewards/chosen": 2.759419395810082, + "rewards/margins": 8.918897323278122, + "rewards/rejected": -6.15947792746804, + "step": 1404 + }, + { + "epoch": 0.5186654976696968, + "grad_norm": 4.9375, + "kl": 1.0783801078796387, + "learning_rate": 4.826417458509116e-06, + "logits/chosen": 295590272.0, + "logits/rejected": 205960448.0, + "logps/chosen": -409.997509765625, + "logps/rejected": -573.2190755208334, + "loss": 0.0925, + "rewards/chosen": 2.900394248962402, + "rewards/margins": 9.361683336893718, + "rewards/rejected": -6.461289087931315, + "step": 1405 + }, + { + "epoch": 0.5190346546075401, + "grad_norm": 4.75, + "kl": 1.077897548675537, + "learning_rate": 4.8205357912383785e-06, + "logits/chosen": 248956304.0, + "logits/rejected": 291213632.0, + "logps/chosen": -357.945556640625, + "logps/rejected": -370.4856872558594, + "loss": 0.099, + "rewards/chosen": 2.116985559463501, + "rewards/margins": 7.845202684402466, + "rewards/rejected": -5.728217124938965, + "step": 1406 + }, + { + "epoch": 0.5194038115453832, + "grad_norm": 7.0625, + "kl": 0.08640003204345703, + "learning_rate": 4.8146543726130205e-06, + "logits/chosen": 225080953.2631579, + "logits/rejected": 235909553.23076922, + "logps/chosen": -321.9380653782895, + "logps/rejected": -409.21382962740387, + "loss": 0.1394, + "rewards/chosen": 1.8921988637823808, + "rewards/margins": 7.759786026680517, + "rewards/rejected": -5.867587162898137, + "step": 1407 + }, + { + "epoch": 0.5197729684832264, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.8087732107816755e-06, + "logits/chosen": 244637639.1111111, + "logits/rejected": 173309549.7142857, + "logps/chosen": -374.53423394097223, + "logps/rejected": -446.16650390625, + "loss": 0.0966, + "rewards/chosen": 3.0319913228352866, + "rewards/margins": 9.635491144089471, + "rewards/rejected": -6.603499821254185, + "step": 1408 + }, + { + "epoch": 0.5201421254210696, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.802892313892618e-06, + "logits/chosen": 213041036.8, + "logits/rejected": 242629236.36363637, + "logps/chosen": -306.570654296875, + "logps/rejected": -454.10844282670456, + "loss": 0.068, + "rewards/chosen": 2.840657615661621, + "rewards/margins": 9.00551287911155, + "rewards/rejected": -6.164855263449929, + "step": 1409 + }, + { + "epoch": 0.5205112823589129, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 4.797011690093756e-06, + "logits/chosen": 294295013.05263156, + "logits/rejected": 200927251.69230768, + "logps/chosen": -299.56517269736844, + "logps/rejected": -464.41019381009613, + "loss": 0.114, + "rewards/chosen": 1.8418723658511513, + "rewards/margins": 8.44368604899418, + "rewards/rejected": -6.601813683143029, + "step": 1410 + }, + { + "epoch": 0.520880439296756, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.791131347532619e-06, + "logits/chosen": 287971635.2, + "logits/rejected": 190156800.0, + "logps/chosen": -326.4712646484375, + "logps/rejected": -496.1604410807292, + "loss": 0.0745, + "rewards/chosen": 2.928312301635742, + "rewards/margins": 9.51511001586914, + "rewards/rejected": -6.586797714233398, + "step": 1411 + }, + { + "epoch": 0.5212495962345992, + "grad_norm": 5.6875, + "kl": 3.076977252960205, + "learning_rate": 4.785251294356343e-06, + "logits/chosen": 243428608.0, + "logits/rejected": 149586688.0, + "logps/chosen": -408.54340277777777, + "logps/rejected": -371.43896484375, + "loss": 0.1413, + "rewards/chosen": 2.814528571234809, + "rewards/margins": 7.999230218312097, + "rewards/rejected": -5.184701647077288, + "step": 1412 + }, + { + "epoch": 0.5216187531724424, + "grad_norm": 5.53125, + "kl": 0.9520225524902344, + "learning_rate": 4.779371538711672e-06, + "logits/chosen": 241599118.2222222, + "logits/rejected": 215095387.42857143, + "logps/chosen": -357.83428276909723, + "logps/rejected": -355.11265345982144, + "loss": 0.0885, + "rewards/chosen": 2.5405366685655384, + "rewards/margins": 7.240611000666543, + "rewards/rejected": -4.700074332101004, + "step": 1413 + }, + { + "epoch": 0.5219879101102857, + "grad_norm": 3.140625, + "kl": 0.0, + "learning_rate": 4.773492088744932e-06, + "logits/chosen": 251931923.69230768, + "logits/rejected": 203833384.42105263, + "logps/chosen": -470.5350811298077, + "logps/rejected": -504.0726768092105, + "loss": 0.0326, + "rewards/chosen": 3.4901709923377404, + "rewards/margins": 10.697136388616522, + "rewards/rejected": -7.2069653962787825, + "step": 1414 + }, + { + "epoch": 0.5223570670481288, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 4.767612952602023e-06, + "logits/chosen": 205618608.0, + "logits/rejected": 203911504.0, + "logps/chosen": -325.8654479980469, + "logps/rejected": -305.3065185546875, + "loss": 0.0985, + "rewards/chosen": 2.4697818756103516, + "rewards/margins": 7.821037769317627, + "rewards/rejected": -5.351255893707275, + "step": 1415 + }, + { + "epoch": 0.522726223985972, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.761734138428417e-06, + "logits/chosen": 202368689.23076922, + "logits/rejected": 274063413.8947368, + "logps/chosen": -291.31473482572113, + "logps/rejected": -407.49488589638156, + "loss": 0.0998, + "rewards/chosen": 2.3607283372145433, + "rewards/margins": 8.198713727325563, + "rewards/rejected": -5.83798539011102, + "step": 1416 + }, + { + "epoch": 0.5230953809238152, + "grad_norm": 4.6875, + "kl": 0.07467937469482422, + "learning_rate": 4.755855654369136e-06, + "logits/chosen": 214673920.0, + "logits/rejected": 198451114.66666666, + "logps/chosen": -312.90513392857144, + "logps/rejected": -365.06231011284723, + "loss": 0.0898, + "rewards/chosen": 2.5822960989815846, + "rewards/margins": 8.641986362517827, + "rewards/rejected": -6.0596902635362415, + "step": 1417 + }, + { + "epoch": 0.5234645378616585, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.749977508568742e-06, + "logits/chosen": 241495567.05882353, + "logits/rejected": 165197858.13333333, + "logps/chosen": -367.15759995404414, + "logps/rejected": -363.45481770833334, + "loss": 0.0717, + "rewards/chosen": 2.5687592450310204, + "rewards/margins": 7.324475696040135, + "rewards/rejected": -4.755716451009115, + "step": 1418 + }, + { + "epoch": 0.5238336947995016, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 4.744099709171335e-06, + "logits/chosen": 215525532.44444445, + "logits/rejected": 207793920.0, + "logps/chosen": -284.14008246527777, + "logps/rejected": -431.67208426339283, + "loss": 0.1238, + "rewards/chosen": 2.1486723158094616, + "rewards/margins": 8.112894270155165, + "rewards/rejected": -5.964221954345703, + "step": 1419 + }, + { + "epoch": 0.5242028517373448, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 4.738222264320529e-06, + "logits/chosen": 305179562.6666667, + "logits/rejected": 389101202.28571427, + "logps/chosen": -387.48280164930554, + "logps/rejected": -744.9566127232143, + "loss": 0.1006, + "rewards/chosen": 2.083546108669705, + "rewards/margins": 11.24951922704303, + "rewards/rejected": -9.165973118373326, + "step": 1420 + }, + { + "epoch": 0.524572008675188, + "grad_norm": 3.25, + "kl": 0.0, + "learning_rate": 4.73234518215945e-06, + "logits/chosen": 219322094.93333334, + "logits/rejected": 182105419.29411766, + "logps/chosen": -344.22783203125, + "logps/rejected": -484.3467371323529, + "loss": 0.0841, + "rewards/chosen": 3.009532419840495, + "rewards/margins": 10.473221438538793, + "rewards/rejected": -7.463689018698299, + "step": 1421 + }, + { + "epoch": 0.5249411656130313, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 4.726468470830721e-06, + "logits/chosen": 244924320.0, + "logits/rejected": 314229632.0, + "logps/chosen": -425.580322265625, + "logps/rejected": -534.7153930664062, + "loss": 0.1155, + "rewards/chosen": 2.1940503120422363, + "rewards/margins": 9.48559284210205, + "rewards/rejected": -7.2915425300598145, + "step": 1422 + }, + { + "epoch": 0.5253103225508744, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 4.720592138476451e-06, + "logits/chosen": 156274828.8, + "logits/rejected": 163095889.45454547, + "logps/chosen": -343.3959716796875, + "logps/rejected": -407.56325461647725, + "loss": 0.0521, + "rewards/chosen": 3.0792274475097656, + "rewards/margins": 9.731223019686613, + "rewards/rejected": -6.651995572176847, + "step": 1423 + }, + { + "epoch": 0.5256794794887176, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.714716193238221e-06, + "logits/chosen": 148876970.66666666, + "logits/rejected": 125489133.71428572, + "logps/chosen": -291.9039713541667, + "logps/rejected": -357.7239467075893, + "loss": 0.0989, + "rewards/chosen": 2.6909567515055337, + "rewards/margins": 7.899149849301292, + "rewards/rejected": -5.208193097795759, + "step": 1424 + }, + { + "epoch": 0.5260486364265609, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 4.708840643257083e-06, + "logits/chosen": 171675702.85714287, + "logits/rejected": 271405056.0, + "logps/chosen": -417.3226841517857, + "logps/rejected": -429.27625868055554, + "loss": 0.0745, + "rewards/chosen": 2.7484193529401506, + "rewards/margins": 8.899833164517842, + "rewards/rejected": -6.151413811577691, + "step": 1425 + }, + { + "epoch": 0.5264177933644041, + "grad_norm": 5.625, + "kl": 0.6887550354003906, + "learning_rate": 4.702965496673534e-06, + "logits/chosen": 247300266.66666666, + "logits/rejected": 182342295.27272728, + "logps/chosen": -386.5907273065476, + "logps/rejected": -397.2086292613636, + "loss": 0.0966, + "rewards/chosen": 2.7673087347121466, + "rewards/margins": 9.094380878266835, + "rewards/rejected": -6.3270721435546875, + "step": 1426 + }, + { + "epoch": 0.5267869503022472, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 4.697090761627515e-06, + "logits/chosen": 176015856.0, + "logits/rejected": 210495584.0, + "logps/chosen": -212.92672729492188, + "logps/rejected": -361.8096923828125, + "loss": 0.0982, + "rewards/chosen": 2.3496363162994385, + "rewards/margins": 7.700561761856079, + "rewards/rejected": -5.350925445556641, + "step": 1427 + }, + { + "epoch": 0.5271561072400904, + "grad_norm": 4.96875, + "kl": 0.2600440979003906, + "learning_rate": 4.691216446258401e-06, + "logits/chosen": 271974112.0, + "logits/rejected": 324800608.0, + "logps/chosen": -433.9719543457031, + "logps/rejected": -501.2889099121094, + "loss": 0.0626, + "rewards/chosen": 2.6660170555114746, + "rewards/margins": 9.50899076461792, + "rewards/rejected": -6.842973709106445, + "step": 1428 + }, + { + "epoch": 0.5275252641779337, + "grad_norm": 7.28125, + "kl": 0.8669424057006836, + "learning_rate": 4.685342558704973e-06, + "logits/chosen": 258415872.0, + "logits/rejected": 204334688.0, + "logps/chosen": -334.923828125, + "logps/rejected": -446.90478515625, + "loss": 0.1462, + "rewards/chosen": 1.9069517453511555, + "rewards/margins": 7.529533704121907, + "rewards/rejected": -5.622581958770752, + "step": 1429 + }, + { + "epoch": 0.5278944211157769, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 4.679469107105435e-06, + "logits/chosen": 215098231.46666667, + "logits/rejected": 281689600.0, + "logps/chosen": -308.0406901041667, + "logps/rejected": -346.74353745404414, + "loss": 0.1249, + "rewards/chosen": 1.7547809600830078, + "rewards/margins": 7.804376063627355, + "rewards/rejected": -6.049595103544347, + "step": 1430 + }, + { + "epoch": 0.52826357805362, + "grad_norm": 4.8125, + "kl": 0.2780132293701172, + "learning_rate": 4.673596099597376e-06, + "logits/chosen": 278474455.57894737, + "logits/rejected": 284140740.9230769, + "logps/chosen": -347.2709703947368, + "logps/rejected": -603.1207557091346, + "loss": 0.0847, + "rewards/chosen": 2.551538768567537, + "rewards/margins": 11.936756288474388, + "rewards/rejected": -9.38521751990685, + "step": 1431 + }, + { + "epoch": 0.5286327349914632, + "grad_norm": 5.75, + "kl": 0.6205463409423828, + "learning_rate": 4.667723544317773e-06, + "logits/chosen": 198285238.85714287, + "logits/rejected": 184184647.1111111, + "logps/chosen": -397.0338657924107, + "logps/rejected": -482.84852430555554, + "loss": 0.0822, + "rewards/chosen": 2.8342699323381697, + "rewards/margins": 8.29972657703218, + "rewards/rejected": -5.465456644694011, + "step": 1432 + }, + { + "epoch": 0.5290018919293065, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.661851449402978e-06, + "logits/chosen": 337714337.68421054, + "logits/rejected": 254032777.84615386, + "logps/chosen": -424.0825966282895, + "logps/rejected": -303.41714242788464, + "loss": 0.0848, + "rewards/chosen": 2.807460584138569, + "rewards/margins": 7.588529239299326, + "rewards/rejected": -4.781068655160757, + "step": 1433 + }, + { + "epoch": 0.5293710488671497, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 4.655979822988705e-06, + "logits/chosen": 252669312.0, + "logits/rejected": 213597696.0, + "logps/chosen": -327.3056335449219, + "logps/rejected": -493.9178466796875, + "loss": 0.0747, + "rewards/chosen": 2.5419564247131348, + "rewards/margins": 9.980780601501465, + "rewards/rejected": -7.43882417678833, + "step": 1434 + }, + { + "epoch": 0.5297402058049928, + "grad_norm": 5.96875, + "kl": 4.248737335205078, + "learning_rate": 4.650108673210014e-06, + "logits/chosen": 276995669.3333333, + "logits/rejected": 216771547.42857143, + "logps/chosen": -376.3618977864583, + "logps/rejected": -435.841796875, + "loss": 0.1338, + "rewards/chosen": 2.876005172729492, + "rewards/margins": 9.628010613577707, + "rewards/rejected": -6.752005440848214, + "step": 1435 + }, + { + "epoch": 0.530109362742836, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 4.644238008201314e-06, + "logits/chosen": 238136923.42857143, + "logits/rejected": 184467072.0, + "logps/chosen": -332.87325613839283, + "logps/rejected": -417.1315646701389, + "loss": 0.1035, + "rewards/chosen": 1.9962009702410017, + "rewards/margins": 8.09691067347451, + "rewards/rejected": -6.100709703233507, + "step": 1436 + }, + { + "epoch": 0.5304785196806793, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.638367836096332e-06, + "logits/chosen": 293300439.57894737, + "logits/rejected": 179712019.69230768, + "logps/chosen": -373.17552425986844, + "logps/rejected": -423.49222506009613, + "loss": 0.0844, + "rewards/chosen": 2.468379773591694, + "rewards/margins": 8.036949250379555, + "rewards/rejected": -5.568569476787861, + "step": 1437 + }, + { + "epoch": 0.5308476766185225, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 4.632498165028119e-06, + "logits/chosen": 206784275.69230768, + "logits/rejected": 214436459.78947368, + "logps/chosen": -387.24515474759613, + "logps/rejected": -488.7125308388158, + "loss": 0.0651, + "rewards/chosen": 2.6690198458158054, + "rewards/margins": 9.867634669006595, + "rewards/rejected": -7.198614823190789, + "step": 1438 + }, + { + "epoch": 0.5312168335563656, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 4.6266290031290295e-06, + "logits/chosen": 198120810.66666666, + "logits/rejected": 240394880.0, + "logps/chosen": -370.9593098958333, + "logps/rejected": -441.782958984375, + "loss": 0.0691, + "rewards/chosen": 2.28105894724528, + "rewards/margins": 9.080454699198405, + "rewards/rejected": -6.799395751953125, + "step": 1439 + }, + { + "epoch": 0.5315859904942088, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 4.620760358530713e-06, + "logits/chosen": 221697115.42857143, + "logits/rejected": 271945130.6666667, + "logps/chosen": -322.88623046875, + "logps/rejected": -360.47398546006946, + "loss": 0.0797, + "rewards/chosen": 2.359747750418527, + "rewards/margins": 7.998114025782026, + "rewards/rejected": -5.638366275363499, + "step": 1440 + }, + { + "epoch": 0.5319551474320521, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.6148922393641e-06, + "logits/chosen": 210581056.0, + "logits/rejected": 234879120.0, + "logps/chosen": -239.3573760986328, + "logps/rejected": -440.73529052734375, + "loss": 0.0937, + "rewards/chosen": 3.08773136138916, + "rewards/margins": 9.282076358795166, + "rewards/rejected": -6.194344997406006, + "step": 1441 + }, + { + "epoch": 0.5323243043698953, + "grad_norm": 6.6875, + "kl": 1.3176417350769043, + "learning_rate": 4.609024653759398e-06, + "logits/chosen": 267303736.8888889, + "logits/rejected": 261320100.57142857, + "logps/chosen": -360.8122829861111, + "logps/rejected": -385.0026157924107, + "loss": 0.0836, + "rewards/chosen": 2.868011898464627, + "rewards/margins": 8.561510782393198, + "rewards/rejected": -5.693498883928571, + "step": 1442 + }, + { + "epoch": 0.5326934613077384, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.60315760984607e-06, + "logits/chosen": 212955511.46666667, + "logits/rejected": 247927567.05882353, + "logps/chosen": -365.1115234375, + "logps/rejected": -321.1416590073529, + "loss": 0.0904, + "rewards/chosen": 2.4348576863606772, + "rewards/margins": 7.268965926824832, + "rewards/rejected": -4.834108240464154, + "step": 1443 + }, + { + "epoch": 0.5330626182455817, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 4.597291115752832e-06, + "logits/chosen": 254754205.53846154, + "logits/rejected": 284361323.7894737, + "logps/chosen": -419.4499699519231, + "logps/rejected": -428.93184621710526, + "loss": 0.0653, + "rewards/chosen": 2.520205571101262, + "rewards/margins": 10.259563770371411, + "rewards/rejected": -7.739358199270148, + "step": 1444 + }, + { + "epoch": 0.5334317751834249, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 4.591425179607639e-06, + "logits/chosen": 276257962.6666667, + "logits/rejected": 201595520.0, + "logps/chosen": -404.37584092881946, + "logps/rejected": -465.50013950892856, + "loss": 0.0663, + "rewards/chosen": 3.640893724229601, + "rewards/margins": 10.954827505444722, + "rewards/rejected": -7.313933781215122, + "step": 1445 + }, + { + "epoch": 0.533800932121268, + "grad_norm": 6.03125, + "kl": 0.03161430358886719, + "learning_rate": 4.585559809537666e-06, + "logits/chosen": 190140397.7142857, + "logits/rejected": 193178225.7777778, + "logps/chosen": -409.596435546875, + "logps/rejected": -396.19121636284723, + "loss": 0.0686, + "rewards/chosen": 2.285852704729353, + "rewards/margins": 9.100347367544023, + "rewards/rejected": -6.8144946628146705, + "step": 1446 + }, + { + "epoch": 0.5341700890591112, + "grad_norm": 5.21875, + "kl": 2.280282497406006, + "learning_rate": 4.579695013669313e-06, + "logits/chosen": 177717684.70588234, + "logits/rejected": 145174801.06666666, + "logps/chosen": -363.4357479319853, + "logps/rejected": -431.1626953125, + "loss": 0.101, + "rewards/chosen": 2.923639185288373, + "rewards/margins": 8.895487736720664, + "rewards/rejected": -5.971848551432291, + "step": 1447 + }, + { + "epoch": 0.5345392459969545, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 4.573830800128178e-06, + "logits/chosen": 296984044.3076923, + "logits/rejected": 246950076.63157895, + "logps/chosen": -368.64024939903845, + "logps/rejected": -453.5678453947368, + "loss": 0.0502, + "rewards/chosen": 2.8429539020244894, + "rewards/margins": 9.004618362859194, + "rewards/rejected": -6.161664460834704, + "step": 1448 + }, + { + "epoch": 0.5349084029347977, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 4.567967177039054e-06, + "logits/chosen": 227843737.6, + "logits/rejected": 193426386.82352942, + "logps/chosen": -347.02721354166664, + "logps/rejected": -448.3244198069853, + "loss": 0.0949, + "rewards/chosen": 2.7115211486816406, + "rewards/margins": 8.954563589657054, + "rewards/rejected": -6.2430424409754135, + "step": 1449 + }, + { + "epoch": 0.5352775598726408, + "grad_norm": 5.8125, + "kl": 0.5325899124145508, + "learning_rate": 4.562104152525918e-06, + "logits/chosen": 209246634.66666666, + "logits/rejected": 218320548.57142857, + "logps/chosen": -346.39808485243054, + "logps/rejected": -441.2928989955357, + "loss": 0.0913, + "rewards/chosen": 2.7152913411458335, + "rewards/margins": 9.744078136625744, + "rewards/rejected": -7.028786795479911, + "step": 1450 + }, + { + "epoch": 0.535646716810484, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 4.556241734711916e-06, + "logits/chosen": 263074577.06666666, + "logits/rejected": 209607890.82352942, + "logps/chosen": -356.95250651041664, + "logps/rejected": -472.34986787683823, + "loss": 0.1093, + "rewards/chosen": 2.044544474283854, + "rewards/margins": 8.73237523097618, + "rewards/rejected": -6.687830756692326, + "step": 1451 + }, + { + "epoch": 0.5360158737483273, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.550379931719351e-06, + "logits/chosen": 269825024.0, + "logits/rejected": 288103697.06666666, + "logps/chosen": -379.9230526194853, + "logps/rejected": -465.5, + "loss": 0.089, + "rewards/chosen": 2.7342255536247704, + "rewards/margins": 9.485158882889094, + "rewards/rejected": -6.750933329264323, + "step": 1452 + }, + { + "epoch": 0.5363850306861705, + "grad_norm": 5.8125, + "kl": 3.1881303787231445, + "learning_rate": 4.54451875166968e-06, + "logits/chosen": 222355584.0, + "logits/rejected": 467751722.6666667, + "logps/chosen": -340.4443359375, + "logps/rejected": -455.5039469401042, + "loss": 0.1191, + "rewards/chosen": 2.8822498321533203, + "rewards/margins": 8.218861897786457, + "rewards/rejected": -5.336612065633138, + "step": 1453 + }, + { + "epoch": 0.5367541876240136, + "grad_norm": 5.65625, + "kl": 1.0933151245117188, + "learning_rate": 4.53865820268349e-06, + "logits/chosen": 208834022.4, + "logits/rejected": 191779285.33333334, + "logps/chosen": -315.9447509765625, + "logps/rejected": -430.3892415364583, + "loss": 0.0929, + "rewards/chosen": 2.9771820068359376, + "rewards/margins": 9.375662740071615, + "rewards/rejected": -6.398480733235677, + "step": 1454 + }, + { + "epoch": 0.5371233445618568, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.532798292880499e-06, + "logits/chosen": 204920422.4, + "logits/rejected": 206163064.47058824, + "logps/chosen": -393.087109375, + "logps/rejected": -465.3317440257353, + "loss": 0.0798, + "rewards/chosen": 2.429126485188802, + "rewards/margins": 9.400459259631587, + "rewards/rejected": -6.971332774442785, + "step": 1455 + }, + { + "epoch": 0.5374925014997001, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 4.5269390303795395e-06, + "logits/chosen": 218456968.53333333, + "logits/rejected": 379569332.7058824, + "logps/chosen": -361.63017578125, + "logps/rejected": -501.84742647058823, + "loss": 0.0694, + "rewards/chosen": 2.6632044474283854, + "rewards/margins": 8.979535749846814, + "rewards/rejected": -6.316331302418428, + "step": 1456 + }, + { + "epoch": 0.5378616584375433, + "grad_norm": 5.90625, + "kl": 1.482109546661377, + "learning_rate": 4.521080423298543e-06, + "logits/chosen": 224888000.0, + "logits/rejected": 274778496.0, + "logps/chosen": -336.7158508300781, + "logps/rejected": -403.68218994140625, + "loss": 0.0984, + "rewards/chosen": 2.803514242172241, + "rewards/margins": 9.076210260391235, + "rewards/rejected": -6.272696018218994, + "step": 1457 + }, + { + "epoch": 0.5382308153753864, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 4.515222479754534e-06, + "logits/chosen": 186246817.68421054, + "logits/rejected": 218567049.84615386, + "logps/chosen": -346.5993009868421, + "logps/rejected": -597.5696364182693, + "loss": 0.0848, + "rewards/chosen": 2.663245351690995, + "rewards/margins": 11.241856872311487, + "rewards/rejected": -8.578611520620493, + "step": 1458 + }, + { + "epoch": 0.5385999723132296, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 4.5093652078636205e-06, + "logits/chosen": 192931653.8181818, + "logits/rejected": 241779046.4, + "logps/chosen": -324.19473544034093, + "logps/rejected": -447.32822265625, + "loss": 0.1457, + "rewards/chosen": 1.9817869013006038, + "rewards/margins": 7.796361715143377, + "rewards/rejected": -5.814574813842773, + "step": 1459 + }, + { + "epoch": 0.5389691292510729, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 4.503508615740978e-06, + "logits/chosen": 301933116.2352941, + "logits/rejected": 150241314.13333333, + "logps/chosen": -391.6200310202206, + "logps/rejected": -320.54140625, + "loss": 0.0829, + "rewards/chosen": 2.7280607784495636, + "rewards/margins": 8.022975024055032, + "rewards/rejected": -5.294914245605469, + "step": 1460 + }, + { + "epoch": 0.5393382861889161, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 4.497652711500841e-06, + "logits/chosen": 214027501.7142857, + "logits/rejected": 183810232.8888889, + "logps/chosen": -231.14167131696428, + "logps/rejected": -400.4912923177083, + "loss": 0.037, + "rewards/chosen": 3.7695606776646207, + "rewards/margins": 11.467194209023127, + "rewards/rejected": -7.697633531358507, + "step": 1461 + }, + { + "epoch": 0.5397074431267592, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 4.491797503256492e-06, + "logits/chosen": 203819625.4117647, + "logits/rejected": 202314615.46666667, + "logps/chosen": -322.7743566176471, + "logps/rejected": -428.5663736979167, + "loss": 0.0925, + "rewards/chosen": 2.7575681349810433, + "rewards/margins": 8.313516010957606, + "rewards/rejected": -5.555947875976562, + "step": 1462 + }, + { + "epoch": 0.5400766000646025, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 4.485942999120243e-06, + "logits/chosen": 226572174.2222222, + "logits/rejected": 265541120.0, + "logps/chosen": -388.0940755208333, + "logps/rejected": -316.506591796875, + "loss": 0.0971, + "rewards/chosen": 2.294867833455404, + "rewards/margins": 7.758261453537713, + "rewards/rejected": -5.46339362008231, + "step": 1463 + }, + { + "epoch": 0.5404457570024457, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 4.480089207203438e-06, + "logits/chosen": 198773760.0, + "logits/rejected": 198691997.53846154, + "logps/chosen": -358.9019325657895, + "logps/rejected": -479.1549729567308, + "loss": 0.1132, + "rewards/chosen": 2.0790591992829976, + "rewards/margins": 9.036746577212686, + "rewards/rejected": -6.9576873779296875, + "step": 1464 + }, + { + "epoch": 0.5408149139402889, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 4.474236135616431e-06, + "logits/chosen": 212950002.52631578, + "logits/rejected": 191013454.76923078, + "logps/chosen": -295.5189658717105, + "logps/rejected": -476.57632211538464, + "loss": 0.1052, + "rewards/chosen": 2.3784607335140833, + "rewards/margins": 8.868231476077185, + "rewards/rejected": -6.489770742563101, + "step": 1465 + }, + { + "epoch": 0.541184070878132, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 4.468383792468578e-06, + "logits/chosen": 254454400.0, + "logits/rejected": 207981248.0, + "logps/chosen": -339.10223388671875, + "logps/rejected": -420.1325378417969, + "loss": 0.0814, + "rewards/chosen": 2.766878366470337, + "rewards/margins": 9.457070589065552, + "rewards/rejected": -6.690192222595215, + "step": 1466 + }, + { + "epoch": 0.5415532278159753, + "grad_norm": 5.0, + "kl": 0.24053382873535156, + "learning_rate": 4.462532185868228e-06, + "logits/chosen": 202690466.9090909, + "logits/rejected": 200430457.9047619, + "logps/chosen": -383.64692826704544, + "logps/rejected": -421.3265904017857, + "loss": 0.0606, + "rewards/chosen": 2.5374128168279473, + "rewards/margins": 9.154651179458156, + "rewards/rejected": -6.617238362630208, + "step": 1467 + }, + { + "epoch": 0.5419223847538185, + "grad_norm": 6.28125, + "kl": 1.298469066619873, + "learning_rate": 4.4566813239227045e-06, + "logits/chosen": 210884445.0909091, + "logits/rejected": 200004070.4, + "logps/chosen": -336.59818892045456, + "logps/rejected": -424.021240234375, + "loss": 0.1178, + "rewards/chosen": 3.0353164672851562, + "rewards/margins": 7.4692230224609375, + "rewards/rejected": -4.433906555175781, + "step": 1468 + }, + { + "epoch": 0.5422915416916617, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.450831214738303e-06, + "logits/chosen": 221572626.2857143, + "logits/rejected": 225538474.66666666, + "logps/chosen": -384.89644949776783, + "logps/rejected": -415.1995442708333, + "loss": 0.0893, + "rewards/chosen": 2.1882554462977817, + "rewards/margins": 9.225038876609197, + "rewards/rejected": -7.036783430311415, + "step": 1469 + }, + { + "epoch": 0.5426606986295048, + "grad_norm": 6.0, + "kl": 0.9471406936645508, + "learning_rate": 4.444981866420278e-06, + "logits/chosen": 246866647.57894737, + "logits/rejected": 157119783.3846154, + "logps/chosen": -419.5625256990132, + "logps/rejected": -422.46822415865387, + "loss": 0.0789, + "rewards/chosen": 2.738193712736431, + "rewards/margins": 8.819267180284507, + "rewards/rejected": -6.081073467548077, + "step": 1470 + }, + { + "epoch": 0.5430298555673481, + "grad_norm": 4.625, + "kl": 0.1600198745727539, + "learning_rate": 4.439133287072826e-06, + "logits/chosen": 271695948.8, + "logits/rejected": 211773056.0, + "logps/chosen": -369.089794921875, + "logps/rejected": -512.5862223307291, + "loss": 0.0599, + "rewards/chosen": 2.9339284896850586, + "rewards/margins": 9.373627662658691, + "rewards/rejected": -6.439699172973633, + "step": 1471 + }, + { + "epoch": 0.5433990125051913, + "grad_norm": 5.875, + "kl": 0.45174741744995117, + "learning_rate": 4.433285484799077e-06, + "logits/chosen": 160023364.26666668, + "logits/rejected": 259740340.70588234, + "logps/chosen": -313.5903645833333, + "logps/rejected": -436.7254997702206, + "loss": 0.1231, + "rewards/chosen": 1.4081128438313801, + "rewards/margins": 7.947230544744753, + "rewards/rejected": -6.539117700913373, + "step": 1472 + }, + { + "epoch": 0.5437681694430345, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 4.427438467701091e-06, + "logits/chosen": 248251587.04761904, + "logits/rejected": 196016081.45454547, + "logps/chosen": -363.36376953125, + "logps/rejected": -529.783203125, + "loss": 0.108, + "rewards/chosen": 2.3634581792922247, + "rewards/margins": 9.40468049152589, + "rewards/rejected": -7.041222312233665, + "step": 1473 + }, + { + "epoch": 0.5441373263808776, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 4.4215922438798335e-06, + "logits/chosen": 236101065.14285713, + "logits/rejected": 183720035.55555555, + "logps/chosen": -400.9345005580357, + "logps/rejected": -446.0129665798611, + "loss": 0.0717, + "rewards/chosen": 2.965878350394113, + "rewards/margins": 8.515927602374365, + "rewards/rejected": -5.550049251980251, + "step": 1474 + }, + { + "epoch": 0.5445064833187209, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 4.415746821435172e-06, + "logits/chosen": 282189372.2352941, + "logits/rejected": 179314790.4, + "logps/chosen": -290.3276941636029, + "logps/rejected": -427.08802083333336, + "loss": 0.0824, + "rewards/chosen": 2.3411833819221046, + "rewards/margins": 8.782050488041897, + "rewards/rejected": -6.440867106119792, + "step": 1475 + }, + { + "epoch": 0.5448756402565641, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 4.409902208465867e-06, + "logits/chosen": 203464658.82352942, + "logits/rejected": 201206135.46666667, + "logps/chosen": -310.3978630514706, + "logps/rejected": -458.1689453125, + "loss": 0.0633, + "rewards/chosen": 2.8485280205221737, + "rewards/margins": 9.348088567397173, + "rewards/rejected": -6.499560546875, + "step": 1476 + }, + { + "epoch": 0.5452447971944073, + "grad_norm": 4.625, + "kl": 0.25643157958984375, + "learning_rate": 4.404058413069556e-06, + "logits/chosen": 217486813.86666667, + "logits/rejected": 256881633.88235295, + "logps/chosen": -296.83404947916665, + "logps/rejected": -406.68738511029414, + "loss": 0.0716, + "rewards/chosen": 2.664673360188802, + "rewards/margins": 9.385446944891237, + "rewards/rejected": -6.720773584702435, + "step": 1477 + }, + { + "epoch": 0.5456139541322504, + "grad_norm": 7.25, + "kl": 0.9866962432861328, + "learning_rate": 4.398215443342741e-06, + "logits/chosen": 245724359.1111111, + "logits/rejected": 170814921.14285713, + "logps/chosen": -349.8797200520833, + "logps/rejected": -438.17041015625, + "loss": 0.1295, + "rewards/chosen": 2.677445305718316, + "rewards/margins": 8.261866736033607, + "rewards/rejected": -5.58442143031529, + "step": 1478 + }, + { + "epoch": 0.5459831110700937, + "grad_norm": 3.71875, + "kl": 1.376004695892334, + "learning_rate": 4.3923733073807865e-06, + "logits/chosen": 281947687.38461536, + "logits/rejected": 154422352.84210527, + "logps/chosen": -349.3121995192308, + "logps/rejected": -377.20230263157896, + "loss": 0.0463, + "rewards/chosen": 2.8781107389009914, + "rewards/margins": 9.352643819955679, + "rewards/rejected": -6.4745330810546875, + "step": 1479 + }, + { + "epoch": 0.5463522680079369, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.386532013277892e-06, + "logits/chosen": 216672073.14285713, + "logits/rejected": 200647708.44444445, + "logps/chosen": -303.78159877232144, + "logps/rejected": -336.35633680555554, + "loss": 0.0853, + "rewards/chosen": 2.181567464556013, + "rewards/margins": 8.84677903614347, + "rewards/rejected": -6.665211571587457, + "step": 1480 + }, + { + "epoch": 0.54672142494578, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.3806915691271e-06, + "logits/chosen": 145488554.66666666, + "logits/rejected": 171974836.70588234, + "logps/chosen": -258.91912434895835, + "logps/rejected": -479.7265050551471, + "loss": 0.0883, + "rewards/chosen": 2.4664683024088543, + "rewards/margins": 9.44439422009038, + "rewards/rejected": -6.977925917681525, + "step": 1481 + }, + { + "epoch": 0.5470905818836233, + "grad_norm": 4.40625, + "kl": 0.2992730140686035, + "learning_rate": 4.374851983020271e-06, + "logits/chosen": 233812599.46666667, + "logits/rejected": 270782795.2941176, + "logps/chosen": -395.50169270833334, + "logps/rejected": -419.5132697610294, + "loss": 0.0627, + "rewards/chosen": 2.693742116292318, + "rewards/margins": 7.8391494002996716, + "rewards/rejected": -5.145407284007353, + "step": 1482 + }, + { + "epoch": 0.5474597388214665, + "grad_norm": 5.59375, + "kl": 1.120060920715332, + "learning_rate": 4.369013263048075e-06, + "logits/chosen": 251365857.88235295, + "logits/rejected": 242079453.86666667, + "logps/chosen": -386.8124138327206, + "logps/rejected": -446.98212890625, + "loss": 0.0731, + "rewards/chosen": 2.899518181295956, + "rewards/margins": 10.17267593682981, + "rewards/rejected": -7.2731577555338545, + "step": 1483 + }, + { + "epoch": 0.5478288957593097, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 4.363175417299989e-06, + "logits/chosen": 219305585.7777778, + "logits/rejected": 173110436.57142857, + "logps/chosen": -323.2955729166667, + "logps/rejected": -451.73500279017856, + "loss": 0.1186, + "rewards/chosen": 2.3387173546685114, + "rewards/margins": 8.678940939524818, + "rewards/rejected": -6.3402235848563055, + "step": 1484 + }, + { + "epoch": 0.5481980526971528, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 4.357338453864271e-06, + "logits/chosen": 314038162.28571427, + "logits/rejected": 228119395.55555555, + "logps/chosen": -362.6402064732143, + "logps/rejected": -372.29302300347223, + "loss": 0.059, + "rewards/chosen": 2.7914611271449496, + "rewards/margins": 8.708148986574203, + "rewards/rejected": -5.916687859429254, + "step": 1485 + }, + { + "epoch": 0.5485672096349961, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 4.351502380827959e-06, + "logits/chosen": 222608170.66666666, + "logits/rejected": 232809182.60869566, + "logps/chosen": -321.43646918402777, + "logps/rejected": -371.2238026494565, + "loss": 0.054, + "rewards/chosen": 2.065670649210612, + "rewards/margins": 7.607609762661699, + "rewards/rejected": -5.541939113451087, + "step": 1486 + }, + { + "epoch": 0.5489363665728393, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.345667206276861e-06, + "logits/chosen": 263458628.26666668, + "logits/rejected": 306242560.0, + "logps/chosen": -341.5208984375, + "logps/rejected": -462.7442842371324, + "loss": 0.092, + "rewards/chosen": 2.4069508870442706, + "rewards/margins": 9.19538059608609, + "rewards/rejected": -6.78842970904182, + "step": 1487 + }, + { + "epoch": 0.5493055235106825, + "grad_norm": 5.09375, + "kl": 0.22046279907226562, + "learning_rate": 4.339832938295534e-06, + "logits/chosen": 220473497.6, + "logits/rejected": 116887274.66666667, + "logps/chosen": -358.68134765625, + "logps/rejected": -295.1658528645833, + "loss": 0.075, + "rewards/chosen": 3.1323291778564455, + "rewards/margins": 8.845809173583984, + "rewards/rejected": -5.713479995727539, + "step": 1488 + }, + { + "epoch": 0.5496746804485256, + "grad_norm": 6.0, + "kl": 1.4560661315917969, + "learning_rate": 4.333999584967284e-06, + "logits/chosen": 257346413.7142857, + "logits/rejected": 292013824.0, + "logps/chosen": -288.85377139136904, + "logps/rejected": -513.7785866477273, + "loss": 0.1531, + "rewards/chosen": 2.1896754673549106, + "rewards/margins": 9.418744570249086, + "rewards/rejected": -7.229069102894176, + "step": 1489 + }, + { + "epoch": 0.5500438373863689, + "grad_norm": 4.6875, + "kl": 0.13152122497558594, + "learning_rate": 4.3281671543741476e-06, + "logits/chosen": 221943627.29411766, + "logits/rejected": 307622400.0, + "logps/chosen": -284.47199563419116, + "logps/rejected": -412.36005859375, + "loss": 0.0687, + "rewards/chosen": 3.1765572043026196, + "rewards/margins": 9.48849313773361, + "rewards/rejected": -6.3119359334309895, + "step": 1490 + }, + { + "epoch": 0.5504129943242121, + "grad_norm": 5.375, + "kl": 0.6262340545654297, + "learning_rate": 4.322335654596884e-06, + "logits/chosen": 224794081.88235295, + "logits/rejected": 218047590.4, + "logps/chosen": -300.68390969669116, + "logps/rejected": -475.35315755208336, + "loss": 0.093, + "rewards/chosen": 2.6388747271369484, + "rewards/margins": 9.099826468673406, + "rewards/rejected": -6.460951741536459, + "step": 1491 + }, + { + "epoch": 0.5507821512620553, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 4.31650509371496e-06, + "logits/chosen": 284664704.0, + "logits/rejected": 232588224.0, + "logps/chosen": -378.54620361328125, + "logps/rejected": -429.8978271484375, + "loss": 0.0674, + "rewards/chosen": 2.667086601257324, + "rewards/margins": 10.326001167297363, + "rewards/rejected": -7.658914566040039, + "step": 1492 + }, + { + "epoch": 0.5511513081998984, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 4.310675479806546e-06, + "logits/chosen": 269903616.0, + "logits/rejected": 208936992.0, + "logps/chosen": -340.29559326171875, + "logps/rejected": -443.4493713378906, + "loss": 0.0872, + "rewards/chosen": 2.4981114864349365, + "rewards/margins": 8.769144773483276, + "rewards/rejected": -6.27103328704834, + "step": 1493 + }, + { + "epoch": 0.5515204651377417, + "grad_norm": 5.90625, + "kl": 0.7086896896362305, + "learning_rate": 4.304846820948497e-06, + "logits/chosen": 152084028.2352941, + "logits/rejected": 163696605.86666667, + "logps/chosen": -322.72673483455884, + "logps/rejected": -302.85556640625, + "loss": 0.1234, + "rewards/chosen": 2.2814010171329273, + "rewards/margins": 7.445798193239698, + "rewards/rejected": -5.164397176106771, + "step": 1494 + }, + { + "epoch": 0.5518896220755849, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 4.2990191252163446e-06, + "logits/chosen": 179822435.55555555, + "logits/rejected": 181157485.7142857, + "logps/chosen": -331.55023871527777, + "logps/rejected": -437.10030691964283, + "loss": 0.0725, + "rewards/chosen": 3.3293435838487415, + "rewards/margins": 9.755594586569167, + "rewards/rejected": -6.4262510027204245, + "step": 1495 + }, + { + "epoch": 0.5522587790134281, + "grad_norm": 3.640625, + "kl": 0.07443809509277344, + "learning_rate": 4.293192400684289e-06, + "logits/chosen": 266890816.0, + "logits/rejected": 198297040.0, + "logps/chosen": -367.36102294921875, + "logps/rejected": -432.30938720703125, + "loss": 0.0519, + "rewards/chosen": 3.4921414852142334, + "rewards/margins": 9.611327886581421, + "rewards/rejected": -6.1191864013671875, + "step": 1496 + }, + { + "epoch": 0.5526279359512712, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 4.287366655425185e-06, + "logits/chosen": 175039859.2, + "logits/rejected": 250803968.0, + "logps/chosen": -295.84296875, + "logps/rejected": -464.6875813802083, + "loss": 0.0816, + "rewards/chosen": 2.7839279174804688, + "rewards/margins": 8.130224863688152, + "rewards/rejected": -5.346296946207683, + "step": 1497 + }, + { + "epoch": 0.5529970928891145, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 4.281541897510524e-06, + "logits/chosen": 218163946.66666666, + "logits/rejected": 220046361.6, + "logps/chosen": -325.2704264322917, + "logps/rejected": -412.075341796875, + "loss": 0.0616, + "rewards/chosen": 2.6222685178120932, + "rewards/margins": 8.631207879384359, + "rewards/rejected": -6.008939361572265, + "step": 1498 + }, + { + "epoch": 0.5533662498269577, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 4.275718135010435e-06, + "logits/chosen": 159597725.53846154, + "logits/rejected": 201482415.15789473, + "logps/chosen": -262.6930588942308, + "logps/rejected": -470.8343441611842, + "loss": 0.0798, + "rewards/chosen": 3.1580878037672777, + "rewards/margins": 9.778155199429285, + "rewards/rejected": -6.620067395662007, + "step": 1499 + }, + { + "epoch": 0.5537354067648009, + "grad_norm": 6.8125, + "kl": 2.6003284454345703, + "learning_rate": 4.269895375993668e-06, + "logits/chosen": 224215668.36363637, + "logits/rejected": 176873856.0, + "logps/chosen": -481.65189985795456, + "logps/rejected": -359.6925048828125, + "loss": 0.1184, + "rewards/chosen": 2.9425312389026987, + "rewards/margins": 8.65153201710094, + "rewards/rejected": -5.709000778198242, + "step": 1500 + }, + { + "epoch": 0.554104563702644, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 4.264073628527583e-06, + "logits/chosen": 257032222.11764705, + "logits/rejected": 152940066.13333333, + "logps/chosen": -433.33269186580884, + "logps/rejected": -393.5248697916667, + "loss": 0.0754, + "rewards/chosen": 3.1357049381031707, + "rewards/margins": 9.787336611280253, + "rewards/rejected": -6.651631673177083, + "step": 1501 + }, + { + "epoch": 0.5544737206404873, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 4.258252900678136e-06, + "logits/chosen": 195097048.6153846, + "logits/rejected": 173327386.9473684, + "logps/chosen": -340.56186147836536, + "logps/rejected": -353.74619654605266, + "loss": 0.0892, + "rewards/chosen": 1.980166508601262, + "rewards/margins": 8.654294990817544, + "rewards/rejected": -6.6741284822162825, + "step": 1502 + }, + { + "epoch": 0.5548428775783305, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 4.252433200509869e-06, + "logits/chosen": 251873216.0, + "logits/rejected": 189901184.0, + "logps/chosen": -429.8963623046875, + "logps/rejected": -433.5455627441406, + "loss": 0.0507, + "rewards/chosen": 3.1961684226989746, + "rewards/margins": 9.659348011016846, + "rewards/rejected": -6.463179588317871, + "step": 1503 + }, + { + "epoch": 0.5552120345161737, + "grad_norm": 5.15625, + "kl": 2.019108772277832, + "learning_rate": 4.2466145360859064e-06, + "logits/chosen": 285438976.0, + "logits/rejected": 247402511.05882353, + "logps/chosen": -363.0815104166667, + "logps/rejected": -389.46780215992646, + "loss": 0.0978, + "rewards/chosen": 2.7062840779622395, + "rewards/margins": 9.115403627881816, + "rewards/rejected": -6.409119549919577, + "step": 1504 + }, + { + "epoch": 0.5555811914540169, + "grad_norm": 4.1875, + "kl": 0.6569375991821289, + "learning_rate": 4.240796915467933e-06, + "logits/chosen": 175386069.33333334, + "logits/rejected": 180077619.2, + "logps/chosen": -425.0688883463542, + "logps/rejected": -463.788916015625, + "loss": 0.0481, + "rewards/chosen": 3.683018366495768, + "rewards/margins": 10.180916468302408, + "rewards/rejected": -6.497898101806641, + "step": 1505 + }, + { + "epoch": 0.5559503483918601, + "grad_norm": 6.625, + "kl": 1.267707347869873, + "learning_rate": 4.2349803467161864e-06, + "logits/chosen": 297655210.6666667, + "logits/rejected": 282195821.71428573, + "logps/chosen": -442.879638671875, + "logps/rejected": -377.82826450892856, + "loss": 0.1038, + "rewards/chosen": 2.1746338738335504, + "rewards/margins": 7.5573804340665305, + "rewards/rejected": -5.38274656023298, + "step": 1506 + }, + { + "epoch": 0.5563195053297033, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 4.229164837889451e-06, + "logits/chosen": 243251109.6470588, + "logits/rejected": 186623214.93333334, + "logps/chosen": -351.32956112132354, + "logps/rejected": -417.32233072916665, + "loss": 0.0969, + "rewards/chosen": 2.3262243831858918, + "rewards/margins": 8.548014951219745, + "rewards/rejected": -6.221790568033854, + "step": 1507 + }, + { + "epoch": 0.5566886622675465, + "grad_norm": 5.0, + "kl": 1.7531604766845703, + "learning_rate": 4.22335039704504e-06, + "logits/chosen": 185562742.15384614, + "logits/rejected": 266546122.10526314, + "logps/chosen": -299.2083082932692, + "logps/rejected": -503.4261924342105, + "loss": 0.0636, + "rewards/chosen": 2.910188528207632, + "rewards/margins": 10.299233841992582, + "rewards/rejected": -7.38904531378495, + "step": 1508 + }, + { + "epoch": 0.5570578192053897, + "grad_norm": 4.59375, + "kl": 0.9543180465698242, + "learning_rate": 4.217537032238784e-06, + "logits/chosen": 174166348.8, + "logits/rejected": 198719189.33333334, + "logps/chosen": -334.0251708984375, + "logps/rejected": -398.2229410807292, + "loss": 0.0912, + "rewards/chosen": 2.7255781173706053, + "rewards/margins": 9.011403210957845, + "rewards/rejected": -6.285825093587239, + "step": 1509 + }, + { + "epoch": 0.5574269761432329, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 4.21172475152503e-06, + "logits/chosen": 265448093.53846154, + "logits/rejected": 203955200.0, + "logps/chosen": -411.5685847355769, + "logps/rejected": -460.23458059210526, + "loss": 0.0689, + "rewards/chosen": 2.7176008958082933, + "rewards/margins": 9.73804606696372, + "rewards/rejected": -7.020445171155427, + "step": 1510 + }, + { + "epoch": 0.5577961330810761, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 4.205913562956619e-06, + "logits/chosen": 193377744.0, + "logits/rejected": 166006496.0, + "logps/chosen": -381.6564636230469, + "logps/rejected": -553.666015625, + "loss": 0.048, + "rewards/chosen": 3.334188461303711, + "rewards/margins": 10.462809562683105, + "rewards/rejected": -7.1286211013793945, + "step": 1511 + }, + { + "epoch": 0.5581652900189193, + "grad_norm": 5.9375, + "kl": 0.41790294647216797, + "learning_rate": 4.200103474584877e-06, + "logits/chosen": 211243358.31578946, + "logits/rejected": 182207251.69230768, + "logps/chosen": -328.3409488075658, + "logps/rejected": -363.93445763221155, + "loss": 0.1068, + "rewards/chosen": 2.687887693706312, + "rewards/margins": 9.50953240336677, + "rewards/rejected": -6.821644709660457, + "step": 1512 + }, + { + "epoch": 0.5585344469567625, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 4.194294494459607e-06, + "logits/chosen": 193408676.57142857, + "logits/rejected": 145648853.33333334, + "logps/chosen": -378.70064871651783, + "logps/rejected": -498.3503146701389, + "loss": 0.0626, + "rewards/chosen": 3.188846860613142, + "rewards/margins": 10.835610980079288, + "rewards/rejected": -7.6467641194661455, + "step": 1513 + }, + { + "epoch": 0.5589036038946057, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 4.188486630629082e-06, + "logits/chosen": 179990994.82352942, + "logits/rejected": 138505489.06666666, + "logps/chosen": -353.8872931985294, + "logps/rejected": -316.301171875, + "loss": 0.0372, + "rewards/chosen": 3.8285437191233918, + "rewards/margins": 9.750524513394225, + "rewards/rejected": -5.921980794270834, + "step": 1514 + }, + { + "epoch": 0.5592727608324489, + "grad_norm": 5.84375, + "kl": 0.45151519775390625, + "learning_rate": 4.1826798911400186e-06, + "logits/chosen": 181545110.5882353, + "logits/rejected": 189596398.93333334, + "logps/chosen": -296.6160098805147, + "logps/rejected": -423.42027994791664, + "loss": 0.1033, + "rewards/chosen": 2.4565979452694164, + "rewards/margins": 9.064323978798063, + "rewards/rejected": -6.607726033528646, + "step": 1515 + }, + { + "epoch": 0.559641917770292, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 4.176874284037581e-06, + "logits/chosen": 137973461.33333334, + "logits/rejected": 168081459.2, + "logps/chosen": -273.0970865885417, + "logps/rejected": -407.8760498046875, + "loss": 0.1155, + "rewards/chosen": 1.7323150634765625, + "rewards/margins": 7.981529998779297, + "rewards/rejected": -6.249214935302734, + "step": 1516 + }, + { + "epoch": 0.5600110747081353, + "grad_norm": 6.5, + "kl": 1.504082202911377, + "learning_rate": 4.171069817365365e-06, + "logits/chosen": 240794868.86956522, + "logits/rejected": 170964821.33333334, + "logps/chosen": -385.48369565217394, + "logps/rejected": -372.1870388454861, + "loss": 0.1202, + "rewards/chosen": 2.9383322674295176, + "rewards/margins": 9.488416275540413, + "rewards/rejected": -6.550084008110894, + "step": 1517 + }, + { + "epoch": 0.5603802316459785, + "grad_norm": 6.5, + "kl": 0.0727548599243164, + "learning_rate": 4.165266499165387e-06, + "logits/chosen": 224699527.52941176, + "logits/rejected": 294620398.93333334, + "logps/chosen": -389.3216911764706, + "logps/rejected": -518.61591796875, + "loss": 0.1082, + "rewards/chosen": 1.8280000125660616, + "rewards/margins": 9.110771313835594, + "rewards/rejected": -7.282771301269531, + "step": 1518 + }, + { + "epoch": 0.5607493885838217, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 4.159464337478068e-06, + "logits/chosen": 158582595.36842105, + "logits/rejected": 140550390.15384614, + "logps/chosen": -272.96767064144734, + "logps/rejected": -404.56527944711536, + "loss": 0.1034, + "rewards/chosen": 2.581893117804276, + "rewards/margins": 9.20789022098186, + "rewards/rejected": -6.625997103177584, + "step": 1519 + }, + { + "epoch": 0.5611185455216648, + "grad_norm": 6.21875, + "kl": 0.25307655334472656, + "learning_rate": 4.15366334034223e-06, + "logits/chosen": 321793077.8947368, + "logits/rejected": 247442313.84615386, + "logps/chosen": -443.87129934210526, + "logps/rejected": -501.7038386418269, + "loss": 0.1118, + "rewards/chosen": 2.628263975444593, + "rewards/margins": 9.696750115769113, + "rewards/rejected": -7.068486140324519, + "step": 1520 + }, + { + "epoch": 0.5614877024595081, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 4.147863515795083e-06, + "logits/chosen": 221278624.0, + "logits/rejected": 194695392.0, + "logps/chosen": -368.2498474121094, + "logps/rejected": -477.5912170410156, + "loss": 0.1107, + "rewards/chosen": 2.005645513534546, + "rewards/margins": 8.87225890159607, + "rewards/rejected": -6.866613388061523, + "step": 1521 + }, + { + "epoch": 0.5618568593973513, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 4.142064871872208e-06, + "logits/chosen": 317427360.0, + "logits/rejected": 271119520.0, + "logps/chosen": -267.46234130859375, + "logps/rejected": -469.29473876953125, + "loss": 0.0743, + "rewards/chosen": 2.8426923751831055, + "rewards/margins": 9.458678722381592, + "rewards/rejected": -6.615986347198486, + "step": 1522 + }, + { + "epoch": 0.5622260163351945, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 4.136267416607552e-06, + "logits/chosen": 230076046.2222222, + "logits/rejected": 254833481.14285713, + "logps/chosen": -319.57421875, + "logps/rejected": -400.7632533482143, + "loss": 0.0973, + "rewards/chosen": 3.2139225006103516, + "rewards/margins": 7.90843391418457, + "rewards/rejected": -4.694511413574219, + "step": 1523 + }, + { + "epoch": 0.5625951732730377, + "grad_norm": 7.0, + "kl": 1.1226701736450195, + "learning_rate": 4.130471158033418e-06, + "logits/chosen": 188319860.36363637, + "logits/rejected": 367174502.4, + "logps/chosen": -324.0292302911932, + "logps/rejected": -356.393505859375, + "loss": 0.1544, + "rewards/chosen": 2.8501815795898438, + "rewards/margins": 8.780629730224609, + "rewards/rejected": -5.930448150634765, + "step": 1524 + }, + { + "epoch": 0.5629643302108809, + "grad_norm": 5.75, + "kl": 0.9994111061096191, + "learning_rate": 4.124676104180447e-06, + "logits/chosen": 180846665.14285713, + "logits/rejected": 234062545.45454547, + "logps/chosen": -325.04966517857144, + "logps/rejected": -480.52934126420456, + "loss": 0.1136, + "rewards/chosen": 2.285872141520182, + "rewards/margins": 9.838743845621744, + "rewards/rejected": -7.5528717041015625, + "step": 1525 + }, + { + "epoch": 0.5633334871487241, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 4.118882263077613e-06, + "logits/chosen": 182230363.42857143, + "logits/rejected": 231083064.8888889, + "logps/chosen": -319.72021484375, + "logps/rejected": -538.2501085069445, + "loss": 0.0913, + "rewards/chosen": 2.4514389038085938, + "rewards/margins": 11.00912136501736, + "rewards/rejected": -8.557682461208767, + "step": 1526 + }, + { + "epoch": 0.5637026440865673, + "grad_norm": 6.0625, + "kl": 0.0, + "learning_rate": 4.113089642752208e-06, + "logits/chosen": 197837549.7142857, + "logits/rejected": 213936839.1111111, + "logps/chosen": -339.10543387276783, + "logps/rejected": -441.7361111111111, + "loss": 0.0661, + "rewards/chosen": 2.523150852748326, + "rewards/margins": 9.627607860262431, + "rewards/rejected": -7.104457007514106, + "step": 1527 + }, + { + "epoch": 0.5640718010244105, + "grad_norm": 5.78125, + "kl": 2.4430975914001465, + "learning_rate": 4.107298251229837e-06, + "logits/chosen": 165391859.2, + "logits/rejected": 169625280.0, + "logps/chosen": -255.622265625, + "logps/rejected": -434.7124837239583, + "loss": 0.1157, + "rewards/chosen": 2.875337028503418, + "rewards/margins": 10.380867067972819, + "rewards/rejected": -7.505530039469401, + "step": 1528 + }, + { + "epoch": 0.5644409579622537, + "grad_norm": 2.3125, + "kl": 0.0, + "learning_rate": 4.101508096534394e-06, + "logits/chosen": 232644096.0, + "logits/rejected": 190869392.6956522, + "logps/chosen": -437.95665147569446, + "logps/rejected": -400.08755095108694, + "loss": 0.0363, + "rewards/chosen": 3.54972775777181, + "rewards/margins": 9.244557947352313, + "rewards/rejected": -5.694830189580503, + "step": 1529 + }, + { + "epoch": 0.5648101149000969, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 4.095719186688071e-06, + "logits/chosen": 175379145.14285713, + "logits/rejected": 182261703.1111111, + "logps/chosen": -423.67421177455356, + "logps/rejected": -329.1457248263889, + "loss": 0.0628, + "rewards/chosen": 3.0941565377371654, + "rewards/margins": 7.776234581356958, + "rewards/rejected": -4.682078043619792, + "step": 1530 + }, + { + "epoch": 0.5651792718379401, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 4.0899315297113255e-06, + "logits/chosen": 224364077.17647058, + "logits/rejected": 232492049.06666666, + "logps/chosen": -278.8450137867647, + "logps/rejected": -523.0978190104166, + "loss": 0.1186, + "rewards/chosen": 1.6740522945628447, + "rewards/margins": 10.511359016568052, + "rewards/rejected": -8.837306722005208, + "step": 1531 + }, + { + "epoch": 0.5655484287757833, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 4.084145133622883e-06, + "logits/chosen": 202160870.4, + "logits/rejected": 152170048.0, + "logps/chosen": -342.036181640625, + "logps/rejected": -371.5902506510417, + "loss": 0.0722, + "rewards/chosen": 3.720493698120117, + "rewards/margins": 8.986845715840657, + "rewards/rejected": -5.26635201772054, + "step": 1532 + }, + { + "epoch": 0.5659175857136265, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 4.0783600064397225e-06, + "logits/chosen": 247465779.2, + "logits/rejected": 174863826.82352942, + "logps/chosen": -307.28828125, + "logps/rejected": -419.41116153492646, + "loss": 0.1113, + "rewards/chosen": 1.7600765228271484, + "rewards/margins": 8.279635597677792, + "rewards/rejected": -6.519559074850643, + "step": 1533 + }, + { + "epoch": 0.5662867426514697, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 4.072576156177062e-06, + "logits/chosen": 228073685.33333334, + "logits/rejected": 183742829.7142857, + "logps/chosen": -394.03729926215277, + "logps/rejected": -366.10128348214283, + "loss": 0.1074, + "rewards/chosen": 2.4557427300347223, + "rewards/margins": 8.075171637156654, + "rewards/rejected": -5.6194289071219305, + "step": 1534 + }, + { + "epoch": 0.566655899589313, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 4.066793590848356e-06, + "logits/chosen": 182538048.0, + "logits/rejected": 164641456.0, + "logps/chosen": -379.342529296875, + "logps/rejected": -507.9759216308594, + "loss": 0.0632, + "rewards/chosen": 2.8885910511016846, + "rewards/margins": 9.998371839523315, + "rewards/rejected": -7.109780788421631, + "step": 1535 + }, + { + "epoch": 0.5670250565271561, + "grad_norm": 3.875, + "kl": 2.3864870071411133, + "learning_rate": 4.061012318465272e-06, + "logits/chosen": 152548816.0, + "logits/rejected": 328293760.0, + "logps/chosen": -348.43707275390625, + "logps/rejected": -348.7934875488281, + "loss": 0.058, + "rewards/chosen": 4.380751609802246, + "rewards/margins": 9.275296688079834, + "rewards/rejected": -4.894545078277588, + "step": 1536 + }, + { + "epoch": 0.5673942134649993, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 4.0552323470376916e-06, + "logits/chosen": 206107704.8888889, + "logits/rejected": 165284937.14285713, + "logps/chosen": -386.6941189236111, + "logps/rejected": -405.8294154575893, + "loss": 0.0748, + "rewards/chosen": 2.641738255818685, + "rewards/margins": 10.13346799214681, + "rewards/rejected": -7.491729736328125, + "step": 1537 + }, + { + "epoch": 0.5677633704028425, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 4.049453684573693e-06, + "logits/chosen": 204932736.0, + "logits/rejected": 188509141.33333334, + "logps/chosen": -374.5387486049107, + "logps/rejected": -464.1678059895833, + "loss": 0.0824, + "rewards/chosen": 2.183180672781808, + "rewards/margins": 9.102358681815012, + "rewards/rejected": -6.919178009033203, + "step": 1538 + }, + { + "epoch": 0.5681325273406858, + "grad_norm": 5.03125, + "kl": 1.0822219848632812, + "learning_rate": 4.043676339079536e-06, + "logits/chosen": 315000183.46666664, + "logits/rejected": 233636547.7647059, + "logps/chosen": -326.75478515625, + "logps/rejected": -448.7416130514706, + "loss": 0.0689, + "rewards/chosen": 2.902715555826823, + "rewards/margins": 9.586787713742723, + "rewards/rejected": -6.6840721579159, + "step": 1539 + }, + { + "epoch": 0.5685016842785289, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 4.037900318559661e-06, + "logits/chosen": 186158819.55555555, + "logits/rejected": 249547885.7142857, + "logps/chosen": -285.73912217881946, + "logps/rejected": -380.09109933035717, + "loss": 0.058, + "rewards/chosen": 3.533460405137804, + "rewards/margins": 9.573864588661799, + "rewards/rejected": -6.040404183523996, + "step": 1540 + }, + { + "epoch": 0.5688708412163721, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 4.032125631016672e-06, + "logits/chosen": 223202645.33333334, + "logits/rejected": 132336844.8, + "logps/chosen": -476.942138671875, + "logps/rejected": -389.8569580078125, + "loss": 0.0834, + "rewards/chosen": 2.31734832127889, + "rewards/margins": 8.595123132069906, + "rewards/rejected": -6.277774810791016, + "step": 1541 + }, + { + "epoch": 0.5692399981542153, + "grad_norm": 5.5, + "kl": 0.013073921203613281, + "learning_rate": 4.026352284451326e-06, + "logits/chosen": 161518125.17647058, + "logits/rejected": 160668518.4, + "logps/chosen": -341.9349724264706, + "logps/rejected": -432.0074869791667, + "loss": 0.0761, + "rewards/chosen": 2.527029598460478, + "rewards/margins": 9.421452511058135, + "rewards/rejected": -6.894422912597657, + "step": 1542 + }, + { + "epoch": 0.5696091550920586, + "grad_norm": 5.1875, + "kl": 0.19856834411621094, + "learning_rate": 4.020580286862517e-06, + "logits/chosen": 206424945.7777778, + "logits/rejected": 309102043.4285714, + "logps/chosen": -395.67469618055554, + "logps/rejected": -517.1408342633929, + "loss": 0.0666, + "rewards/chosen": 3.030164294772678, + "rewards/margins": 11.678409969995892, + "rewards/rejected": -8.648245675223214, + "step": 1543 + }, + { + "epoch": 0.5699783120299017, + "grad_norm": 6.9375, + "kl": 4.4345855712890625, + "learning_rate": 4.014809646247278e-06, + "logits/chosen": 313404385.88235295, + "logits/rejected": 243470848.0, + "logps/chosen": -462.5446346507353, + "logps/rejected": -497.15514322916664, + "loss": 0.121, + "rewards/chosen": 2.7935108857996322, + "rewards/margins": 9.232208700741038, + "rewards/rejected": -6.438697814941406, + "step": 1544 + }, + { + "epoch": 0.5703474689677449, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 4.009040370600759e-06, + "logits/chosen": 236605617.23076922, + "logits/rejected": 185910622.31578946, + "logps/chosen": -314.9286358173077, + "logps/rejected": -536.2390522203947, + "loss": 0.0674, + "rewards/chosen": 2.138529557448167, + "rewards/margins": 10.776258599902938, + "rewards/rejected": -8.63772904245477, + "step": 1545 + }, + { + "epoch": 0.5707166259055881, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.003272467916214e-06, + "logits/chosen": 195333315.7647059, + "logits/rejected": 229102592.0, + "logps/chosen": -370.8232996323529, + "logps/rejected": -456.3264973958333, + "loss": 0.0549, + "rewards/chosen": 3.23719383688534, + "rewards/margins": 11.098401926078049, + "rewards/rejected": -7.861208089192709, + "step": 1546 + }, + { + "epoch": 0.5710857828434314, + "grad_norm": 4.6875, + "kl": 0.43814802169799805, + "learning_rate": 3.9975059461850035e-06, + "logits/chosen": 166969992.53333333, + "logits/rejected": 227902991.05882353, + "logps/chosen": -336.18056640625, + "logps/rejected": -355.1887637867647, + "loss": 0.0813, + "rewards/chosen": 3.0980010986328126, + "rewards/margins": 8.130207016888788, + "rewards/rejected": -5.032205918255975, + "step": 1547 + }, + { + "epoch": 0.5714549397812745, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 3.991740813396568e-06, + "logits/chosen": 206328024.6153846, + "logits/rejected": 213007252.21052632, + "logps/chosen": -363.04995492788464, + "logps/rejected": -354.27724095394734, + "loss": 0.126, + "rewards/chosen": 2.5132683974045973, + "rewards/margins": 8.008623300776307, + "rewards/rejected": -5.495354903371711, + "step": 1548 + }, + { + "epoch": 0.5718240967191177, + "grad_norm": 4.65625, + "kl": 2.9089603424072266, + "learning_rate": 3.985977077538426e-06, + "logits/chosen": 222652024.47058824, + "logits/rejected": 224493380.26666668, + "logps/chosen": -443.4969841452206, + "logps/rejected": -590.026171875, + "loss": 0.0719, + "rewards/chosen": 3.604330399457146, + "rewards/margins": 11.827062943402458, + "rewards/rejected": -8.222732543945312, + "step": 1549 + }, + { + "epoch": 0.5721932536569609, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.980214746596159e-06, + "logits/chosen": 162415030.85714287, + "logits/rejected": 143629667.55555555, + "logps/chosen": -306.0222865513393, + "logps/rejected": -419.01068793402777, + "loss": 0.0531, + "rewards/chosen": 2.8996374947684154, + "rewards/margins": 10.009394812205482, + "rewards/rejected": -7.109757317437066, + "step": 1550 + }, + { + "epoch": 0.5725624105948041, + "grad_norm": 7.21875, + "kl": 0.0, + "learning_rate": 3.974453828553404e-06, + "logits/chosen": 221513508.57142857, + "logits/rejected": 231800049.7777778, + "logps/chosen": -352.589599609375, + "logps/rejected": -491.67637803819446, + "loss": 0.0853, + "rewards/chosen": 2.2156531470162526, + "rewards/margins": 10.041228233821808, + "rewards/rejected": -7.825575086805555, + "step": 1551 + }, + { + "epoch": 0.5729315675326473, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 3.9686943313918405e-06, + "logits/chosen": 211493356.30769232, + "logits/rejected": 206795237.0526316, + "logps/chosen": -285.66898287259613, + "logps/rejected": -378.78091591282896, + "loss": 0.0879, + "rewards/chosen": 2.229715347290039, + "rewards/margins": 7.305144761738024, + "rewards/rejected": -5.075429414447985, + "step": 1552 + }, + { + "epoch": 0.5733007244704905, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.962936263091179e-06, + "logits/chosen": 190909008.0, + "logits/rejected": 204501424.0, + "logps/chosen": -290.7789611816406, + "logps/rejected": -555.0100708007812, + "loss": 0.0592, + "rewards/chosen": 2.8254802227020264, + "rewards/margins": 11.314440488815308, + "rewards/rejected": -8.488960266113281, + "step": 1553 + }, + { + "epoch": 0.5736698814083337, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 3.957179631629148e-06, + "logits/chosen": 167153248.0, + "logits/rejected": 235019673.6, + "logps/chosen": -365.2660725911458, + "logps/rejected": -423.481103515625, + "loss": 0.0621, + "rewards/chosen": 3.2236302693684897, + "rewards/margins": 9.901995595296224, + "rewards/rejected": -6.678365325927734, + "step": 1554 + }, + { + "epoch": 0.5740390383461769, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.9514244449814886e-06, + "logits/chosen": 329534976.0, + "logits/rejected": 206815047.1111111, + "logps/chosen": -343.92508370535717, + "logps/rejected": -418.05072699652777, + "loss": 0.076, + "rewards/chosen": 2.7444539751325334, + "rewards/margins": 9.156194172208272, + "rewards/rejected": -6.411740197075738, + "step": 1555 + }, + { + "epoch": 0.5744081952840201, + "grad_norm": 6.0, + "kl": 0.016396522521972656, + "learning_rate": 3.945670711121939e-06, + "logits/chosen": 260280550.4, + "logits/rejected": 231159637.33333334, + "logps/chosen": -341.314111328125, + "logps/rejected": -571.7940266927084, + "loss": 0.0836, + "rewards/chosen": 2.6730009078979493, + "rewards/margins": 9.707773145039877, + "rewards/rejected": -7.034772237141927, + "step": 1556 + }, + { + "epoch": 0.5747773522218633, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 3.939918438022224e-06, + "logits/chosen": 161952098.46153846, + "logits/rejected": 240813541.0526316, + "logps/chosen": -313.1041729266827, + "logps/rejected": -432.7877261513158, + "loss": 0.0546, + "rewards/chosen": 3.055501791147085, + "rewards/margins": 9.41203303472233, + "rewards/rejected": -6.356531243575247, + "step": 1557 + }, + { + "epoch": 0.5751465091597066, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.934167633652045e-06, + "logits/chosen": 282832969.14285713, + "logits/rejected": 203627690.66666666, + "logps/chosen": -300.46909877232144, + "logps/rejected": -332.5647786458333, + "loss": 0.0836, + "rewards/chosen": 2.789027895246233, + "rewards/margins": 8.62213022746737, + "rewards/rejected": -5.833102332221137, + "step": 1558 + }, + { + "epoch": 0.5755156660975497, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 3.928418305979069e-06, + "logits/chosen": 218787174.4, + "logits/rejected": 272777749.3333333, + "logps/chosen": -327.86171875, + "logps/rejected": -398.8180338541667, + "loss": 0.0862, + "rewards/chosen": 3.0647464752197267, + "rewards/margins": 9.261930084228515, + "rewards/rejected": -6.197183609008789, + "step": 1559 + }, + { + "epoch": 0.5758848230353929, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.922670462968914e-06, + "logits/chosen": 227049747.69230768, + "logits/rejected": 232306714.9473684, + "logps/chosen": -463.89806189903845, + "logps/rejected": -455.7479954769737, + "loss": 0.0503, + "rewards/chosen": 2.8257452157827525, + "rewards/margins": 8.797568657137605, + "rewards/rejected": -5.971823441354852, + "step": 1560 + }, + { + "epoch": 0.5762539799732361, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 3.916924112585146e-06, + "logits/chosen": 168994257.45454547, + "logits/rejected": 232869717.33333334, + "logps/chosen": -372.13583096590907, + "logps/rejected": -469.3876953125, + "loss": 0.0335, + "rewards/chosen": 3.186624700372869, + "rewards/margins": 10.831300545564462, + "rewards/rejected": -7.644675845191593, + "step": 1561 + }, + { + "epoch": 0.5766231369110794, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.9111792627892605e-06, + "logits/chosen": 148423850.66666666, + "logits/rejected": 304302200.4705882, + "logps/chosen": -365.35393880208335, + "logps/rejected": -419.3508731617647, + "loss": 0.0475, + "rewards/chosen": 3.2329477945963543, + "rewards/margins": 9.657549629959405, + "rewards/rejected": -6.424601835363052, + "step": 1562 + }, + { + "epoch": 0.5769922938489225, + "grad_norm": 4.84375, + "kl": 0.9795241355895996, + "learning_rate": 3.905435921540672e-06, + "logits/chosen": 221549104.0, + "logits/rejected": 151484800.0, + "logps/chosen": -379.9283142089844, + "logps/rejected": -405.81072998046875, + "loss": 0.0752, + "rewards/chosen": 2.625338077545166, + "rewards/margins": 9.858580589294434, + "rewards/rejected": -7.233242511749268, + "step": 1563 + }, + { + "epoch": 0.5773614507867657, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 3.899694096796709e-06, + "logits/chosen": 177991031.46666667, + "logits/rejected": 151414467.7647059, + "logps/chosen": -347.35901692708336, + "logps/rejected": -280.7923368566176, + "loss": 0.0757, + "rewards/chosen": 2.6398302714029946, + "rewards/margins": 8.178509057736864, + "rewards/rejected": -5.538678786333869, + "step": 1564 + }, + { + "epoch": 0.5777306077246089, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 3.893953796512596e-06, + "logits/chosen": 182502272.0, + "logits/rejected": 158015347.2, + "logps/chosen": -269.10398356119794, + "logps/rejected": -291.4302490234375, + "loss": 0.0676, + "rewards/chosen": 3.93927796681722, + "rewards/margins": 9.23396447499593, + "rewards/rejected": -5.294686508178711, + "step": 1565 + }, + { + "epoch": 0.5780997646624522, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 3.8882150286414455e-06, + "logits/chosen": 203562160.0, + "logits/rejected": 246960800.0, + "logps/chosen": -362.80078125, + "logps/rejected": -532.0850219726562, + "loss": 0.0708, + "rewards/chosen": 2.233140230178833, + "rewards/margins": 10.564944505691528, + "rewards/rejected": -8.331804275512695, + "step": 1566 + }, + { + "epoch": 0.5784689216002953, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.882477801134247e-06, + "logits/chosen": 216620008.72727272, + "logits/rejected": 188669476.57142857, + "logps/chosen": -312.70687588778407, + "logps/rejected": -354.10037667410717, + "loss": 0.0847, + "rewards/chosen": 2.054068825461648, + "rewards/margins": 7.2758469684815505, + "rewards/rejected": -5.221778143019903, + "step": 1567 + }, + { + "epoch": 0.5788380785381385, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 3.876742121939857e-06, + "logits/chosen": 241496371.2, + "logits/rejected": 220181970.82352942, + "logps/chosen": -400.82197265625, + "logps/rejected": -448.3999885110294, + "loss": 0.056, + "rewards/chosen": 2.8618975321451825, + "rewards/margins": 10.10947452619964, + "rewards/rejected": -7.247576994054458, + "step": 1568 + }, + { + "epoch": 0.5792072354759817, + "grad_norm": 5.84375, + "kl": 0.5881967544555664, + "learning_rate": 3.871007999004986e-06, + "logits/chosen": 238058571.29411766, + "logits/rejected": 159373090.13333333, + "logps/chosen": -418.88694852941177, + "logps/rejected": -337.63776041666665, + "loss": 0.0705, + "rewards/chosen": 3.5816603267894074, + "rewards/margins": 8.449831001431335, + "rewards/rejected": -4.868170674641927, + "step": 1569 + }, + { + "epoch": 0.579576392413825, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 3.8652754402741896e-06, + "logits/chosen": 289431068.4444444, + "logits/rejected": 180182052.57142857, + "logps/chosen": -367.40228949652777, + "logps/rejected": -407.9868861607143, + "loss": 0.1081, + "rewards/chosen": 2.56443108452691, + "rewards/margins": 9.080787961445157, + "rewards/rejected": -6.516356876918247, + "step": 1570 + }, + { + "epoch": 0.5799455493516681, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 3.859544453689853e-06, + "logits/chosen": 196111495.52941176, + "logits/rejected": 253071872.0, + "logps/chosen": -274.30419921875, + "logps/rejected": -570.7041015625, + "loss": 0.1084, + "rewards/chosen": 2.7917193244485294, + "rewards/margins": 11.222496301987592, + "rewards/rejected": -8.430776977539063, + "step": 1571 + }, + { + "epoch": 0.5803147062895113, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.853815047192188e-06, + "logits/chosen": 226282522.9473684, + "logits/rejected": 290649875.6923077, + "logps/chosen": -339.84511204769734, + "logps/rejected": -715.4322415865385, + "loss": 0.0837, + "rewards/chosen": 2.442916468570107, + "rewards/margins": 11.693081967743785, + "rewards/rejected": -9.250165499173677, + "step": 1572 + }, + { + "epoch": 0.5806838632273545, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 3.848087228719212e-06, + "logits/chosen": 195887024.0, + "logits/rejected": 324025440.0, + "logps/chosen": -354.58782958984375, + "logps/rejected": -396.267333984375, + "loss": 0.0828, + "rewards/chosen": 2.4007620811462402, + "rewards/margins": 8.316563606262207, + "rewards/rejected": -5.915801525115967, + "step": 1573 + }, + { + "epoch": 0.5810530201651978, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 3.842361006206745e-06, + "logits/chosen": 180785898.66666666, + "logits/rejected": 273987379.2, + "logps/chosen": -311.2270914713542, + "logps/rejected": -364.974072265625, + "loss": 0.0977, + "rewards/chosen": 2.081823190053304, + "rewards/margins": 8.718485482533772, + "rewards/rejected": -6.636662292480469, + "step": 1574 + }, + { + "epoch": 0.5814221771030409, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.836636387588397e-06, + "logits/chosen": 307978524.4444444, + "logits/rejected": 244824740.57142857, + "logps/chosen": -302.93296983506946, + "logps/rejected": -520.63232421875, + "loss": 0.0906, + "rewards/chosen": 2.7021916707356772, + "rewards/margins": 9.209661392938523, + "rewards/rejected": -6.507469722202846, + "step": 1575 + }, + { + "epoch": 0.5817913340408841, + "grad_norm": 5.125, + "kl": 0.8740835189819336, + "learning_rate": 3.830913380795554e-06, + "logits/chosen": 235275170.9090909, + "logits/rejected": 243641241.6, + "logps/chosen": -368.03085049715907, + "logps/rejected": -481.97421875, + "loss": 0.0865, + "rewards/chosen": 2.9232032082297583, + "rewards/margins": 9.594347312233664, + "rewards/rejected": -6.671144104003906, + "step": 1576 + }, + { + "epoch": 0.5821604909787274, + "grad_norm": 5.75, + "kl": 0.15717363357543945, + "learning_rate": 3.825191993757368e-06, + "logits/chosen": 229484617.14285713, + "logits/rejected": 170396439.27272728, + "logps/chosen": -304.46547154017856, + "logps/rejected": -439.12819602272725, + "loss": 0.1277, + "rewards/chosen": 2.4844567435128346, + "rewards/margins": 9.203742188292663, + "rewards/rejected": -6.719285444779829, + "step": 1577 + }, + { + "epoch": 0.5825296479165706, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 3.819472234400749e-06, + "logits/chosen": 263081941.33333334, + "logits/rejected": 141370112.0, + "logps/chosen": -350.3445231119792, + "logps/rejected": -407.372216796875, + "loss": 0.0486, + "rewards/chosen": 2.7543280919392905, + "rewards/margins": 9.498518308003744, + "rewards/rejected": -6.744190216064453, + "step": 1578 + }, + { + "epoch": 0.5828988048544137, + "grad_norm": 5.4375, + "kl": 1.0548067092895508, + "learning_rate": 3.813754110650352e-06, + "logits/chosen": 213181440.0, + "logits/rejected": 164749621.89473686, + "logps/chosen": -333.4220628004808, + "logps/rejected": -470.15347450657896, + "loss": 0.0715, + "rewards/chosen": 3.1855999873234677, + "rewards/margins": 10.252529051622398, + "rewards/rejected": -7.066929064298931, + "step": 1579 + }, + { + "epoch": 0.5832679617922569, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.8080376304285615e-06, + "logits/chosen": 164522422.85714287, + "logits/rejected": 190184704.0, + "logps/chosen": -306.2776576450893, + "logps/rejected": -466.9621310763889, + "loss": 0.066, + "rewards/chosen": 3.2703865596226285, + "rewards/margins": 9.860701636662558, + "rewards/rejected": -6.59031507703993, + "step": 1580 + }, + { + "epoch": 0.5836371187301002, + "grad_norm": 6.21875, + "kl": 1.0180091857910156, + "learning_rate": 3.8023228016554913e-06, + "logits/chosen": 187133884.63157895, + "logits/rejected": 124712241.23076923, + "logps/chosen": -374.14967105263156, + "logps/rejected": -352.8668870192308, + "loss": 0.0982, + "rewards/chosen": 2.6707825911672494, + "rewards/margins": 9.001892445058475, + "rewards/rejected": -6.331109853891226, + "step": 1581 + }, + { + "epoch": 0.5840062756679434, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 3.7966096322489637e-06, + "logits/chosen": 290774812.4444444, + "logits/rejected": 305840329.14285713, + "logps/chosen": -359.8854709201389, + "logps/rejected": -445.50013950892856, + "loss": 0.0677, + "rewards/chosen": 2.851699617173937, + "rewards/margins": 9.50487442622109, + "rewards/rejected": -6.653174809047154, + "step": 1582 + }, + { + "epoch": 0.5843754326057865, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.790898130124503e-06, + "logits/chosen": 192999338.66666666, + "logits/rejected": 261971546.3529412, + "logps/chosen": -292.5908203125, + "logps/rejected": -520.9181985294117, + "loss": 0.0687, + "rewards/chosen": 2.6218238830566407, + "rewards/margins": 10.021340717988856, + "rewards/rejected": -7.399516834932215, + "step": 1583 + }, + { + "epoch": 0.5847445895436297, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 3.7851883031953197e-06, + "logits/chosen": 187241392.0, + "logits/rejected": 186575584.0, + "logps/chosen": -342.6145324707031, + "logps/rejected": -554.2030029296875, + "loss": 0.076, + "rewards/chosen": 2.419079065322876, + "rewards/margins": 10.49667763710022, + "rewards/rejected": -8.077598571777344, + "step": 1584 + }, + { + "epoch": 0.585113746481473, + "grad_norm": 3.53125, + "kl": 0.0, + "learning_rate": 3.7794801593723075e-06, + "logits/chosen": 210814907.73333332, + "logits/rejected": 203905280.0, + "logps/chosen": -335.0934244791667, + "logps/rejected": -429.3349034926471, + "loss": 0.0648, + "rewards/chosen": 2.9984153747558593, + "rewards/margins": 10.133269859762752, + "rewards/rejected": -7.134854485006893, + "step": 1585 + }, + { + "epoch": 0.5854829034193162, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 3.773773706564029e-06, + "logits/chosen": 151047610.1818182, + "logits/rejected": 180713753.6, + "logps/chosen": -292.6541859019886, + "logps/rejected": -317.701123046875, + "loss": 0.1183, + "rewards/chosen": 2.5030300833962182, + "rewards/margins": 7.56165587685325, + "rewards/rejected": -5.058625793457031, + "step": 1586 + }, + { + "epoch": 0.5858520603571593, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 3.768068952676701e-06, + "logits/chosen": 279556027.73333335, + "logits/rejected": 245236193.88235295, + "logps/chosen": -345.52239583333335, + "logps/rejected": -424.09754136029414, + "loss": 0.0607, + "rewards/chosen": 3.3209215799967446, + "rewards/margins": 10.406576792399088, + "rewards/rejected": -7.085655212402344, + "step": 1587 + }, + { + "epoch": 0.5862212172950025, + "grad_norm": 5.40625, + "kl": 0.47977352142333984, + "learning_rate": 3.762365905614187e-06, + "logits/chosen": 149265724.2352941, + "logits/rejected": 163057442.13333333, + "logps/chosen": -312.7196691176471, + "logps/rejected": -425.7224934895833, + "loss": 0.0967, + "rewards/chosen": 3.337706397561466, + "rewards/margins": 9.804179786233341, + "rewards/rejected": -6.466473388671875, + "step": 1588 + }, + { + "epoch": 0.5865903742328458, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 3.756664573277987e-06, + "logits/chosen": 263936768.0, + "logits/rejected": 192657408.0, + "logps/chosen": -283.1241048177083, + "logps/rejected": -441.47587890625, + "loss": 0.0654, + "rewards/chosen": 2.7297592163085938, + "rewards/margins": 9.580414581298829, + "rewards/rejected": -6.850655364990234, + "step": 1589 + }, + { + "epoch": 0.5869595311706889, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 3.750964963567225e-06, + "logits/chosen": 166866790.4, + "logits/rejected": 313638038.5882353, + "logps/chosen": -329.52561848958334, + "logps/rejected": -572.0335477941177, + "loss": 0.0387, + "rewards/chosen": 3.559645589192708, + "rewards/margins": 12.386056219362745, + "rewards/rejected": -8.826410630170036, + "step": 1590 + }, + { + "epoch": 0.5873286881085321, + "grad_norm": 8.0, + "kl": 2.80645751953125, + "learning_rate": 3.745267084378636e-06, + "logits/chosen": 181257028.26666668, + "logits/rejected": 264642048.0, + "logps/chosen": -381.15546875, + "logps/rejected": -485.38528262867646, + "loss": 0.1435, + "rewards/chosen": 1.990085983276367, + "rewards/margins": 8.007851701624253, + "rewards/rejected": -6.017765718347886, + "step": 1591 + }, + { + "epoch": 0.5876978450463753, + "grad_norm": 6.375, + "kl": 2.8225955963134766, + "learning_rate": 3.7395709436065615e-06, + "logits/chosen": 200431193.04347825, + "logits/rejected": 161241315.55555555, + "logps/chosen": -326.4715098505435, + "logps/rejected": -500.48448350694446, + "loss": 0.1507, + "rewards/chosen": 2.363901718803074, + "rewards/margins": 9.504888691188057, + "rewards/rejected": -7.140986972384983, + "step": 1592 + }, + { + "epoch": 0.5880670019842186, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 3.7338765491429308e-06, + "logits/chosen": 240717536.0, + "logits/rejected": 182711328.0, + "logps/chosen": -297.491943359375, + "logps/rejected": -561.3634643554688, + "loss": 0.1073, + "rewards/chosen": 1.8871102333068848, + "rewards/margins": 10.711385250091553, + "rewards/rejected": -8.824275016784668, + "step": 1593 + }, + { + "epoch": 0.5884361589220617, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.728183908877254e-06, + "logits/chosen": 277274597.05263156, + "logits/rejected": 206893292.30769232, + "logps/chosen": -362.3946083470395, + "logps/rejected": -333.55093149038464, + "loss": 0.082, + "rewards/chosen": 2.870431398090563, + "rewards/margins": 8.545656289166285, + "rewards/rejected": -5.675224891075721, + "step": 1594 + }, + { + "epoch": 0.5888053158599049, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 3.7224930306966146e-06, + "logits/chosen": 297713250.46153843, + "logits/rejected": 183224239.15789473, + "logps/chosen": -410.73719200721155, + "logps/rejected": -365.10911800986844, + "loss": 0.0586, + "rewards/chosen": 2.882161360520583, + "rewards/margins": 9.409119625323214, + "rewards/rejected": -6.526958264802632, + "step": 1595 + }, + { + "epoch": 0.5891744727977481, + "grad_norm": 5.1875, + "kl": 0.25931453704833984, + "learning_rate": 3.7168039224856508e-06, + "logits/chosen": 237324066.13333333, + "logits/rejected": 154713735.52941176, + "logps/chosen": -404.7148763020833, + "logps/rejected": -391.64680032169116, + "loss": 0.0665, + "rewards/chosen": 2.2550928751627604, + "rewards/margins": 8.452818058986288, + "rewards/rejected": -6.197725183823529, + "step": 1596 + }, + { + "epoch": 0.5895436297355914, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 3.711116592126548e-06, + "logits/chosen": 148416843.29411766, + "logits/rejected": 169500910.93333334, + "logps/chosen": -362.5255342371324, + "logps/rejected": -419.273046875, + "loss": 0.0572, + "rewards/chosen": 3.2722690806669346, + "rewards/margins": 9.923448585061466, + "rewards/rejected": -6.651179504394531, + "step": 1597 + }, + { + "epoch": 0.5899127866734345, + "grad_norm": 5.46875, + "kl": 0.42885303497314453, + "learning_rate": 3.705431047499033e-06, + "logits/chosen": 182738221.17647058, + "logits/rejected": 248975035.73333332, + "logps/chosen": -295.2233455882353, + "logps/rejected": -383.6401692708333, + "loss": 0.1058, + "rewards/chosen": 3.033616907456342, + "rewards/margins": 8.736906193751915, + "rewards/rejected": -5.703289286295573, + "step": 1598 + }, + { + "epoch": 0.5902819436112777, + "grad_norm": 6.4375, + "kl": 1.0114810466766357, + "learning_rate": 3.6997472964803545e-06, + "logits/chosen": 215096832.0, + "logits/rejected": 221271680.0, + "logps/chosen": -350.9801940917969, + "logps/rejected": -364.3854064941406, + "loss": 0.1345, + "rewards/chosen": 1.9298264980316162, + "rewards/margins": 7.972697973251343, + "rewards/rejected": -6.042871475219727, + "step": 1599 + }, + { + "epoch": 0.590651100549121, + "grad_norm": 5.53125, + "kl": 0.6737480163574219, + "learning_rate": 3.694065346945278e-06, + "logits/chosen": 251747726.2222222, + "logits/rejected": 156332854.85714287, + "logps/chosen": -319.93047417534723, + "logps/rejected": -375.22171456473217, + "loss": 0.0981, + "rewards/chosen": 2.591781828138563, + "rewards/margins": 8.499626371595594, + "rewards/rejected": -5.907844543457031, + "step": 1600 + }, + { + "epoch": 0.5910202574869642, + "grad_norm": 6.65625, + "kl": 0.8555989265441895, + "learning_rate": 3.6883852067660698e-06, + "logits/chosen": 245369586.52631578, + "logits/rejected": 138573883.07692307, + "logps/chosen": -442.29502467105266, + "logps/rejected": -361.4602238581731, + "loss": 0.1185, + "rewards/chosen": 2.5861378719932153, + "rewards/margins": 7.754913577184021, + "rewards/rejected": -5.168775705190805, + "step": 1601 + }, + { + "epoch": 0.5913894144248073, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 3.682706883812494e-06, + "logits/chosen": 258735414.85714287, + "logits/rejected": 208119182.2222222, + "logps/chosen": -316.0078125, + "logps/rejected": -525.13525390625, + "loss": 0.0894, + "rewards/chosen": 2.410480499267578, + "rewards/margins": 9.686210208468967, + "rewards/rejected": -7.275729709201389, + "step": 1602 + }, + { + "epoch": 0.5917585713626505, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 3.6770303859517954e-06, + "logits/chosen": 129068689.06666666, + "logits/rejected": 193292197.6470588, + "logps/chosen": -261.5488606770833, + "logps/rejected": -435.0868566176471, + "loss": 0.1205, + "rewards/chosen": 1.7747515360514323, + "rewards/margins": 8.919053904215495, + "rewards/rejected": -7.1443023681640625, + "step": 1603 + }, + { + "epoch": 0.5921277283004938, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 3.6713557210486874e-06, + "logits/chosen": 228346439.1111111, + "logits/rejected": 192743533.7142857, + "logps/chosen": -358.64935980902777, + "logps/rejected": -475.4862583705357, + "loss": 0.0742, + "rewards/chosen": 2.8136253356933594, + "rewards/margins": 9.96622085571289, + "rewards/rejected": -7.152595520019531, + "step": 1604 + }, + { + "epoch": 0.592496885238337, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 3.665682896965349e-06, + "logits/chosen": 158314880.0, + "logits/rejected": 182776177.7777778, + "logps/chosen": -312.53261021205356, + "logps/rejected": -406.78789605034723, + "loss": 0.0763, + "rewards/chosen": 2.6778022221156528, + "rewards/margins": 9.01321159847199, + "rewards/rejected": -6.335409376356337, + "step": 1605 + }, + { + "epoch": 0.5928660421761801, + "grad_norm": 4.71875, + "kl": 0.09131026268005371, + "learning_rate": 3.660011921561405e-06, + "logits/chosen": 210586331.42857143, + "logits/rejected": 341367665.7777778, + "logps/chosen": -414.62534877232144, + "logps/rejected": -598.9058159722222, + "loss": 0.0736, + "rewards/chosen": 2.167006083897182, + "rewards/margins": 9.569001424880256, + "rewards/rejected": -7.401995340983073, + "step": 1606 + }, + { + "epoch": 0.5932351991140233, + "grad_norm": 6.4375, + "kl": 0.0, + "learning_rate": 3.654342802693918e-06, + "logits/chosen": 164839662.93333334, + "logits/rejected": 176424222.11764705, + "logps/chosen": -296.13365885416664, + "logps/rejected": -437.4610811121324, + "loss": 0.106, + "rewards/chosen": 2.090783182779948, + "rewards/margins": 7.427544657389323, + "rewards/rejected": -5.336761474609375, + "step": 1607 + }, + { + "epoch": 0.5936043560518666, + "grad_norm": 6.375, + "kl": 0.7118239402770996, + "learning_rate": 3.6486755482173814e-06, + "logits/chosen": 203221105.7777778, + "logits/rejected": 268827520.0, + "logps/chosen": -388.33924696180554, + "logps/rejected": -490.8937290736607, + "loss": 0.1148, + "rewards/chosen": 2.3524453904893665, + "rewards/margins": 9.343678126259455, + "rewards/rejected": -6.991232735770089, + "step": 1608 + }, + { + "epoch": 0.5939735129897098, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 3.643010165983705e-06, + "logits/chosen": 196907739.42857143, + "logits/rejected": 176773902.2222222, + "logps/chosen": -374.43345424107144, + "logps/rejected": -412.51996527777777, + "loss": 0.0439, + "rewards/chosen": 3.164928436279297, + "rewards/margins": 8.995528327094185, + "rewards/rejected": -5.830599890814887, + "step": 1609 + }, + { + "epoch": 0.5943426699275529, + "grad_norm": 5.28125, + "kl": 0.2009744644165039, + "learning_rate": 3.637346663842204e-06, + "logits/chosen": 201248034.13333333, + "logits/rejected": 198773579.29411766, + "logps/chosen": -352.06354166666665, + "logps/rejected": -419.75973690257354, + "loss": 0.0785, + "rewards/chosen": 2.6765380859375, + "rewards/margins": 8.16433599135455, + "rewards/rejected": -5.487797905417049, + "step": 1610 + }, + { + "epoch": 0.5947118268653961, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 3.6316850496395863e-06, + "logits/chosen": 213862656.0, + "logits/rejected": 184412706.13333333, + "logps/chosen": -288.1920955882353, + "logps/rejected": -359.94915364583335, + "loss": 0.0959, + "rewards/chosen": 2.7282434351303997, + "rewards/margins": 7.6722614811915975, + "rewards/rejected": -4.944018046061198, + "step": 1611 + }, + { + "epoch": 0.5950809838032394, + "grad_norm": 6.9375, + "kl": 0.567863941192627, + "learning_rate": 3.626025331219949e-06, + "logits/chosen": 249717338.3529412, + "logits/rejected": 142580548.26666668, + "logps/chosen": -321.7359260110294, + "logps/rejected": -318.30029296875, + "loss": 0.1171, + "rewards/chosen": 2.216459835276884, + "rewards/margins": 7.718352942373238, + "rewards/rejected": -5.501893107096354, + "step": 1612 + }, + { + "epoch": 0.5954501407410826, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 3.6203675164247586e-06, + "logits/chosen": 149872674.13333333, + "logits/rejected": 213107169.88235295, + "logps/chosen": -281.21051432291665, + "logps/rejected": -432.3833869485294, + "loss": 0.064, + "rewards/chosen": 2.907190958658854, + "rewards/margins": 8.742208892223882, + "rewards/rejected": -5.835017933565028, + "step": 1613 + }, + { + "epoch": 0.5958192976789257, + "grad_norm": 3.75, + "kl": 0.0, + "learning_rate": 3.6147116130928462e-06, + "logits/chosen": 148514032.94117647, + "logits/rejected": 244583953.06666666, + "logps/chosen": -319.65490004595586, + "logps/rejected": -577.102734375, + "loss": 0.0495, + "rewards/chosen": 3.599873486687155, + "rewards/margins": 11.815484245150698, + "rewards/rejected": -8.215610758463542, + "step": 1614 + }, + { + "epoch": 0.596188454616769, + "grad_norm": 5.8125, + "kl": 0.2957429885864258, + "learning_rate": 3.609057629060394e-06, + "logits/chosen": 262027477.33333334, + "logits/rejected": 257876121.6, + "logps/chosen": -408.680908203125, + "logps/rejected": -474.179345703125, + "loss": 0.1039, + "rewards/chosen": 2.0120107332865396, + "rewards/margins": 8.449322287241618, + "rewards/rejected": -6.4373115539550785, + "step": 1615 + }, + { + "epoch": 0.5965576115546122, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 3.6034055721609256e-06, + "logits/chosen": 305930154.6666667, + "logits/rejected": 154960731.42857143, + "logps/chosen": -289.06287977430554, + "logps/rejected": -460.27591378348217, + "loss": 0.0676, + "rewards/chosen": 2.9704632229275174, + "rewards/margins": 10.30761979118226, + "rewards/rejected": -7.337156568254743, + "step": 1616 + }, + { + "epoch": 0.5969267684924554, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.5977554502252943e-06, + "logits/chosen": 169367956.21052632, + "logits/rejected": 177500081.23076922, + "logps/chosen": -264.99825246710526, + "logps/rejected": -432.75619741586536, + "loss": 0.0911, + "rewards/chosen": 3.0070557845266244, + "rewards/margins": 8.975735946222837, + "rewards/rejected": -5.968680161696214, + "step": 1617 + }, + { + "epoch": 0.5972959254302985, + "grad_norm": 3.46875, + "kl": 0.0672612190246582, + "learning_rate": 3.5921072710816697e-06, + "logits/chosen": 134405391.05882353, + "logits/rejected": 227441629.86666667, + "logps/chosen": -308.3805721507353, + "logps/rejected": -522.1137369791667, + "loss": 0.0719, + "rewards/chosen": 3.343503615435432, + "rewards/margins": 9.813828831092984, + "rewards/rejected": -6.470325215657552, + "step": 1618 + }, + { + "epoch": 0.5976650823681418, + "grad_norm": 6.34375, + "kl": 0.4939708709716797, + "learning_rate": 3.586461042555535e-06, + "logits/chosen": 228999544.47058824, + "logits/rejected": 263052270.93333334, + "logps/chosen": -383.8249080882353, + "logps/rejected": -512.9112630208333, + "loss": 0.0918, + "rewards/chosen": 2.722546970143038, + "rewards/margins": 10.177749708587049, + "rewards/rejected": -7.455202738444011, + "step": 1619 + }, + { + "epoch": 0.598034239305985, + "grad_norm": 7.09375, + "kl": 0.0, + "learning_rate": 3.5808167724696657e-06, + "logits/chosen": 182367680.0, + "logits/rejected": 243872512.0, + "logps/chosen": -405.2329406738281, + "logps/rejected": -373.0892639160156, + "loss": 0.0702, + "rewards/chosen": 2.7083380222320557, + "rewards/margins": 8.972395658493042, + "rewards/rejected": -6.264057636260986, + "step": 1620 + }, + { + "epoch": 0.5984033962438282, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 3.5751744686441277e-06, + "logits/chosen": 370108245.3333333, + "logits/rejected": 194952161.88235295, + "logps/chosen": -473.70436197916666, + "logps/rejected": -293.91311465992646, + "loss": 0.0652, + "rewards/chosen": 2.3183878580729167, + "rewards/margins": 8.137406502518, + "rewards/rejected": -5.819018644445083, + "step": 1621 + }, + { + "epoch": 0.5987725531816713, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 3.569534138896262e-06, + "logits/chosen": 175519488.0, + "logits/rejected": 159930128.0, + "logps/chosen": -377.1714172363281, + "logps/rejected": -412.52197265625, + "loss": 0.0794, + "rewards/chosen": 2.7066919803619385, + "rewards/margins": 8.710565328598022, + "rewards/rejected": -6.003873348236084, + "step": 1622 + }, + { + "epoch": 0.5991417101195146, + "grad_norm": 4.125, + "kl": 0.35694217681884766, + "learning_rate": 3.5638957910406724e-06, + "logits/chosen": 220071253.33333334, + "logits/rejected": 309741754.1818182, + "logps/chosen": -388.4454055059524, + "logps/rejected": -388.33158735795456, + "loss": 0.0516, + "rewards/chosen": 3.8263368152436756, + "rewards/margins": 9.822774581578903, + "rewards/rejected": -5.9964377663352275, + "step": 1623 + }, + { + "epoch": 0.5995108670573578, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 3.5582594328892183e-06, + "logits/chosen": 264720896.0, + "logits/rejected": 217807195.42857143, + "logps/chosen": -442.5166015625, + "logps/rejected": -386.18990652901783, + "loss": 0.0973, + "rewards/chosen": 2.268438551161024, + "rewards/margins": 8.168292938716828, + "rewards/rejected": -5.899854387555804, + "step": 1624 + }, + { + "epoch": 0.5998800239952009, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 3.5526250722510042e-06, + "logits/chosen": 241377713.23076922, + "logits/rejected": 236489404.63157895, + "logps/chosen": -339.5123948317308, + "logps/rejected": -519.963712993421, + "loss": 0.0693, + "rewards/chosen": 2.679790790264423, + "rewards/margins": 9.082368240665328, + "rewards/rejected": -6.402577450400905, + "step": 1625 + }, + { + "epoch": 0.6002491809330441, + "grad_norm": 3.640625, + "kl": 0.4611787796020508, + "learning_rate": 3.546992716932364e-06, + "logits/chosen": 219108647.3846154, + "logits/rejected": 242660540.63157895, + "logps/chosen": -346.88961087740387, + "logps/rejected": -473.3247327302632, + "loss": 0.048, + "rewards/chosen": 3.6446990966796875, + "rewards/margins": 10.776119031404194, + "rewards/rejected": -7.131419934724507, + "step": 1626 + }, + { + "epoch": 0.6006183378708874, + "grad_norm": 5.03125, + "kl": 0.36748790740966797, + "learning_rate": 3.541362374736852e-06, + "logits/chosen": 182943381.33333334, + "logits/rejected": 197104166.4, + "logps/chosen": -311.3664143880208, + "logps/rejected": -394.198681640625, + "loss": 0.0725, + "rewards/chosen": 2.2777175903320312, + "rewards/margins": 8.75364990234375, + "rewards/rejected": -6.475932312011719, + "step": 1627 + }, + { + "epoch": 0.6009874948087306, + "grad_norm": 5.40625, + "kl": 0.024155616760253906, + "learning_rate": 3.5357340534652397e-06, + "logits/chosen": 262669248.0, + "logits/rejected": 193803872.0, + "logps/chosen": -413.7292175292969, + "logps/rejected": -433.9677734375, + "loss": 0.0468, + "rewards/chosen": 3.382657766342163, + "rewards/margins": 9.753710508346558, + "rewards/rejected": -6.3710527420043945, + "step": 1628 + }, + { + "epoch": 0.6013566517465737, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 3.530107760915493e-06, + "logits/chosen": 227475344.0, + "logits/rejected": 270151424.0, + "logps/chosen": -301.8738098144531, + "logps/rejected": -494.39093017578125, + "loss": 0.0605, + "rewards/chosen": 3.0267951488494873, + "rewards/margins": 9.038216829299927, + "rewards/rejected": -6.0114216804504395, + "step": 1629 + }, + { + "epoch": 0.6017258086844169, + "grad_norm": 5.84375, + "kl": 0.35849952697753906, + "learning_rate": 3.5244835048827686e-06, + "logits/chosen": 191478272.0, + "logits/rejected": 166253408.0, + "logps/chosen": -363.4026123046875, + "logps/rejected": -428.019287109375, + "loss": 0.0887, + "rewards/chosen": 2.632578468322754, + "rewards/margins": 8.71590353647868, + "rewards/rejected": -6.083325068155925, + "step": 1630 + }, + { + "epoch": 0.6020949656222602, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.5188612931594014e-06, + "logits/chosen": 194736213.33333334, + "logits/rejected": 282656102.4, + "logps/chosen": -333.37518310546875, + "logps/rejected": -447.90185546875, + "loss": 0.0587, + "rewards/chosen": 3.1627012888590493, + "rewards/margins": 10.555825678507487, + "rewards/rejected": -7.393124389648437, + "step": 1631 + }, + { + "epoch": 0.6024641225601034, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.5132411335348946e-06, + "logits/chosen": 156648628.70588234, + "logits/rejected": 253298056.53333333, + "logps/chosen": -280.1159237132353, + "logps/rejected": -570.1373046875, + "loss": 0.1179, + "rewards/chosen": 2.5571365356445312, + "rewards/margins": 10.012803649902343, + "rewards/rejected": -7.455667114257812, + "step": 1632 + }, + { + "epoch": 0.6028332794979465, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.5076230337959095e-06, + "logits/chosen": 295209069.71428573, + "logits/rejected": 171354979.55555555, + "logps/chosen": -379.48904854910717, + "logps/rejected": -462.3079427083333, + "loss": 0.0625, + "rewards/chosen": 3.666439873831613, + "rewards/margins": 9.877600745549277, + "rewards/rejected": -6.211160871717665, + "step": 1633 + }, + { + "epoch": 0.6032024364357897, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 3.5020070017262515e-06, + "logits/chosen": 184762337.88235295, + "logits/rejected": 245202602.66666666, + "logps/chosen": -298.8612706801471, + "logps/rejected": -517.2347981770833, + "loss": 0.0995, + "rewards/chosen": 2.6411274180692783, + "rewards/margins": 10.258569739846623, + "rewards/rejected": -7.617442321777344, + "step": 1634 + }, + { + "epoch": 0.603571593373633, + "grad_norm": 4.90625, + "kl": 0.16420674324035645, + "learning_rate": 3.4963930451068585e-06, + "logits/chosen": 208881948.44444445, + "logits/rejected": 184267428.57142857, + "logps/chosen": -301.69194878472223, + "logps/rejected": -333.265625, + "loss": 0.1232, + "rewards/chosen": 2.5373255411783853, + "rewards/margins": 7.9423504783993675, + "rewards/rejected": -5.405024937220982, + "step": 1635 + }, + { + "epoch": 0.6039407503114762, + "grad_norm": 5.59375, + "kl": 0.23453521728515625, + "learning_rate": 3.4907811717157993e-06, + "logits/chosen": 200084619.63636363, + "logits/rejected": 200779366.4, + "logps/chosen": -399.416015625, + "logps/rejected": -313.04609375, + "loss": 0.1057, + "rewards/chosen": 2.354829268022017, + "rewards/margins": 8.17458978132768, + "rewards/rejected": -5.819760513305664, + "step": 1636 + }, + { + "epoch": 0.6043099072493193, + "grad_norm": 8.6875, + "kl": 0.8423633575439453, + "learning_rate": 3.4851713893282523e-06, + "logits/chosen": 310914432.0, + "logits/rejected": 248528256.0, + "logps/chosen": -295.8192138671875, + "logps/rejected": -299.6892496744792, + "loss": 0.1548, + "rewards/chosen": 2.03299617767334, + "rewards/margins": 7.5129899978637695, + "rewards/rejected": -5.47999382019043, + "step": 1637 + }, + { + "epoch": 0.6046790641871626, + "grad_norm": 5.09375, + "kl": 1.0267353057861328, + "learning_rate": 3.479563705716499e-06, + "logits/chosen": 242716160.0, + "logits/rejected": 232868571.42857143, + "logps/chosen": -324.2498372395833, + "logps/rejected": -471.74937220982144, + "loss": 0.0852, + "rewards/chosen": 2.923011144002279, + "rewards/margins": 8.367971056983585, + "rewards/rejected": -5.4449599129813055, + "step": 1638 + }, + { + "epoch": 0.6050482211250058, + "grad_norm": 7.40625, + "kl": 1.6011466979980469, + "learning_rate": 3.4739581286499147e-06, + "logits/chosen": 286424366.54545456, + "logits/rejected": 233301964.8, + "logps/chosen": -330.84701260653407, + "logps/rejected": -472.373681640625, + "loss": 0.135, + "rewards/chosen": 2.3285817232998935, + "rewards/margins": 8.710185727206143, + "rewards/rejected": -6.38160400390625, + "step": 1639 + }, + { + "epoch": 0.605417378062849, + "grad_norm": 7.3125, + "kl": 1.1066274642944336, + "learning_rate": 3.468354665894955e-06, + "logits/chosen": 190859533.47368422, + "logits/rejected": 180620819.69230768, + "logps/chosen": -321.4481907894737, + "logps/rejected": -401.9354717548077, + "loss": 0.1387, + "rewards/chosen": 2.392152284321032, + "rewards/margins": 8.479368001343268, + "rewards/rejected": -6.087215717022236, + "step": 1640 + }, + { + "epoch": 0.6057865350006921, + "grad_norm": 3.625, + "kl": 0.5357807278633118, + "learning_rate": 3.4627533252151465e-06, + "logits/chosen": 294769543.5294118, + "logits/rejected": 259962897.06666666, + "logps/chosen": -260.1576286764706, + "logps/rejected": -413.7093098958333, + "loss": 0.0885, + "rewards/chosen": 2.5069723690257355, + "rewards/margins": 8.891671363980162, + "rewards/rejected": -6.3846989949544275, + "step": 1641 + }, + { + "epoch": 0.6061556919385354, + "grad_norm": 5.96875, + "kl": 0.7686929702758789, + "learning_rate": 3.4571541143710757e-06, + "logits/chosen": 259652750.2222222, + "logits/rejected": 196488978.2857143, + "logps/chosen": -338.0151638454861, + "logps/rejected": -436.71888950892856, + "loss": 0.1098, + "rewards/chosen": 2.1250156826443143, + "rewards/margins": 8.259252699594649, + "rewards/rejected": -6.134237016950335, + "step": 1642 + }, + { + "epoch": 0.6065248488763786, + "grad_norm": 4.46875, + "kl": 2.0260696411132812, + "learning_rate": 3.4515570411203782e-06, + "logits/chosen": 251887525.6470588, + "logits/rejected": 300821333.3333333, + "logps/chosen": -345.45071231617646, + "logps/rejected": -469.7045572916667, + "loss": 0.0927, + "rewards/chosen": 3.2917888865751377, + "rewards/margins": 10.4864574656767, + "rewards/rejected": -7.194668579101562, + "step": 1643 + }, + { + "epoch": 0.6068940058142218, + "grad_norm": 6.28125, + "kl": 0.5298519134521484, + "learning_rate": 3.445962113217726e-06, + "logits/chosen": 262305203.2, + "logits/rejected": 200356181.33333334, + "logps/chosen": -333.8925048828125, + "logps/rejected": -345.8238118489583, + "loss": 0.0933, + "rewards/chosen": 2.794539451599121, + "rewards/margins": 8.81660493214925, + "rewards/rejected": -6.02206548055013, + "step": 1644 + }, + { + "epoch": 0.6072631627520649, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 3.440369338414823e-06, + "logits/chosen": 192516437.33333334, + "logits/rejected": 181226193.45454547, + "logps/chosen": -329.03696986607144, + "logps/rejected": -554.6960671164773, + "loss": 0.1417, + "rewards/chosen": 2.2106265113467263, + "rewards/margins": 8.16698221314005, + "rewards/rejected": -5.956355701793324, + "step": 1645 + }, + { + "epoch": 0.6076323196899082, + "grad_norm": 6.875, + "kl": 2.3208298683166504, + "learning_rate": 3.434778724460387e-06, + "logits/chosen": 168602009.6, + "logits/rejected": 202572992.0, + "logps/chosen": -383.933740234375, + "logps/rejected": -427.8286946614583, + "loss": 0.1259, + "rewards/chosen": 3.3026351928710938, + "rewards/margins": 8.89517593383789, + "rewards/rejected": -5.592540740966797, + "step": 1646 + }, + { + "epoch": 0.6080014766277514, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 3.4291902791001406e-06, + "logits/chosen": 241510912.0, + "logits/rejected": 209324388.17391303, + "logps/chosen": -305.7297634548611, + "logps/rejected": -390.0945567255435, + "loss": 0.0381, + "rewards/chosen": 2.971496158175998, + "rewards/margins": 9.889768757106026, + "rewards/rejected": -6.918272598930027, + "step": 1647 + }, + { + "epoch": 0.6083706335655946, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 3.4236040100768077e-06, + "logits/chosen": 253212359.1111111, + "logits/rejected": 255579172.57142857, + "logps/chosen": -356.05433485243054, + "logps/rejected": -466.62939453125, + "loss": 0.0908, + "rewards/chosen": 2.725046157836914, + "rewards/margins": 9.628175190516881, + "rewards/rejected": -6.903129032679966, + "step": 1648 + }, + { + "epoch": 0.6087397905034377, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 3.4180199251300898e-06, + "logits/chosen": 209352329.84615386, + "logits/rejected": 194760326.7368421, + "logps/chosen": -296.5037184495192, + "logps/rejected": -379.2723838404605, + "loss": 0.0645, + "rewards/chosen": 2.8440167353703427, + "rewards/margins": 8.017633646605951, + "rewards/rejected": -5.173616911235609, + "step": 1649 + }, + { + "epoch": 0.609108947441281, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 3.4124380319966665e-06, + "logits/chosen": 268377472.0, + "logits/rejected": 175755072.0, + "logps/chosen": -321.65216064453125, + "logps/rejected": -323.9752502441406, + "loss": 0.1283, + "rewards/chosen": 1.9356796741485596, + "rewards/margins": 8.048082113265991, + "rewards/rejected": -6.112402439117432, + "step": 1650 + }, + { + "epoch": 0.6094781043791242, + "grad_norm": 4.46875, + "kl": 0.2957887649536133, + "learning_rate": 3.406858338410181e-06, + "logits/chosen": 210180592.94117647, + "logits/rejected": 185571635.2, + "logps/chosen": -354.64148667279414, + "logps/rejected": -363.6052734375, + "loss": 0.067, + "rewards/chosen": 3.011439828311696, + "rewards/margins": 9.541657825544767, + "rewards/rejected": -6.5302179972330725, + "step": 1651 + }, + { + "epoch": 0.6098472613169674, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.401280852101228e-06, + "logits/chosen": 147433321.4117647, + "logits/rejected": 225238562.13333333, + "logps/chosen": -271.57493681066177, + "logps/rejected": -425.7997721354167, + "loss": 0.0673, + "rewards/chosen": 3.448152654311236, + "rewards/margins": 10.290521225274778, + "rewards/rejected": -6.842368570963542, + "step": 1652 + }, + { + "epoch": 0.6102164182548105, + "grad_norm": 4.4375, + "kl": 1.979532241821289, + "learning_rate": 3.3957055807973416e-06, + "logits/chosen": 223563535.05882353, + "logits/rejected": 171092343.46666667, + "logps/chosen": -412.8622185202206, + "logps/rejected": -387.42470703125, + "loss": 0.0868, + "rewards/chosen": 3.1143870634191178, + "rewards/margins": 9.762358153100108, + "rewards/rejected": -6.64797108968099, + "step": 1653 + }, + { + "epoch": 0.6105855751926538, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.390132532222991e-06, + "logits/chosen": 256563248.0, + "logits/rejected": 233508000.0, + "logps/chosen": -378.3759765625, + "logps/rejected": -441.9617919921875, + "loss": 0.065, + "rewards/chosen": 2.8812055587768555, + "rewards/margins": 8.559681415557861, + "rewards/rejected": -5.678475856781006, + "step": 1654 + }, + { + "epoch": 0.610954732130497, + "grad_norm": 4.40625, + "kl": 0.8543100357055664, + "learning_rate": 3.3845617140995628e-06, + "logits/chosen": 203085363.2, + "logits/rejected": 220176564.70588234, + "logps/chosen": -357.11432291666665, + "logps/rejected": -357.93640854779414, + "loss": 0.0607, + "rewards/chosen": 3.619695536295573, + "rewards/margins": 9.72570821724686, + "rewards/rejected": -6.106012680951287, + "step": 1655 + }, + { + "epoch": 0.6113238890683402, + "grad_norm": 4.78125, + "kl": 0.06501388549804688, + "learning_rate": 3.3789931341453564e-06, + "logits/chosen": 215229499.07692307, + "logits/rejected": 272333177.2631579, + "logps/chosen": -333.43306790865387, + "logps/rejected": -509.39237253289474, + "loss": 0.0761, + "rewards/chosen": 2.539858891413762, + "rewards/margins": 9.280575122910474, + "rewards/rejected": -6.740716231496711, + "step": 1656 + }, + { + "epoch": 0.6116930460061833, + "grad_norm": 6.90625, + "kl": 1.8463134765625, + "learning_rate": 3.373426800075569e-06, + "logits/chosen": 206179961.9047619, + "logits/rejected": 186627514.1818182, + "logps/chosen": -298.1513671875, + "logps/rejected": -449.0743963068182, + "loss": 0.1549, + "rewards/chosen": 2.040491739908854, + "rewards/margins": 7.615943330706973, + "rewards/rejected": -5.575451590798118, + "step": 1657 + }, + { + "epoch": 0.6120622029440266, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 3.3678627196022827e-06, + "logits/chosen": 226495846.4, + "logits/rejected": 328249002.6666667, + "logps/chosen": -338.456689453125, + "logps/rejected": -558.9448649088541, + "loss": 0.1169, + "rewards/chosen": 2.3144767761230467, + "rewards/margins": 9.043907292683919, + "rewards/rejected": -6.729430516560872, + "step": 1658 + }, + { + "epoch": 0.6124313598818698, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 3.362300900434463e-06, + "logits/chosen": 315680393.84615386, + "logits/rejected": 276667688.42105263, + "logps/chosen": -338.2663386418269, + "logps/rejected": -532.8635896381579, + "loss": 0.071, + "rewards/chosen": 1.9870210794302134, + "rewards/margins": 9.857259534148552, + "rewards/rejected": -7.870238454718339, + "step": 1659 + }, + { + "epoch": 0.6128005168197129, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.35674135027794e-06, + "logits/chosen": 187842653.0909091, + "logits/rejected": 155587474.2857143, + "logps/chosen": -324.42484907670456, + "logps/rejected": -385.7144484747024, + "loss": 0.0761, + "rewards/chosen": 2.521716898137873, + "rewards/margins": 8.717453011186608, + "rewards/rejected": -6.195736113048735, + "step": 1660 + }, + { + "epoch": 0.6131696737575562, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 3.3511840768353977e-06, + "logits/chosen": 282821102.93333334, + "logits/rejected": 226826029.17647058, + "logps/chosen": -335.9609700520833, + "logps/rejected": -439.50749655330884, + "loss": 0.0637, + "rewards/chosen": 2.773309580485026, + "rewards/margins": 10.221829118915633, + "rewards/rejected": -7.448519538430607, + "step": 1661 + }, + { + "epoch": 0.6135388306953994, + "grad_norm": 4.96875, + "kl": 1.634124755859375, + "learning_rate": 3.345629087806369e-06, + "logits/chosen": 353563520.0, + "logits/rejected": 232522880.0, + "logps/chosen": -433.3789876302083, + "logps/rejected": -448.209521484375, + "loss": 0.0573, + "rewards/chosen": 2.759746233622233, + "rewards/margins": 9.001675860087076, + "rewards/rejected": -6.241929626464843, + "step": 1662 + }, + { + "epoch": 0.6139079876332426, + "grad_norm": 6.25, + "kl": 0.8676877021789551, + "learning_rate": 3.3400763908872214e-06, + "logits/chosen": 194312055.46666667, + "logits/rejected": 180125982.11764705, + "logps/chosen": -373.9943359375, + "logps/rejected": -416.0182100183824, + "loss": 0.0891, + "rewards/chosen": 2.947461954752604, + "rewards/margins": 8.929947108848422, + "rewards/rejected": -5.982485154095818, + "step": 1663 + }, + { + "epoch": 0.6142771445710857, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.3345259937711436e-06, + "logits/chosen": 170338176.0, + "logits/rejected": 321371420.4444444, + "logps/chosen": -318.05643136160717, + "logps/rejected": -535.2668728298611, + "loss": 0.0556, + "rewards/chosen": 2.8648193904331754, + "rewards/margins": 9.969247999645415, + "rewards/rejected": -7.104428609212239, + "step": 1664 + }, + { + "epoch": 0.614646301508929, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 3.328977904148143e-06, + "logits/chosen": 233338168.8888889, + "logits/rejected": 123816521.14285715, + "logps/chosen": -318.4082302517361, + "logps/rejected": -400.1705845424107, + "loss": 0.0984, + "rewards/chosen": 2.9427195654975042, + "rewards/margins": 10.122714421105762, + "rewards/rejected": -7.179994855608259, + "step": 1665 + }, + { + "epoch": 0.6150154584467722, + "grad_norm": 5.65625, + "kl": 1.332643747329712, + "learning_rate": 3.3234321297050264e-06, + "logits/chosen": 198718754.13333333, + "logits/rejected": 180192316.2352941, + "logps/chosen": -362.26005859375, + "logps/rejected": -451.55589384191177, + "loss": 0.1035, + "rewards/chosen": 2.551810709635417, + "rewards/margins": 9.341632558785232, + "rewards/rejected": -6.789821849149816, + "step": 1666 + }, + { + "epoch": 0.6153846153846154, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 3.317888678125392e-06, + "logits/chosen": 248434995.2, + "logits/rejected": 194471296.0, + "logps/chosen": -339.220263671875, + "logps/rejected": -330.09136962890625, + "loss": 0.0856, + "rewards/chosen": 2.6849552154541017, + "rewards/margins": 7.476619656880697, + "rewards/rejected": -4.791664441426595, + "step": 1667 + }, + { + "epoch": 0.6157537723224585, + "grad_norm": 5.53125, + "kl": 1.0505385398864746, + "learning_rate": 3.3123475570896246e-06, + "logits/chosen": 182469213.0909091, + "logits/rejected": 210787212.8, + "logps/chosen": -309.79252485795456, + "logps/rejected": -353.7570068359375, + "loss": 0.0922, + "rewards/chosen": 2.9945838234641333, + "rewards/margins": 8.411393668434837, + "rewards/rejected": -5.416809844970703, + "step": 1668 + }, + { + "epoch": 0.6161229292603018, + "grad_norm": 5.03125, + "kl": 0.13963842391967773, + "learning_rate": 3.3068087742748763e-06, + "logits/chosen": 213051204.26666668, + "logits/rejected": 224569133.17647058, + "logps/chosen": -336.53590494791666, + "logps/rejected": -513.9895450367648, + "loss": 0.0798, + "rewards/chosen": 2.5613731384277343, + "rewards/margins": 9.782771884693819, + "rewards/rejected": -7.221398746266084, + "step": 1669 + }, + { + "epoch": 0.616492086198145, + "grad_norm": 5.6875, + "kl": 0.30010509490966797, + "learning_rate": 3.301272337355058e-06, + "logits/chosen": 182050141.0909091, + "logits/rejected": 404952857.6, + "logps/chosen": -342.27450284090907, + "logps/rejected": -404.5845458984375, + "loss": 0.0988, + "rewards/chosen": 3.1693111766468394, + "rewards/margins": 7.9052658427845355, + "rewards/rejected": -4.735954666137696, + "step": 1670 + }, + { + "epoch": 0.6168612431359882, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 3.295738254000835e-06, + "logits/chosen": 227957111.46666667, + "logits/rejected": 195614102.5882353, + "logps/chosen": -320.1849609375, + "logps/rejected": -437.5427676930147, + "loss": 0.0913, + "rewards/chosen": 2.230022176106771, + "rewards/margins": 8.576737766639859, + "rewards/rejected": -6.346715590533088, + "step": 1671 + }, + { + "epoch": 0.6172304000738313, + "grad_norm": 4.6875, + "kl": 0.014677047729492188, + "learning_rate": 3.2902065318796072e-06, + "logits/chosen": 315429262.2222222, + "logits/rejected": 164600064.0, + "logps/chosen": -314.1724446614583, + "logps/rejected": -292.58705357142856, + "loss": 0.075, + "rewards/chosen": 2.6172235276963978, + "rewards/margins": 8.689374711778429, + "rewards/rejected": -6.072151184082031, + "step": 1672 + }, + { + "epoch": 0.6175995570116746, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 3.2846771786555075e-06, + "logits/chosen": 264118137.2631579, + "logits/rejected": 241239315.69230768, + "logps/chosen": -409.21114309210526, + "logps/rejected": -509.7318584735577, + "loss": 0.0824, + "rewards/chosen": 2.7018773932206, + "rewards/margins": 9.240886487458882, + "rewards/rejected": -6.539009094238281, + "step": 1673 + }, + { + "epoch": 0.6179687139495178, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 3.279150201989384e-06, + "logits/chosen": 177674211.55555555, + "logits/rejected": 219591012.17391303, + "logps/chosen": -366.9159342447917, + "logps/rejected": -404.22702955163044, + "loss": 0.048, + "rewards/chosen": 2.987739986843533, + "rewards/margins": 10.541267155449171, + "rewards/rejected": -7.553527168605639, + "step": 1674 + }, + { + "epoch": 0.618337870887361, + "grad_norm": 5.1875, + "kl": 2.6850404739379883, + "learning_rate": 3.2736256095387912e-06, + "logits/chosen": 214238624.0, + "logits/rejected": 183838736.0, + "logps/chosen": -362.3451843261719, + "logps/rejected": -386.79241943359375, + "loss": 0.0912, + "rewards/chosen": 3.1997296810150146, + "rewards/margins": 9.647626638412476, + "rewards/rejected": -6.447896957397461, + "step": 1675 + }, + { + "epoch": 0.6187070278252041, + "grad_norm": 8.5, + "kl": 0.0, + "learning_rate": 3.2681034089579843e-06, + "logits/chosen": 182752544.0, + "logits/rejected": 257935872.0, + "logps/chosen": -445.558837890625, + "logps/rejected": -511.276611328125, + "loss": 0.075, + "rewards/chosen": 3.7275657653808594, + "rewards/margins": 11.5919189453125, + "rewards/rejected": -7.864353179931641, + "step": 1676 + }, + { + "epoch": 0.6190761847630474, + "grad_norm": 5.28125, + "kl": 1.00445556640625, + "learning_rate": 3.2625836078979013e-06, + "logits/chosen": 160931719.52941176, + "logits/rejected": 147231436.8, + "logps/chosen": -312.01795151654414, + "logps/rejected": -370.09430338541665, + "loss": 0.066, + "rewards/chosen": 3.6567456862505745, + "rewards/margins": 10.09441556743547, + "rewards/rejected": -6.437669881184896, + "step": 1677 + }, + { + "epoch": 0.6194453417008906, + "grad_norm": 3.890625, + "kl": 0.19110393524169922, + "learning_rate": 3.2570662140061543e-06, + "logits/chosen": 296389632.0, + "logits/rejected": 273299131.73333335, + "logps/chosen": -361.21165556066177, + "logps/rejected": -475.40498046875, + "loss": 0.0604, + "rewards/chosen": 3.405132966883042, + "rewards/margins": 10.562890535242417, + "rewards/rejected": -7.157757568359375, + "step": 1678 + }, + { + "epoch": 0.6198144986387338, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 3.251551234927025e-06, + "logits/chosen": 191973401.6, + "logits/rejected": 244529941.33333334, + "logps/chosen": -353.90849609375, + "logps/rejected": -410.3572591145833, + "loss": 0.0712, + "rewards/chosen": 3.2171802520751953, + "rewards/margins": 9.710159937540691, + "rewards/rejected": -6.492979685465495, + "step": 1679 + }, + { + "epoch": 0.620183655576577, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 3.2460386783014466e-06, + "logits/chosen": 268376484.57142857, + "logits/rejected": 244703687.1111111, + "logps/chosen": -305.63065011160717, + "logps/rejected": -407.08827039930554, + "loss": 0.1033, + "rewards/chosen": 2.3135741097586497, + "rewards/margins": 8.362862965417287, + "rewards/rejected": -6.049288855658637, + "step": 1680 + }, + { + "epoch": 0.6205528125144202, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 3.240528551766994e-06, + "logits/chosen": 161030912.0, + "logits/rejected": 235509917.53846154, + "logps/chosen": -329.92948190789474, + "logps/rejected": -526.2522536057693, + "loss": 0.0787, + "rewards/chosen": 2.803119659423828, + "rewards/margins": 10.016961024357723, + "rewards/rejected": -7.213841364933894, + "step": 1681 + }, + { + "epoch": 0.6209219694522634, + "grad_norm": 8.0, + "kl": 0.0, + "learning_rate": 3.2350208629578795e-06, + "logits/chosen": 227341264.0, + "logits/rejected": 171162816.0, + "logps/chosen": -458.0523681640625, + "logps/rejected": -440.31573486328125, + "loss": 0.0907, + "rewards/chosen": 2.291940450668335, + "rewards/margins": 8.540462732315063, + "rewards/rejected": -6.2485222816467285, + "step": 1682 + }, + { + "epoch": 0.6212911263901066, + "grad_norm": 7.5, + "kl": 1.5476152896881104, + "learning_rate": 3.229515619504936e-06, + "logits/chosen": 145153008.0, + "logits/rejected": 282058880.0, + "logps/chosen": -361.7148132324219, + "logps/rejected": -489.4687805175781, + "loss": 0.1291, + "rewards/chosen": 2.308596134185791, + "rewards/margins": 10.311841487884521, + "rewards/rejected": -8.00324535369873, + "step": 1683 + }, + { + "epoch": 0.6216602833279498, + "grad_norm": 4.40625, + "kl": 0.5626134872436523, + "learning_rate": 3.224012829035607e-06, + "logits/chosen": 181547872.0, + "logits/rejected": 148361568.0, + "logps/chosen": -320.01495361328125, + "logps/rejected": -447.0802307128906, + "loss": 0.0771, + "rewards/chosen": 2.8532819747924805, + "rewards/margins": 11.002297401428223, + "rewards/rejected": -8.149015426635742, + "step": 1684 + }, + { + "epoch": 0.622029440265793, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 3.2185124991739406e-06, + "logits/chosen": 180383129.6, + "logits/rejected": 214389317.8181818, + "logps/chosen": -284.7353515625, + "logps/rejected": -439.11288174715907, + "loss": 0.0641, + "rewards/chosen": 2.1686895370483397, + "rewards/margins": 9.178497470508923, + "rewards/rejected": -7.009807933460582, + "step": 1685 + }, + { + "epoch": 0.6223985972036362, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 3.2130146375405748e-06, + "logits/chosen": 133129270.85714285, + "logits/rejected": 169631971.55555555, + "logps/chosen": -229.18411690848214, + "logps/rejected": -373.22930230034723, + "loss": 0.1015, + "rewards/chosen": 2.330439976283482, + "rewards/margins": 8.148046160501147, + "rewards/rejected": -5.817606184217665, + "step": 1686 + }, + { + "epoch": 0.6227677541414794, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 3.2075192517527233e-06, + "logits/chosen": 234361480.53333333, + "logits/rejected": 278777042.8235294, + "logps/chosen": -318.84290364583336, + "logps/rejected": -433.6752068014706, + "loss": 0.0659, + "rewards/chosen": 2.7934580485026044, + "rewards/margins": 9.024483744303385, + "rewards/rejected": -6.231025695800781, + "step": 1687 + }, + { + "epoch": 0.6231369110793226, + "grad_norm": 3.546875, + "kl": 0.0, + "learning_rate": 3.2020263494241757e-06, + "logits/chosen": 101572829.0909091, + "logits/rejected": 196116748.19047618, + "logps/chosen": -293.87351296164775, + "logps/rejected": -568.6834542410714, + "loss": 0.0481, + "rewards/chosen": 3.318952040238814, + "rewards/margins": 10.612379280519692, + "rewards/rejected": -7.293427240280878, + "step": 1688 + }, + { + "epoch": 0.6235060680171658, + "grad_norm": 8.5625, + "kl": 0.6080255508422852, + "learning_rate": 3.196535938165277e-06, + "logits/chosen": 316544391.5294118, + "logits/rejected": 203978905.6, + "logps/chosen": -424.3502987132353, + "logps/rejected": -534.6986002604167, + "loss": 0.1411, + "rewards/chosen": 1.5647807401769303, + "rewards/margins": 9.124091653262868, + "rewards/rejected": -7.5593109130859375, + "step": 1689 + }, + { + "epoch": 0.623875224955009, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 3.1910480255829235e-06, + "logits/chosen": 213815808.0, + "logits/rejected": 230186736.0, + "logps/chosen": -374.3551940917969, + "logps/rejected": -412.953369140625, + "loss": 0.0681, + "rewards/chosen": 3.1513030529022217, + "rewards/margins": 9.647020101547241, + "rewards/rejected": -6.4957170486450195, + "step": 1690 + }, + { + "epoch": 0.6242443818928523, + "grad_norm": 5.0, + "kl": 0.13101768493652344, + "learning_rate": 3.185562619280549e-06, + "logits/chosen": 170183734.85714287, + "logits/rejected": 197645340.44444445, + "logps/chosen": -352.5523158482143, + "logps/rejected": -463.42686631944446, + "loss": 0.0827, + "rewards/chosen": 2.2882799421037947, + "rewards/margins": 10.345795404343379, + "rewards/rejected": -8.057515462239584, + "step": 1691 + }, + { + "epoch": 0.6246135388306954, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 3.1800797268581115e-06, + "logits/chosen": 304879680.0, + "logits/rejected": 254355686.4, + "logps/chosen": -372.9073079427083, + "logps/rejected": -577.935888671875, + "loss": 0.0608, + "rewards/chosen": 2.8466622034708657, + "rewards/margins": 11.059265200297038, + "rewards/rejected": -8.212602996826172, + "step": 1692 + }, + { + "epoch": 0.6249826957685386, + "grad_norm": 7.84375, + "kl": 1.583846092224121, + "learning_rate": 3.174599355912092e-06, + "logits/chosen": 170026208.0, + "logits/rejected": 162069312.0, + "logps/chosen": -299.4337158203125, + "logps/rejected": -438.9234313964844, + "loss": 0.1035, + "rewards/chosen": 2.7685322761535645, + "rewards/margins": 9.500402450561523, + "rewards/rejected": -6.731870174407959, + "step": 1693 + }, + { + "epoch": 0.6253518527063818, + "grad_norm": 4.125, + "kl": 0.607607364654541, + "learning_rate": 3.169121514035473e-06, + "logits/chosen": 224488576.0, + "logits/rejected": 126124224.0, + "logps/chosen": -361.4172119140625, + "logps/rejected": -268.07729085286456, + "loss": 0.0841, + "rewards/chosen": 3.10894832611084, + "rewards/margins": 8.283429718017578, + "rewards/rejected": -5.174481391906738, + "step": 1694 + }, + { + "epoch": 0.625721009644225, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 3.1636462088177345e-06, + "logits/chosen": 144011913.14285713, + "logits/rejected": 186899811.55555555, + "logps/chosen": -296.069580078125, + "logps/rejected": -429.34331597222223, + "loss": 0.0724, + "rewards/chosen": 3.196262632097517, + "rewards/margins": 9.156552905128116, + "rewards/rejected": -5.960290273030599, + "step": 1695 + }, + { + "epoch": 0.625721009644225, + "eval_kl": 0.34610968828201294, + "eval_logits/chosen": 229151108.803532, + "eval_logits/rejected": 195237069.7683215, + "eval_logps/chosen": -355.56815673289185, + "eval_logps/rejected": -443.9371675531915, + "eval_loss": 0.08047980815172195, + "eval_rewards/chosen": 2.7977378457850444, + "eval_rewards/margins": 9.366090871974464, + "eval_rewards/rejected": -6.5683530261894205, + "eval_runtime": 52.8205, + "eval_samples_per_second": 16.584, + "eval_steps_per_second": 4.146, + "step": 1695 + }, + { + "epoch": 0.6260901665820682, + "grad_norm": 7.78125, + "kl": 0.0, + "learning_rate": 3.1581734478448447e-06, + "logits/chosen": 206888168.72727272, + "logits/rejected": 225613952.0, + "logps/chosen": -344.99678178267044, + "logps/rejected": -386.096728515625, + "loss": 0.1519, + "rewards/chosen": 1.7081017927689985, + "rewards/margins": 8.335659928755327, + "rewards/rejected": -6.6275581359863285, + "step": 1696 + }, + { + "epoch": 0.6264593235199114, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 3.152703238699242e-06, + "logits/chosen": 207946461.86666667, + "logits/rejected": 208287412.70588234, + "logps/chosen": -430.84986979166666, + "logps/rejected": -434.5193301930147, + "loss": 0.0491, + "rewards/chosen": 3.210815938313802, + "rewards/margins": 10.122304190841376, + "rewards/rejected": -6.911488252527573, + "step": 1697 + }, + { + "epoch": 0.6268284804577546, + "grad_norm": 6.6875, + "kl": 0.0, + "learning_rate": 3.147235588959831e-06, + "logits/chosen": 247386693.8181818, + "logits/rejected": 192089356.8, + "logps/chosen": -372.99520596590907, + "logps/rejected": -409.3025390625, + "loss": 0.1276, + "rewards/chosen": 1.9426815726540305, + "rewards/margins": 8.220092842795632, + "rewards/rejected": -6.277411270141601, + "step": 1698 + }, + { + "epoch": 0.6271976373955978, + "grad_norm": 7.625, + "kl": 2.944716453552246, + "learning_rate": 3.1417705062019742e-06, + "logits/chosen": 266398310.4, + "logits/rejected": 168289440.0, + "logps/chosen": -405.892041015625, + "logps/rejected": -576.7840983072916, + "loss": 0.1461, + "rewards/chosen": 2.447518539428711, + "rewards/margins": 9.691415659586589, + "rewards/rejected": -7.243897120157878, + "step": 1699 + }, + { + "epoch": 0.627566794333441, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 3.136307997997472e-06, + "logits/chosen": 323884960.0, + "logits/rejected": 228717888.0, + "logps/chosen": -324.49896240234375, + "logps/rejected": -496.1636962890625, + "loss": 0.0931, + "rewards/chosen": 2.792818069458008, + "rewards/margins": 9.462363243103027, + "rewards/rejected": -6.6695451736450195, + "step": 1700 + }, + { + "epoch": 0.6279359512712842, + "grad_norm": 5.9375, + "kl": 0.3125629425048828, + "learning_rate": 3.1308480719145594e-06, + "logits/chosen": 246652181.33333334, + "logits/rejected": 198750480.0, + "logps/chosen": -355.4556477864583, + "logps/rejected": -324.0204162597656, + "loss": 0.1023, + "rewards/chosen": 2.609989643096924, + "rewards/margins": 9.207754611968994, + "rewards/rejected": -6.59776496887207, + "step": 1701 + }, + { + "epoch": 0.6283051082091274, + "grad_norm": 4.4375, + "kl": 3.28700590133667, + "learning_rate": 3.125390735517898e-06, + "logits/chosen": 133318987.29411764, + "logits/rejected": 202986359.46666667, + "logps/chosen": -264.8616727941176, + "logps/rejected": -332.2091796875, + "loss": 0.0961, + "rewards/chosen": 3.4543008243336395, + "rewards/margins": 8.528711835075827, + "rewards/rejected": -5.074411010742187, + "step": 1702 + }, + { + "epoch": 0.6286742651469706, + "grad_norm": 5.96875, + "kl": 0.904144287109375, + "learning_rate": 3.119935996368556e-06, + "logits/chosen": 227780800.0, + "logits/rejected": 195705728.0, + "logps/chosen": -428.3711853027344, + "logps/rejected": -330.2705993652344, + "loss": 0.0945, + "rewards/chosen": 2.5496902465820312, + "rewards/margins": 8.076478958129883, + "rewards/rejected": -5.526788711547852, + "step": 1703 + }, + { + "epoch": 0.6290434220848138, + "grad_norm": 3.953125, + "kl": 0.0, + "learning_rate": 3.1144838620240038e-06, + "logits/chosen": 230832457.14285713, + "logits/rejected": 181074816.0, + "logps/chosen": -464.5595005580357, + "logps/rejected": -360.91834852430554, + "loss": 0.0613, + "rewards/chosen": 3.2871426173618863, + "rewards/margins": 8.538702828543528, + "rewards/rejected": -5.251560211181641, + "step": 1704 + }, + { + "epoch": 0.629412579022657, + "grad_norm": 7.78125, + "kl": 3.005979061126709, + "learning_rate": 3.109034340038106e-06, + "logits/chosen": 173521859.04761904, + "logits/rejected": 132900852.36363636, + "logps/chosen": -400.2373046875, + "logps/rejected": -284.96908291903407, + "loss": 0.1139, + "rewards/chosen": 3.1431681315104165, + "rewards/margins": 7.346821987267697, + "rewards/rejected": -4.20365385575728, + "step": 1705 + }, + { + "epoch": 0.6297817359605002, + "grad_norm": 5.46875, + "kl": 1.1188678741455078, + "learning_rate": 3.103587437961104e-06, + "logits/chosen": 161704622.54545453, + "logits/rejected": 193385203.2, + "logps/chosen": -369.7078302556818, + "logps/rejected": -712.1005859375, + "loss": 0.0763, + "rewards/chosen": 3.476016304709695, + "rewards/margins": 15.368748543479226, + "rewards/rejected": -11.892732238769531, + "step": 1706 + }, + { + "epoch": 0.6301508928983434, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 3.0981431633396153e-06, + "logits/chosen": 179464115.2, + "logits/rejected": 220021760.0, + "logps/chosen": -378.5281982421875, + "logps/rejected": -415.3922674005682, + "loss": 0.0335, + "rewards/chosen": 3.5094882965087892, + "rewards/margins": 9.9805382121693, + "rewards/rejected": -6.471049915660512, + "step": 1707 + }, + { + "epoch": 0.6305200498361866, + "grad_norm": 3.296875, + "kl": 0.3451552391052246, + "learning_rate": 3.0927015237166104e-06, + "logits/chosen": 165059696.0, + "logits/rejected": 221410672.0, + "logps/chosen": -267.7331237792969, + "logps/rejected": -471.49151611328125, + "loss": 0.0447, + "rewards/chosen": 3.1415135860443115, + "rewards/margins": 9.913703203201294, + "rewards/rejected": -6.772189617156982, + "step": 1708 + }, + { + "epoch": 0.6308892067740298, + "grad_norm": 4.78125, + "kl": 0.16387939453125, + "learning_rate": 3.0872625266314104e-06, + "logits/chosen": 155617204.70588234, + "logits/rejected": 195450965.33333334, + "logps/chosen": -323.90119485294116, + "logps/rejected": -465.5981770833333, + "loss": 0.0816, + "rewards/chosen": 2.7005363913143383, + "rewards/margins": 9.763505344764859, + "rewards/rejected": -7.062968953450521, + "step": 1709 + }, + { + "epoch": 0.631258363711873, + "grad_norm": 6.875, + "kl": 0.3409137725830078, + "learning_rate": 3.081826179619681e-06, + "logits/chosen": 311919858.5263158, + "logits/rejected": 244020499.69230768, + "logps/chosen": -264.8466539884868, + "logps/rejected": -381.36925330528845, + "loss": 0.1264, + "rewards/chosen": 2.577246816534745, + "rewards/margins": 9.047259774767918, + "rewards/rejected": -6.470012958233173, + "step": 1710 + }, + { + "epoch": 0.6316275206497162, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.076392490213409e-06, + "logits/chosen": 168389655.27272728, + "logits/rejected": 266894945.52380952, + "logps/chosen": -328.275634765625, + "logps/rejected": -597.0929129464286, + "loss": 0.0762, + "rewards/chosen": 2.4528172232887964, + "rewards/margins": 10.155942223288797, + "rewards/rejected": -7.703125, + "step": 1711 + }, + { + "epoch": 0.6319966775875594, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 3.0709614659409013e-06, + "logits/chosen": 199501845.33333334, + "logits/rejected": 182604275.2, + "logps/chosen": -338.57212320963544, + "logps/rejected": -436.96328125, + "loss": 0.034, + "rewards/chosen": 3.6497815450032554, + "rewards/margins": 10.475450642903645, + "rewards/rejected": -6.8256690979003904, + "step": 1712 + }, + { + "epoch": 0.6323658345254026, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 3.0655331143267758e-06, + "logits/chosen": 196738752.0, + "logits/rejected": 215400816.0, + "logps/chosen": -340.8554992675781, + "logps/rejected": -453.6380310058594, + "loss": 0.1028, + "rewards/chosen": 3.1648058891296387, + "rewards/margins": 10.030722618103027, + "rewards/rejected": -6.865916728973389, + "step": 1713 + }, + { + "epoch": 0.6327349914632459, + "grad_norm": 5.78125, + "kl": 2.911423683166504, + "learning_rate": 3.060107442891943e-06, + "logits/chosen": 326504925.8666667, + "logits/rejected": 218123384.47058824, + "logps/chosen": -444.40709635416664, + "logps/rejected": -473.5076688878676, + "loss": 0.0598, + "rewards/chosen": 3.387145487467448, + "rewards/margins": 10.404154100605087, + "rewards/rejected": -7.017008613137638, + "step": 1714 + }, + { + "epoch": 0.633104148401089, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 3.054684459153601e-06, + "logits/chosen": 306788171.2941176, + "logits/rejected": 325157068.8, + "logps/chosen": -340.67750459558823, + "logps/rejected": -591.4481770833333, + "loss": 0.0922, + "rewards/chosen": 2.549258961397059, + "rewards/margins": 10.714816822725183, + "rewards/rejected": -8.165557861328125, + "step": 1715 + }, + { + "epoch": 0.6334733053389322, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 3.049264170625227e-06, + "logits/chosen": 208158930.82352942, + "logits/rejected": 199689796.26666668, + "logps/chosen": -347.5497472426471, + "logps/rejected": -389.00869140625, + "loss": 0.109, + "rewards/chosen": 2.462581858915441, + "rewards/margins": 9.834623179716221, + "rewards/rejected": -7.372041320800781, + "step": 1716 + }, + { + "epoch": 0.6338424622767754, + "grad_norm": 6.8125, + "kl": 1.560791015625, + "learning_rate": 3.043846584816561e-06, + "logits/chosen": 199813302.85714287, + "logits/rejected": 239813445.8181818, + "logps/chosen": -348.8135463169643, + "logps/rejected": -423.89848188920456, + "loss": 0.0985, + "rewards/chosen": 3.236209687732515, + "rewards/margins": 8.853737042579816, + "rewards/rejected": -5.617527354847301, + "step": 1717 + }, + { + "epoch": 0.6342116192146187, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 3.038431709233598e-06, + "logits/chosen": 235725688.47058824, + "logits/rejected": 168565230.93333334, + "logps/chosen": -360.5758272058824, + "logps/rejected": -398.24130859375, + "loss": 0.0904, + "rewards/chosen": 2.400935453527114, + "rewards/margins": 8.601213672114355, + "rewards/rejected": -6.20027821858724, + "step": 1718 + }, + { + "epoch": 0.6345807761524618, + "grad_norm": 5.65625, + "kl": 1.383622169494629, + "learning_rate": 3.033019551378581e-06, + "logits/chosen": 208551589.6470588, + "logits/rejected": 159239082.66666666, + "logps/chosen": -272.6219841452206, + "logps/rejected": -350.1263020833333, + "loss": 0.119, + "rewards/chosen": 2.6792147019330192, + "rewards/margins": 8.2838487961713, + "rewards/rejected": -5.604634094238281, + "step": 1719 + }, + { + "epoch": 0.634949933090305, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 3.0276101187499864e-06, + "logits/chosen": 218463872.0, + "logits/rejected": 244938816.0, + "logps/chosen": -356.3518981933594, + "logps/rejected": -426.837890625, + "loss": 0.0875, + "rewards/chosen": 2.3916492462158203, + "rewards/margins": 9.680180549621582, + "rewards/rejected": -7.288531303405762, + "step": 1720 + }, + { + "epoch": 0.6353190900281482, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 3.022203418842512e-06, + "logits/chosen": 224217744.0, + "logits/rejected": 211110576.0, + "logps/chosen": -330.31396484375, + "logps/rejected": -508.4434814453125, + "loss": 0.092, + "rewards/chosen": 2.1178646087646484, + "rewards/margins": 9.978680610656738, + "rewards/rejected": -7.86081600189209, + "step": 1721 + }, + { + "epoch": 0.6356882469659915, + "grad_norm": 6.15625, + "kl": 2.0015640258789062, + "learning_rate": 3.016799459147074e-06, + "logits/chosen": 242313523.2, + "logits/rejected": 127538218.66666667, + "logps/chosen": -390.694873046875, + "logps/rejected": -403.1761067708333, + "loss": 0.1, + "rewards/chosen": 2.6829387664794924, + "rewards/margins": 8.995639673868816, + "rewards/rejected": -6.312700907389323, + "step": 1722 + }, + { + "epoch": 0.6360574039038346, + "grad_norm": 6.15625, + "kl": 1.099165916442871, + "learning_rate": 3.0113982471507873e-06, + "logits/chosen": 171252339.2, + "logits/rejected": 224115200.0, + "logps/chosen": -272.749658203125, + "logps/rejected": -348.495849609375, + "loss": 0.1043, + "rewards/chosen": 2.705405426025391, + "rewards/margins": 9.467370859781902, + "rewards/rejected": -6.761965433756511, + "step": 1723 + }, + { + "epoch": 0.6364265608416778, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 3.0059997903369658e-06, + "logits/chosen": 317831890.8235294, + "logits/rejected": 200755797.33333334, + "logps/chosen": -365.8299345128676, + "logps/rejected": -429.47718098958336, + "loss": 0.0853, + "rewards/chosen": 3.054871951832491, + "rewards/margins": 9.2433266434015, + "rewards/rejected": -6.18845469156901, + "step": 1724 + }, + { + "epoch": 0.636795717779521, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 3.0006040961851014e-06, + "logits/chosen": 164246340.26666668, + "logits/rejected": 279344700.2352941, + "logps/chosen": -362.4203776041667, + "logps/rejected": -419.62439682904414, + "loss": 0.0553, + "rewards/chosen": 3.438335418701172, + "rewards/margins": 9.699136846205768, + "rewards/rejected": -6.260801427504596, + "step": 1725 + }, + { + "epoch": 0.6371648747173643, + "grad_norm": 5.375, + "kl": 1.4157674312591553, + "learning_rate": 2.9952111721708576e-06, + "logits/chosen": 261540710.4, + "logits/rejected": 144063914.66666666, + "logps/chosen": -314.926025390625, + "logps/rejected": -349.3498942057292, + "loss": 0.1273, + "rewards/chosen": 2.674770545959473, + "rewards/margins": 6.6451208114624025, + "rewards/rejected": -3.9703502655029297, + "step": 1726 + }, + { + "epoch": 0.6375340316552074, + "grad_norm": 2.40625, + "kl": 0.0, + "learning_rate": 2.9898210257660664e-06, + "logits/chosen": 277655040.0, + "logits/rejected": 201287524.17391303, + "logps/chosen": -324.17475043402777, + "logps/rejected": -571.3809018342391, + "loss": 0.0302, + "rewards/chosen": 2.975285424126519, + "rewards/margins": 10.601173751020202, + "rewards/rejected": -7.625888326893682, + "step": 1727 + }, + { + "epoch": 0.6379031885930506, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 2.984433664438704e-06, + "logits/chosen": 206773820.2352941, + "logits/rejected": 171841809.06666666, + "logps/chosen": -307.6460822610294, + "logps/rejected": -288.480859375, + "loss": 0.1015, + "rewards/chosen": 2.4090109432444855, + "rewards/margins": 8.828037127326517, + "rewards/rejected": -6.419026184082031, + "step": 1728 + }, + { + "epoch": 0.6382723455308938, + "grad_norm": 5.28125, + "kl": 0.7205114364624023, + "learning_rate": 2.979049095652892e-06, + "logits/chosen": 350067768.8888889, + "logits/rejected": 261268827.42857143, + "logps/chosen": -343.761962890625, + "logps/rejected": -443.3335658482143, + "loss": 0.074, + "rewards/chosen": 2.962757110595703, + "rewards/margins": 9.44514410836356, + "rewards/rejected": -6.482386997767857, + "step": 1729 + }, + { + "epoch": 0.6386415024687371, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 2.973667326868882e-06, + "logits/chosen": 248687104.0, + "logits/rejected": 195116913.7777778, + "logps/chosen": -437.90806361607144, + "logps/rejected": -440.6433376736111, + "loss": 0.0925, + "rewards/chosen": 2.7246720450265065, + "rewards/margins": 8.508947099958148, + "rewards/rejected": -5.784275054931641, + "step": 1730 + }, + { + "epoch": 0.6390106594065802, + "grad_norm": 6.25, + "kl": 0.5029020309448242, + "learning_rate": 2.968288365543047e-06, + "logits/chosen": 290815129.6, + "logits/rejected": 188389568.0, + "logps/chosen": -325.9523681640625, + "logps/rejected": -474.4121500651042, + "loss": 0.1125, + "rewards/chosen": 2.4119802474975587, + "rewards/margins": 8.903685824076335, + "rewards/rejected": -6.491705576578776, + "step": 1731 + }, + { + "epoch": 0.6393798163444234, + "grad_norm": 5.1875, + "kl": 0.17247343063354492, + "learning_rate": 2.9629122191278677e-06, + "logits/chosen": 165298224.0, + "logits/rejected": 199404608.0, + "logps/chosen": -335.7440185546875, + "logps/rejected": -378.66973876953125, + "loss": 0.0735, + "rewards/chosen": 3.1482791900634766, + "rewards/margins": 9.56049108505249, + "rewards/rejected": -6.412211894989014, + "step": 1732 + }, + { + "epoch": 0.6397489732822667, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 2.9575388950719286e-06, + "logits/chosen": 225346304.0, + "logits/rejected": 249143728.0, + "logps/chosen": -343.61480712890625, + "logps/rejected": -662.44677734375, + "loss": 0.0495, + "rewards/chosen": 3.2875852584838867, + "rewards/margins": 11.047679901123047, + "rewards/rejected": -7.76009464263916, + "step": 1733 + }, + { + "epoch": 0.6401181302201098, + "grad_norm": 3.609375, + "kl": 1.0683414936065674, + "learning_rate": 2.9521684008199012e-06, + "logits/chosen": 226036210.52631578, + "logits/rejected": 281064960.0, + "logps/chosen": -372.4350842927632, + "logps/rejected": -494.03170072115387, + "loss": 0.0603, + "rewards/chosen": 3.078982905337685, + "rewards/margins": 10.084237210663707, + "rewards/rejected": -7.005254305326021, + "step": 1734 + }, + { + "epoch": 0.640487287157953, + "grad_norm": 5.125, + "kl": 0.21534395217895508, + "learning_rate": 2.946800743812537e-06, + "logits/chosen": 218551093.89473686, + "logits/rejected": 206051820.30769232, + "logps/chosen": -297.9867393092105, + "logps/rejected": -397.7901141826923, + "loss": 0.1024, + "rewards/chosen": 2.623192837363795, + "rewards/margins": 8.83335507350412, + "rewards/rejected": -6.210162236140325, + "step": 1735 + }, + { + "epoch": 0.6408564440957962, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.941435931486656e-06, + "logits/chosen": 160478592.0, + "logits/rejected": 209783232.0, + "logps/chosen": -306.3674621582031, + "logps/rejected": -383.089599609375, + "loss": 0.0949, + "rewards/chosen": 2.7154064178466797, + "rewards/margins": 9.063541889190674, + "rewards/rejected": -6.348135471343994, + "step": 1736 + }, + { + "epoch": 0.6412256010336395, + "grad_norm": 5.5, + "kl": 0.6685600280761719, + "learning_rate": 2.9360739712751394e-06, + "logits/chosen": 194130838.5882353, + "logits/rejected": 198286011.73333332, + "logps/chosen": -390.97443704044116, + "logps/rejected": -410.05751953125, + "loss": 0.0819, + "rewards/chosen": 3.0048881979549633, + "rewards/margins": 9.164789117551317, + "rewards/rejected": -6.159900919596354, + "step": 1737 + }, + { + "epoch": 0.6415947579714826, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 2.9307148706069145e-06, + "logits/chosen": 185211057.23076922, + "logits/rejected": 178529010.52631578, + "logps/chosen": -342.8097581129808, + "logps/rejected": -431.74917763157896, + "loss": 0.079, + "rewards/chosen": 2.3471136826735277, + "rewards/margins": 8.845779743271802, + "rewards/rejected": -6.498666060598273, + "step": 1738 + }, + { + "epoch": 0.6419639149093258, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 2.9253586369069447e-06, + "logits/chosen": 252334694.4, + "logits/rejected": 248281991.52941176, + "logps/chosen": -374.78704427083335, + "logps/rejected": -431.64631204044116, + "loss": 0.1348, + "rewards/chosen": 1.4249900817871093, + "rewards/margins": 8.068281779569737, + "rewards/rejected": -6.643291697782629, + "step": 1739 + }, + { + "epoch": 0.642333071847169, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 2.920005277596225e-06, + "logits/chosen": 193877520.0, + "logits/rejected": 183765552.0, + "logps/chosen": -370.78265380859375, + "logps/rejected": -415.1462707519531, + "loss": 0.0783, + "rewards/chosen": 2.9239823818206787, + "rewards/margins": 9.163837671279907, + "rewards/rejected": -6.2398552894592285, + "step": 1740 + }, + { + "epoch": 0.6427022287850123, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 2.914654800091768e-06, + "logits/chosen": 161273325.7142857, + "logits/rejected": 208324039.1111111, + "logps/chosen": -382.85940987723217, + "logps/rejected": -378.0492892795139, + "loss": 0.0675, + "rewards/chosen": 2.4573868342808316, + "rewards/margins": 8.766233202010866, + "rewards/rejected": -6.308846367730035, + "step": 1741 + }, + { + "epoch": 0.6430713857228554, + "grad_norm": 6.59375, + "kl": 2.6381969451904297, + "learning_rate": 2.9093072118065903e-06, + "logits/chosen": 137447082.66666666, + "logits/rejected": 199172699.42857143, + "logps/chosen": -354.7816569010417, + "logps/rejected": -421.1435546875, + "loss": 0.1001, + "rewards/chosen": 2.903320736355252, + "rewards/margins": 8.098736233181423, + "rewards/rejected": -5.195415496826172, + "step": 1742 + }, + { + "epoch": 0.6434405426606986, + "grad_norm": 6.625, + "kl": 2.169375419616699, + "learning_rate": 2.9039625201497106e-06, + "logits/chosen": 234564024.8888889, + "logits/rejected": 209223588.57142857, + "logps/chosen": -386.63671875, + "logps/rejected": -441.42783900669644, + "loss": 0.1325, + "rewards/chosen": 3.019129859076606, + "rewards/margins": 9.688609774150546, + "rewards/rejected": -6.66947991507394, + "step": 1743 + }, + { + "epoch": 0.6438096995985418, + "grad_norm": 4.84375, + "kl": 0.02286243438720703, + "learning_rate": 2.8986207325261272e-06, + "logits/chosen": 222086798.2222222, + "logits/rejected": 168880530.2857143, + "logps/chosen": -379.16026475694446, + "logps/rejected": -354.386962890625, + "loss": 0.0738, + "rewards/chosen": 2.7428131103515625, + "rewards/margins": 9.069566454206194, + "rewards/rejected": -6.326753343854632, + "step": 1744 + }, + { + "epoch": 0.6441788565363851, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 2.893281856336822e-06, + "logits/chosen": 228165961.14285713, + "logits/rejected": 163432860.44444445, + "logps/chosen": -439.12904575892856, + "logps/rejected": -524.4853515625, + "loss": 0.079, + "rewards/chosen": 2.238978930882045, + "rewards/margins": 11.16459063878135, + "rewards/rejected": -8.925611707899305, + "step": 1745 + }, + { + "epoch": 0.6445480134742282, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 2.887945898978741e-06, + "logits/chosen": 349169536.0, + "logits/rejected": 204379136.0, + "logps/chosen": -398.52386474609375, + "logps/rejected": -398.3068440755208, + "loss": 0.0756, + "rewards/chosen": 2.660767078399658, + "rewards/margins": 8.658344745635986, + "rewards/rejected": -5.997577667236328, + "step": 1746 + }, + { + "epoch": 0.6449171704120714, + "grad_norm": 5.6875, + "kl": 0.37863779067993164, + "learning_rate": 2.8826128678447806e-06, + "logits/chosen": 198699895.46666667, + "logits/rejected": 211912975.05882353, + "logps/chosen": -392.878125, + "logps/rejected": -400.17839499080884, + "loss": 0.0995, + "rewards/chosen": 2.302209981282552, + "rewards/margins": 7.715003309062883, + "rewards/rejected": -5.412793327780331, + "step": 1747 + }, + { + "epoch": 0.6452863273499146, + "grad_norm": 5.15625, + "kl": 1.8838310241699219, + "learning_rate": 2.8772827703237914e-06, + "logits/chosen": 327486866.28571427, + "logits/rejected": 204773944.8888889, + "logps/chosen": -319.0743931361607, + "logps/rejected": -344.8424479166667, + "loss": 0.0911, + "rewards/chosen": 2.8972345079694475, + "rewards/margins": 7.734599552457295, + "rewards/rejected": -4.837365044487847, + "step": 1748 + }, + { + "epoch": 0.6456554842877579, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 2.871955613800557e-06, + "logits/chosen": 223180259.55555555, + "logits/rejected": 213087049.14285713, + "logps/chosen": -288.5857204861111, + "logps/rejected": -351.3093959263393, + "loss": 0.095, + "rewards/chosen": 2.3966632419162326, + "rewards/margins": 7.254569583468967, + "rewards/rejected": -4.857906341552734, + "step": 1749 + }, + { + "epoch": 0.646024641225601, + "grad_norm": 3.296875, + "kl": 0.0, + "learning_rate": 2.8666314056557815e-06, + "logits/chosen": 182728802.46153846, + "logits/rejected": 234576195.36842105, + "logps/chosen": -276.1807391826923, + "logps/rejected": -521.4027549342105, + "loss": 0.0422, + "rewards/chosen": 3.6575572674091044, + "rewards/margins": 11.100259711385256, + "rewards/rejected": -7.4427024439761515, + "step": 1750 + }, + { + "epoch": 0.6463937981634442, + "grad_norm": 4.84375, + "kl": 0.013498306274414062, + "learning_rate": 2.8613101532660894e-06, + "logits/chosen": 202411287.27272728, + "logits/rejected": 145442457.6, + "logps/chosen": -334.29487748579544, + "logps/rejected": -592.154736328125, + "loss": 0.0869, + "rewards/chosen": 3.0797160755504263, + "rewards/margins": 11.282604564319957, + "rewards/rejected": -8.202888488769531, + "step": 1751 + }, + { + "epoch": 0.6467629551012875, + "grad_norm": 3.21875, + "kl": 0.0, + "learning_rate": 2.85599186400401e-06, + "logits/chosen": 195185152.0, + "logits/rejected": 217606371.55555555, + "logps/chosen": -396.17640904017856, + "logps/rejected": -421.18353949652777, + "loss": 0.0393, + "rewards/chosen": 3.3711468832833424, + "rewards/margins": 9.996844972882952, + "rewards/rejected": -6.625698089599609, + "step": 1752 + }, + { + "epoch": 0.6471321120391307, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 2.8506765452379604e-06, + "logits/chosen": 438446682.35294116, + "logits/rejected": 244424704.0, + "logps/chosen": -394.8489200367647, + "logps/rejected": -477.35003255208335, + "loss": 0.1113, + "rewards/chosen": 2.257961497587316, + "rewards/margins": 7.344695506376379, + "rewards/rejected": -5.086734008789063, + "step": 1753 + }, + { + "epoch": 0.6475012689769738, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 2.8453642043322517e-06, + "logits/chosen": 238673646.93333334, + "logits/rejected": 281473867.2941176, + "logps/chosen": -262.16404622395834, + "logps/rejected": -419.0653722426471, + "loss": 0.068, + "rewards/chosen": 2.973915608723958, + "rewards/margins": 9.703847608379288, + "rewards/rejected": -6.729931999655331, + "step": 1754 + }, + { + "epoch": 0.647870425914817, + "grad_norm": 5.71875, + "kl": 0.09079170227050781, + "learning_rate": 2.8400548486470657e-06, + "logits/chosen": 193177720.47058824, + "logits/rejected": 206084454.4, + "logps/chosen": -275.3885857077206, + "logps/rejected": -422.46728515625, + "loss": 0.08, + "rewards/chosen": 2.54552728989545, + "rewards/margins": 9.429228868671492, + "rewards/rejected": -6.883701578776042, + "step": 1755 + }, + { + "epoch": 0.6482395828526603, + "grad_norm": 6.84375, + "kl": 0.32238292694091797, + "learning_rate": 2.834748485538444e-06, + "logits/chosen": 194127705.6, + "logits/rejected": 173491082.66666666, + "logps/chosen": -335.1042236328125, + "logps/rejected": -385.3439127604167, + "loss": 0.1042, + "rewards/chosen": 3.1111764907836914, + "rewards/margins": 8.977260271708172, + "rewards/rejected": -5.8660837809244795, + "step": 1756 + }, + { + "epoch": 0.6486087397905035, + "grad_norm": 5.5625, + "kl": 1.659231424331665, + "learning_rate": 2.829445122358285e-06, + "logits/chosen": 157861158.4, + "logits/rejected": 298940202.6666667, + "logps/chosen": -353.356201171875, + "logps/rejected": -504.9119873046875, + "loss": 0.0888, + "rewards/chosen": 2.9128486633300783, + "rewards/margins": 9.455155563354491, + "rewards/rejected": -6.542306900024414, + "step": 1757 + }, + { + "epoch": 0.6489778967283466, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 2.824144766454333e-06, + "logits/chosen": 226170131.69230768, + "logits/rejected": 198095279.15789473, + "logps/chosen": -330.78012319711536, + "logps/rejected": -386.80592105263156, + "loss": 0.0532, + "rewards/chosen": 2.5264117901141825, + "rewards/margins": 9.179196840355754, + "rewards/rejected": -6.652785050241571, + "step": 1758 + }, + { + "epoch": 0.6493470536661898, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.8188474251701647e-06, + "logits/chosen": 232630528.0, + "logits/rejected": 142135957.33333334, + "logps/chosen": -349.10048828125, + "logps/rejected": -325.44036865234375, + "loss": 0.1154, + "rewards/chosen": 2.500308609008789, + "rewards/margins": 8.35693130493164, + "rewards/rejected": -5.856622695922852, + "step": 1759 + }, + { + "epoch": 0.6497162106040331, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 2.8135531058451746e-06, + "logits/chosen": 165291955.2, + "logits/rejected": 229487941.8181818, + "logps/chosen": -341.5548828125, + "logps/rejected": -352.5941051136364, + "loss": 0.0537, + "rewards/chosen": 2.967854309082031, + "rewards/margins": 8.31998554576527, + "rewards/rejected": -5.352131236683238, + "step": 1760 + }, + { + "epoch": 0.6500853675418763, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 2.8082618158145792e-06, + "logits/chosen": 188394259.69230768, + "logits/rejected": 257848832.0, + "logps/chosen": -343.60903695913464, + "logps/rejected": -454.16334292763156, + "loss": 0.0721, + "rewards/chosen": 3.0641124431903544, + "rewards/margins": 9.314783428362023, + "rewards/rejected": -6.250670985171669, + "step": 1761 + }, + { + "epoch": 0.6504545244797194, + "grad_norm": 3.90625, + "kl": 0.37172603607177734, + "learning_rate": 2.8029735624093936e-06, + "logits/chosen": 175088828.63157895, + "logits/rejected": 207983497.84615386, + "logps/chosen": -342.3629214638158, + "logps/rejected": -350.92664513221155, + "loss": 0.0665, + "rewards/chosen": 3.5036833913702714, + "rewards/margins": 8.849877145126282, + "rewards/rejected": -5.34619375375601, + "step": 1762 + }, + { + "epoch": 0.6508236814175626, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 2.7976883529564226e-06, + "logits/chosen": 175527253.33333334, + "logits/rejected": 189247475.2, + "logps/chosen": -276.05657958984375, + "logps/rejected": -515.6251953125, + "loss": 0.0746, + "rewards/chosen": 2.064387480417887, + "rewards/margins": 9.438365904490153, + "rewards/rejected": -7.3739784240722654, + "step": 1763 + }, + { + "epoch": 0.6511928383554059, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 2.7924061947782576e-06, + "logits/chosen": 244190192.94117647, + "logits/rejected": 220025326.93333334, + "logps/chosen": -421.1265510110294, + "logps/rejected": -615.2856119791667, + "loss": 0.0983, + "rewards/chosen": 2.3879843319163605, + "rewards/margins": 10.948951332241881, + "rewards/rejected": -8.56096700032552, + "step": 1764 + }, + { + "epoch": 0.6515619952932491, + "grad_norm": 4.5625, + "kl": 4.280933380126953, + "learning_rate": 2.7871270951932655e-06, + "logits/chosen": 175245212.44444445, + "logits/rejected": 193209417.14285713, + "logps/chosen": -342.7299533420139, + "logps/rejected": -553.3170340401786, + "loss": 0.1242, + "rewards/chosen": 3.0677931043836804, + "rewards/margins": 11.002535078260633, + "rewards/rejected": -7.934741973876953, + "step": 1765 + }, + { + "epoch": 0.6519311522310922, + "grad_norm": 4.78125, + "kl": 1.4990544319152832, + "learning_rate": 2.7818510615155667e-06, + "logits/chosen": 194669152.0, + "logits/rejected": 251271264.0, + "logps/chosen": -245.31753540039062, + "logps/rejected": -421.1958312988281, + "loss": 0.1144, + "rewards/chosen": 2.103545665740967, + "rewards/margins": 7.7687907218933105, + "rewards/rejected": -5.665245056152344, + "step": 1766 + }, + { + "epoch": 0.6523003091689354, + "grad_norm": 3.953125, + "kl": 0.8803324699401855, + "learning_rate": 2.776578101055041e-06, + "logits/chosen": 237098476.30769232, + "logits/rejected": 258725133.47368422, + "logps/chosen": -402.40478515625, + "logps/rejected": -320.72201377467104, + "loss": 0.0692, + "rewards/chosen": 2.5808739295372596, + "rewards/margins": 8.504486825302061, + "rewards/rejected": -5.923612895764802, + "step": 1767 + }, + { + "epoch": 0.6526694661067787, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 2.771308221117309e-06, + "logits/chosen": 242395867.42857143, + "logits/rejected": 124982411.63636364, + "logps/chosen": -345.8014322916667, + "logps/rejected": -339.9007679332386, + "loss": 0.1259, + "rewards/chosen": 2.173197791689918, + "rewards/margins": 8.431719131800003, + "rewards/rejected": -6.258521340110085, + "step": 1768 + }, + { + "epoch": 0.6530386230446218, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.7660414290037203e-06, + "logits/chosen": 250680805.0526316, + "logits/rejected": 200946195.69230768, + "logps/chosen": -380.3867958470395, + "logps/rejected": -548.408203125, + "loss": 0.0605, + "rewards/chosen": 3.4683922215511926, + "rewards/margins": 11.91012007987451, + "rewards/rejected": -8.441727858323317, + "step": 1769 + }, + { + "epoch": 0.653407779982465, + "grad_norm": 4.375, + "kl": 0.5058460235595703, + "learning_rate": 2.7607777320113494e-06, + "logits/chosen": 198560398.2222222, + "logits/rejected": 206100809.14285713, + "logps/chosen": -271.2626953125, + "logps/rejected": -444.05960518973217, + "loss": 0.0748, + "rewards/chosen": 3.384264204237196, + "rewards/margins": 10.14882326882983, + "rewards/rejected": -6.764559064592634, + "step": 1770 + }, + { + "epoch": 0.6537769369203082, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 2.7555171374329837e-06, + "logits/chosen": 180954166.85714287, + "logits/rejected": 241286599.1111111, + "logps/chosen": -271.377685546875, + "logps/rejected": -528.138671875, + "loss": 0.0662, + "rewards/chosen": 2.254962648664202, + "rewards/margins": 9.238931171477788, + "rewards/rejected": -6.983968522813585, + "step": 1771 + }, + { + "epoch": 0.6541460938581515, + "grad_norm": 5.59375, + "kl": 0.3452012538909912, + "learning_rate": 2.750259652557108e-06, + "logits/chosen": 193511006.31578946, + "logits/rejected": 135592605.53846154, + "logps/chosen": -270.64566200657896, + "logps/rejected": -338.78012319711536, + "loss": 0.0988, + "rewards/chosen": 2.442384619461863, + "rewards/margins": 7.820309025073342, + "rewards/rejected": -5.377924405611479, + "step": 1772 + }, + { + "epoch": 0.6545152507959946, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.7450052846678987e-06, + "logits/chosen": 226393107.69230768, + "logits/rejected": 213229325.47368422, + "logps/chosen": -262.47952974759613, + "logps/rejected": -486.376953125, + "loss": 0.0736, + "rewards/chosen": 2.0913999997652493, + "rewards/margins": 8.625333137357766, + "rewards/rejected": -6.533933137592516, + "step": 1773 + }, + { + "epoch": 0.6548844077338378, + "grad_norm": 4.34375, + "kl": 0.6195554733276367, + "learning_rate": 2.7397540410452206e-06, + "logits/chosen": 170858020.57142857, + "logits/rejected": 186309347.55555555, + "logps/chosen": -334.68690708705356, + "logps/rejected": -415.8386501736111, + "loss": 0.0544, + "rewards/chosen": 2.9754867553710938, + "rewards/margins": 9.83821275499132, + "rewards/rejected": -6.862725999620226, + "step": 1774 + }, + { + "epoch": 0.655253564671681, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 2.734505928964601e-06, + "logits/chosen": 234017957.6470588, + "logits/rejected": 199132910.93333334, + "logps/chosen": -357.84894875919116, + "logps/rejected": -429.43577473958334, + "loss": 0.0779, + "rewards/chosen": 2.3353006699505974, + "rewards/margins": 8.033249888700597, + "rewards/rejected": -5.69794921875, + "step": 1775 + }, + { + "epoch": 0.6556227216095243, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 2.7292609556972333e-06, + "logits/chosen": 176931106.13333333, + "logits/rejected": 302454904.4705882, + "logps/chosen": -309.84599609375, + "logps/rejected": -388.1822150735294, + "loss": 0.1069, + "rewards/chosen": 2.409600830078125, + "rewards/margins": 7.169103465360754, + "rewards/rejected": -4.759502635282629, + "step": 1776 + }, + { + "epoch": 0.6559918785473674, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 2.724019128509964e-06, + "logits/chosen": 214222259.2, + "logits/rejected": 183840234.66666666, + "logps/chosen": -389.955908203125, + "logps/rejected": -344.273193359375, + "loss": 0.0714, + "rewards/chosen": 3.2230201721191407, + "rewards/margins": 8.151259994506836, + "rewards/rejected": -4.928239822387695, + "step": 1777 + }, + { + "epoch": 0.6563610354852106, + "grad_norm": 6.5, + "kl": 1.36053466796875, + "learning_rate": 2.7187804546652742e-06, + "logits/chosen": 210204052.21052632, + "logits/rejected": 156490870.15384614, + "logps/chosen": -318.2978515625, + "logps/rejected": -328.3697040264423, + "loss": 0.1046, + "rewards/chosen": 2.4486941287392066, + "rewards/margins": 7.93364133332905, + "rewards/rejected": -5.484947204589844, + "step": 1778 + }, + { + "epoch": 0.6567301924230539, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 2.7135449414212822e-06, + "logits/chosen": 241088691.2, + "logits/rejected": 151191338.66666666, + "logps/chosen": -382.9021728515625, + "logps/rejected": -447.0592041015625, + "loss": 0.1082, + "rewards/chosen": 2.406845474243164, + "rewards/margins": 9.465988540649414, + "rewards/rejected": -7.05914306640625, + "step": 1779 + }, + { + "epoch": 0.6570993493608971, + "grad_norm": 4.03125, + "kl": 2.6127262115478516, + "learning_rate": 2.708312596031727e-06, + "logits/chosen": 314582564.5714286, + "logits/rejected": 204415203.55555555, + "logps/chosen": -298.3312290736607, + "logps/rejected": -530.5474175347222, + "loss": 0.0808, + "rewards/chosen": 3.0516510009765625, + "rewards/margins": 11.294125027126736, + "rewards/rejected": -8.242474026150173, + "step": 1780 + }, + { + "epoch": 0.6574685062987402, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.7030834257459513e-06, + "logits/chosen": 304173696.0, + "logits/rejected": 176728576.0, + "logps/chosen": -348.8450622558594, + "logps/rejected": -452.6874084472656, + "loss": 0.0627, + "rewards/chosen": 3.0545740127563477, + "rewards/margins": 9.754715919494629, + "rewards/rejected": -6.700141906738281, + "step": 1781 + }, + { + "epoch": 0.6578376632365834, + "grad_norm": 3.765625, + "kl": 0.0, + "learning_rate": 2.6978574378089085e-06, + "logits/chosen": 222620245.33333334, + "logits/rejected": 185687823.05882353, + "logps/chosen": -360.49664713541665, + "logps/rejected": -431.68370863970586, + "loss": 0.072, + "rewards/chosen": 3.1289212544759115, + "rewards/margins": 10.040444063672833, + "rewards/rejected": -6.911522809196921, + "step": 1782 + }, + { + "epoch": 0.6582068201744267, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 2.692634639461138e-06, + "logits/chosen": 261024466.82352942, + "logits/rejected": 224729941.33333334, + "logps/chosen": -271.43480009191177, + "logps/rejected": -364.47841796875, + "loss": 0.0866, + "rewards/chosen": 2.380812027875115, + "rewards/margins": 8.60257818184647, + "rewards/rejected": -6.2217661539713545, + "step": 1783 + }, + { + "epoch": 0.6585759771122699, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.6874150379387583e-06, + "logits/chosen": 140955766.15384614, + "logits/rejected": 191594334.31578946, + "logps/chosen": -348.27137169471155, + "logps/rejected": -395.41524465460526, + "loss": 0.0679, + "rewards/chosen": 2.470271183894231, + "rewards/margins": 9.027530592945423, + "rewards/rejected": -6.557259409051192, + "step": 1784 + }, + { + "epoch": 0.658945134050113, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 2.6821986404734623e-06, + "logits/chosen": 220716112.0, + "logits/rejected": 265249040.0, + "logps/chosen": -315.5312194824219, + "logps/rejected": -405.416015625, + "loss": 0.1137, + "rewards/chosen": 1.7652438879013062, + "rewards/margins": 8.830973029136658, + "rewards/rejected": -7.065729141235352, + "step": 1785 + }, + { + "epoch": 0.6593142909879562, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 2.6769854542925045e-06, + "logits/chosen": 256629681.23076922, + "logits/rejected": 221059570.52631578, + "logps/chosen": -350.99575570913464, + "logps/rejected": -386.7939196134868, + "loss": 0.0839, + "rewards/chosen": 2.617841573861929, + "rewards/margins": 8.361224881067933, + "rewards/rejected": -5.743383307206003, + "step": 1786 + }, + { + "epoch": 0.6596834479257995, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 2.6717754866186845e-06, + "logits/chosen": 193043160.6153846, + "logits/rejected": 185988230.7368421, + "logps/chosen": -386.6610576923077, + "logps/rejected": -381.2797594572368, + "loss": 0.0402, + "rewards/chosen": 4.54543950007512, + "rewards/margins": 11.119144269811962, + "rewards/rejected": -6.5737047697368425, + "step": 1787 + }, + { + "epoch": 0.6600526048636427, + "grad_norm": 5.71875, + "kl": 0.6372947692871094, + "learning_rate": 2.666568744670348e-06, + "logits/chosen": 241969578.66666666, + "logits/rejected": 193996690.2857143, + "logps/chosen": -393.71620008680554, + "logps/rejected": -421.4208984375, + "loss": 0.1071, + "rewards/chosen": 2.320757124159071, + "rewards/margins": 8.073457929823133, + "rewards/rejected": -5.7527008056640625, + "step": 1788 + }, + { + "epoch": 0.6604217618014858, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 2.6613652356613716e-06, + "logits/chosen": 232634096.94117647, + "logits/rejected": 183087650.13333333, + "logps/chosen": -341.23265165441177, + "logps/rejected": -605.88515625, + "loss": 0.0902, + "rewards/chosen": 2.4114958819221046, + "rewards/margins": 9.760807201909085, + "rewards/rejected": -7.349311319986979, + "step": 1789 + }, + { + "epoch": 0.660790918739329, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.656164966801149e-06, + "logits/chosen": 217020833.68421054, + "logits/rejected": 264423936.0, + "logps/chosen": -348.5938784950658, + "logps/rejected": -530.6551983173077, + "loss": 0.1181, + "rewards/chosen": 2.074400048506887, + "rewards/margins": 9.770792988147813, + "rewards/rejected": -7.696392939640925, + "step": 1790 + }, + { + "epoch": 0.6611600756771723, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 2.6509679452945847e-06, + "logits/chosen": 255733956.92307693, + "logits/rejected": 243294693.0526316, + "logps/chosen": -391.8759765625, + "logps/rejected": -470.02842310855266, + "loss": 0.0612, + "rewards/chosen": 2.1625325129582333, + "rewards/margins": 9.040495930413003, + "rewards/rejected": -6.87796341745477, + "step": 1791 + }, + { + "epoch": 0.6615292326150155, + "grad_norm": 5.6875, + "kl": 0.11703372001647949, + "learning_rate": 2.6457741783420885e-06, + "logits/chosen": 167422584.47058824, + "logits/rejected": 195750382.93333334, + "logps/chosen": -312.08128446691177, + "logps/rejected": -414.07373046875, + "loss": 0.105, + "rewards/chosen": 2.7210585650275734, + "rewards/margins": 8.647771618412989, + "rewards/rejected": -5.926713053385416, + "step": 1792 + }, + { + "epoch": 0.6618983895528586, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 2.6405836731395594e-06, + "logits/chosen": 163558630.4, + "logits/rejected": 184879722.66666666, + "logps/chosen": -343.0431640625, + "logps/rejected": -467.4716389973958, + "loss": 0.0605, + "rewards/chosen": 3.7544532775878907, + "rewards/margins": 11.16934928894043, + "rewards/rejected": -7.414896011352539, + "step": 1793 + }, + { + "epoch": 0.6622675464907019, + "grad_norm": 7.0, + "kl": 2.0325074195861816, + "learning_rate": 2.635396436878374e-06, + "logits/chosen": 141362048.0, + "logits/rejected": 162375405.7142857, + "logps/chosen": -378.3771158854167, + "logps/rejected": -405.03065708705356, + "loss": 0.1359, + "rewards/chosen": 2.586467530992296, + "rewards/margins": 8.81890793452187, + "rewards/rejected": -6.2324404035295755, + "step": 1794 + }, + { + "epoch": 0.6626367034285451, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 2.630212476745383e-06, + "logits/chosen": 266082304.0, + "logits/rejected": 185589582.76923078, + "logps/chosen": -414.3531558388158, + "logps/rejected": -323.2741887019231, + "loss": 0.0967, + "rewards/chosen": 2.301826276277241, + "rewards/margins": 8.878067989581027, + "rewards/rejected": -6.576241713303786, + "step": 1795 + }, + { + "epoch": 0.6630058603663883, + "grad_norm": 3.90625, + "kl": 0.10737991333007812, + "learning_rate": 2.6250317999228993e-06, + "logits/chosen": 181629568.0, + "logits/rejected": 187502250.66666666, + "logps/chosen": -283.240234375, + "logps/rejected": -410.1000162760417, + "loss": 0.0453, + "rewards/chosen": 3.685531343732561, + "rewards/margins": 9.682185430375357, + "rewards/rejected": -5.9966540866427955, + "step": 1796 + }, + { + "epoch": 0.6633750173042314, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 2.6198544135886818e-06, + "logits/chosen": 236164382.11764705, + "logits/rejected": 211212424.53333333, + "logps/chosen": -364.98658662683823, + "logps/rejected": -507.225390625, + "loss": 0.0992, + "rewards/chosen": 2.4520945829503678, + "rewards/margins": 10.85529587689568, + "rewards/rejected": -8.403201293945312, + "step": 1797 + }, + { + "epoch": 0.6637441742420747, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 2.6146803249159335e-06, + "logits/chosen": 172746854.4, + "logits/rejected": 169662042.3529412, + "logps/chosen": -352.9928385416667, + "logps/rejected": -477.8509880514706, + "loss": 0.0599, + "rewards/chosen": 2.646600850423177, + "rewards/margins": 8.902225509344362, + "rewards/rejected": -6.255624658921185, + "step": 1798 + }, + { + "epoch": 0.6641133311799179, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.609509541073293e-06, + "logits/chosen": 275127182.2222222, + "logits/rejected": 132083126.85714285, + "logps/chosen": -308.13994683159723, + "logps/rejected": -411.1073521205357, + "loss": 0.0882, + "rewards/chosen": 2.9930091434054904, + "rewards/margins": 9.564711828080435, + "rewards/rejected": -6.5717026846749445, + "step": 1799 + }, + { + "epoch": 0.6644824881177611, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 2.60434206922481e-06, + "logits/chosen": 226521048.6153846, + "logits/rejected": 176059472.84210527, + "logps/chosen": -427.0149113581731, + "logps/rejected": -296.98183079769734, + "loss": 0.05, + "rewards/chosen": 3.143987802358774, + "rewards/margins": 8.680073633850345, + "rewards/rejected": -5.536085831491571, + "step": 1800 + }, + { + "epoch": 0.6648516450556042, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 2.599177916529954e-06, + "logits/chosen": 239781868.30769232, + "logits/rejected": 210037638.7368421, + "logps/chosen": -420.86328125, + "logps/rejected": -448.9072265625, + "loss": 0.047, + "rewards/chosen": 3.019123370830829, + "rewards/margins": 10.133614883731735, + "rewards/rejected": -7.114491512900905, + "step": 1801 + }, + { + "epoch": 0.6652208019934475, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 2.5940170901435945e-06, + "logits/chosen": 135817674.66666666, + "logits/rejected": 153322124.8, + "logps/chosen": -297.2082112630208, + "logps/rejected": -392.41826171875, + "loss": 0.044, + "rewards/chosen": 3.3679466247558594, + "rewards/margins": 10.35202865600586, + "rewards/rejected": -6.98408203125, + "step": 1802 + }, + { + "epoch": 0.6655899589312907, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 2.5888595972159864e-06, + "logits/chosen": 192598832.0, + "logits/rejected": 205365952.0, + "logps/chosen": -316.0784606933594, + "logps/rejected": -370.2534484863281, + "loss": 0.1019, + "rewards/chosen": 2.4792864322662354, + "rewards/margins": 8.121882677078247, + "rewards/rejected": -5.642596244812012, + "step": 1803 + }, + { + "epoch": 0.6659591158691338, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 2.5837054448927733e-06, + "logits/chosen": 274676826.35294116, + "logits/rejected": 239028565.33333334, + "logps/chosen": -363.58521943933823, + "logps/rejected": -435.0589192708333, + "loss": 0.0612, + "rewards/chosen": 2.6953201293945312, + "rewards/margins": 9.341085306803386, + "rewards/rejected": -6.645765177408854, + "step": 1804 + }, + { + "epoch": 0.666328272806977, + "grad_norm": 7.6875, + "kl": 1.8729734420776367, + "learning_rate": 2.5785546403149696e-06, + "logits/chosen": 109107350.58823529, + "logits/rejected": 141621213.86666667, + "logps/chosen": -380.0120634191176, + "logps/rejected": -285.5900065104167, + "loss": 0.115, + "rewards/chosen": 2.3792969198787914, + "rewards/margins": 7.621913191851448, + "rewards/rejected": -5.242616271972656, + "step": 1805 + }, + { + "epoch": 0.6666974297448203, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 2.573407190618948e-06, + "logits/chosen": 189150177.88235295, + "logits/rejected": 232475477.33333334, + "logps/chosen": -332.67888327205884, + "logps/rejected": -574.773828125, + "loss": 0.0991, + "rewards/chosen": 2.3499241997213924, + "rewards/margins": 9.95173745248832, + "rewards/rejected": -7.601813252766927, + "step": 1806 + }, + { + "epoch": 0.6670665866826635, + "grad_norm": 5.25, + "kl": 0.0984811782836914, + "learning_rate": 2.568263102936431e-06, + "logits/chosen": 218772645.6470588, + "logits/rejected": 166998306.13333333, + "logps/chosen": -360.9624885110294, + "logps/rejected": -393.54186197916664, + "loss": 0.0826, + "rewards/chosen": 2.602009717155905, + "rewards/margins": 8.363039787142885, + "rewards/rejected": -5.761030069986979, + "step": 1807 + }, + { + "epoch": 0.6674357436205066, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 2.5631223843944937e-06, + "logits/chosen": 252130112.0, + "logits/rejected": 235778624.0, + "logps/chosen": -401.6929931640625, + "logps/rejected": -412.9393615722656, + "loss": 0.0779, + "rewards/chosen": 2.5182690620422363, + "rewards/margins": 8.493751525878906, + "rewards/rejected": -5.97548246383667, + "step": 1808 + }, + { + "epoch": 0.6678049005583498, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 2.5579850421155294e-06, + "logits/chosen": 202772187.42857143, + "logits/rejected": 190404238.2222222, + "logps/chosen": -284.2642299107143, + "logps/rejected": -447.720458984375, + "loss": 0.0966, + "rewards/chosen": 2.965088435581752, + "rewards/margins": 9.279230450826978, + "rewards/rejected": -6.314142015245226, + "step": 1809 + }, + { + "epoch": 0.6681740574961931, + "grad_norm": 5.375, + "kl": 0.335296630859375, + "learning_rate": 2.5528510832172646e-06, + "logits/chosen": 280605147.4285714, + "logits/rejected": 175032064.0, + "logps/chosen": -378.2504185267857, + "logps/rejected": -351.4379611545139, + "loss": 0.0785, + "rewards/chosen": 3.2580743517194475, + "rewards/margins": 8.31357819693429, + "rewards/rejected": -5.055503845214844, + "step": 1810 + }, + { + "epoch": 0.6685432144340363, + "grad_norm": 4.84375, + "kl": 1.2327780723571777, + "learning_rate": 2.5477205148127347e-06, + "logits/chosen": 149932257.88235295, + "logits/rejected": 182673288.53333333, + "logps/chosen": -272.5544864430147, + "logps/rejected": -332.0185221354167, + "loss": 0.1033, + "rewards/chosen": 2.9926789227653954, + "rewards/margins": 9.070824267817478, + "rewards/rejected": -6.078145345052083, + "step": 1811 + }, + { + "epoch": 0.6689123713718794, + "grad_norm": 2.984375, + "kl": 0.0, + "learning_rate": 2.5425933440102737e-06, + "logits/chosen": 277967232.0, + "logits/rejected": 214354636.8, + "logps/chosen": -366.8612467447917, + "logps/rejected": -538.144189453125, + "loss": 0.0365, + "rewards/chosen": 3.3891754150390625, + "rewards/margins": 11.20102767944336, + "rewards/rejected": -7.811852264404297, + "step": 1812 + }, + { + "epoch": 0.6692815283097227, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 2.537469577913514e-06, + "logits/chosen": 167747456.0, + "logits/rejected": 207146419.2, + "logps/chosen": -283.3855387369792, + "logps/rejected": -337.3384033203125, + "loss": 0.0946, + "rewards/chosen": 2.3489584922790527, + "rewards/margins": 7.645992946624756, + "rewards/rejected": -5.297034454345703, + "step": 1813 + }, + { + "epoch": 0.6696506852475659, + "grad_norm": 7.875, + "kl": 0.0, + "learning_rate": 2.532349223621369e-06, + "logits/chosen": 190733280.0, + "logits/rejected": 143128224.0, + "logps/chosen": -368.41815185546875, + "logps/rejected": -422.35205078125, + "loss": 0.087, + "rewards/chosen": 2.731442451477051, + "rewards/margins": 9.412727355957031, + "rewards/rejected": -6.6812849044799805, + "step": 1814 + }, + { + "epoch": 0.6700198421854091, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 2.527232288228022e-06, + "logits/chosen": 163804302.2222222, + "logits/rejected": 315033490.28571427, + "logps/chosen": -339.28632269965277, + "logps/rejected": -523.3578404017857, + "loss": 0.1484, + "rewards/chosen": 1.672774314880371, + "rewards/margins": 9.023331369672503, + "rewards/rejected": -7.350557054792132, + "step": 1815 + }, + { + "epoch": 0.6703889991232522, + "grad_norm": 6.8125, + "kl": 0.0, + "learning_rate": 2.522118778822924e-06, + "logits/chosen": 204911570.82352942, + "logits/rejected": 169522585.6, + "logps/chosen": -308.8726447610294, + "logps/rejected": -420.91484375, + "loss": 0.0969, + "rewards/chosen": 2.747760772705078, + "rewards/margins": 8.810697174072265, + "rewards/rejected": -6.062936401367187, + "step": 1816 + }, + { + "epoch": 0.6707581560610955, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 2.517008702490778e-06, + "logits/chosen": 222328521.14285713, + "logits/rejected": 158001379.55555555, + "logps/chosen": -406.834716796875, + "logps/rejected": -366.2516818576389, + "loss": 0.0733, + "rewards/chosen": 2.7637032100132535, + "rewards/margins": 8.956566674368723, + "rewards/rejected": -6.192863464355469, + "step": 1817 + }, + { + "epoch": 0.6711273129989387, + "grad_norm": 6.03125, + "kl": 0.8886761665344238, + "learning_rate": 2.511902066311527e-06, + "logits/chosen": 232751292.63157895, + "logits/rejected": 153044371.69230768, + "logps/chosen": -296.97640830592104, + "logps/rejected": -347.61474609375, + "loss": 0.1372, + "rewards/chosen": 2.4496700889185856, + "rewards/margins": 7.836488607923994, + "rewards/rejected": -5.386818519005408, + "step": 1818 + }, + { + "epoch": 0.6714964699367819, + "grad_norm": 7.9375, + "kl": 1.553786039352417, + "learning_rate": 2.5067988773603523e-06, + "logits/chosen": 271542723.7647059, + "logits/rejected": 239900928.0, + "logps/chosen": -388.2652803308824, + "logps/rejected": -364.43470052083336, + "loss": 0.1101, + "rewards/chosen": 2.340779921587776, + "rewards/margins": 6.8427208395565255, + "rewards/rejected": -4.50194091796875, + "step": 1819 + }, + { + "epoch": 0.671865626874625, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 2.5016991427076585e-06, + "logits/chosen": 221708130.46153846, + "logits/rejected": 186320114.52631578, + "logps/chosen": -277.1242863581731, + "logps/rejected": -388.94541529605266, + "loss": 0.0594, + "rewards/chosen": 2.703343024620643, + "rewards/margins": 9.233077585938489, + "rewards/rejected": -6.529734561317845, + "step": 1820 + }, + { + "epoch": 0.6722347838124683, + "grad_norm": 4.65625, + "kl": 1.2350010871887207, + "learning_rate": 2.4966028694190607e-06, + "logits/chosen": 184798464.0, + "logits/rejected": 138437059.7647059, + "logps/chosen": -288.99951171875, + "logps/rejected": -453.7792394301471, + "loss": 0.1121, + "rewards/chosen": 2.7126284281412762, + "rewards/margins": 10.821361661424824, + "rewards/rejected": -8.108733233283548, + "step": 1821 + }, + { + "epoch": 0.6726039407503115, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.491510064555381e-06, + "logits/chosen": 258594764.8, + "logits/rejected": 207147474.82352942, + "logps/chosen": -361.90387369791665, + "logps/rejected": -482.4449103860294, + "loss": 0.0739, + "rewards/chosen": 2.9541702270507812, + "rewards/margins": 10.5582836375517, + "rewards/rejected": -7.604113410500919, + "step": 1822 + }, + { + "epoch": 0.6729730976881547, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.48642073517264e-06, + "logits/chosen": 142433868.8, + "logits/rejected": 268448325.8181818, + "logps/chosen": -245.0970947265625, + "logps/rejected": -467.12411221590907, + "loss": 0.067, + "rewards/chosen": 2.290744590759277, + "rewards/margins": 8.877150258150968, + "rewards/rejected": -6.58640566739169, + "step": 1823 + }, + { + "epoch": 0.6733422546259978, + "grad_norm": 4.9375, + "kl": 0.9078273773193359, + "learning_rate": 2.481334888322035e-06, + "logits/chosen": 198629839.23809522, + "logits/rejected": 177507060.36363637, + "logps/chosen": -380.32061476934524, + "logps/rejected": -411.43954190340907, + "loss": 0.087, + "rewards/chosen": 3.1124874296642484, + "rewards/margins": 8.400392342439462, + "rewards/rejected": -5.287904912775213, + "step": 1824 + }, + { + "epoch": 0.6737114115638411, + "grad_norm": 5.0, + "kl": 0.4485936164855957, + "learning_rate": 2.4762525310499413e-06, + "logits/chosen": 267016428.30769232, + "logits/rejected": 252996608.0, + "logps/chosen": -396.93956580528845, + "logps/rejected": -512.9119037828947, + "loss": 0.0779, + "rewards/chosen": 2.0921651400052586, + "rewards/margins": 8.351567766444404, + "rewards/rejected": -6.259402626439145, + "step": 1825 + }, + { + "epoch": 0.6740805685016843, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.4711736703979015e-06, + "logits/chosen": 279935786.6666667, + "logits/rejected": 306779340.8, + "logps/chosen": -354.8403727213542, + "logps/rejected": -477.2787109375, + "loss": 0.0364, + "rewards/chosen": 3.1940253575642905, + "rewards/margins": 9.788899930318196, + "rewards/rejected": -6.594874572753906, + "step": 1826 + }, + { + "epoch": 0.6744497254395275, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 2.4660983134026156e-06, + "logits/chosen": 190640097.88235295, + "logits/rejected": 369057211.73333335, + "logps/chosen": -325.4273897058824, + "logps/rejected": -496.0688151041667, + "loss": 0.1152, + "rewards/chosen": 2.0474541608025048, + "rewards/margins": 9.250072060379328, + "rewards/rejected": -7.202617899576823, + "step": 1827 + }, + { + "epoch": 0.6748188823773706, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.461026467095921e-06, + "logits/chosen": 173292800.0, + "logits/rejected": 179116256.0, + "logps/chosen": -328.5021667480469, + "logps/rejected": -546.9633178710938, + "loss": 0.068, + "rewards/chosen": 2.7242352962493896, + "rewards/margins": 11.260011434555054, + "rewards/rejected": -8.535776138305664, + "step": 1828 + }, + { + "epoch": 0.6751880393152139, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 2.4559581385047993e-06, + "logits/chosen": 183540304.0, + "logits/rejected": 281823840.0, + "logps/chosen": -294.3261413574219, + "logps/rejected": -576.218505859375, + "loss": 0.0981, + "rewards/chosen": 2.215763807296753, + "rewards/margins": 10.43056845664978, + "rewards/rejected": -8.214804649353027, + "step": 1829 + }, + { + "epoch": 0.6755571962530571, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 2.4508933346513563e-06, + "logits/chosen": 203118336.0, + "logits/rejected": 196080526.2222222, + "logps/chosen": -304.23800223214283, + "logps/rejected": -479.3447265625, + "loss": 0.1076, + "rewards/chosen": 2.2619544437953403, + "rewards/margins": 9.300630387805757, + "rewards/rejected": -7.038675944010417, + "step": 1830 + }, + { + "epoch": 0.6759263531909003, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.445832062552811e-06, + "logits/chosen": 288651690.6666667, + "logits/rejected": 119445001.14285715, + "logps/chosen": -393.5196940104167, + "logps/rejected": -397.2771693638393, + "loss": 0.0558, + "rewards/chosen": 3.182908376057943, + "rewards/margins": 9.64964367094494, + "rewards/rejected": -6.466735294886997, + "step": 1831 + }, + { + "epoch": 0.6762955101287434, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 2.440774329221492e-06, + "logits/chosen": 174102506.66666666, + "logits/rejected": 216418969.6, + "logps/chosen": -409.9210611979167, + "logps/rejected": -499.660205078125, + "loss": 0.0647, + "rewards/chosen": 2.478846867879232, + "rewards/margins": 10.027457555135092, + "rewards/rejected": -7.548610687255859, + "step": 1832 + }, + { + "epoch": 0.6766646670665867, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 2.435720141664827e-06, + "logits/chosen": 256923989.33333334, + "logits/rejected": 204501900.8, + "logps/chosen": -385.3791910807292, + "logps/rejected": -439.72255859375, + "loss": 0.0509, + "rewards/chosen": 2.662472407023112, + "rewards/margins": 10.05457903544108, + "rewards/rejected": -7.392106628417968, + "step": 1833 + }, + { + "epoch": 0.6770338240044299, + "grad_norm": 5.5, + "kl": 0.7346725463867188, + "learning_rate": 2.430669506885326e-06, + "logits/chosen": 173840284.44444445, + "logits/rejected": 211496813.7142857, + "logps/chosen": -358.52227105034723, + "logps/rejected": -393.5325404575893, + "loss": 0.0805, + "rewards/chosen": 3.6085599263509116, + "rewards/margins": 9.373890649704705, + "rewards/rejected": -5.765330723353794, + "step": 1834 + }, + { + "epoch": 0.6774029809422731, + "grad_norm": 6.28125, + "kl": 0.17316246032714844, + "learning_rate": 2.425622431880579e-06, + "logits/chosen": 194568908.8, + "logits/rejected": 313696015.0588235, + "logps/chosen": -386.10400390625, + "logps/rejected": -468.0651424632353, + "loss": 0.0804, + "rewards/chosen": 2.3245010375976562, + "rewards/margins": 9.68913672952091, + "rewards/rejected": -7.364635691923254, + "step": 1835 + }, + { + "epoch": 0.6777721378801163, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 2.420578923643248e-06, + "logits/chosen": 289938659.5555556, + "logits/rejected": 211892132.57142857, + "logps/chosen": -409.600341796875, + "logps/rejected": -383.0283900669643, + "loss": 0.0638, + "rewards/chosen": 2.9122286902533636, + "rewards/margins": 9.055076023888965, + "rewards/rejected": -6.142847333635602, + "step": 1836 + }, + { + "epoch": 0.6781412948179595, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 2.4155389891610454e-06, + "logits/chosen": 178464244.36363637, + "logits/rejected": 209563233.52380952, + "logps/chosen": -338.15531782670456, + "logps/rejected": -460.88881138392856, + "loss": 0.0723, + "rewards/chosen": 2.1719032634388316, + "rewards/margins": 8.479856144298207, + "rewards/rejected": -6.307952880859375, + "step": 1837 + }, + { + "epoch": 0.6785104517558027, + "grad_norm": 3.71875, + "kl": 0.5800690650939941, + "learning_rate": 2.4105026354167376e-06, + "logits/chosen": 127698069.33333333, + "logits/rejected": 173133990.4, + "logps/chosen": -313.0507405598958, + "logps/rejected": -504.91640625, + "loss": 0.0488, + "rewards/chosen": 3.141171455383301, + "rewards/margins": 11.214068031311035, + "rewards/rejected": -8.072896575927734, + "step": 1838 + }, + { + "epoch": 0.6788796086936458, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 2.405469869388131e-06, + "logits/chosen": 218682998.15384614, + "logits/rejected": 198342251.78947368, + "logps/chosen": -348.58871694711536, + "logps/rejected": -473.5491365131579, + "loss": 0.0644, + "rewards/chosen": 2.951761979323167, + "rewards/margins": 9.176676128557336, + "rewards/rejected": -6.224914149234169, + "step": 1839 + }, + { + "epoch": 0.6792487656314891, + "grad_norm": 6.625, + "kl": 2.063535451889038, + "learning_rate": 2.400440698048056e-06, + "logits/chosen": 269157259.6363636, + "logits/rejected": 238265011.2, + "logps/chosen": -432.36399147727275, + "logps/rejected": -428.474560546875, + "loss": 0.1318, + "rewards/chosen": 2.6777000427246094, + "rewards/margins": 9.768981170654296, + "rewards/rejected": -7.091281127929688, + "step": 1840 + }, + { + "epoch": 0.6796179225693323, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 2.395415128364368e-06, + "logits/chosen": 214964896.0, + "logits/rejected": 179237440.0, + "logps/chosen": -362.07623291015625, + "logps/rejected": -372.8382873535156, + "loss": 0.0661, + "rewards/chosen": 2.994330883026123, + "rewards/margins": 9.405700206756592, + "rewards/rejected": -6.411369323730469, + "step": 1841 + }, + { + "epoch": 0.6799870795071755, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 2.390393167299929e-06, + "logits/chosen": 175937056.0, + "logits/rejected": 213884576.0, + "logps/chosen": -328.900146484375, + "logps/rejected": -344.4046325683594, + "loss": 0.1075, + "rewards/chosen": 2.630654811859131, + "rewards/margins": 7.7400922775268555, + "rewards/rejected": -5.109437465667725, + "step": 1842 + }, + { + "epoch": 0.6803562364450186, + "grad_norm": 7.40625, + "kl": 0.0, + "learning_rate": 2.3853748218126e-06, + "logits/chosen": 175933794.46153846, + "logits/rejected": 215172082.52631578, + "logps/chosen": -373.62545072115387, + "logps/rejected": -519.0461040296053, + "loss": 0.106, + "rewards/chosen": 1.74126463669997, + "rewards/margins": 9.403298520848818, + "rewards/rejected": -7.6620338841488485, + "step": 1843 + }, + { + "epoch": 0.6807253933828619, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 2.3803600988552373e-06, + "logits/chosen": 258974947.55555555, + "logits/rejected": 177235053.7142857, + "logps/chosen": -404.1897786458333, + "logps/rejected": -424.7529994419643, + "loss": 0.0623, + "rewards/chosen": 3.068935818142361, + "rewards/margins": 10.964578779916915, + "rewards/rejected": -7.895642961774554, + "step": 1844 + }, + { + "epoch": 0.6810945503207051, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 2.3753490053756766e-06, + "logits/chosen": 193908856.47058824, + "logits/rejected": 366749627.73333335, + "logps/chosen": -312.54406020220586, + "logps/rejected": -525.6202473958333, + "loss": 0.0698, + "rewards/chosen": 2.7666161481071923, + "rewards/margins": 9.658186175776464, + "rewards/rejected": -6.891570027669271, + "step": 1845 + }, + { + "epoch": 0.6814637072585483, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.370341548316722e-06, + "logits/chosen": 204319285.89473686, + "logits/rejected": 185718941.53846154, + "logps/chosen": -327.47520045230266, + "logps/rejected": -483.86185396634613, + "loss": 0.0873, + "rewards/chosen": 3.147353925203022, + "rewards/margins": 8.539427336410954, + "rewards/rejected": -5.3920734112079325, + "step": 1846 + }, + { + "epoch": 0.6818328641963914, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 2.3653377346161423e-06, + "logits/chosen": 205838892.52173913, + "logits/rejected": 183896504.8888889, + "logps/chosen": -307.63279127038044, + "logps/rejected": -576.7855902777778, + "loss": 0.1048, + "rewards/chosen": 2.3984728274138076, + "rewards/margins": 9.304741053189632, + "rewards/rejected": -6.9062682257758246, + "step": 1847 + }, + { + "epoch": 0.6822020211342347, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 2.36033757120666e-06, + "logits/chosen": 250252066.13333333, + "logits/rejected": 227724754.82352942, + "logps/chosen": -448.1149088541667, + "logps/rejected": -503.20203354779414, + "loss": 0.0887, + "rewards/chosen": 2.461260223388672, + "rewards/margins": 9.130916864731732, + "rewards/rejected": -6.66965664134306, + "step": 1848 + }, + { + "epoch": 0.6825711780720779, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 2.3553410650159347e-06, + "logits/chosen": 216432969.14285713, + "logits/rejected": 255824668.44444445, + "logps/chosen": -456.4632045200893, + "logps/rejected": -513.7255859375, + "loss": 0.0535, + "rewards/chosen": 3.4505983080182756, + "rewards/margins": 10.454006528097485, + "rewards/rejected": -7.00340822007921, + "step": 1849 + }, + { + "epoch": 0.6829403350099211, + "grad_norm": 5.5625, + "kl": 0.38340091705322266, + "learning_rate": 2.3503482229665637e-06, + "logits/chosen": 163844113.06666666, + "logits/rejected": 117372687.05882353, + "logps/chosen": -295.48401692708336, + "logps/rejected": -323.6543543198529, + "loss": 0.0951, + "rewards/chosen": 2.6498677571614584, + "rewards/margins": 8.73801015218099, + "rewards/rejected": -6.088142395019531, + "step": 1850 + }, + { + "epoch": 0.6833094919477642, + "grad_norm": 5.5, + "kl": 2.1624326705932617, + "learning_rate": 2.3453590519760676e-06, + "logits/chosen": 146437278.47619048, + "logits/rejected": 262569704.72727272, + "logps/chosen": -315.6162806919643, + "logps/rejected": -464.2385919744318, + "loss": 0.0998, + "rewards/chosen": 3.0067645481654575, + "rewards/margins": 8.621075766427175, + "rewards/rejected": -5.614311218261719, + "step": 1851 + }, + { + "epoch": 0.6836786488856075, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 2.340373558956877e-06, + "logits/chosen": 184036061.86666667, + "logits/rejected": 208733033.4117647, + "logps/chosen": -329.8307291666667, + "logps/rejected": -358.6774471507353, + "loss": 0.0997, + "rewards/chosen": 2.8053207397460938, + "rewards/margins": 8.786771886488971, + "rewards/rejected": -5.981451146742877, + "step": 1852 + }, + { + "epoch": 0.6840478058234507, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 2.3353917508163297e-06, + "logits/chosen": 136676050.2857143, + "logits/rejected": 154590464.0, + "logps/chosen": -299.5166015625, + "logps/rejected": -378.5890842013889, + "loss": 0.0796, + "rewards/chosen": 3.0878707340785434, + "rewards/margins": 9.212300648764959, + "rewards/rejected": -6.124429914686415, + "step": 1853 + }, + { + "epoch": 0.6844169627612939, + "grad_norm": 3.65625, + "kl": 1.226254940032959, + "learning_rate": 2.3304136344566603e-06, + "logits/chosen": 188061952.0, + "logits/rejected": 288413376.0, + "logps/chosen": -315.511572265625, + "logps/rejected": -511.5791015625, + "loss": 0.1145, + "rewards/chosen": 2.7315059661865235, + "rewards/margins": 10.771730677286783, + "rewards/rejected": -8.04022471110026, + "step": 1854 + }, + { + "epoch": 0.684786119699137, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 2.325439216774982e-06, + "logits/chosen": 227240345.6, + "logits/rejected": 200221141.33333334, + "logps/chosen": -359.645849609375, + "logps/rejected": -478.586669921875, + "loss": 0.0516, + "rewards/chosen": 4.187619018554687, + "rewards/margins": 10.434032567342122, + "rewards/rejected": -6.246413548787435, + "step": 1855 + }, + { + "epoch": 0.6851552766369803, + "grad_norm": 5.0, + "kl": 1.0882787704467773, + "learning_rate": 2.3204685046632884e-06, + "logits/chosen": 239275573.89473686, + "logits/rejected": 267290446.76923078, + "logps/chosen": -354.08516652960526, + "logps/rejected": -379.90279447115387, + "loss": 0.0866, + "rewards/chosen": 2.736929642526727, + "rewards/margins": 7.9011024753091785, + "rewards/rejected": -5.164172832782452, + "step": 1856 + }, + { + "epoch": 0.6855244335748235, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 2.3155015050084406e-06, + "logits/chosen": 234905524.70588234, + "logits/rejected": 211098385.06666666, + "logps/chosen": -338.1979549632353, + "logps/rejected": -387.25872395833335, + "loss": 0.0557, + "rewards/chosen": 3.1951076283174404, + "rewards/margins": 9.76600352268593, + "rewards/rejected": -6.57089589436849, + "step": 1857 + }, + { + "epoch": 0.6858935905126667, + "grad_norm": 4.8125, + "kl": 0.5370454788208008, + "learning_rate": 2.3105382246921516e-06, + "logits/chosen": 179827561.4117647, + "logits/rejected": 104756514.13333334, + "logps/chosen": -319.84949448529414, + "logps/rejected": -418.3751627604167, + "loss": 0.0878, + "rewards/chosen": 2.7162507001091454, + "rewards/margins": 9.584004301183363, + "rewards/rejected": -6.867753601074218, + "step": 1858 + }, + { + "epoch": 0.6862627474505099, + "grad_norm": 8.8125, + "kl": 0.0, + "learning_rate": 2.3055786705909803e-06, + "logits/chosen": 222828185.6, + "logits/rejected": 236533085.0909091, + "logps/chosen": -387.6325927734375, + "logps/rejected": -412.2109375, + "loss": 0.1058, + "rewards/chosen": 1.4919689178466797, + "rewards/margins": 7.152800369262695, + "rewards/rejected": -5.660831451416016, + "step": 1859 + }, + { + "epoch": 0.6866319043883531, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.3006228495763295e-06, + "logits/chosen": 189995673.6, + "logits/rejected": 217274819.7647059, + "logps/chosen": -350.5972005208333, + "logps/rejected": -446.9626034007353, + "loss": 0.0569, + "rewards/chosen": 3.0563819885253904, + "rewards/margins": 8.629403372371897, + "rewards/rejected": -5.573021383846507, + "step": 1860 + }, + { + "epoch": 0.6870010613261963, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 2.295670768514427e-06, + "logits/chosen": 175601008.0, + "logits/rejected": 127685368.0, + "logps/chosen": -308.1645202636719, + "logps/rejected": -352.3638610839844, + "loss": 0.0719, + "rewards/chosen": 3.7057316303253174, + "rewards/margins": 10.698806047439575, + "rewards/rejected": -6.993074417114258, + "step": 1861 + }, + { + "epoch": 0.6873702182640395, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 2.290722434266315e-06, + "logits/chosen": 240697580.30769232, + "logits/rejected": 236306512.84210527, + "logps/chosen": -428.48035606971155, + "logps/rejected": -408.68624074835526, + "loss": 0.0482, + "rewards/chosen": 3.28350830078125, + "rewards/margins": 8.597036662854645, + "rewards/rejected": -5.313528362073396, + "step": 1862 + }, + { + "epoch": 0.6877393752018827, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.285777853687849e-06, + "logits/chosen": 177070613.33333334, + "logits/rejected": 202039795.2, + "logps/chosen": -323.4339192708333, + "logps/rejected": -417.21376953125, + "loss": 0.0818, + "rewards/chosen": 1.8145491282145183, + "rewards/margins": 8.463316218058267, + "rewards/rejected": -6.64876708984375, + "step": 1863 + }, + { + "epoch": 0.6881085321397259, + "grad_norm": 4.84375, + "kl": 0.12174463272094727, + "learning_rate": 2.280837033629684e-06, + "logits/chosen": 204742898.52631578, + "logits/rejected": 228568772.92307693, + "logps/chosen": -378.6764494243421, + "logps/rejected": -432.06310096153845, + "loss": 0.079, + "rewards/chosen": 3.047152469032689, + "rewards/margins": 10.236660173547413, + "rewards/rejected": -7.189507704514724, + "step": 1864 + }, + { + "epoch": 0.6884776890775691, + "grad_norm": 5.65625, + "kl": 1.5550944805145264, + "learning_rate": 2.275899980937262e-06, + "logits/chosen": 190286697.4117647, + "logits/rejected": 154093158.4, + "logps/chosen": -312.80339499080884, + "logps/rejected": -419.3283203125, + "loss": 0.0895, + "rewards/chosen": 3.413514081169577, + "rewards/margins": 11.420911542107078, + "rewards/rejected": -8.0073974609375, + "step": 1865 + }, + { + "epoch": 0.6888468460154124, + "grad_norm": 6.3125, + "kl": 0.0, + "learning_rate": 2.270966702450806e-06, + "logits/chosen": 173030022.7368421, + "logits/rejected": 234075116.30769232, + "logps/chosen": -312.67017886513156, + "logps/rejected": -423.04244290865387, + "loss": 0.1288, + "rewards/chosen": 2.5264944779245475, + "rewards/margins": 8.096546991633982, + "rewards/rejected": -5.570052513709435, + "step": 1866 + }, + { + "epoch": 0.6892160029532555, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 2.2660372050053136e-06, + "logits/chosen": 247950745.6, + "logits/rejected": 199313392.94117647, + "logps/chosen": -337.07141927083336, + "logps/rejected": -403.4491325827206, + "loss": 0.0933, + "rewards/chosen": 2.1409678141276043, + "rewards/margins": 9.05040241316253, + "rewards/rejected": -6.909434599034927, + "step": 1867 + }, + { + "epoch": 0.6895851598910987, + "grad_norm": 6.46875, + "kl": 0.23051667213439941, + "learning_rate": 2.2611114954305372e-06, + "logits/chosen": 299191523.5555556, + "logits/rejected": 201482715.42857143, + "logps/chosen": -345.00303819444446, + "logps/rejected": -393.0337611607143, + "loss": 0.0955, + "rewards/chosen": 2.6857166290283203, + "rewards/margins": 8.56128774370466, + "rewards/rejected": -5.875571114676339, + "step": 1868 + }, + { + "epoch": 0.6899543168289419, + "grad_norm": 3.828125, + "kl": 0.0, + "learning_rate": 2.256189580550987e-06, + "logits/chosen": 211098919.3846154, + "logits/rejected": 195605908.21052632, + "logps/chosen": -346.85486778846155, + "logps/rejected": -444.0977076480263, + "loss": 0.0754, + "rewards/chosen": 2.444181148822491, + "rewards/margins": 9.364055641266981, + "rewards/rejected": -6.91987449244449, + "step": 1869 + }, + { + "epoch": 0.6903234737667852, + "grad_norm": 5.09375, + "kl": 0.6425275802612305, + "learning_rate": 2.2512714671859147e-06, + "logits/chosen": 195675322.1818182, + "logits/rejected": 102571545.6, + "logps/chosen": -298.97358842329544, + "logps/rejected": -325.745263671875, + "loss": 0.0958, + "rewards/chosen": 2.8381385803222656, + "rewards/margins": 8.393841552734376, + "rewards/rejected": -5.555702972412109, + "step": 1870 + }, + { + "epoch": 0.6906926307046283, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 2.2463571621493006e-06, + "logits/chosen": 390687118.2222222, + "logits/rejected": 139571684.57142857, + "logps/chosen": -393.3134765625, + "logps/rejected": -391.6472865513393, + "loss": 0.0911, + "rewards/chosen": 2.9927978515625, + "rewards/margins": 10.008419036865234, + "rewards/rejected": -7.015621185302734, + "step": 1871 + }, + { + "epoch": 0.6910617876424715, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 2.241446672249854e-06, + "logits/chosen": 183519534.54545453, + "logits/rejected": 261138968.3809524, + "logps/chosen": -394.83931107954544, + "logps/rejected": -392.97446986607144, + "loss": 0.0883, + "rewards/chosen": 3.003652399236506, + "rewards/margins": 8.75092143929882, + "rewards/rejected": -5.747269040062314, + "step": 1872 + }, + { + "epoch": 0.6914309445803147, + "grad_norm": 6.125, + "kl": 0.15421104431152344, + "learning_rate": 2.2365400042909973e-06, + "logits/chosen": 310994944.0, + "logits/rejected": 233999360.0, + "logps/chosen": -457.1474609375, + "logps/rejected": -591.1267438616071, + "loss": 0.0903, + "rewards/chosen": 2.825791252983941, + "rewards/margins": 10.87295671493288, + "rewards/rejected": -8.04716546194894, + "step": 1873 + }, + { + "epoch": 0.691800101518158, + "grad_norm": 6.96875, + "kl": 5.457200050354004, + "learning_rate": 2.2316371650708534e-06, + "logits/chosen": 249261690.88, + "logits/rejected": 129353645.71428572, + "logps/chosen": -401.601796875, + "logps/rejected": -342.4813755580357, + "loss": 0.1388, + "rewards/chosen": 3.2359515380859376, + "rewards/margins": 9.185026419503348, + "rewards/rejected": -5.949074881417411, + "step": 1874 + }, + { + "epoch": 0.6921692584560011, + "grad_norm": 8.375, + "kl": 0.0, + "learning_rate": 2.2267381613822482e-06, + "logits/chosen": 273471441.45454544, + "logits/rejected": 133676147.2, + "logps/chosen": -337.8878284801136, + "logps/rejected": -299.0693115234375, + "loss": 0.1455, + "rewards/chosen": 2.140032161365856, + "rewards/margins": 9.218889583240856, + "rewards/rejected": -7.078857421875, + "step": 1875 + }, + { + "epoch": 0.6925384153938443, + "grad_norm": 7.4375, + "kl": 0.0, + "learning_rate": 2.2218430000126858e-06, + "logits/chosen": 250451591.52941176, + "logits/rejected": 206856840.53333333, + "logps/chosen": -387.38916015625, + "logps/rejected": -483.56702473958336, + "loss": 0.1217, + "rewards/chosen": 2.1813661911908317, + "rewards/margins": 8.755208049100988, + "rewards/rejected": -6.573841857910156, + "step": 1876 + }, + { + "epoch": 0.6929075723316875, + "grad_norm": 9.75, + "kl": 0.4453458786010742, + "learning_rate": 2.2169516877443487e-06, + "logits/chosen": 210615342.54545453, + "logits/rejected": 198302105.6, + "logps/chosen": -385.09916548295456, + "logps/rejected": -374.5117431640625, + "loss": 0.1252, + "rewards/chosen": 2.649918469515714, + "rewards/margins": 8.360650357333096, + "rewards/rejected": -5.710731887817383, + "step": 1877 + }, + { + "epoch": 0.6932767292695307, + "grad_norm": 5.34375, + "kl": 1.2092504501342773, + "learning_rate": 2.2120642313540906e-06, + "logits/chosen": 221200192.0, + "logits/rejected": 174025216.0, + "logps/chosen": -393.9343566894531, + "logps/rejected": -381.7397766113281, + "loss": 0.0685, + "rewards/chosen": 3.5963287353515625, + "rewards/margins": 10.548031330108643, + "rewards/rejected": -6.95170259475708, + "step": 1878 + }, + { + "epoch": 0.6936458862073739, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 2.207180637613421e-06, + "logits/chosen": 200550400.0, + "logits/rejected": 299984384.0, + "logps/chosen": -364.6688537597656, + "logps/rejected": -592.6929931640625, + "loss": 0.0555, + "rewards/chosen": 3.2445406913757324, + "rewards/margins": 10.473860263824463, + "rewards/rejected": -7.2293195724487305, + "step": 1879 + }, + { + "epoch": 0.6940150431452171, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 2.202300913288494e-06, + "logits/chosen": 206929316.57142857, + "logits/rejected": 196829809.7777778, + "logps/chosen": -354.35511997767856, + "logps/rejected": -545.97119140625, + "loss": 0.0602, + "rewards/chosen": 2.805321284702846, + "rewards/margins": 10.704447125631665, + "rewards/rejected": -7.89912584092882, + "step": 1880 + }, + { + "epoch": 0.6943842000830603, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 2.197425065140107e-06, + "logits/chosen": 179180016.0, + "logits/rejected": 169384544.0, + "logps/chosen": -354.6195373535156, + "logps/rejected": -477.9481506347656, + "loss": 0.0821, + "rewards/chosen": 3.344226598739624, + "rewards/margins": 9.213842153549194, + "rewards/rejected": -5.86961555480957, + "step": 1881 + }, + { + "epoch": 0.6947533570209035, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.1925530999236875e-06, + "logits/chosen": 166460514.46153846, + "logits/rejected": 282762778.94736844, + "logps/chosen": -288.79525991586536, + "logps/rejected": -481.4019839638158, + "loss": 0.0835, + "rewards/chosen": 2.8356241079477162, + "rewards/margins": 8.897830577031803, + "rewards/rejected": -6.062206469084087, + "step": 1882 + }, + { + "epoch": 0.6951225139587467, + "grad_norm": 5.78125, + "kl": 0.9243040084838867, + "learning_rate": 2.1876850243892787e-06, + "logits/chosen": 229116048.0, + "logits/rejected": 177367344.0, + "logps/chosen": -369.2049560546875, + "logps/rejected": -424.4156494140625, + "loss": 0.0851, + "rewards/chosen": 2.575141191482544, + "rewards/margins": 10.250917196273804, + "rewards/rejected": -7.67577600479126, + "step": 1883 + }, + { + "epoch": 0.6954916708965899, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 2.182820845281538e-06, + "logits/chosen": 126368752.0, + "logits/rejected": 275129472.0, + "logps/chosen": -323.2087707519531, + "logps/rejected": -523.3358764648438, + "loss": 0.0656, + "rewards/chosen": 2.6581242084503174, + "rewards/margins": 9.33663296699524, + "rewards/rejected": -6.678508758544922, + "step": 1884 + }, + { + "epoch": 0.6958608278344331, + "grad_norm": 5.5625, + "kl": 1.0886235237121582, + "learning_rate": 2.1779605693397264e-06, + "logits/chosen": 256209381.0526316, + "logits/rejected": 198895832.6153846, + "logps/chosen": -380.1189607319079, + "logps/rejected": -435.7831280048077, + "loss": 0.0889, + "rewards/chosen": 2.3513647380628084, + "rewards/margins": 9.145409819568217, + "rewards/rejected": -6.794045081505408, + "step": 1885 + }, + { + "epoch": 0.6962299847722763, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 2.1731042032976903e-06, + "logits/chosen": 208388756.21052632, + "logits/rejected": 218430385.23076922, + "logps/chosen": -298.8988486842105, + "logps/rejected": -563.4758864182693, + "loss": 0.1364, + "rewards/chosen": 2.196565527664988, + "rewards/margins": 10.745938274059217, + "rewards/rejected": -8.54937274639423, + "step": 1886 + }, + { + "epoch": 0.6965991417101195, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 2.1682517538838648e-06, + "logits/chosen": 275107840.0, + "logits/rejected": 179722410.66666666, + "logps/chosen": -317.5899873621324, + "logps/rejected": -446.8912109375, + "loss": 0.0867, + "rewards/chosen": 2.1526329938103173, + "rewards/margins": 9.307476133458755, + "rewards/rejected": -7.154843139648437, + "step": 1887 + }, + { + "epoch": 0.6969682986479627, + "grad_norm": 5.5, + "kl": 2.0032291412353516, + "learning_rate": 2.1634032278212597e-06, + "logits/chosen": 208810199.57894737, + "logits/rejected": 190906230.15384614, + "logps/chosen": -314.2378186677632, + "logps/rejected": -372.00304236778845, + "loss": 0.1175, + "rewards/chosen": 2.297694156044408, + "rewards/margins": 8.149820057486716, + "rewards/rejected": -5.8521259014423075, + "step": 1888 + }, + { + "epoch": 0.697337455585806, + "grad_norm": 5.0625, + "kl": 0.7916131019592285, + "learning_rate": 2.1585586318274423e-06, + "logits/chosen": 209717086.31578946, + "logits/rejected": 229515106.46153846, + "logps/chosen": -392.3624331825658, + "logps/rejected": -454.9216871995192, + "loss": 0.0749, + "rewards/chosen": 2.9920284873560856, + "rewards/margins": 8.61291565682724, + "rewards/rejected": -5.620887169471154, + "step": 1889 + }, + { + "epoch": 0.6977066125236491, + "grad_norm": 6.96875, + "kl": 1.8284282684326172, + "learning_rate": 2.1537179726145395e-06, + "logits/chosen": 275193856.0, + "logits/rejected": 200727488.0, + "logps/chosen": -419.99580078125, + "logps/rejected": -402.9980061848958, + "loss": 0.11, + "rewards/chosen": 2.6675891876220703, + "rewards/margins": 8.712098439534504, + "rewards/rejected": -6.044509251912435, + "step": 1890 + }, + { + "epoch": 0.6980757694614923, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 2.1488812568892263e-06, + "logits/chosen": 202894136.8888889, + "logits/rejected": 160022016.0, + "logps/chosen": -255.08230251736111, + "logps/rejected": -377.19883510044644, + "loss": 0.1015, + "rewards/chosen": 2.3910003238254123, + "rewards/margins": 8.512231402926975, + "rewards/rejected": -6.1212310791015625, + "step": 1891 + }, + { + "epoch": 0.6984449263993355, + "grad_norm": 8.0625, + "kl": 0.05467510223388672, + "learning_rate": 2.1440484913527066e-06, + "logits/chosen": 264945392.94117647, + "logits/rejected": 277396121.6, + "logps/chosen": -428.13913143382354, + "logps/rejected": -535.2615234375, + "loss": 0.0911, + "rewards/chosen": 2.4499630647547104, + "rewards/margins": 9.303072297339344, + "rewards/rejected": -6.853109232584635, + "step": 1892 + }, + { + "epoch": 0.6988140833371788, + "grad_norm": 4.0, + "kl": 0.9466266632080078, + "learning_rate": 2.1392196827007193e-06, + "logits/chosen": 144556458.66666666, + "logits/rejected": 125569987.76470588, + "logps/chosen": -297.48264973958334, + "logps/rejected": -372.5934627757353, + "loss": 0.057, + "rewards/chosen": 3.774865214029948, + "rewards/margins": 9.916206030752145, + "rewards/rejected": -6.141340816722197, + "step": 1893 + }, + { + "epoch": 0.6991832402750219, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.1343948376235146e-06, + "logits/chosen": 194202248.53333333, + "logits/rejected": 219029534.11764705, + "logps/chosen": -329.63291015625, + "logps/rejected": -420.8423713235294, + "loss": 0.087, + "rewards/chosen": 2.2693191528320313, + "rewards/margins": 8.472988532571232, + "rewards/rejected": -6.203669379739201, + "step": 1894 + }, + { + "epoch": 0.6995523972128651, + "grad_norm": 3.28125, + "kl": 0.0, + "learning_rate": 2.1295739628058567e-06, + "logits/chosen": 175871936.0, + "logits/rejected": 247275648.0, + "logps/chosen": -284.388671875, + "logps/rejected": -486.4717712402344, + "loss": 0.0475, + "rewards/chosen": 3.4464282989501953, + "rewards/margins": 10.540575504302979, + "rewards/rejected": -7.094147205352783, + "step": 1895 + }, + { + "epoch": 0.6999215541507083, + "grad_norm": 4.84375, + "kl": 0.20828676223754883, + "learning_rate": 2.1247570649270027e-06, + "logits/chosen": 196674981.6470588, + "logits/rejected": 227672951.46666667, + "logps/chosen": -293.55213120404414, + "logps/rejected": -483.15748697916666, + "loss": 0.0776, + "rewards/chosen": 3.1088225420783546, + "rewards/margins": 9.575617382573146, + "rewards/rejected": -6.466794840494791, + "step": 1896 + }, + { + "epoch": 0.7002907110885516, + "grad_norm": 6.15625, + "kl": 1.5593762397766113, + "learning_rate": 2.119944150660706e-06, + "logits/chosen": 197937710.54545453, + "logits/rejected": 259290470.4, + "logps/chosen": -405.86159446022725, + "logps/rejected": -479.7369140625, + "loss": 0.1013, + "rewards/chosen": 3.0018237720836294, + "rewards/margins": 9.383042873035778, + "rewards/rejected": -6.381219100952149, + "step": 1897 + }, + { + "epoch": 0.7006598680263947, + "grad_norm": 7.34375, + "kl": 2.1206016540527344, + "learning_rate": 2.1151352266751996e-06, + "logits/chosen": 184207009.68421054, + "logits/rejected": 114596716.3076923, + "logps/chosen": -416.84156558388156, + "logps/rejected": -453.06775841346155, + "loss": 0.145, + "rewards/chosen": 2.348549491480777, + "rewards/margins": 9.958492001058602, + "rewards/rejected": -7.609942509577825, + "step": 1898 + }, + { + "epoch": 0.7010290249642379, + "grad_norm": 5.5, + "kl": 1.0040435791015625, + "learning_rate": 2.1103302996331832e-06, + "logits/chosen": 260199224.8888889, + "logits/rejected": 136441398.85714287, + "logps/chosen": -358.10942925347223, + "logps/rejected": -358.73685128348217, + "loss": 0.0735, + "rewards/chosen": 3.4868380228678384, + "rewards/margins": 10.029635111490885, + "rewards/rejected": -6.542797088623047, + "step": 1899 + }, + { + "epoch": 0.7013981819020811, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 2.105529376191824e-06, + "logits/chosen": 217675622.4, + "logits/rejected": 216938914.9090909, + "logps/chosen": -456.1013671875, + "logps/rejected": -407.1402698863636, + "loss": 0.0581, + "rewards/chosen": 2.0191184997558596, + "rewards/margins": 7.795373673872515, + "rewards/rejected": -5.776255174116655, + "step": 1900 + }, + { + "epoch": 0.7017673388399244, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.1007324630027416e-06, + "logits/chosen": 153671792.0, + "logits/rejected": 188838112.0, + "logps/chosen": -270.4542236328125, + "logps/rejected": -388.3487548828125, + "loss": 0.1051, + "rewards/chosen": 2.224634885787964, + "rewards/margins": 8.639535665512085, + "rewards/rejected": -6.414900779724121, + "step": 1901 + }, + { + "epoch": 0.7021364957777675, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 2.0959395667119946e-06, + "logits/chosen": 194119241.14285713, + "logits/rejected": 171671082.66666666, + "logps/chosen": -301.82320731026783, + "logps/rejected": -447.38091362847223, + "loss": 0.0687, + "rewards/chosen": 2.7478771209716797, + "rewards/margins": 8.838157018025715, + "rewards/rejected": -6.090279897054036, + "step": 1902 + }, + { + "epoch": 0.7025056527156107, + "grad_norm": 7.59375, + "kl": 1.9420709609985352, + "learning_rate": 2.091150693960083e-06, + "logits/chosen": 239211025.06666666, + "logits/rejected": 172132833.88235295, + "logps/chosen": -526.58740234375, + "logps/rejected": -400.46277573529414, + "loss": 0.0952, + "rewards/chosen": 2.504327646891276, + "rewards/margins": 8.606074598723767, + "rewards/rejected": -6.1017469518324905, + "step": 1903 + }, + { + "epoch": 0.702874809653454, + "grad_norm": 6.3125, + "kl": 1.5509862899780273, + "learning_rate": 2.0863658513819296e-06, + "logits/chosen": 235276364.8, + "logits/rejected": 351867989.3333333, + "logps/chosen": -359.85400390625, + "logps/rejected": -484.5271402994792, + "loss": 0.1125, + "rewards/chosen": 2.8017093658447267, + "rewards/margins": 9.583842468261718, + "rewards/rejected": -6.782133102416992, + "step": 1904 + }, + { + "epoch": 0.7032439665912972, + "grad_norm": 4.71875, + "kl": 0.7012090682983398, + "learning_rate": 2.0815850456068703e-06, + "logits/chosen": 139967872.0, + "logits/rejected": 209823445.33333334, + "logps/chosen": -265.56544712611606, + "logps/rejected": -332.98301866319446, + "loss": 0.0831, + "rewards/chosen": 2.2381924220493863, + "rewards/margins": 8.020117199610151, + "rewards/rejected": -5.781924777560764, + "step": 1905 + }, + { + "epoch": 0.7036131235291403, + "grad_norm": 5.40625, + "kl": 0.2014172077178955, + "learning_rate": 2.0768082832586524e-06, + "logits/chosen": 240578394.3529412, + "logits/rejected": 225672465.06666666, + "logps/chosen": -324.8343290441176, + "logps/rejected": -446.90553385416666, + "loss": 0.0998, + "rewards/chosen": 2.9591282115263096, + "rewards/margins": 9.727369076597924, + "rewards/rejected": -6.768240865071615, + "step": 1906 + }, + { + "epoch": 0.7039822804669835, + "grad_norm": 6.15625, + "kl": 1.0543155670166016, + "learning_rate": 2.072035570955421e-06, + "logits/chosen": 193902136.8888889, + "logits/rejected": 152999058.2857143, + "logps/chosen": -319.75523546006946, + "logps/rejected": -411.94663783482144, + "loss": 0.0991, + "rewards/chosen": 2.2134393056233725, + "rewards/margins": 9.27378018697103, + "rewards/rejected": -7.060340881347656, + "step": 1907 + }, + { + "epoch": 0.7043514374048268, + "grad_norm": 3.8125, + "kl": 0.3417062759399414, + "learning_rate": 2.067266915309704e-06, + "logits/chosen": 232320184.8888889, + "logits/rejected": 205803794.2857143, + "logps/chosen": -338.00390625, + "logps/rejected": -399.04178292410717, + "loss": 0.0671, + "rewards/chosen": 3.5443263583713107, + "rewards/margins": 10.222227490137493, + "rewards/rejected": -6.677901131766183, + "step": 1908 + }, + { + "epoch": 0.70472059434267, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 2.062502322928417e-06, + "logits/chosen": 231128362.66666666, + "logits/rejected": 217985689.6, + "logps/chosen": -293.67104085286456, + "logps/rejected": -514.10927734375, + "loss": 0.0571, + "rewards/chosen": 2.770845095316569, + "rewards/margins": 9.681092135111491, + "rewards/rejected": -6.910247039794922, + "step": 1909 + }, + { + "epoch": 0.7050897512805131, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 2.057741800412844e-06, + "logits/chosen": 222667878.4, + "logits/rejected": 184740065.88235295, + "logps/chosen": -408.5717447916667, + "logps/rejected": -556.45703125, + "loss": 0.082, + "rewards/chosen": 2.8569496154785154, + "rewards/margins": 11.333036400290098, + "rewards/rejected": -8.476086784811582, + "step": 1910 + }, + { + "epoch": 0.7054589082183563, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 2.052985354358622e-06, + "logits/chosen": 202132596.36363637, + "logits/rejected": 326929456.7619048, + "logps/chosen": -360.4347478693182, + "logps/rejected": -547.1351376488095, + "loss": 0.0539, + "rewards/chosen": 3.662404840642756, + "rewards/margins": 11.07305009850176, + "rewards/rejected": -7.410645257859003, + "step": 1911 + }, + { + "epoch": 0.7058280651561996, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 2.04823299135575e-06, + "logits/chosen": 186576099.55555555, + "logits/rejected": 166831305.14285713, + "logps/chosen": -372.3650716145833, + "logps/rejected": -414.4773646763393, + "loss": 0.0743, + "rewards/chosen": 3.3368837568495007, + "rewards/margins": 9.637060104854523, + "rewards/rejected": -6.300176348005023, + "step": 1912 + }, + { + "epoch": 0.7061972220940427, + "grad_norm": 6.0625, + "kl": 1.0083045959472656, + "learning_rate": 2.0434847179885687e-06, + "logits/chosen": 275299814.4, + "logits/rejected": 140593866.66666666, + "logps/chosen": -355.267919921875, + "logps/rejected": -409.7069498697917, + "loss": 0.0796, + "rewards/chosen": 3.1309268951416014, + "rewards/margins": 9.573061116536458, + "rewards/rejected": -6.4421342213948565, + "step": 1913 + }, + { + "epoch": 0.7065663790318859, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.0387405408357464e-06, + "logits/chosen": 190766954.66666666, + "logits/rejected": 196659456.0, + "logps/chosen": -269.88527425130206, + "logps/rejected": -394.3842041015625, + "loss": 0.0702, + "rewards/chosen": 2.3561172485351562, + "rewards/margins": 9.014228057861327, + "rewards/rejected": -6.658110809326172, + "step": 1914 + }, + { + "epoch": 0.7069355359697291, + "grad_norm": 4.8125, + "kl": 0.3869476318359375, + "learning_rate": 2.034000466470283e-06, + "logits/chosen": 275809706.6666667, + "logits/rejected": 205228397.7142857, + "logps/chosen": -482.1310763888889, + "logps/rejected": -437.90956333705356, + "loss": 0.0565, + "rewards/chosen": 3.4596678415934243, + "rewards/margins": 10.464327221825009, + "rewards/rejected": -7.004659380231585, + "step": 1915 + }, + { + "epoch": 0.7073046929075724, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 2.0292645014594917e-06, + "logits/chosen": 215412456.72727272, + "logits/rejected": 197005714.2857143, + "logps/chosen": -476.61985085227275, + "logps/rejected": -504.2891555059524, + "loss": 0.033, + "rewards/chosen": 2.8768173564564097, + "rewards/margins": 10.50279900521943, + "rewards/rejected": -7.6259816487630205, + "step": 1916 + }, + { + "epoch": 0.7076738498454155, + "grad_norm": 4.6875, + "kl": 0.7106161117553711, + "learning_rate": 2.0245326523649896e-06, + "logits/chosen": 302921999.0588235, + "logits/rejected": 199293576.53333333, + "logps/chosen": -321.52375344669116, + "logps/rejected": -440.3296223958333, + "loss": 0.0594, + "rewards/chosen": 3.260765748865464, + "rewards/margins": 9.950521506515203, + "rewards/rejected": -6.689755757649739, + "step": 1917 + }, + { + "epoch": 0.7080430067832587, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 2.0198049257426943e-06, + "logits/chosen": 242605363.2, + "logits/rejected": 197598539.29411766, + "logps/chosen": -349.81946614583336, + "logps/rejected": -390.08498965992646, + "loss": 0.1043, + "rewards/chosen": 1.9504000345865886, + "rewards/margins": 9.407274657604741, + "rewards/rejected": -7.456874623018153, + "step": 1918 + }, + { + "epoch": 0.7084121637211019, + "grad_norm": 4.90625, + "kl": 0.26328277587890625, + "learning_rate": 2.015081328142813e-06, + "logits/chosen": 233906266.3529412, + "logits/rejected": 195190749.86666667, + "logps/chosen": -311.0184685202206, + "logps/rejected": -451.1787109375, + "loss": 0.1064, + "rewards/chosen": 2.7187531415153954, + "rewards/margins": 8.256485583735447, + "rewards/rejected": -5.537732442220052, + "step": 1919 + }, + { + "epoch": 0.7087813206589452, + "grad_norm": 6.15625, + "kl": 0.8950157165527344, + "learning_rate": 2.0103618661098274e-06, + "logits/chosen": 162587424.0, + "logits/rejected": 204542336.0, + "logps/chosen": -337.5411071777344, + "logps/rejected": -323.911865234375, + "loss": 0.1154, + "rewards/chosen": 2.793668270111084, + "rewards/margins": 9.023985862731934, + "rewards/rejected": -6.23031759262085, + "step": 1920 + }, + { + "epoch": 0.7091504775967883, + "grad_norm": 8.0, + "kl": 0.3069124221801758, + "learning_rate": 2.0056465461824932e-06, + "logits/chosen": 270130202.94736844, + "logits/rejected": 214505472.0, + "logps/chosen": -425.77803762335526, + "logps/rejected": -466.38089693509613, + "loss": 0.1245, + "rewards/chosen": 1.9351463317871094, + "rewards/margins": 9.502070206862228, + "rewards/rejected": -7.56692387507512, + "step": 1921 + }, + { + "epoch": 0.7095196345346315, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 2.0009353748938277e-06, + "logits/chosen": 178743347.2, + "logits/rejected": 176975850.66666666, + "logps/chosen": -275.5816650390625, + "logps/rejected": -514.3889973958334, + "loss": 0.1041, + "rewards/chosen": 2.5834102630615234, + "rewards/margins": 9.344872792561848, + "rewards/rejected": -6.761462529500325, + "step": 1922 + }, + { + "epoch": 0.7098887914724747, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 1.9962283587710962e-06, + "logits/chosen": 148956048.0, + "logits/rejected": 163765008.0, + "logps/chosen": -351.4565734863281, + "logps/rejected": -402.9085998535156, + "loss": 0.0704, + "rewards/chosen": 2.769812822341919, + "rewards/margins": 8.40341830253601, + "rewards/rejected": -5.633605480194092, + "step": 1923 + }, + { + "epoch": 0.710257948410318, + "grad_norm": 8.5, + "kl": 1.3281612396240234, + "learning_rate": 1.9915255043358105e-06, + "logits/chosen": 148844112.0, + "logits/rejected": 251195408.0, + "logps/chosen": -391.494384765625, + "logps/rejected": -369.34869384765625, + "loss": 0.136, + "rewards/chosen": 1.9688657522201538, + "rewards/margins": 8.156463503837585, + "rewards/rejected": -6.187597751617432, + "step": 1924 + }, + { + "epoch": 0.7106271053481611, + "grad_norm": 4.71875, + "kl": 0.017340660095214844, + "learning_rate": 1.9868268181037186e-06, + "logits/chosen": 200189169.7777778, + "logits/rejected": 184030317.7142857, + "logps/chosen": -345.74669053819446, + "logps/rejected": -492.35414341517856, + "loss": 0.0842, + "rewards/chosen": 2.7948248121473522, + "rewards/margins": 10.418192121717665, + "rewards/rejected": -7.6233673095703125, + "step": 1925 + }, + { + "epoch": 0.7109962622860043, + "grad_norm": 6.59375, + "kl": 0.8739814758300781, + "learning_rate": 1.9821323065847866e-06, + "logits/chosen": 174858126.2222222, + "logits/rejected": 233693275.42857143, + "logps/chosen": -288.35533311631946, + "logps/rejected": -469.9688197544643, + "loss": 0.1317, + "rewards/chosen": 2.3050409952799478, + "rewards/margins": 9.763692946661086, + "rewards/rejected": -7.458651951381138, + "step": 1926 + }, + { + "epoch": 0.7113654192238476, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.9774419762832035e-06, + "logits/chosen": 248426060.8, + "logits/rejected": 258398208.0, + "logps/chosen": -397.96240234375, + "logps/rejected": -463.2222493489583, + "loss": 0.0883, + "rewards/chosen": 2.698749542236328, + "rewards/margins": 10.277338027954102, + "rewards/rejected": -7.578588485717773, + "step": 1927 + }, + { + "epoch": 0.7117345761616908, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 1.9727558336973594e-06, + "logits/chosen": 166953712.94117647, + "logits/rejected": 222781235.2, + "logps/chosen": -373.50071806066177, + "logps/rejected": -476.9498046875, + "loss": 0.0885, + "rewards/chosen": 2.1248449437758503, + "rewards/margins": 9.787028578215955, + "rewards/rejected": -7.662183634440104, + "step": 1928 + }, + { + "epoch": 0.7121037330995339, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 1.968073885319849e-06, + "logits/chosen": 203901243.07692307, + "logits/rejected": 236743733.89473686, + "logps/chosen": -266.9533879206731, + "logps/rejected": -487.0498560855263, + "loss": 0.0625, + "rewards/chosen": 3.1863949115459738, + "rewards/margins": 9.443883764599017, + "rewards/rejected": -6.257488853053043, + "step": 1929 + }, + { + "epoch": 0.7124728900373771, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 1.963396137637448e-06, + "logits/chosen": 226853120.0, + "logits/rejected": 208155904.0, + "logps/chosen": -390.222802734375, + "logps/rejected": -444.3185628255208, + "loss": 0.0841, + "rewards/chosen": 2.760881233215332, + "rewards/margins": 9.359851010640462, + "rewards/rejected": -6.59896977742513, + "step": 1930 + }, + { + "epoch": 0.7128420469752204, + "grad_norm": 7.125, + "kl": 0.40119266510009766, + "learning_rate": 1.958722597131119e-06, + "logits/chosen": 144469915.42857143, + "logits/rejected": 262984305.7777778, + "logps/chosen": -428.85696847098217, + "logps/rejected": -445.03569878472223, + "loss": 0.0892, + "rewards/chosen": 2.525793892996652, + "rewards/margins": 9.374137636214968, + "rewards/rejected": -6.848343743218316, + "step": 1931 + }, + { + "epoch": 0.7132112039130636, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.9540532702759944e-06, + "logits/chosen": 300666157.1764706, + "logits/rejected": 199942826.66666666, + "logps/chosen": -456.38884420955884, + "logps/rejected": -278.2496744791667, + "loss": 0.0908, + "rewards/chosen": 2.187096315271714, + "rewards/margins": 7.349308432784735, + "rewards/rejected": -5.162212117513021, + "step": 1932 + }, + { + "epoch": 0.7135803608509067, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.949388163541364e-06, + "logits/chosen": 361066052.26666665, + "logits/rejected": 102111179.29411764, + "logps/chosen": -358.07783203125, + "logps/rejected": -415.4837431066176, + "loss": 0.0931, + "rewards/chosen": 2.704695638020833, + "rewards/margins": 8.365692916570925, + "rewards/rejected": -5.6609972785500915, + "step": 1933 + }, + { + "epoch": 0.7139495177887499, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.944727283390675e-06, + "logits/chosen": 220751616.0, + "logits/rejected": 252872372.70588234, + "logps/chosen": -381.08470052083334, + "logps/rejected": -458.79377297794116, + "loss": 0.0655, + "rewards/chosen": 2.537303924560547, + "rewards/margins": 8.064960030948415, + "rewards/rejected": -5.527656106387868, + "step": 1934 + }, + { + "epoch": 0.7143186747265932, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.9400706362815195e-06, + "logits/chosen": 160198058.66666666, + "logits/rejected": 219853552.94117647, + "logps/chosen": -331.6126953125, + "logps/rejected": -496.3446691176471, + "loss": 0.1013, + "rewards/chosen": 2.2537211100260417, + "rewards/margins": 10.284087895412071, + "rewards/rejected": -8.030366785386029, + "step": 1935 + }, + { + "epoch": 0.7146878316644364, + "grad_norm": 5.09375, + "kl": 1.8142404556274414, + "learning_rate": 1.9354182286656203e-06, + "logits/chosen": 214415329.88235295, + "logits/rejected": 179202662.4, + "logps/chosen": -360.91164981617646, + "logps/rejected": -389.9974609375, + "loss": 0.0784, + "rewards/chosen": 3.6012164845186123, + "rewards/margins": 8.691700595032936, + "rewards/rejected": -5.090484110514323, + "step": 1936 + }, + { + "epoch": 0.7150569886022795, + "grad_norm": 5.6875, + "kl": 1.3368611335754395, + "learning_rate": 1.9307700669888303e-06, + "logits/chosen": 248509885.2173913, + "logits/rejected": 156505543.1111111, + "logps/chosen": -353.37958559782606, + "logps/rejected": -360.2667643229167, + "loss": 0.1191, + "rewards/chosen": 2.727618341860564, + "rewards/margins": 8.713731996103185, + "rewards/rejected": -5.986113654242621, + "step": 1937 + }, + { + "epoch": 0.7154261455401227, + "grad_norm": 4.875, + "kl": 0.25182056427001953, + "learning_rate": 1.9261261576911196e-06, + "logits/chosen": 219354129.06666666, + "logits/rejected": 148370311.52941176, + "logps/chosen": -405.4877604166667, + "logps/rejected": -404.9160730698529, + "loss": 0.0654, + "rewards/chosen": 2.7761194864908854, + "rewards/margins": 8.611165244906555, + "rewards/rejected": -5.835045758415671, + "step": 1938 + }, + { + "epoch": 0.715795302477966, + "grad_norm": 7.09375, + "kl": 0.3834066390991211, + "learning_rate": 1.921486507206562e-06, + "logits/chosen": 212914858.66666666, + "logits/rejected": 218891776.0, + "logps/chosen": -340.8109654017857, + "logps/rejected": -407.62349076704544, + "loss": 0.1192, + "rewards/chosen": 2.1882296970912387, + "rewards/margins": 8.072237931288683, + "rewards/rejected": -5.884008234197443, + "step": 1939 + }, + { + "epoch": 0.7161644594158092, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 1.916851121963337e-06, + "logits/chosen": 186858448.0, + "logits/rejected": 150735808.0, + "logps/chosen": -317.6627197265625, + "logps/rejected": -484.1087951660156, + "loss": 0.0848, + "rewards/chosen": 2.1850457191467285, + "rewards/margins": 11.262596607208252, + "rewards/rejected": -9.077550888061523, + "step": 1940 + }, + { + "epoch": 0.7165336163536523, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.9122200083837134e-06, + "logits/chosen": 179930066.82352942, + "logits/rejected": 323953049.6, + "logps/chosen": -268.91067325367646, + "logps/rejected": -568.8732421875, + "loss": 0.0897, + "rewards/chosen": 2.928793514476103, + "rewards/margins": 9.63526731004902, + "rewards/rejected": -6.706473795572917, + "step": 1941 + }, + { + "epoch": 0.7169027732914955, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.907593172884037e-06, + "logits/chosen": 288086768.9411765, + "logits/rejected": 156608836.26666668, + "logps/chosen": -411.2903262867647, + "logps/rejected": -490.55, + "loss": 0.0756, + "rewards/chosen": 2.5298141030704273, + "rewards/margins": 11.016224116905063, + "rewards/rejected": -8.486410013834636, + "step": 1942 + }, + { + "epoch": 0.7172719302293388, + "grad_norm": 6.09375, + "kl": 1.5411376953125, + "learning_rate": 1.9029706218747302e-06, + "logits/chosen": 205632857.6, + "logits/rejected": 254144277.33333334, + "logps/chosen": -333.3183837890625, + "logps/rejected": -492.1164957682292, + "loss": 0.0901, + "rewards/chosen": 2.9060726165771484, + "rewards/margins": 9.772721608479817, + "rewards/rejected": -6.866648991902669, + "step": 1943 + }, + { + "epoch": 0.717641087167182, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.8983523617602834e-06, + "logits/chosen": 184640457.14285713, + "logits/rejected": 233771776.0, + "logps/chosen": -342.46885463169644, + "logps/rejected": -371.9724934895833, + "loss": 0.0663, + "rewards/chosen": 2.1710353578839983, + "rewards/margins": 7.911794041830396, + "rewards/rejected": -5.740758683946398, + "step": 1944 + }, + { + "epoch": 0.7180102441050251, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.8937383989392294e-06, + "logits/chosen": 233321693.86666667, + "logits/rejected": 100491986.8235294, + "logps/chosen": -360.03489583333334, + "logps/rejected": -316.7017463235294, + "loss": 0.0738, + "rewards/chosen": 2.4162394205729165, + "rewards/margins": 9.411876753264782, + "rewards/rejected": -6.9956373326918655, + "step": 1945 + }, + { + "epoch": 0.7183794010428683, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.8891287398041591e-06, + "logits/chosen": 184467952.0, + "logits/rejected": 196561280.0, + "logps/chosen": -303.34649658203125, + "logps/rejected": -416.08795166015625, + "loss": 0.0951, + "rewards/chosen": 2.5533604621887207, + "rewards/margins": 9.280359745025635, + "rewards/rejected": -6.726999282836914, + "step": 1946 + }, + { + "epoch": 0.7187485579807116, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.8845233907416987e-06, + "logits/chosen": 223055902.11764705, + "logits/rejected": 171782382.93333334, + "logps/chosen": -386.0620691636029, + "logps/rejected": -351.93323567708336, + "loss": 0.0602, + "rewards/chosen": 3.113920997170841, + "rewards/margins": 8.601460161396101, + "rewards/rejected": -5.48753916422526, + "step": 1947 + }, + { + "epoch": 0.7191177149185547, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 1.8799223581324965e-06, + "logits/chosen": 251408745.4117647, + "logits/rejected": 220011059.2, + "logps/chosen": -360.06295955882354, + "logps/rejected": -449.2015625, + "loss": 0.106, + "rewards/chosen": 2.182321885052849, + "rewards/margins": 8.5191754210229, + "rewards/rejected": -6.336853535970052, + "step": 1948 + }, + { + "epoch": 0.7194868718563979, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.8753256483512272e-06, + "logits/chosen": 186080862.31578946, + "logits/rejected": 267611076.92307693, + "logps/chosen": -284.4723478618421, + "logps/rejected": -394.4025691105769, + "loss": 0.0997, + "rewards/chosen": 2.1364463003058183, + "rewards/margins": 8.416010790990914, + "rewards/rejected": -6.279564490685096, + "step": 1949 + }, + { + "epoch": 0.7198560287942412, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 1.8707332677665752e-06, + "logits/chosen": 332966400.0, + "logits/rejected": 267297905.7777778, + "logps/chosen": -344.90945870535717, + "logps/rejected": -506.7483723958333, + "loss": 0.112, + "rewards/chosen": 1.4673703057425362, + "rewards/margins": 7.614596427433074, + "rewards/rejected": -6.147226121690538, + "step": 1950 + }, + { + "epoch": 0.7202251857320844, + "grad_norm": 4.75, + "kl": 2.282345771789551, + "learning_rate": 1.866145222741222e-06, + "logits/chosen": 197406225.06666666, + "logits/rejected": 156951461.6470588, + "logps/chosen": -327.3379231770833, + "logps/rejected": -426.970703125, + "loss": 0.0927, + "rewards/chosen": 2.4710660298665363, + "rewards/margins": 8.284094253240847, + "rewards/rejected": -5.81302822337431, + "step": 1951 + }, + { + "epoch": 0.7205943426699275, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.8615615196318476e-06, + "logits/chosen": 209571925.33333334, + "logits/rejected": 146826477.7142857, + "logps/chosen": -394.46601019965277, + "logps/rejected": -433.97977120535717, + "loss": 0.0542, + "rewards/chosen": 2.869706047905816, + "rewards/margins": 10.001438988579643, + "rewards/rejected": -7.131732940673828, + "step": 1952 + }, + { + "epoch": 0.7209634996077707, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 1.8569821647891163e-06, + "logits/chosen": 183175728.0, + "logits/rejected": 232394773.33333334, + "logps/chosen": -362.81689453125, + "logps/rejected": -439.167724609375, + "loss": 0.0509, + "rewards/chosen": 3.231647491455078, + "rewards/margins": 10.076606114705402, + "rewards/rejected": -6.844958623250325, + "step": 1953 + }, + { + "epoch": 0.721332656545614, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 1.8524071645576613e-06, + "logits/chosen": 187364368.0, + "logits/rejected": 181131488.0, + "logps/chosen": -333.1637878417969, + "logps/rejected": -450.42498779296875, + "loss": 0.0645, + "rewards/chosen": 2.8343231678009033, + "rewards/margins": 8.884578943252563, + "rewards/rejected": -6.05025577545166, + "step": 1954 + }, + { + "epoch": 0.7217018134834572, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.847836525276091e-06, + "logits/chosen": 181083248.0, + "logits/rejected": 217940416.0, + "logps/chosen": -326.80474853515625, + "logps/rejected": -483.5968424479167, + "loss": 0.055, + "rewards/chosen": 2.265062093734741, + "rewards/margins": 9.277026255925495, + "rewards/rejected": -7.011964162190755, + "step": 1955 + }, + { + "epoch": 0.7220709704213003, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 1.8432702532769685e-06, + "logits/chosen": 174670967.46666667, + "logits/rejected": 176739584.0, + "logps/chosen": -306.91044921875, + "logps/rejected": -389.49500229779414, + "loss": 0.0577, + "rewards/chosen": 3.107995351155599, + "rewards/margins": 9.794059678619984, + "rewards/rejected": -6.6860643274643845, + "step": 1956 + }, + { + "epoch": 0.7224401273591435, + "grad_norm": 4.21875, + "kl": 1.0500802993774414, + "learning_rate": 1.8387083548868023e-06, + "logits/chosen": 313612662.15384614, + "logits/rejected": 290266650.94736844, + "logps/chosen": -457.33492337740387, + "logps/rejected": -493.89535361842104, + "loss": 0.0567, + "rewards/chosen": 3.15096194927509, + "rewards/margins": 10.368675957807163, + "rewards/rejected": -7.217714008532073, + "step": 1957 + }, + { + "epoch": 0.7228092842969868, + "grad_norm": 5.65625, + "kl": 0.5525369644165039, + "learning_rate": 1.8341508364260469e-06, + "logits/chosen": 192097253.0526316, + "logits/rejected": 230195830.15384614, + "logps/chosen": -373.14052220394734, + "logps/rejected": -455.29064002403845, + "loss": 0.0767, + "rewards/chosen": 2.785440344559519, + "rewards/margins": 10.562673761777067, + "rewards/rejected": -7.777233417217548, + "step": 1958 + }, + { + "epoch": 0.72317844123483, + "grad_norm": 6.78125, + "kl": 0.8182878494262695, + "learning_rate": 1.829597704209088e-06, + "logits/chosen": 180113874.82352942, + "logits/rejected": 109452654.93333334, + "logps/chosen": -407.8915441176471, + "logps/rejected": -281.07060546875, + "loss": 0.08, + "rewards/chosen": 3.3380158368278954, + "rewards/margins": 8.540854480219823, + "rewards/rejected": -5.202838643391927, + "step": 1959 + }, + { + "epoch": 0.7235475981726731, + "grad_norm": 4.96875, + "kl": 1.4094128608703613, + "learning_rate": 1.8250489645442283e-06, + "logits/chosen": 216809485.47368422, + "logits/rejected": 246860957.53846154, + "logps/chosen": -331.06113795230266, + "logps/rejected": -427.9768254206731, + "loss": 0.1112, + "rewards/chosen": 2.41390649895919, + "rewards/margins": 7.701135596765679, + "rewards/rejected": -5.28722909780649, + "step": 1960 + }, + { + "epoch": 0.7239167551105163, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.820504623733692e-06, + "logits/chosen": 216939940.57142857, + "logits/rejected": 298678471.1111111, + "logps/chosen": -317.05643136160717, + "logps/rejected": -488.13134765625, + "loss": 0.0571, + "rewards/chosen": 2.6949770791190013, + "rewards/margins": 9.289422201731849, + "rewards/rejected": -6.594445122612847, + "step": 1961 + }, + { + "epoch": 0.7242859120483596, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.8159646880736036e-06, + "logits/chosen": 292800157.53846157, + "logits/rejected": 280564843.7894737, + "logps/chosen": -407.9113206129808, + "logps/rejected": -343.0209189967105, + "loss": 0.0593, + "rewards/chosen": 3.1352723928598256, + "rewards/margins": 8.649175558978246, + "rewards/rejected": -5.513903166118421, + "step": 1962 + }, + { + "epoch": 0.7246550689862028, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 1.8114291638539883e-06, + "logits/chosen": 226092662.15384614, + "logits/rejected": 152610964.21052632, + "logps/chosen": -337.9017803485577, + "logps/rejected": -395.2838712993421, + "loss": 0.0668, + "rewards/chosen": 2.847702906681941, + "rewards/margins": 9.23207541709004, + "rewards/rejected": -6.384372510408101, + "step": 1963 + }, + { + "epoch": 0.7250242259240459, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 1.8068980573587547e-06, + "logits/chosen": 193337007.15789473, + "logits/rejected": 224624167.3846154, + "logps/chosen": -302.7447060032895, + "logps/rejected": -424.3273737980769, + "loss": 0.1248, + "rewards/chosen": 2.1040211727744653, + "rewards/margins": 7.254209912257639, + "rewards/rejected": -5.150188739483173, + "step": 1964 + }, + { + "epoch": 0.7253933828618891, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.8023713748656946e-06, + "logits/chosen": 258472172.30769232, + "logits/rejected": 200186758.7368421, + "logps/chosen": -373.11568509615387, + "logps/rejected": -486.31224300986844, + "loss": 0.0651, + "rewards/chosen": 2.8837115948016825, + "rewards/margins": 9.675669249252753, + "rewards/rejected": -6.791957654451069, + "step": 1965 + }, + { + "epoch": 0.7257625397997324, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.7978491226464706e-06, + "logits/chosen": 297617426.28571427, + "logits/rejected": 150699278.2222222, + "logps/chosen": -300.3529575892857, + "logps/rejected": -337.31385633680554, + "loss": 0.0797, + "rewards/chosen": 2.253650392804827, + "rewards/margins": 8.463843602982779, + "rewards/rejected": -6.210193210177952, + "step": 1966 + }, + { + "epoch": 0.7261316967375756, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.7933313069666026e-06, + "logits/chosen": 315698147.5555556, + "logits/rejected": 173017270.85714287, + "logps/chosen": -390.0925021701389, + "logps/rejected": -415.4541015625, + "loss": 0.074, + "rewards/chosen": 2.5073155297173395, + "rewards/margins": 9.216489004710365, + "rewards/rejected": -6.709173474993024, + "step": 1967 + }, + { + "epoch": 0.7265008536754187, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 1.788817934085469e-06, + "logits/chosen": 268466806.15384614, + "logits/rejected": 225714095.15789473, + "logps/chosen": -339.1319110576923, + "logps/rejected": -459.1660670230263, + "loss": 0.0705, + "rewards/chosen": 2.680907909686749, + "rewards/margins": 8.152304908042012, + "rewards/rejected": -5.471396998355263, + "step": 1968 + }, + { + "epoch": 0.726870010613262, + "grad_norm": 8.5, + "kl": 1.5683889389038086, + "learning_rate": 1.784309010256291e-06, + "logits/chosen": 235932979.2, + "logits/rejected": 203446592.0, + "logps/chosen": -381.14970703125, + "logps/rejected": -393.1575113932292, + "loss": 0.1654, + "rewards/chosen": 1.72095947265625, + "rewards/margins": 7.722629674275716, + "rewards/rejected": -6.001670201619466, + "step": 1969 + }, + { + "epoch": 0.7272391675511052, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 1.7798045417261234e-06, + "logits/chosen": 179442627.7647059, + "logits/rejected": 133775069.86666666, + "logps/chosen": -269.11399931066177, + "logps/rejected": -321.35615234375, + "loss": 0.109, + "rewards/chosen": 2.2209441241096046, + "rewards/margins": 9.273270072189032, + "rewards/rejected": -7.052325948079427, + "step": 1970 + }, + { + "epoch": 0.7276083244889484, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.7753045347358505e-06, + "logits/chosen": 241216640.0, + "logits/rejected": 174060416.0, + "logps/chosen": -290.94384765625, + "logps/rejected": -415.8557400173611, + "loss": 0.0749, + "rewards/chosen": 2.2045722688947404, + "rewards/margins": 10.086085228692918, + "rewards/rejected": -7.881512959798177, + "step": 1971 + }, + { + "epoch": 0.7279774814267915, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.7708089955201773e-06, + "logits/chosen": 323384618.6666667, + "logits/rejected": 197163724.8, + "logps/chosen": -396.8101399739583, + "logps/rejected": -445.17705078125, + "loss": 0.0651, + "rewards/chosen": 2.469858487447103, + "rewards/margins": 8.167914136250815, + "rewards/rejected": -5.698055648803711, + "step": 1972 + }, + { + "epoch": 0.7283466383646348, + "grad_norm": 5.375, + "kl": 0.8038606643676758, + "learning_rate": 1.7663179303076127e-06, + "logits/chosen": 372417501.8666667, + "logits/rejected": 174642311.52941176, + "logps/chosen": -444.88776041666665, + "logps/rejected": -390.8817497702206, + "loss": 0.0549, + "rewards/chosen": 3.106919860839844, + "rewards/margins": 9.256267502728631, + "rewards/rejected": -6.149347641888787, + "step": 1973 + }, + { + "epoch": 0.728715795302478, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 1.7618313453204723e-06, + "logits/chosen": 215125248.0, + "logits/rejected": 174227370.66666666, + "logps/chosen": -314.31017348345586, + "logps/rejected": -469.187109375, + "loss": 0.0964, + "rewards/chosen": 2.475912655101103, + "rewards/margins": 8.700434546377144, + "rewards/rejected": -6.224521891276042, + "step": 1974 + }, + { + "epoch": 0.7290849522403212, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 1.7573492467748636e-06, + "logits/chosen": 165123622.4, + "logits/rejected": 234482920.72727272, + "logps/chosen": -342.97412109375, + "logps/rejected": -411.39604048295456, + "loss": 0.0621, + "rewards/chosen": 3.3896514892578127, + "rewards/margins": 8.740463534268466, + "rewards/rejected": -5.350812045010653, + "step": 1975 + }, + { + "epoch": 0.7294541091781643, + "grad_norm": 4.75, + "kl": 0.9845809936523438, + "learning_rate": 1.752871640880675e-06, + "logits/chosen": 330316424.53333336, + "logits/rejected": 243622836.70588234, + "logps/chosen": -378.19622395833335, + "logps/rejected": -480.4811006433824, + "loss": 0.0964, + "rewards/chosen": 2.684514872233073, + "rewards/margins": 9.89557668648514, + "rewards/rejected": -7.211061814252068, + "step": 1976 + }, + { + "epoch": 0.7298232661160076, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 1.7483985338415731e-06, + "logits/chosen": 238938389.33333334, + "logits/rejected": 205432192.0, + "logps/chosen": -424.0506998697917, + "logps/rejected": -493.80361328125, + "loss": 0.0494, + "rewards/chosen": 2.631830374399821, + "rewards/margins": 10.681983725229898, + "rewards/rejected": -8.050153350830078, + "step": 1977 + }, + { + "epoch": 0.7301924230538508, + "grad_norm": 6.125, + "kl": 2.497830390930176, + "learning_rate": 1.7439299318549936e-06, + "logits/chosen": 196460336.0, + "logits/rejected": 152579520.0, + "logps/chosen": -319.15435791015625, + "logps/rejected": -398.533203125, + "loss": 0.0963, + "rewards/chosen": 2.696653127670288, + "rewards/margins": 9.284360647201538, + "rewards/rejected": -6.58770751953125, + "step": 1978 + }, + { + "epoch": 0.730561579991694, + "grad_norm": 3.640625, + "kl": 0.2472209930419922, + "learning_rate": 1.739465841112125e-06, + "logits/chosen": 297867968.0, + "logits/rejected": 269101728.0, + "logps/chosen": -358.69635009765625, + "logps/rejected": -443.0214538574219, + "loss": 0.0468, + "rewards/chosen": 3.6019678115844727, + "rewards/margins": 10.397169589996338, + "rewards/rejected": -6.795201778411865, + "step": 1979 + }, + { + "epoch": 0.7309307369295371, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 1.7350062677979075e-06, + "logits/chosen": 176509466.9473684, + "logits/rejected": 334718739.6923077, + "logps/chosen": -327.98362972861844, + "logps/rejected": -570.3757512019231, + "loss": 0.0889, + "rewards/chosen": 2.8887604161312708, + "rewards/margins": 11.2775854658984, + "rewards/rejected": -8.388825049767128, + "step": 1980 + }, + { + "epoch": 0.7312998938673804, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 1.7305512180910244e-06, + "logits/chosen": 183463021.7142857, + "logits/rejected": 174789154.9090909, + "logps/chosen": -325.08965773809524, + "logps/rejected": -498.38356711647725, + "loss": 0.0614, + "rewards/chosen": 3.201124282110305, + "rewards/margins": 10.333662817488502, + "rewards/rejected": -7.132538535378196, + "step": 1981 + }, + { + "epoch": 0.7316690508052236, + "grad_norm": 4.3125, + "kl": 0.2650489807128906, + "learning_rate": 1.726100698163893e-06, + "logits/chosen": 183815100.63157895, + "logits/rejected": 326319773.53846157, + "logps/chosen": -311.0679224917763, + "logps/rejected": -374.52103365384613, + "loss": 0.0706, + "rewards/chosen": 3.138166728772615, + "rewards/margins": 9.209873647342327, + "rewards/rejected": -6.071706918569712, + "step": 1982 + }, + { + "epoch": 0.7320382077430668, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.7216547141826472e-06, + "logits/chosen": 218583808.0, + "logits/rejected": 178737356.8, + "logps/chosen": -248.83628216911765, + "logps/rejected": -394.4533203125, + "loss": 0.086, + "rewards/chosen": 2.903981825884651, + "rewards/margins": 8.994756870643766, + "rewards/rejected": -6.0907750447591145, + "step": 1983 + }, + { + "epoch": 0.73240736468091, + "grad_norm": 5.59375, + "kl": 0.16059017181396484, + "learning_rate": 1.7172132723071444e-06, + "logits/chosen": 202788096.0, + "logits/rejected": 202573027.55555555, + "logps/chosen": -314.49068777901783, + "logps/rejected": -437.104736328125, + "loss": 0.0868, + "rewards/chosen": 2.5483125959123885, + "rewards/margins": 8.33494386218843, + "rewards/rejected": -5.786631266276042, + "step": 1984 + }, + { + "epoch": 0.7327765216187532, + "grad_norm": 4.1875, + "kl": 0.9843502044677734, + "learning_rate": 1.7127763786909474e-06, + "logits/chosen": 233913965.7142857, + "logits/rejected": 210658645.33333334, + "logps/chosen": -299.83021763392856, + "logps/rejected": -419.4108072916667, + "loss": 0.0636, + "rewards/chosen": 3.074996130807059, + "rewards/margins": 10.008751793513223, + "rewards/rejected": -6.933755662706163, + "step": 1985 + }, + { + "epoch": 0.7331456785565964, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.7083440394813116e-06, + "logits/chosen": 64724362.666666664, + "logits/rejected": 247452723.2, + "logps/chosen": -384.1635335286458, + "logps/rejected": -443.628173828125, + "loss": 0.0705, + "rewards/chosen": 3.3605006535847983, + "rewards/margins": 10.812923749287924, + "rewards/rejected": -7.452423095703125, + "step": 1986 + }, + { + "epoch": 0.7335148354944395, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 1.7039162608191895e-06, + "logits/chosen": 180926702.93333334, + "logits/rejected": 129995489.88235295, + "logps/chosen": -443.0734375, + "logps/rejected": -427.18138212316177, + "loss": 0.075, + "rewards/chosen": 2.4693280537923177, + "rewards/margins": 10.710406150069891, + "rewards/rejected": -8.241078096277574, + "step": 1987 + }, + { + "epoch": 0.7338839924322828, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 1.6994930488392135e-06, + "logits/chosen": 222814902.85714287, + "logits/rejected": 259754240.0, + "logps/chosen": -328.49595424107144, + "logps/rejected": -381.0842013888889, + "loss": 0.0936, + "rewards/chosen": 2.339430945260184, + "rewards/margins": 7.662334048558795, + "rewards/rejected": -5.322903103298611, + "step": 1988 + }, + { + "epoch": 0.734253149370126, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 1.6950744096696843e-06, + "logits/chosen": 191908384.0, + "logits/rejected": 220440336.0, + "logps/chosen": -300.9150390625, + "logps/rejected": -389.01922607421875, + "loss": 0.1423, + "rewards/chosen": 1.6398217678070068, + "rewards/margins": 7.163532972335815, + "rewards/rejected": -5.523711204528809, + "step": 1989 + }, + { + "epoch": 0.7346223063079692, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 1.690660349432573e-06, + "logits/chosen": 231586167.46666667, + "logits/rejected": 250372065.88235295, + "logps/chosen": -347.29192708333335, + "logps/rejected": -432.8168083639706, + "loss": 0.0704, + "rewards/chosen": 2.5876068115234374, + "rewards/margins": 8.437704467773438, + "rewards/rejected": -5.85009765625, + "step": 1990 + }, + { + "epoch": 0.7349914632458123, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 1.6862508742435014e-06, + "logits/chosen": 172556110.76923078, + "logits/rejected": 281215164.6315789, + "logps/chosen": -337.63198617788464, + "logps/rejected": -473.4927014802632, + "loss": 0.0705, + "rewards/chosen": 2.916265634390024, + "rewards/margins": 8.641322332837804, + "rewards/rejected": -5.72505669844778, + "step": 1991 + }, + { + "epoch": 0.7353606201836556, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 1.6818459902117429e-06, + "logits/chosen": 220767625.84615386, + "logits/rejected": 166514661.0526316, + "logps/chosen": -408.5750075120192, + "logps/rejected": -330.65879420230266, + "loss": 0.0459, + "rewards/chosen": 3.176727294921875, + "rewards/margins": 8.6823461432206, + "rewards/rejected": -5.505618848298726, + "step": 1992 + }, + { + "epoch": 0.7357297771214988, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 1.6774457034402097e-06, + "logits/chosen": 236802951.52941176, + "logits/rejected": 202626065.06666666, + "logps/chosen": -405.49235983455884, + "logps/rejected": -464.87587890625, + "loss": 0.0789, + "rewards/chosen": 2.6426485847024357, + "rewards/margins": 9.391400415757124, + "rewards/rejected": -6.748751831054688, + "step": 1993 + }, + { + "epoch": 0.736098934059342, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.67305002002544e-06, + "logits/chosen": 184760554.66666666, + "logits/rejected": 195010432.0, + "logps/chosen": -294.2271321614583, + "logps/rejected": -475.140185546875, + "loss": 0.0667, + "rewards/chosen": 2.7120771408081055, + "rewards/margins": 9.855583763122558, + "rewards/rejected": -7.143506622314453, + "step": 1994 + }, + { + "epoch": 0.7364680909971851, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 1.6686589460575992e-06, + "logits/chosen": 305472352.0, + "logits/rejected": 140496448.0, + "logps/chosen": -313.6943359375, + "logps/rejected": -324.4515075683594, + "loss": 0.1195, + "rewards/chosen": 1.9036287069320679, + "rewards/margins": 8.00057303905487, + "rewards/rejected": -6.096944332122803, + "step": 1995 + }, + { + "epoch": 0.7368372479350284, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 1.6642724876204658e-06, + "logits/chosen": 181712870.4, + "logits/rejected": 315571200.0, + "logps/chosen": -284.9514404296875, + "logps/rejected": -422.90407492897725, + "loss": 0.0839, + "rewards/chosen": 3.3308353424072266, + "rewards/margins": 9.276566765525125, + "rewards/rejected": -5.9457314231178975, + "step": 1996 + }, + { + "epoch": 0.7372064048728716, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 1.6598906507914214e-06, + "logits/chosen": 144628676.92307693, + "logits/rejected": 136917881.2631579, + "logps/chosen": -284.23940805288464, + "logps/rejected": -378.3180509868421, + "loss": 0.0458, + "rewards/chosen": 3.393572293795072, + "rewards/margins": 10.365710548061108, + "rewards/rejected": -6.972138254266036, + "step": 1997 + }, + { + "epoch": 0.7375755618107148, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 1.6555134416414426e-06, + "logits/chosen": 201869789.86666667, + "logits/rejected": 190495939.7647059, + "logps/chosen": -374.4681640625, + "logps/rejected": -369.6167566636029, + "loss": 0.0742, + "rewards/chosen": 2.4941327412923178, + "rewards/margins": 7.867177821140664, + "rewards/rejected": -5.373045079848346, + "step": 1998 + }, + { + "epoch": 0.7379447187485579, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 1.6511408662350993e-06, + "logits/chosen": 271803338.1052632, + "logits/rejected": 268921324.3076923, + "logps/chosen": -389.9468030427632, + "logps/rejected": -453.82718599759613, + "loss": 0.0986, + "rewards/chosen": 2.7211687188399467, + "rewards/margins": 9.597529098572519, + "rewards/rejected": -6.8763603797325725, + "step": 1999 + }, + { + "epoch": 0.7383138756864012, + "grad_norm": 4.25, + "kl": 0.5924568176269531, + "learning_rate": 1.6467729306305408e-06, + "logits/chosen": 213592497.23076922, + "logits/rejected": 194613140.21052632, + "logps/chosen": -309.06700721153845, + "logps/rejected": -486.70297080592104, + "loss": 0.0714, + "rewards/chosen": 3.1372481126051683, + "rewards/margins": 10.526519095849412, + "rewards/rejected": -7.389270983244243, + "step": 2000 + }, + { + "epoch": 0.7386830326242444, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 1.6424096408794825e-06, + "logits/chosen": 201391160.8888889, + "logits/rejected": 130069878.85714285, + "logps/chosen": -331.0080837673611, + "logps/rejected": -392.10721261160717, + "loss": 0.0856, + "rewards/chosen": 2.792960272894965, + "rewards/margins": 9.192226107158358, + "rewards/rejected": -6.399265834263393, + "step": 2001 + }, + { + "epoch": 0.7390521895620876, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 1.6380510030272089e-06, + "logits/chosen": 268438983.1111111, + "logits/rejected": 169453933.7142857, + "logps/chosen": -357.33018663194446, + "logps/rejected": -434.9497767857143, + "loss": 0.072, + "rewards/chosen": 3.0843755933973522, + "rewards/margins": 9.60452427939763, + "rewards/rejected": -6.520148686000279, + "step": 2002 + }, + { + "epoch": 0.7394213464999307, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.63369702311256e-06, + "logits/chosen": 172412849.23076922, + "logits/rejected": 125573901.4736842, + "logps/chosen": -359.44388521634613, + "logps/rejected": -364.2909385279605, + "loss": 0.0732, + "rewards/chosen": 2.694271967961238, + "rewards/margins": 8.57610188225503, + "rewards/rejected": -5.8818299142937915, + "step": 2003 + }, + { + "epoch": 0.739790503437774, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.6293477071679147e-06, + "logits/chosen": 361317216.0, + "logits/rejected": 120393240.0, + "logps/chosen": -425.313720703125, + "logps/rejected": -375.513671875, + "loss": 0.0795, + "rewards/chosen": 2.5696535110473633, + "rewards/margins": 8.76594066619873, + "rewards/rejected": -6.196287155151367, + "step": 2004 + }, + { + "epoch": 0.7401596603756172, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 1.6250030612191974e-06, + "logits/chosen": 201490492.2352941, + "logits/rejected": 241929574.4, + "logps/chosen": -308.2275965073529, + "logps/rejected": -418.3667317708333, + "loss": 0.0683, + "rewards/chosen": 3.861492830164292, + "rewards/margins": 9.54660482967601, + "rewards/rejected": -5.685111999511719, + "step": 2005 + }, + { + "epoch": 0.7405288173134604, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 1.6206630912858618e-06, + "logits/chosen": 216586313.14285713, + "logits/rejected": 164336284.44444445, + "logps/chosen": -285.67769949776783, + "logps/rejected": -409.0020345052083, + "loss": 0.0656, + "rewards/chosen": 2.3071561540876115, + "rewards/margins": 8.56096195039295, + "rewards/rejected": -6.253805796305339, + "step": 2006 + }, + { + "epoch": 0.7408979742513035, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 1.6163278033808777e-06, + "logits/chosen": 194901611.78947368, + "logits/rejected": 148626510.76923078, + "logps/chosen": -315.2300575657895, + "logps/rejected": -447.6394606370192, + "loss": 0.1043, + "rewards/chosen": 1.950872120104338, + "rewards/margins": 9.628747345464914, + "rewards/rejected": -7.677875225360577, + "step": 2007 + }, + { + "epoch": 0.7412671311891468, + "grad_norm": 5.4375, + "kl": 2.5351457595825195, + "learning_rate": 1.6119972035107328e-06, + "logits/chosen": 242115072.0, + "logits/rejected": 183046459.07692307, + "logps/chosen": -383.6645250822368, + "logps/rejected": -356.5265925480769, + "loss": 0.1072, + "rewards/chosen": 2.8743804128546464, + "rewards/margins": 9.102488745561978, + "rewards/rejected": -6.228108332707332, + "step": 2008 + }, + { + "epoch": 0.74163628812699, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.6076712976754199e-06, + "logits/chosen": 245141728.0, + "logits/rejected": 207279840.0, + "logps/chosen": -283.381103515625, + "logps/rejected": -497.8667907714844, + "loss": 0.0819, + "rewards/chosen": 2.813563346862793, + "rewards/margins": 8.856824398040771, + "rewards/rejected": -6.0432610511779785, + "step": 2009 + }, + { + "epoch": 0.7420054450648332, + "grad_norm": 5.21875, + "kl": 1.3774967193603516, + "learning_rate": 1.6033500918684232e-06, + "logits/chosen": 217506290.52631578, + "logits/rejected": 190271409.23076922, + "logps/chosen": -357.13697574013156, + "logps/rejected": -396.33011568509613, + "loss": 0.0782, + "rewards/chosen": 2.747140181692023, + "rewards/margins": 8.131229138084752, + "rewards/rejected": -5.384088956392729, + "step": 2010 + }, + { + "epoch": 0.7423746020026764, + "grad_norm": 5.1875, + "kl": 0.5398502349853516, + "learning_rate": 1.5990335920767202e-06, + "logits/chosen": 219683384.8888889, + "logits/rejected": 192798482.2857143, + "logps/chosen": -428.6783854166667, + "logps/rejected": -323.10794503348217, + "loss": 0.0655, + "rewards/chosen": 2.8954310946994357, + "rewards/margins": 9.025215270027282, + "rewards/rejected": -6.129784175327846, + "step": 2011 + }, + { + "epoch": 0.7427437589405196, + "grad_norm": 5.375, + "kl": 0.6860685348510742, + "learning_rate": 1.5947218042807682e-06, + "logits/chosen": 271039326.31578946, + "logits/rejected": 197092982.15384614, + "logps/chosen": -370.8214689555921, + "logps/rejected": -459.287109375, + "loss": 0.092, + "rewards/chosen": 2.549482044420744, + "rewards/margins": 8.060586334722727, + "rewards/rejected": -5.511104290301983, + "step": 2012 + }, + { + "epoch": 0.7431129158783628, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.5904147344544928e-06, + "logits/chosen": 174150640.94117647, + "logits/rejected": 230866022.4, + "logps/chosen": -288.1578584558824, + "logps/rejected": -462.91484375, + "loss": 0.0885, + "rewards/chosen": 3.138924991383272, + "rewards/margins": 10.218454308603324, + "rewards/rejected": -7.079529317220052, + "step": 2013 + }, + { + "epoch": 0.743482072816206, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 1.5861123885652829e-06, + "logits/chosen": 246732528.94117647, + "logits/rejected": 221124983.46666667, + "logps/chosen": -312.7457490808824, + "logps/rejected": -361.0835286458333, + "loss": 0.0894, + "rewards/chosen": 2.776242873247932, + "rewards/margins": 7.8234835755591305, + "rewards/rejected": -5.047240702311198, + "step": 2014 + }, + { + "epoch": 0.7438512297540492, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 1.5818147725739858e-06, + "logits/chosen": 246815772.44444445, + "logits/rejected": 235537737.14285713, + "logps/chosen": -261.94129774305554, + "logps/rejected": -472.2101353236607, + "loss": 0.1329, + "rewards/chosen": 1.9195866054958768, + "rewards/margins": 8.503754085964626, + "rewards/rejected": -6.58416748046875, + "step": 2015 + }, + { + "epoch": 0.7442203866918924, + "grad_norm": 3.890625, + "kl": 0.8639869689941406, + "learning_rate": 1.577521892434895e-06, + "logits/chosen": 208250658.13333333, + "logits/rejected": 215062467.7647059, + "logps/chosen": -339.48346354166665, + "logps/rejected": -428.42819393382354, + "loss": 0.0654, + "rewards/chosen": 3.054131825764974, + "rewards/margins": 8.912425246893191, + "rewards/rejected": -5.8582934211282165, + "step": 2016 + }, + { + "epoch": 0.7445895436297356, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.573233754095738e-06, + "logits/chosen": 228985201.7777778, + "logits/rejected": 209506212.57142857, + "logps/chosen": -331.47357855902777, + "logps/rejected": -338.04457310267856, + "loss": 0.088, + "rewards/chosen": 2.504522959391276, + "rewards/margins": 8.704234350295295, + "rewards/rejected": -6.199711390904018, + "step": 2017 + }, + { + "epoch": 0.7449587005675788, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 1.5689503634976788e-06, + "logits/chosen": 238221010.82352942, + "logits/rejected": 156762709.33333334, + "logps/chosen": -384.60793887867646, + "logps/rejected": -402.17542317708336, + "loss": 0.1014, + "rewards/chosen": 2.2589613970588234, + "rewards/margins": 8.865853283452052, + "rewards/rejected": -6.606891886393229, + "step": 2018 + }, + { + "epoch": 0.745327857505422, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 1.5646717265753013e-06, + "logits/chosen": 217681840.0, + "logits/rejected": 267114624.0, + "logps/chosen": -383.573486328125, + "logps/rejected": -485.74139404296875, + "loss": 0.0528, + "rewards/chosen": 3.0307695865631104, + "rewards/margins": 9.61982798576355, + "rewards/rejected": -6.5890583992004395, + "step": 2019 + }, + { + "epoch": 0.7456970144432652, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 1.5603978492566002e-06, + "logits/chosen": 161109128.53333333, + "logits/rejected": 280909643.2941176, + "logps/chosen": -323.78619791666665, + "logps/rejected": -378.0980009191176, + "loss": 0.0754, + "rewards/chosen": 2.510356648763021, + "rewards/margins": 8.812264356426164, + "rewards/rejected": -6.301907707663143, + "step": 2020 + }, + { + "epoch": 0.7460661713811084, + "grad_norm": 6.53125, + "kl": 0.10735559463500977, + "learning_rate": 1.5561287374629786e-06, + "logits/chosen": 299575523.5555556, + "logits/rejected": 176786834.2857143, + "logps/chosen": -476.63829210069446, + "logps/rejected": -407.39034598214283, + "loss": 0.0985, + "rewards/chosen": 2.5047153896755643, + "rewards/margins": 8.450118897453187, + "rewards/rejected": -5.945403507777622, + "step": 2021 + }, + { + "epoch": 0.7464353283189515, + "grad_norm": 4.4375, + "kl": 0.4698352813720703, + "learning_rate": 1.551864397109239e-06, + "logits/chosen": 178531166.31578946, + "logits/rejected": 151076716.30769232, + "logps/chosen": -272.1553248355263, + "logps/rejected": -343.81869741586536, + "loss": 0.0769, + "rewards/chosen": 2.6957423561497738, + "rewards/margins": 9.327748487835471, + "rewards/rejected": -6.6320061316856975, + "step": 2022 + }, + { + "epoch": 0.7468044852567948, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.5476048341035678e-06, + "logits/chosen": 238155237.0526316, + "logits/rejected": 284767921.2307692, + "logps/chosen": -347.51182154605266, + "logps/rejected": -526.4988356370193, + "loss": 0.0865, + "rewards/chosen": 2.4573528892115544, + "rewards/margins": 8.882297253319127, + "rewards/rejected": -6.4249443641075725, + "step": 2023 + }, + { + "epoch": 0.747173642194638, + "grad_norm": 5.4375, + "kl": 0.06421756744384766, + "learning_rate": 1.5433500543475361e-06, + "logits/chosen": 216984997.6470588, + "logits/rejected": 172820155.73333332, + "logps/chosen": -272.55615234375, + "logps/rejected": -376.13824869791665, + "loss": 0.1056, + "rewards/chosen": 2.5987768734202668, + "rewards/margins": 9.406329465379901, + "rewards/rejected": -6.807552591959635, + "step": 2024 + }, + { + "epoch": 0.7475427991324812, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.5391000637360898e-06, + "logits/chosen": 151957775.05882353, + "logits/rejected": 141229849.6, + "logps/chosen": -270.2259306066176, + "logps/rejected": -495.0767578125, + "loss": 0.0918, + "rewards/chosen": 3.0420765596277572, + "rewards/margins": 9.710588522518382, + "rewards/rejected": -6.668511962890625, + "step": 2025 + }, + { + "epoch": 0.7479119560703243, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.5348548681575332e-06, + "logits/chosen": 154007202.9090909, + "logits/rejected": 181869385.14285713, + "logps/chosen": -314.1388050426136, + "logps/rejected": -339.8056175595238, + "loss": 0.1016, + "rewards/chosen": 1.6030824834650212, + "rewards/margins": 7.811081576656986, + "rewards/rejected": -6.207999093191964, + "step": 2026 + }, + { + "epoch": 0.7482811130081676, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 1.530614473493533e-06, + "logits/chosen": 210776051.2, + "logits/rejected": 295766421.3333333, + "logps/chosen": -361.5173828125, + "logps/rejected": -382.2325439453125, + "loss": 0.0851, + "rewards/chosen": 3.072002410888672, + "rewards/margins": 9.160166422526043, + "rewards/rejected": -6.08816401163737, + "step": 2027 + }, + { + "epoch": 0.7486502699460108, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 1.5263788856191026e-06, + "logits/chosen": 121425648.94117647, + "logits/rejected": 164545604.26666668, + "logps/chosen": -286.2415556066176, + "logps/rejected": -464.680078125, + "loss": 0.091, + "rewards/chosen": 2.960814083323759, + "rewards/margins": 10.593226803050321, + "rewards/rejected": -7.632412719726562, + "step": 2028 + }, + { + "epoch": 0.749019426883854, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.522148110402593e-06, + "logits/chosen": 239950189.7142857, + "logits/rejected": 213130780.44444445, + "logps/chosen": -386.4874790736607, + "logps/rejected": -440.2548828125, + "loss": 0.0577, + "rewards/chosen": 2.6148643493652344, + "rewards/margins": 10.012137095133465, + "rewards/rejected": -7.3972727457682295, + "step": 2029 + }, + { + "epoch": 0.7493885838216972, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.517922153705692e-06, + "logits/chosen": 275123765.8947368, + "logits/rejected": 140176502.15384614, + "logps/chosen": -321.8645662006579, + "logps/rejected": -395.6305964543269, + "loss": 0.0873, + "rewards/chosen": 3.057404769094367, + "rewards/margins": 9.154178356834752, + "rewards/rejected": -6.096773587740385, + "step": 2030 + }, + { + "epoch": 0.7497577407595404, + "grad_norm": 6.46875, + "kl": 0.7340707778930664, + "learning_rate": 1.5137010213834086e-06, + "logits/chosen": 301115818.6666667, + "logits/rejected": 243197805.7142857, + "logps/chosen": -456.31781684027777, + "logps/rejected": -601.7674386160714, + "loss": 0.0926, + "rewards/chosen": 2.752646976047092, + "rewards/margins": 9.42572033594525, + "rewards/rejected": -6.673073359898159, + "step": 2031 + }, + { + "epoch": 0.7501268976973836, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 1.5094847192840644e-06, + "logits/chosen": 210346922.66666666, + "logits/rejected": 188628684.8, + "logps/chosen": -339.1022135416667, + "logps/rejected": -462.228125, + "loss": 0.0847, + "rewards/chosen": 2.2992590268452964, + "rewards/margins": 8.005397256215414, + "rewards/rejected": -5.706138229370117, + "step": 2032 + }, + { + "epoch": 0.7504960546352268, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 1.5052732532492959e-06, + "logits/chosen": 234450582.5882353, + "logits/rejected": 235398058.66666666, + "logps/chosen": -302.21263212316177, + "logps/rejected": -461.08649088541665, + "loss": 0.0911, + "rewards/chosen": 2.507434171788833, + "rewards/margins": 9.440286344640395, + "rewards/rejected": -6.932852172851563, + "step": 2033 + }, + { + "epoch": 0.75086521157307, + "grad_norm": 4.125, + "kl": 0.6207456588745117, + "learning_rate": 1.5010666291140363e-06, + "logits/chosen": 246045312.0, + "logits/rejected": 196842709.33333334, + "logps/chosen": -312.80634765625, + "logps/rejected": -403.041015625, + "loss": 0.0916, + "rewards/chosen": 3.1786407470703124, + "rewards/margins": 10.007992808024088, + "rewards/rejected": -6.829352060953776, + "step": 2034 + }, + { + "epoch": 0.75086521157307, + "eval_kl": 0.3725018799304962, + "eval_logits/chosen": 222581920.49448124, + "eval_logits/rejected": 189182194.08037826, + "eval_logps/chosen": -355.36465231788077, + "eval_logps/rejected": -444.5080526004728, + "eval_loss": 0.08018526434898376, + "eval_rewards/chosen": 2.818092059878587, + "eval_rewards/margins": 9.443531859302347, + "eval_rewards/rejected": -6.625439799423759, + "eval_runtime": 47.0121, + "eval_samples_per_second": 18.633, + "eval_steps_per_second": 4.658, + "step": 2034 + }, + { + "epoch": 0.7512343685109132, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.4968648527065066e-06, + "logits/chosen": 180033792.0, + "logits/rejected": 165899673.6, + "logps/chosen": -383.8947323069853, + "logps/rejected": -415.67063802083334, + "loss": 0.0519, + "rewards/chosen": 3.4851859597598804, + "rewards/margins": 11.003443100873161, + "rewards/rejected": -7.518257141113281, + "step": 2035 + }, + { + "epoch": 0.7516035254487564, + "grad_norm": 4.46875, + "kl": 0.8612871170043945, + "learning_rate": 1.4926679298482172e-06, + "logits/chosen": 213675081.14285713, + "logits/rejected": 264922065.45454547, + "logps/chosen": -264.96751767113096, + "logps/rejected": -506.42631392045456, + "loss": 0.0775, + "rewards/chosen": 3.0075291224888394, + "rewards/margins": 9.141598391842532, + "rewards/rejected": -6.134069269353693, + "step": 2036 + }, + { + "epoch": 0.7519726823865996, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 1.4884758663539517e-06, + "logits/chosen": 193185264.0, + "logits/rejected": 142216576.0, + "logps/chosen": -348.81512451171875, + "logps/rejected": -505.3362731933594, + "loss": 0.0961, + "rewards/chosen": 2.401319980621338, + "rewards/margins": 9.300276279449463, + "rewards/rejected": -6.898956298828125, + "step": 2037 + }, + { + "epoch": 0.7523418393244428, + "grad_norm": 4.71875, + "kl": 0.3740377426147461, + "learning_rate": 1.4842886680317592e-06, + "logits/chosen": 254732603.07692307, + "logits/rejected": 192218772.21052632, + "logps/chosen": -440.03699669471155, + "logps/rejected": -397.25868626644734, + "loss": 0.0507, + "rewards/chosen": 3.3766940190241885, + "rewards/margins": 9.44066006741543, + "rewards/rejected": -6.063966048391242, + "step": 2038 + }, + { + "epoch": 0.752710996262286, + "grad_norm": 5.90625, + "kl": 1.651082992553711, + "learning_rate": 1.4801063406829497e-06, + "logits/chosen": 230180352.0, + "logits/rejected": 192155426.13333333, + "logps/chosen": -308.53150850183823, + "logps/rejected": -497.1212565104167, + "loss": 0.0962, + "rewards/chosen": 2.829288258272059, + "rewards/margins": 10.274445627249923, + "rewards/rejected": -7.445157368977864, + "step": 2039 + }, + { + "epoch": 0.7530801532001292, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 1.4759288901020875e-06, + "logits/chosen": 225217251.55555555, + "logits/rejected": 157521371.42857143, + "logps/chosen": -337.7683919270833, + "logps/rejected": -355.51583426339283, + "loss": 0.0793, + "rewards/chosen": 2.8781325022379556, + "rewards/margins": 10.407558531988235, + "rewards/rejected": -7.529426029750279, + "step": 2040 + }, + { + "epoch": 0.7534493101379725, + "grad_norm": 7.6875, + "kl": 3.023190498352051, + "learning_rate": 1.4717563220769733e-06, + "logits/chosen": 162917440.0, + "logits/rejected": 190797440.0, + "logps/chosen": -365.356396484375, + "logps/rejected": -418.6086832682292, + "loss": 0.1516, + "rewards/chosen": 2.1046653747558595, + "rewards/margins": 8.34147923787435, + "rewards/rejected": -6.236813863118489, + "step": 2041 + }, + { + "epoch": 0.7538184670758156, + "grad_norm": 4.96875, + "kl": 0.14215946197509766, + "learning_rate": 1.4675886423886488e-06, + "logits/chosen": 213293752.8888889, + "logits/rejected": 313981513.14285713, + "logps/chosen": -315.84727647569446, + "logps/rejected": -480.1629115513393, + "loss": 0.0767, + "rewards/chosen": 2.559202618069119, + "rewards/margins": 8.603661279829721, + "rewards/rejected": -6.044458661760602, + "step": 2042 + }, + { + "epoch": 0.7541876240136588, + "grad_norm": 4.875, + "kl": 0.22188377380371094, + "learning_rate": 1.4634258568113835e-06, + "logits/chosen": 203819158.5882353, + "logits/rejected": 205638929.06666666, + "logps/chosen": -349.06560202205884, + "logps/rejected": -430.22630208333334, + "loss": 0.0786, + "rewards/chosen": 2.738374149098116, + "rewards/margins": 8.824616316253064, + "rewards/rejected": -6.086242167154948, + "step": 2043 + }, + { + "epoch": 0.754556780951502, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.45926797111266e-06, + "logits/chosen": 227693283.55555555, + "logits/rejected": 131468269.71428572, + "logps/chosen": -381.9368489583333, + "logps/rejected": -343.3383091517857, + "loss": 0.0796, + "rewards/chosen": 2.7015762329101562, + "rewards/margins": 7.832838875906808, + "rewards/rejected": -5.131262642996652, + "step": 2044 + }, + { + "epoch": 0.7549259378893453, + "grad_norm": 5.59375, + "kl": 0.9968338012695312, + "learning_rate": 1.4551149910531781e-06, + "logits/chosen": 115935718.4, + "logits/rejected": 137437696.0, + "logps/chosen": -238.299609375, + "logps/rejected": -379.4276936848958, + "loss": 0.1354, + "rewards/chosen": 2.7615686416625977, + "rewards/margins": 8.162554359436035, + "rewards/rejected": -5.4009857177734375, + "step": 2045 + }, + { + "epoch": 0.7552950948271884, + "grad_norm": 8.25, + "kl": 1.883927345275879, + "learning_rate": 1.45096692238684e-06, + "logits/chosen": 178261224.72727272, + "logits/rejected": 223078835.2, + "logps/chosen": -328.83369584517044, + "logps/rejected": -609.2728515625, + "loss": 0.1817, + "rewards/chosen": 2.150650371204723, + "rewards/margins": 11.115371287952769, + "rewards/rejected": -8.964720916748046, + "step": 2046 + }, + { + "epoch": 0.7556642517650316, + "grad_norm": 3.578125, + "kl": 0.0, + "learning_rate": 1.4468237708607397e-06, + "logits/chosen": 194048736.0, + "logits/rejected": 145393600.0, + "logps/chosen": -349.609619140625, + "logps/rejected": -295.155029296875, + "loss": 0.051, + "rewards/chosen": 3.534348487854004, + "rewards/margins": 9.521968841552734, + "rewards/rejected": -5.9876203536987305, + "step": 2047 + }, + { + "epoch": 0.7560334087028748, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 1.4426855422151636e-06, + "logits/chosen": 159161344.0, + "logits/rejected": 217985792.0, + "logps/chosen": -265.79218207465277, + "logps/rejected": -622.3097098214286, + "loss": 0.0633, + "rewards/chosen": 2.90093994140625, + "rewards/margins": 12.803860255650111, + "rewards/rejected": -9.902920314243861, + "step": 2048 + }, + { + "epoch": 0.7564025656407181, + "grad_norm": 4.65625, + "kl": 0.4319491386413574, + "learning_rate": 1.4385522421835724e-06, + "logits/chosen": 176261984.0, + "logits/rejected": 215506432.0, + "logps/chosen": -315.5356140136719, + "logps/rejected": -526.6458129882812, + "loss": 0.0801, + "rewards/chosen": 2.359410285949707, + "rewards/margins": 9.383461952209473, + "rewards/rejected": -7.024051666259766, + "step": 2049 + }, + { + "epoch": 0.7567717225785612, + "grad_norm": 5.0, + "kl": 0.0, + "learning_rate": 1.4344238764926032e-06, + "logits/chosen": 188420539.73333332, + "logits/rejected": 262099636.70588234, + "logps/chosen": -337.0097981770833, + "logps/rejected": -420.10759420955884, + "loss": 0.0746, + "rewards/chosen": 2.633990732828776, + "rewards/margins": 8.575799994375192, + "rewards/rejected": -5.941809261546416, + "step": 2050 + }, + { + "epoch": 0.7571408795164044, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.4303004508620515e-06, + "logits/chosen": 233205088.0, + "logits/rejected": 237002768.0, + "logps/chosen": -355.7894592285156, + "logps/rejected": -512.1624145507812, + "loss": 0.0896, + "rewards/chosen": 2.3242340087890625, + "rewards/margins": 9.197521686553955, + "rewards/rejected": -6.873287677764893, + "step": 2051 + }, + { + "epoch": 0.7575100364542476, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.4261819710048725e-06, + "logits/chosen": 232735286.85714287, + "logits/rejected": 176711765.33333334, + "logps/chosen": -377.16336495535717, + "logps/rejected": -356.78900824652777, + "loss": 0.0583, + "rewards/chosen": 3.089609146118164, + "rewards/margins": 8.747755474514431, + "rewards/rejected": -5.658146328396267, + "step": 2052 + }, + { + "epoch": 0.7578791933920909, + "grad_norm": 3.390625, + "kl": 0.0, + "learning_rate": 1.4220684426271692e-06, + "logits/chosen": 172222544.0, + "logits/rejected": 186968784.0, + "logps/chosen": -365.1658020019531, + "logps/rejected": -393.4367980957031, + "loss": 0.0428, + "rewards/chosen": 3.806063413619995, + "rewards/margins": 10.241100072860718, + "rewards/rejected": -6.435036659240723, + "step": 2053 + }, + { + "epoch": 0.758248350329934, + "grad_norm": 4.84375, + "kl": 0.027656078338623047, + "learning_rate": 1.41795987142818e-06, + "logits/chosen": 139266654.31578946, + "logits/rejected": 154972800.0, + "logps/chosen": -372.5641447368421, + "logps/rejected": -348.07774939903845, + "loss": 0.076, + "rewards/chosen": 3.5347679539730676, + "rewards/margins": 9.221372967306902, + "rewards/rejected": -5.686605013333834, + "step": 2054 + }, + { + "epoch": 0.7586175072677772, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 1.4138562631002794e-06, + "logits/chosen": 223540870.7368421, + "logits/rejected": 177242407.3846154, + "logps/chosen": -280.4502210115132, + "logps/rejected": -280.0652418870192, + "loss": 0.0955, + "rewards/chosen": 2.5243982013903166, + "rewards/margins": 7.863254238236771, + "rewards/rejected": -5.338856036846455, + "step": 2055 + }, + { + "epoch": 0.7589866642056204, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 1.4097576233289662e-06, + "logits/chosen": 171888836.92307693, + "logits/rejected": 199917568.0, + "logps/chosen": -305.9304387019231, + "logps/rejected": -410.1089124177632, + "loss": 0.0818, + "rewards/chosen": 2.8048843970665565, + "rewards/margins": 8.80783014644978, + "rewards/rejected": -6.0029457493832235, + "step": 2056 + }, + { + "epoch": 0.7593558211434636, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.405663957792851e-06, + "logits/chosen": 274168079.0588235, + "logits/rejected": 183380172.8, + "logps/chosen": -255.32120289522058, + "logps/rejected": -455.67864583333335, + "loss": 0.1114, + "rewards/chosen": 2.107464509851792, + "rewards/margins": 9.551300976323146, + "rewards/rejected": -7.443836466471354, + "step": 2057 + }, + { + "epoch": 0.7597249780813068, + "grad_norm": 6.3125, + "kl": 1.0240211486816406, + "learning_rate": 1.4015752721636573e-06, + "logits/chosen": 170864241.7777778, + "logits/rejected": 187520219.42857143, + "logps/chosen": -386.1383463541667, + "logps/rejected": -382.09427315848217, + "loss": 0.088, + "rewards/chosen": 2.9352493286132812, + "rewards/margins": 9.238615853445872, + "rewards/rejected": -6.303366524832589, + "step": 2058 + }, + { + "epoch": 0.76009413501915, + "grad_norm": 6.78125, + "kl": 1.558638572692871, + "learning_rate": 1.397491572106207e-06, + "logits/chosen": 238446942.31578946, + "logits/rejected": 213932662.15384614, + "logps/chosen": -366.59120579769734, + "logps/rejected": -529.6616586538462, + "loss": 0.1077, + "rewards/chosen": 2.5255470275878906, + "rewards/margins": 9.2017942575308, + "rewards/rejected": -6.676247229942908, + "step": 2059 + }, + { + "epoch": 0.7604632919569932, + "grad_norm": 4.96875, + "kl": 2.5463199615478516, + "learning_rate": 1.3934128632784132e-06, + "logits/chosen": 163795305.4117647, + "logits/rejected": 243709491.2, + "logps/chosen": -391.78857421875, + "logps/rejected": -402.69908854166664, + "loss": 0.0808, + "rewards/chosen": 3.2681958815630745, + "rewards/margins": 8.942994974173752, + "rewards/rejected": -5.6747990926106775, + "step": 2060 + }, + { + "epoch": 0.7608324488948364, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.3893391513312759e-06, + "logits/chosen": 213951795.2, + "logits/rejected": 265808715.29411766, + "logps/chosen": -417.98310546875, + "logps/rejected": -497.3024471507353, + "loss": 0.0439, + "rewards/chosen": 3.1281728108723956, + "rewards/margins": 9.130725965312882, + "rewards/rejected": -6.002553154440487, + "step": 2061 + }, + { + "epoch": 0.7612016058326796, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 1.385270441908873e-06, + "logits/chosen": 272829741.1764706, + "logits/rejected": 252228539.73333332, + "logps/chosen": -369.7796415441176, + "logps/rejected": -387.42760416666664, + "loss": 0.0969, + "rewards/chosen": 2.5946500441607308, + "rewards/margins": 9.053705626843023, + "rewards/rejected": -6.459055582682292, + "step": 2062 + }, + { + "epoch": 0.7615707627705228, + "grad_norm": 4.78125, + "kl": 0.5661907196044922, + "learning_rate": 1.381206740648347e-06, + "logits/chosen": 150051584.0, + "logits/rejected": 152990674.82352942, + "logps/chosen": -292.99498697916664, + "logps/rejected": -405.1525448069853, + "loss": 0.0832, + "rewards/chosen": 3.3124430338541666, + "rewards/margins": 9.962736271876913, + "rewards/rejected": -6.650293238022748, + "step": 2063 + }, + { + "epoch": 0.761939919708366, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 1.3771480531799054e-06, + "logits/chosen": 136388480.0, + "logits/rejected": 226279808.0, + "logps/chosen": -310.494873046875, + "logps/rejected": -311.253271484375, + "loss": 0.0711, + "rewards/chosen": 2.8324178059895835, + "rewards/margins": 8.31457379659017, + "rewards/rejected": -5.482155990600586, + "step": 2064 + }, + { + "epoch": 0.7623090766462092, + "grad_norm": 6.8125, + "kl": 0.294891357421875, + "learning_rate": 1.3730943851268109e-06, + "logits/chosen": 253446629.0526316, + "logits/rejected": 214413174.15384614, + "logps/chosen": -416.80306846217104, + "logps/rejected": -407.87744140625, + "loss": 0.1162, + "rewards/chosen": 1.999752446224815, + "rewards/margins": 8.507695819684852, + "rewards/rejected": -6.507943373460036, + "step": 2065 + }, + { + "epoch": 0.7626782335840524, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 1.3690457421053638e-06, + "logits/chosen": 199990237.86666667, + "logits/rejected": 226780099.7647059, + "logps/chosen": -302.0140625, + "logps/rejected": -439.3986385569853, + "loss": 0.0818, + "rewards/chosen": 2.3807449340820312, + "rewards/margins": 9.178120781393613, + "rewards/rejected": -6.797375847311581, + "step": 2066 + }, + { + "epoch": 0.7630473905218956, + "grad_norm": 4.1875, + "kl": 0.10230422019958496, + "learning_rate": 1.36500212972491e-06, + "logits/chosen": 171373994.66666666, + "logits/rejected": 281909308.2352941, + "logps/chosen": -332.75999348958334, + "logps/rejected": -381.8255399816176, + "loss": 0.0633, + "rewards/chosen": 3.3810193379720053, + "rewards/margins": 9.179813803878485, + "rewards/rejected": -5.79879446590648, + "step": 2067 + }, + { + "epoch": 0.7634165474597389, + "grad_norm": 5.25, + "kl": 1.2594213485717773, + "learning_rate": 1.3609635535878246e-06, + "logits/chosen": 215665178.9473684, + "logits/rejected": 288584625.2307692, + "logps/chosen": -308.79124691611844, + "logps/rejected": -398.95425180288464, + "loss": 0.1019, + "rewards/chosen": 2.9236257452713814, + "rewards/margins": 8.806800934949868, + "rewards/rejected": -5.883175189678486, + "step": 2068 + }, + { + "epoch": 0.763785704397582, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.3569300192895006e-06, + "logits/chosen": 168640716.8, + "logits/rejected": 197478430.11764705, + "logps/chosen": -394.95026041666665, + "logps/rejected": -426.9935087316176, + "loss": 0.0831, + "rewards/chosen": 2.2309860229492187, + "rewards/margins": 9.351334785012638, + "rewards/rejected": -7.120348762063419, + "step": 2069 + }, + { + "epoch": 0.7641548613354252, + "grad_norm": 5.71875, + "kl": 0.8533849716186523, + "learning_rate": 1.3529015324183509e-06, + "logits/chosen": 163015367.1111111, + "logits/rejected": 159042194.2857143, + "logps/chosen": -345.876708984375, + "logps/rejected": -435.34322684151783, + "loss": 0.0749, + "rewards/chosen": 3.6477737426757812, + "rewards/margins": 10.573795318603516, + "rewards/rejected": -6.926021575927734, + "step": 2070 + }, + { + "epoch": 0.7645240182732684, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 1.348878098555793e-06, + "logits/chosen": 340717945.2631579, + "logits/rejected": 233613981.53846154, + "logps/chosen": -338.8394068667763, + "logps/rejected": -444.24962439903845, + "loss": 0.0903, + "rewards/chosen": 2.36551907188014, + "rewards/margins": 9.190135137272268, + "rewards/rejected": -6.824616065392127, + "step": 2071 + }, + { + "epoch": 0.7648931752111117, + "grad_norm": 2.5625, + "kl": 0.0, + "learning_rate": 1.344859723276241e-06, + "logits/chosen": 211530786.13333333, + "logits/rejected": 227574076.2352941, + "logps/chosen": -321.9653645833333, + "logps/rejected": -428.2917911305147, + "loss": 0.0491, + "rewards/chosen": 3.2689748128255207, + "rewards/margins": 10.327153434005439, + "rewards/rejected": -7.058178621179917, + "step": 2072 + }, + { + "epoch": 0.7652623321489548, + "grad_norm": 6.5, + "kl": 0.7450284957885742, + "learning_rate": 1.3408464121471048e-06, + "logits/chosen": 180038144.0, + "logits/rejected": 129549236.70588236, + "logps/chosen": -406.09625651041665, + "logps/rejected": -446.91604434742646, + "loss": 0.0807, + "rewards/chosen": 2.8983858744303386, + "rewards/margins": 9.475793711344402, + "rewards/rejected": -6.5774078369140625, + "step": 2073 + }, + { + "epoch": 0.765631489086798, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.3368381707287764e-06, + "logits/chosen": 166729192.72727272, + "logits/rejected": 222379715.04761904, + "logps/chosen": -304.86476828835225, + "logps/rejected": -398.14276413690476, + "loss": 0.0596, + "rewards/chosen": 3.1573125665838067, + "rewards/margins": 9.361361086626589, + "rewards/rejected": -6.204048520042782, + "step": 2074 + }, + { + "epoch": 0.7660006460246412, + "grad_norm": 3.0, + "kl": 0.0, + "learning_rate": 1.3328350045746213e-06, + "logits/chosen": 180408285.0909091, + "logits/rejected": 261519189.33333334, + "logps/chosen": -366.45556640625, + "logps/rejected": -556.5956101190476, + "loss": 0.0333, + "rewards/chosen": 4.227797768332741, + "rewards/margins": 12.778472933418307, + "rewards/rejected": -8.550675165085565, + "step": 2075 + }, + { + "epoch": 0.7663698029624845, + "grad_norm": 7.25, + "kl": 0.0, + "learning_rate": 1.3288369192309764e-06, + "logits/chosen": 220151628.8, + "logits/rejected": 195418794.66666666, + "logps/chosen": -345.7255615234375, + "logps/rejected": -348.382568359375, + "loss": 0.1052, + "rewards/chosen": 2.4893352508544924, + "rewards/margins": 8.073662948608398, + "rewards/rejected": -5.584327697753906, + "step": 2076 + }, + { + "epoch": 0.7667389599003276, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.3248439202371399e-06, + "logits/chosen": 236732017.7777778, + "logits/rejected": 146512054.85714287, + "logps/chosen": -407.692626953125, + "logps/rejected": -356.70595005580356, + "loss": 0.0911, + "rewards/chosen": 2.6543295118543835, + "rewards/margins": 8.552532680450923, + "rewards/rejected": -5.89820316859654, + "step": 2077 + }, + { + "epoch": 0.7671081168381708, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.3208560131253578e-06, + "logits/chosen": 235978550.85714287, + "logits/rejected": 201448206.2222222, + "logps/chosen": -381.7466517857143, + "logps/rejected": -510.2931857638889, + "loss": 0.1062, + "rewards/chosen": 2.0109476361955916, + "rewards/margins": 9.154503171406095, + "rewards/rejected": -7.143555535210504, + "step": 2078 + }, + { + "epoch": 0.767477273776014, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 1.3168732034208264e-06, + "logits/chosen": 156878672.0, + "logits/rejected": 168989920.0, + "logps/chosen": -322.17083740234375, + "logps/rejected": -429.2776794433594, + "loss": 0.1, + "rewards/chosen": 2.799645185470581, + "rewards/margins": 9.46143889427185, + "rewards/rejected": -6.6617937088012695, + "step": 2079 + }, + { + "epoch": 0.7678464307138573, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 1.3128954966416801e-06, + "logits/chosen": 180110762.66666666, + "logits/rejected": 255761993.14285713, + "logps/chosen": -283.0333658854167, + "logps/rejected": -473.80908203125, + "loss": 0.0937, + "rewards/chosen": 2.8729659186469183, + "rewards/margins": 9.355853186713325, + "rewards/rejected": -6.482887268066406, + "step": 2080 + }, + { + "epoch": 0.7682155876517004, + "grad_norm": 5.25, + "kl": 0.1291513442993164, + "learning_rate": 1.3089228982989771e-06, + "logits/chosen": 183993618.2857143, + "logits/rejected": 159394645.33333334, + "logps/chosen": -382.145751953125, + "logps/rejected": -355.3448893229167, + "loss": 0.0731, + "rewards/chosen": 2.7244581495012556, + "rewards/margins": 8.98492059253511, + "rewards/rejected": -6.2604624430338545, + "step": 2081 + }, + { + "epoch": 0.7685847445895436, + "grad_norm": 4.15625, + "kl": 0.28879213333129883, + "learning_rate": 1.3049554138967052e-06, + "logits/chosen": 214783590.4, + "logits/rejected": 170813364.70588234, + "logps/chosen": -374.72552083333335, + "logps/rejected": -405.1849149816176, + "loss": 0.0629, + "rewards/chosen": 3.441567230224609, + "rewards/margins": 9.593546115650849, + "rewards/rejected": -6.1519788854262405, + "step": 2082 + }, + { + "epoch": 0.7689539015273869, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.3009930489317613e-06, + "logits/chosen": 186877719.27272728, + "logits/rejected": 166308864.0, + "logps/chosen": -284.6370960582386, + "logps/rejected": -433.052001953125, + "loss": 0.1302, + "rewards/chosen": 2.4233824990012427, + "rewards/margins": 9.971346213600851, + "rewards/rejected": -7.547963714599609, + "step": 2083 + }, + { + "epoch": 0.7693230584652301, + "grad_norm": 4.40625, + "kl": 0.08494281768798828, + "learning_rate": 1.2970358088939534e-06, + "logits/chosen": 236002208.0, + "logits/rejected": 178731008.0, + "logps/chosen": -283.1667785644531, + "logps/rejected": -435.9864807128906, + "loss": 0.1003, + "rewards/chosen": 2.2115607261657715, + "rewards/margins": 8.241061210632324, + "rewards/rejected": -6.029500484466553, + "step": 2084 + }, + { + "epoch": 0.7696922154030732, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 1.2930836992659857e-06, + "logits/chosen": 180508882.82352942, + "logits/rejected": 141243869.86666667, + "logps/chosen": -381.9695829503676, + "logps/rejected": -413.40257161458334, + "loss": 0.0716, + "rewards/chosen": 3.0583413067985985, + "rewards/margins": 8.66882511213714, + "rewards/rejected": -5.610483805338542, + "step": 2085 + }, + { + "epoch": 0.7700613723409164, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.2891367255234566e-06, + "logits/chosen": 222490436.26666668, + "logits/rejected": 225973519.05882353, + "logps/chosen": -408.00416666666666, + "logps/rejected": -447.5673253676471, + "loss": 0.0734, + "rewards/chosen": 2.72001215616862, + "rewards/margins": 9.634247439515356, + "rewards/rejected": -6.914235283346737, + "step": 2086 + }, + { + "epoch": 0.7704305292787597, + "grad_norm": 4.78125, + "kl": 0.8841285705566406, + "learning_rate": 1.2851948931348495e-06, + "logits/chosen": 248163754.66666666, + "logits/rejected": 270095283.2, + "logps/chosen": -414.9983723958333, + "logps/rejected": -516.453466796875, + "loss": 0.0728, + "rewards/chosen": 2.5470606486002603, + "rewards/margins": 8.82170321146647, + "rewards/rejected": -6.274642562866211, + "step": 2087 + }, + { + "epoch": 0.7707996862166029, + "grad_norm": 6.28125, + "kl": 1.2821502685546875, + "learning_rate": 1.281258207561521e-06, + "logits/chosen": 244468337.7777778, + "logits/rejected": 346618916.5714286, + "logps/chosen": -403.48822699652777, + "logps/rejected": -501.03037806919644, + "loss": 0.0715, + "rewards/chosen": 2.927623536851671, + "rewards/margins": 10.206266463748992, + "rewards/rejected": -7.278642926897321, + "step": 2088 + }, + { + "epoch": 0.771168843154446, + "grad_norm": 6.53125, + "kl": 1.2869596481323242, + "learning_rate": 1.277326674257699e-06, + "logits/chosen": 242106316.8, + "logits/rejected": 230803498.66666666, + "logps/chosen": -339.240673828125, + "logps/rejected": -411.9602864583333, + "loss": 0.1169, + "rewards/chosen": 2.1568490982055666, + "rewards/margins": 9.105579566955566, + "rewards/rejected": -6.94873046875, + "step": 2089 + }, + { + "epoch": 0.7715380000922892, + "grad_norm": 3.421875, + "kl": 0.0, + "learning_rate": 1.2734002986704757e-06, + "logits/chosen": 194305536.0, + "logits/rejected": 144226073.6, + "logps/chosen": -376.774169921875, + "logps/rejected": -446.8349609375, + "loss": 0.037, + "rewards/chosen": 3.9056078592936196, + "rewards/margins": 10.618733469645182, + "rewards/rejected": -6.713125610351563, + "step": 2090 + }, + { + "epoch": 0.7719071570301325, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 1.269479086239791e-06, + "logits/chosen": 205131758.93333334, + "logits/rejected": 207209456.94117647, + "logps/chosen": -312.9109375, + "logps/rejected": -342.13180721507354, + "loss": 0.0898, + "rewards/chosen": 2.493427276611328, + "rewards/margins": 8.784912333768958, + "rewards/rejected": -6.291485057157629, + "step": 2091 + }, + { + "epoch": 0.7722763139679756, + "grad_norm": 5.28125, + "kl": 0.0018100738525390625, + "learning_rate": 1.2655630423984367e-06, + "logits/chosen": 306802880.0, + "logits/rejected": 177041184.0, + "logps/chosen": -367.11859130859375, + "logps/rejected": -465.9686584472656, + "loss": 0.0972, + "rewards/chosen": 2.8651158809661865, + "rewards/margins": 9.129492998123169, + "rewards/rejected": -6.264377117156982, + "step": 2092 + }, + { + "epoch": 0.7726454709058188, + "grad_norm": 5.96875, + "kl": 0.0, + "learning_rate": 1.2616521725720427e-06, + "logits/chosen": 193731150.76923078, + "logits/rejected": 191712000.0, + "logps/chosen": -337.38900991586536, + "logps/rejected": -376.3764391447368, + "loss": 0.0922, + "rewards/chosen": 2.0855236053466797, + "rewards/margins": 8.102863612927887, + "rewards/rejected": -6.0173400075812085, + "step": 2093 + }, + { + "epoch": 0.773014627843662, + "grad_norm": 7.625, + "kl": 0.0, + "learning_rate": 1.2577464821790675e-06, + "logits/chosen": 206147584.0, + "logits/rejected": 294963825.7777778, + "logps/chosen": -345.45908900669644, + "logps/rejected": -450.9920247395833, + "loss": 0.0894, + "rewards/chosen": 2.6330977848597934, + "rewards/margins": 9.154905137561617, + "rewards/rejected": -6.521807352701823, + "step": 2094 + }, + { + "epoch": 0.7733837847815053, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 1.253845976630796e-06, + "logits/chosen": 252412233.14285713, + "logits/rejected": 179950848.0, + "logps/chosen": -357.91591099330356, + "logps/rejected": -436.3151041666667, + "loss": 0.0721, + "rewards/chosen": 2.7145462036132812, + "rewards/margins": 9.370969984266493, + "rewards/rejected": -6.656423780653212, + "step": 2095 + }, + { + "epoch": 0.7737529417193484, + "grad_norm": 7.53125, + "kl": 0.6796684265136719, + "learning_rate": 1.2499506613313307e-06, + "logits/chosen": 207718424.3809524, + "logits/rejected": 188140171.63636363, + "logps/chosen": -369.4365001860119, + "logps/rejected": -396.7750799005682, + "loss": 0.1243, + "rewards/chosen": 2.846226828438895, + "rewards/margins": 8.659866035758675, + "rewards/rejected": -5.81363920731978, + "step": 2096 + }, + { + "epoch": 0.7741220986571916, + "grad_norm": 4.9375, + "kl": 0.11193275451660156, + "learning_rate": 1.2460605416775789e-06, + "logits/chosen": 150616968.53333333, + "logits/rejected": 186017249.88235295, + "logps/chosen": -348.4946614583333, + "logps/rejected": -390.0765739889706, + "loss": 0.0739, + "rewards/chosen": 2.556749216715495, + "rewards/margins": 8.583167566037645, + "rewards/rejected": -6.02641834932215, + "step": 2097 + }, + { + "epoch": 0.7744912555950348, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 1.2421756230592535e-06, + "logits/chosen": 150903995.07692307, + "logits/rejected": 188929266.52631578, + "logps/chosen": -279.1123610276442, + "logps/rejected": -456.6274671052632, + "loss": 0.0548, + "rewards/chosen": 2.35666876572829, + "rewards/margins": 9.9535323617912, + "rewards/rejected": -7.596863596062911, + "step": 2098 + }, + { + "epoch": 0.7748604125328781, + "grad_norm": 5.03125, + "kl": 0.05439138412475586, + "learning_rate": 1.2382959108588627e-06, + "logits/chosen": 212691996.44444445, + "logits/rejected": 200139026.2857143, + "logps/chosen": -404.0850423177083, + "logps/rejected": -479.076904296875, + "loss": 0.0805, + "rewards/chosen": 2.9103601243760853, + "rewards/margins": 10.239476945665148, + "rewards/rejected": -7.3291168212890625, + "step": 2099 + }, + { + "epoch": 0.7752295694707212, + "grad_norm": 6.8125, + "kl": 1.180574893951416, + "learning_rate": 1.2344214104516921e-06, + "logits/chosen": 203919800.8888889, + "logits/rejected": 143610962.2857143, + "logps/chosen": -380.39716254340277, + "logps/rejected": -411.89034598214283, + "loss": 0.1093, + "rewards/chosen": 2.520215140448676, + "rewards/margins": 8.493893729315865, + "rewards/rejected": -5.9736785888671875, + "step": 2100 + }, + { + "epoch": 0.7755987264085644, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 1.2305521272058163e-06, + "logits/chosen": 192881273.9047619, + "logits/rejected": 192925975.27272728, + "logps/chosen": -361.28538876488096, + "logps/rejected": -429.93496981534093, + "loss": 0.1136, + "rewards/chosen": 2.6226198105585006, + "rewards/margins": 10.300451980524766, + "rewards/rejected": -7.677832169966265, + "step": 2101 + }, + { + "epoch": 0.7759678833464076, + "grad_norm": 8.625, + "kl": 0.0, + "learning_rate": 1.2266880664820797e-06, + "logits/chosen": 338889999.0588235, + "logits/rejected": 183719833.6, + "logps/chosen": -400.45226332720586, + "logps/rejected": -417.4399739583333, + "loss": 0.0706, + "rewards/chosen": 3.0873444501091454, + "rewards/margins": 9.629585355870864, + "rewards/rejected": -6.542240905761719, + "step": 2102 + }, + { + "epoch": 0.7763370402842509, + "grad_norm": 3.09375, + "kl": 0.0, + "learning_rate": 1.2228292336340857e-06, + "logits/chosen": 187327418.1818182, + "logits/rejected": 247350710.85714287, + "logps/chosen": -351.0528675426136, + "logps/rejected": -480.84109933035717, + "loss": 0.0307, + "rewards/chosen": 3.405180497602983, + "rewards/margins": 11.841831818287506, + "rewards/rejected": -8.436651320684524, + "step": 2103 + }, + { + "epoch": 0.776706197222094, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 1.2189756340082004e-06, + "logits/chosen": 348575829.3333333, + "logits/rejected": 167191283.2, + "logps/chosen": -327.8875732421875, + "logps/rejected": -361.6175048828125, + "loss": 0.0905, + "rewards/chosen": 1.9615591367085774, + "rewards/margins": 8.016285165150961, + "rewards/rejected": -6.054726028442383, + "step": 2104 + }, + { + "epoch": 0.7770753541599372, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.2151272729435376e-06, + "logits/chosen": 260810806.85714287, + "logits/rejected": 267760099.55555555, + "logps/chosen": -309.07958984375, + "logps/rejected": -436.8357204861111, + "loss": 0.1051, + "rewards/chosen": 2.1775899614606584, + "rewards/margins": 8.370444101000588, + "rewards/rejected": -6.19285413953993, + "step": 2105 + }, + { + "epoch": 0.7774445110977805, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 1.2112841557719506e-06, + "logits/chosen": 246268040.53333333, + "logits/rejected": 220001656.47058824, + "logps/chosen": -357.710546875, + "logps/rejected": -443.1353975183824, + "loss": 0.0519, + "rewards/chosen": 2.883777364095052, + "rewards/margins": 10.461927496218214, + "rewards/rejected": -7.578150132123162, + "step": 2106 + }, + { + "epoch": 0.7778136680356237, + "grad_norm": 6.1875, + "kl": 1.6250643730163574, + "learning_rate": 1.207446287818031e-06, + "logits/chosen": 252142080.0, + "logits/rejected": 199786456.6153846, + "logps/chosen": -352.20839329769734, + "logps/rejected": -551.396484375, + "loss": 0.133, + "rewards/chosen": 2.484780562551398, + "rewards/margins": 10.468656130647853, + "rewards/rejected": -7.983875568096455, + "step": 2107 + }, + { + "epoch": 0.7781828249734668, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 1.2036136743990968e-06, + "logits/chosen": 286539203.7647059, + "logits/rejected": 151701811.2, + "logps/chosen": -369.0139590992647, + "logps/rejected": -397.18772786458334, + "loss": 0.0807, + "rewards/chosen": 2.5256491268382355, + "rewards/margins": 8.436912147671569, + "rewards/rejected": -5.911263020833333, + "step": 2108 + }, + { + "epoch": 0.77855198191131, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 1.1997863208251825e-06, + "logits/chosen": 183273834.66666666, + "logits/rejected": 200911641.6, + "logps/chosen": -313.802734375, + "logps/rejected": -401.618115234375, + "loss": 0.0496, + "rewards/chosen": 3.0420405069986978, + "rewards/margins": 10.127097574869792, + "rewards/rejected": -7.085057067871094, + "step": 2109 + }, + { + "epoch": 0.7789211388491533, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.19596423239904e-06, + "logits/chosen": 311905152.0, + "logits/rejected": 302206924.8, + "logps/chosen": -317.4608968098958, + "logps/rejected": -566.938916015625, + "loss": 0.0698, + "rewards/chosen": 2.645625432332357, + "rewards/margins": 9.32955945332845, + "rewards/rejected": -6.683934020996094, + "step": 2110 + }, + { + "epoch": 0.7792902957869965, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.1921474144161249e-06, + "logits/chosen": 218007241.14285713, + "logits/rejected": 298769578.6666667, + "logps/chosen": -347.6524135044643, + "logps/rejected": -460.4811197916667, + "loss": 0.0688, + "rewards/chosen": 2.416468484061105, + "rewards/margins": 8.380035582042876, + "rewards/rejected": -5.9635670979817705, + "step": 2111 + }, + { + "epoch": 0.7796594527248396, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 1.1883358721645876e-06, + "logits/chosen": 168425625.6, + "logits/rejected": 219515241.4117647, + "logps/chosen": -290.20397135416664, + "logps/rejected": -486.8756318933824, + "loss": 0.0818, + "rewards/chosen": 2.316575368245443, + "rewards/margins": 8.629978778315525, + "rewards/rejected": -6.313403410070083, + "step": 2112 + }, + { + "epoch": 0.7800286096626828, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.1845296109252724e-06, + "logits/chosen": 204439736.8888889, + "logits/rejected": 110330020.57142857, + "logps/chosen": -359.19249131944446, + "logps/rejected": -300.0165318080357, + "loss": 0.0589, + "rewards/chosen": 3.3416536119249134, + "rewards/margins": 9.218757689945281, + "rewards/rejected": -5.877104078020368, + "step": 2113 + }, + { + "epoch": 0.7803977666005261, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 1.180728635971708e-06, + "logits/chosen": 118090163.2, + "logits/rejected": 178698682.1818182, + "logps/chosen": -252.5218017578125, + "logps/rejected": -405.69375887784093, + "loss": 0.0792, + "rewards/chosen": 2.7231502532958984, + "rewards/margins": 8.526918064464223, + "rewards/rejected": -5.803767811168324, + "step": 2114 + }, + { + "epoch": 0.7807669235383693, + "grad_norm": 4.78125, + "kl": 0.04145193099975586, + "learning_rate": 1.1769329525700934e-06, + "logits/chosen": 241724455.3846154, + "logits/rejected": 175140796.63157895, + "logps/chosen": -387.5441331129808, + "logps/rejected": -365.9119294819079, + "loss": 0.0786, + "rewards/chosen": 3.1453475952148438, + "rewards/margins": 9.043391177528783, + "rewards/rejected": -5.898043582313939, + "step": 2115 + }, + { + "epoch": 0.7811360804762124, + "grad_norm": 6.59375, + "kl": 0.0, + "learning_rate": 1.1731425659793028e-06, + "logits/chosen": 248982357.33333334, + "logits/rejected": 258592329.14285713, + "logps/chosen": -412.818115234375, + "logps/rejected": -373.06612723214283, + "loss": 0.0903, + "rewards/chosen": 2.200332429673937, + "rewards/margins": 8.347198819357253, + "rewards/rejected": -6.146866389683315, + "step": 2116 + }, + { + "epoch": 0.7815052374140556, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.1693574814508657e-06, + "logits/chosen": 219747669.33333334, + "logits/rejected": 200676080.94117647, + "logps/chosen": -319.65930989583336, + "logps/rejected": -394.03728170955884, + "loss": 0.0579, + "rewards/chosen": 3.566841634114583, + "rewards/margins": 10.180888067507276, + "rewards/rejected": -6.614046433392693, + "step": 2117 + }, + { + "epoch": 0.7818743943518989, + "grad_norm": 4.8125, + "kl": 0.976801872253418, + "learning_rate": 1.1655777042289724e-06, + "logits/chosen": 200834116.26666668, + "logits/rejected": 159882044.2352941, + "logps/chosen": -336.8942057291667, + "logps/rejected": -425.50338924632354, + "loss": 0.0649, + "rewards/chosen": 3.0952430725097657, + "rewards/margins": 10.903081961239085, + "rewards/rejected": -7.80783888872932, + "step": 2118 + }, + { + "epoch": 0.7822435512897421, + "grad_norm": 5.5, + "kl": 2.952413558959961, + "learning_rate": 1.161803239550452e-06, + "logits/chosen": 189391925.89473686, + "logits/rejected": 286817280.0, + "logps/chosen": -376.29893092105266, + "logps/rejected": -390.9596604567308, + "loss": 0.0885, + "rewards/chosen": 2.890784213417455, + "rewards/margins": 8.468199092849547, + "rewards/rejected": -5.577414879432092, + "step": 2119 + }, + { + "epoch": 0.7826127082275852, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 1.1580340926447797e-06, + "logits/chosen": 346093252.9230769, + "logits/rejected": 236307402.10526314, + "logps/chosen": -381.1506535456731, + "logps/rejected": -428.7721011513158, + "loss": 0.0775, + "rewards/chosen": 2.416669405423678, + "rewards/margins": 10.171270331873103, + "rewards/rejected": -7.754600926449425, + "step": 2120 + }, + { + "epoch": 0.7829818651654284, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.1542702687340612e-06, + "logits/chosen": 347996348.6315789, + "logits/rejected": 190685991.3846154, + "logps/chosen": -372.64535361842104, + "logps/rejected": -465.52291165865387, + "loss": 0.0855, + "rewards/chosen": 2.838553177682977, + "rewards/margins": 9.024519082505694, + "rewards/rejected": -6.185965904822717, + "step": 2121 + }, + { + "epoch": 0.7833510221032717, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 1.150511773033025e-06, + "logits/chosen": 333376808.42105263, + "logits/rejected": 225278542.76923078, + "logps/chosen": -405.7284642269737, + "logps/rejected": -441.1944110576923, + "loss": 0.1215, + "rewards/chosen": 2.238503104762027, + "rewards/margins": 8.346721958052292, + "rewards/rejected": -6.108218853290264, + "step": 2122 + }, + { + "epoch": 0.7837201790411149, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.1467586107490202e-06, + "logits/chosen": 244632832.0, + "logits/rejected": 245919534.54545453, + "logps/chosen": -366.825390625, + "logps/rejected": -544.9570756392045, + "loss": 0.0499, + "rewards/chosen": 3.126139831542969, + "rewards/margins": 10.751822315562855, + "rewards/rejected": -7.625682484019887, + "step": 2123 + }, + { + "epoch": 0.784089335978958, + "grad_norm": 4.28125, + "kl": 0.2247905731201172, + "learning_rate": 1.143010787082006e-06, + "logits/chosen": 152488391.1111111, + "logits/rejected": 220083090.2857143, + "logps/chosen": -323.9550509982639, + "logps/rejected": -471.21714564732144, + "loss": 0.05, + "rewards/chosen": 3.8814794752332897, + "rewards/margins": 10.402191101558625, + "rewards/rejected": -6.520711626325335, + "step": 2124 + }, + { + "epoch": 0.7844584929168013, + "grad_norm": 3.015625, + "kl": 0.0, + "learning_rate": 1.139268307224543e-06, + "logits/chosen": 163520128.0, + "logits/rejected": 267332560.0, + "logps/chosen": -320.4345703125, + "logps/rejected": -639.7425537109375, + "loss": 0.0343, + "rewards/chosen": 3.6419434547424316, + "rewards/margins": 13.624903202056885, + "rewards/rejected": -9.982959747314453, + "step": 2125 + }, + { + "epoch": 0.7848276498546445, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 1.13553117636179e-06, + "logits/chosen": 250492928.0, + "logits/rejected": 205055812.26666668, + "logps/chosen": -513.583984375, + "logps/rejected": -444.9342447916667, + "loss": 0.1103, + "rewards/chosen": 2.547796361586627, + "rewards/margins": 9.08488437428194, + "rewards/rejected": -6.537088012695312, + "step": 2126 + }, + { + "epoch": 0.7851968067924877, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 1.131799399671496e-06, + "logits/chosen": 223805952.0, + "logits/rejected": 235039274.66666666, + "logps/chosen": -355.6201171875, + "logps/rejected": -462.3304036458333, + "loss": 0.0847, + "rewards/chosen": 2.8675201416015623, + "rewards/margins": 9.988936233520509, + "rewards/rejected": -7.121416091918945, + "step": 2127 + }, + { + "epoch": 0.7855659637303308, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 1.1280729823239872e-06, + "logits/chosen": 274388460.3076923, + "logits/rejected": 169137704.42105263, + "logps/chosen": -363.0997971754808, + "logps/rejected": -348.47286184210526, + "loss": 0.0914, + "rewards/chosen": 2.2696546407846303, + "rewards/margins": 7.817080964926284, + "rewards/rejected": -5.547426324141653, + "step": 2128 + }, + { + "epoch": 0.7859351206681741, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 1.1243519294821693e-06, + "logits/chosen": 197717699.04761904, + "logits/rejected": 205322938.1818182, + "logps/chosen": -354.7519298735119, + "logps/rejected": -570.8431729403409, + "loss": 0.0937, + "rewards/chosen": 2.4541407993861606, + "rewards/margins": 9.583352918748732, + "rewards/rejected": -7.129212119362571, + "step": 2129 + }, + { + "epoch": 0.7863042776060173, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.1206362463015146e-06, + "logits/chosen": 262660147.2, + "logits/rejected": 151237135.05882353, + "logps/chosen": -324.934765625, + "logps/rejected": -514.3410500919117, + "loss": 0.0752, + "rewards/chosen": 2.130934651692708, + "rewards/margins": 9.616218447217754, + "rewards/rejected": -7.485283795525046, + "step": 2130 + }, + { + "epoch": 0.7866734345438604, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 1.1169259379300524e-06, + "logits/chosen": 315403136.0, + "logits/rejected": 234115072.0, + "logps/chosen": -277.9078776041667, + "logps/rejected": -429.846142578125, + "loss": 0.0642, + "rewards/chosen": 2.768209457397461, + "rewards/margins": 8.643983840942383, + "rewards/rejected": -5.875774383544922, + "step": 2131 + }, + { + "epoch": 0.7870425914817036, + "grad_norm": 6.15625, + "kl": 0.160491943359375, + "learning_rate": 1.1132210095083696e-06, + "logits/chosen": 138048668.44444445, + "logits/rejected": 177476096.0, + "logps/chosen": -346.2598470052083, + "logps/rejected": -502.7974330357143, + "loss": 0.0807, + "rewards/chosen": 2.379980934990777, + "rewards/margins": 9.985051321604896, + "rewards/rejected": -7.605070386614118, + "step": 2132 + }, + { + "epoch": 0.7874117484195469, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 1.1095214661695985e-06, + "logits/chosen": 213273856.0, + "logits/rejected": 153304896.0, + "logps/chosen": -339.1483154296875, + "logps/rejected": -370.2804260253906, + "loss": 0.085, + "rewards/chosen": 2.5116398334503174, + "rewards/margins": 8.749587297439575, + "rewards/rejected": -6.237947463989258, + "step": 2133 + }, + { + "epoch": 0.7877809053573901, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 1.1058273130394075e-06, + "logits/chosen": 289614677.3333333, + "logits/rejected": 221016395.29411766, + "logps/chosen": -431.94720052083335, + "logps/rejected": -424.59406594669116, + "loss": 0.0662, + "rewards/chosen": 2.5075963338216147, + "rewards/margins": 8.806703156115962, + "rewards/rejected": -6.299106822294347, + "step": 2134 + }, + { + "epoch": 0.7881500622952332, + "grad_norm": 4.46875, + "kl": 1.9527454376220703, + "learning_rate": 1.1021385552359982e-06, + "logits/chosen": 258511392.0, + "logits/rejected": 278840928.0, + "logps/chosen": -325.1587219238281, + "logps/rejected": -654.6736450195312, + "loss": 0.0957, + "rewards/chosen": 2.5751848220825195, + "rewards/margins": 11.568174362182617, + "rewards/rejected": -8.992989540100098, + "step": 2135 + }, + { + "epoch": 0.7885192192330764, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 1.0984551978701001e-06, + "logits/chosen": 203875041.88235295, + "logits/rejected": 201416533.33333334, + "logps/chosen": -357.61891084558823, + "logps/rejected": -404.30377604166665, + "loss": 0.0641, + "rewards/chosen": 3.5338421989889706, + "rewards/margins": 10.153275284112668, + "rewards/rejected": -6.619433085123698, + "step": 2136 + }, + { + "epoch": 0.7888883761709197, + "grad_norm": 5.84375, + "kl": 0.8565759658813477, + "learning_rate": 1.0947772460449558e-06, + "logits/chosen": 274005162.6666667, + "logits/rejected": 157159412.36363637, + "logps/chosen": -367.9280598958333, + "logps/rejected": -484.76620205965907, + "loss": 0.1091, + "rewards/chosen": 2.5040797278994607, + "rewards/margins": 10.557810085676449, + "rewards/rejected": -8.053730357776988, + "step": 2137 + }, + { + "epoch": 0.7892575331087629, + "grad_norm": 5.46875, + "kl": 1.3555335998535156, + "learning_rate": 1.0911047048563212e-06, + "logits/chosen": 193745042.2857143, + "logits/rejected": 153038264.8888889, + "logps/chosen": -267.91367885044644, + "logps/rejected": -370.66460503472223, + "loss": 0.1107, + "rewards/chosen": 2.260298047746931, + "rewards/margins": 8.278738294328962, + "rewards/rejected": -6.018440246582031, + "step": 2138 + }, + { + "epoch": 0.789626690046606, + "grad_norm": 9.25, + "kl": 1.776442527770996, + "learning_rate": 1.0874375793924575e-06, + "logits/chosen": 164086406.7368421, + "logits/rejected": 197760571.07692307, + "logps/chosen": -383.94497841282896, + "logps/rejected": -379.28793569711536, + "loss": 0.1521, + "rewards/chosen": 2.5808209870990955, + "rewards/margins": 8.377292802941945, + "rewards/rejected": -5.796471815842849, + "step": 2139 + }, + { + "epoch": 0.7899958469844492, + "grad_norm": 5.4375, + "kl": 0.7060518264770508, + "learning_rate": 1.0837758747341176e-06, + "logits/chosen": 208033056.0, + "logits/rejected": 212175312.0, + "logps/chosen": -318.814697265625, + "logps/rejected": -476.04443359375, + "loss": 0.1184, + "rewards/chosen": 2.0320534706115723, + "rewards/margins": 10.050857067108154, + "rewards/rejected": -8.018803596496582, + "step": 2140 + }, + { + "epoch": 0.7903650039222925, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 1.0801195959545486e-06, + "logits/chosen": 235024448.0, + "logits/rejected": 199646480.0, + "logps/chosen": -342.8032531738281, + "logps/rejected": -344.89361572265625, + "loss": 0.0809, + "rewards/chosen": 2.5856175422668457, + "rewards/margins": 8.66103982925415, + "rewards/rejected": -6.075422286987305, + "step": 2141 + }, + { + "epoch": 0.7907341608601357, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 1.0764687481194786e-06, + "logits/chosen": 184733006.76923078, + "logits/rejected": 176828793.2631579, + "logps/chosen": -393.90380859375, + "logps/rejected": -402.30250308388156, + "loss": 0.0538, + "rewards/chosen": 3.010088407076322, + "rewards/margins": 9.010292794540343, + "rewards/rejected": -6.000204387464021, + "step": 2142 + }, + { + "epoch": 0.7911033177979788, + "grad_norm": 3.328125, + "kl": 0.0, + "learning_rate": 1.0728233362871087e-06, + "logits/chosen": 315782400.0, + "logits/rejected": 235634816.0, + "logps/chosen": -283.99114990234375, + "logps/rejected": -486.1525065104167, + "loss": 0.0356, + "rewards/chosen": 3.1858506202697754, + "rewards/margins": 9.124207655588787, + "rewards/rejected": -5.938357035319011, + "step": 2143 + }, + { + "epoch": 0.791472474735822, + "grad_norm": 6.53125, + "kl": 1.6356334686279297, + "learning_rate": 1.0691833655081124e-06, + "logits/chosen": 238442704.0, + "logits/rejected": 301749216.0, + "logps/chosen": -418.55499267578125, + "logps/rejected": -386.3564758300781, + "loss": 0.0748, + "rewards/chosen": 2.8683993816375732, + "rewards/margins": 8.976774454116821, + "rewards/rejected": -6.108375072479248, + "step": 2144 + }, + { + "epoch": 0.7918416316736653, + "grad_norm": 6.875, + "kl": 0.6936721801757812, + "learning_rate": 1.0655488408256243e-06, + "logits/chosen": 243846180.57142857, + "logits/rejected": 171288177.7777778, + "logps/chosen": -335.92159598214283, + "logps/rejected": -434.02308485243054, + "loss": 0.1034, + "rewards/chosen": 2.402036394391741, + "rewards/margins": 9.717931474958148, + "rewards/rejected": -7.315895080566406, + "step": 2145 + }, + { + "epoch": 0.7922107886115085, + "grad_norm": 5.21875, + "kl": 0.8618106842041016, + "learning_rate": 1.0619197672752285e-06, + "logits/chosen": 169415488.0, + "logits/rejected": 242550016.0, + "logps/chosen": -380.1358947753906, + "logps/rejected": -590.2781372070312, + "loss": 0.0592, + "rewards/chosen": 3.306767702102661, + "rewards/margins": 12.822512865066528, + "rewards/rejected": -9.515745162963867, + "step": 2146 + }, + { + "epoch": 0.7925799455493516, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.058296149884963e-06, + "logits/chosen": 203051064.8888889, + "logits/rejected": 209635181.7142857, + "logps/chosen": -400.7217610677083, + "logps/rejected": -450.19517299107144, + "loss": 0.0771, + "rewards/chosen": 2.5405201382107205, + "rewards/margins": 9.965497032044427, + "rewards/rejected": -7.424976893833706, + "step": 2147 + }, + { + "epoch": 0.7929491024871949, + "grad_norm": 5.65625, + "kl": 0.0, + "learning_rate": 1.0546779936753037e-06, + "logits/chosen": 271380058.35294116, + "logits/rejected": 186583022.93333334, + "logps/chosen": -273.2796415441176, + "logps/rejected": -456.1283854166667, + "loss": 0.1159, + "rewards/chosen": 2.032319461598116, + "rewards/margins": 8.000726647470511, + "rewards/rejected": -5.968407185872396, + "step": 2148 + }, + { + "epoch": 0.7933182594250381, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 1.0510653036591583e-06, + "logits/chosen": 213681038.2222222, + "logits/rejected": 238516357.5652174, + "logps/chosen": -392.2087673611111, + "logps/rejected": -468.30693783967394, + "loss": 0.0585, + "rewards/chosen": 1.8088077969021268, + "rewards/margins": 8.232046799959193, + "rewards/rejected": -6.423239003057065, + "step": 2149 + }, + { + "epoch": 0.7936874163628813, + "grad_norm": 4.15625, + "kl": 0.001331329345703125, + "learning_rate": 1.0474580848418643e-06, + "logits/chosen": 262906587.42857143, + "logits/rejected": 133138332.44444445, + "logps/chosen": -282.92171805245533, + "logps/rejected": -354.0216471354167, + "loss": 0.0652, + "rewards/chosen": 2.3500238146100725, + "rewards/margins": 9.530610735454257, + "rewards/rejected": -7.180586920844184, + "step": 2150 + }, + { + "epoch": 0.7940565733007244, + "grad_norm": 4.375, + "kl": 0.9958133697509766, + "learning_rate": 1.0438563422211784e-06, + "logits/chosen": 193040412.44444445, + "logits/rejected": 239555309.7142857, + "logps/chosen": -337.03773328993054, + "logps/rejected": -422.9397670200893, + "loss": 0.0682, + "rewards/chosen": 2.7725401984320746, + "rewards/margins": 9.200185200524707, + "rewards/rejected": -6.427645002092634, + "step": 2151 + }, + { + "epoch": 0.7944257302385677, + "grad_norm": 5.34375, + "kl": 0.179107666015625, + "learning_rate": 1.0402600807872676e-06, + "logits/chosen": 205026880.0, + "logits/rejected": 260176960.0, + "logps/chosen": -333.47442626953125, + "logps/rejected": -532.892333984375, + "loss": 0.0885, + "rewards/chosen": 2.300112724304199, + "rewards/margins": 9.958656311035156, + "rewards/rejected": -7.658543586730957, + "step": 2152 + }, + { + "epoch": 0.7947948871764109, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.0366693055227063e-06, + "logits/chosen": 197267939.55555555, + "logits/rejected": 171165421.7142857, + "logps/chosen": -312.53046332465277, + "logps/rejected": -394.36502511160717, + "loss": 0.087, + "rewards/chosen": 2.7227613661024304, + "rewards/margins": 8.572101532466828, + "rewards/rejected": -5.849340166364398, + "step": 2153 + }, + { + "epoch": 0.7951640441142541, + "grad_norm": 8.625, + "kl": 0.5175771713256836, + "learning_rate": 1.033084021402468e-06, + "logits/chosen": 240101102.93333334, + "logits/rejected": 174183273.4117647, + "logps/chosen": -435.95745442708335, + "logps/rejected": -366.68577665441177, + "loss": 0.1378, + "rewards/chosen": 1.8346558888753255, + "rewards/margins": 7.384031535129921, + "rewards/rejected": -5.549375646254596, + "step": 2154 + }, + { + "epoch": 0.7955332010520972, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 1.0295042333939204e-06, + "logits/chosen": 164634774.5882353, + "logits/rejected": 195532083.2, + "logps/chosen": -305.7161075367647, + "logps/rejected": -394.56858723958334, + "loss": 0.0874, + "rewards/chosen": 2.872540193445542, + "rewards/margins": 8.79884075089997, + "rewards/rejected": -5.926300557454427, + "step": 2155 + }, + { + "epoch": 0.7959023579899405, + "grad_norm": 6.78125, + "kl": 2.6610116958618164, + "learning_rate": 1.0259299464568112e-06, + "logits/chosen": 272512602.35294116, + "logits/rejected": 164068761.6, + "logps/chosen": -410.26921530330884, + "logps/rejected": -430.11748046875, + "loss": 0.1117, + "rewards/chosen": 2.4175136790556064, + "rewards/margins": 9.034121345071231, + "rewards/rejected": -6.616607666015625, + "step": 2156 + }, + { + "epoch": 0.7962715149277837, + "grad_norm": 5.25, + "kl": 0.34206676483154297, + "learning_rate": 1.0223611655432713e-06, + "logits/chosen": 215682611.2, + "logits/rejected": 179464213.33333334, + "logps/chosen": -318.5669921875, + "logps/rejected": -435.4808349609375, + "loss": 0.1026, + "rewards/chosen": 2.484393501281738, + "rewards/margins": 9.402975145975748, + "rewards/rejected": -6.918581644694011, + "step": 2157 + }, + { + "epoch": 0.7966406718656269, + "grad_norm": 5.21875, + "kl": 1.176997184753418, + "learning_rate": 1.0187978955978028e-06, + "logits/chosen": 170045344.0, + "logits/rejected": 162049424.0, + "logps/chosen": -274.91192626953125, + "logps/rejected": -498.3582458496094, + "loss": 0.1127, + "rewards/chosen": 2.1121280193328857, + "rewards/margins": 9.932083368301392, + "rewards/rejected": -7.819955348968506, + "step": 2158 + }, + { + "epoch": 0.79700982880347, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 1.0152401415572677e-06, + "logits/chosen": 217825792.0, + "logits/rejected": 176361944.6153846, + "logps/chosen": -303.52271792763156, + "logps/rejected": -489.4976337139423, + "loss": 0.1164, + "rewards/chosen": 2.7518143904836556, + "rewards/margins": 9.454047925076505, + "rewards/rejected": -6.702233534592849, + "step": 2159 + }, + { + "epoch": 0.7973789857413133, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.0116879083508908e-06, + "logits/chosen": 310119837.53846157, + "logits/rejected": 186201249.68421054, + "logps/chosen": -431.73670372596155, + "logps/rejected": -487.6156969572368, + "loss": 0.0673, + "rewards/chosen": 2.715100508469802, + "rewards/margins": 9.986629856742828, + "rewards/rejected": -7.2715293482730265, + "step": 2160 + }, + { + "epoch": 0.7977481426791565, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 1.0081412009002466e-06, + "logits/chosen": 144896682.66666666, + "logits/rejected": 175890492.2352941, + "logps/chosen": -331.99684244791666, + "logps/rejected": -492.5244715073529, + "loss": 0.0932, + "rewards/chosen": 2.109185282389323, + "rewards/margins": 10.340323623956419, + "rewards/rejected": -8.231138341567096, + "step": 2161 + }, + { + "epoch": 0.7981172996169997, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 1.0046000241192516e-06, + "logits/chosen": 207599985.7777778, + "logits/rejected": 260462811.42857143, + "logps/chosen": -358.08485243055554, + "logps/rejected": -548.7638811383929, + "loss": 0.0641, + "rewards/chosen": 3.1563487582736545, + "rewards/margins": 12.492427947029235, + "rewards/rejected": -9.33607918875558, + "step": 2162 + }, + { + "epoch": 0.7984864565548428, + "grad_norm": 6.46875, + "kl": 0.13544464111328125, + "learning_rate": 1.0010643829141624e-06, + "logits/chosen": 310193664.0, + "logits/rejected": 188471509.33333334, + "logps/chosen": -526.9755161830357, + "logps/rejected": -370.5953776041667, + "loss": 0.0752, + "rewards/chosen": 2.365980420793806, + "rewards/margins": 8.100651271759517, + "rewards/rejected": -5.734670850965712, + "step": 2163 + }, + { + "epoch": 0.7988556134926861, + "grad_norm": 6.375, + "kl": 1.137376308441162, + "learning_rate": 9.975342821835654e-07, + "logits/chosen": 146949072.0, + "logits/rejected": 182010304.0, + "logps/chosen": -398.4840393066406, + "logps/rejected": -389.9901123046875, + "loss": 0.0956, + "rewards/chosen": 2.3752002716064453, + "rewards/margins": 8.78531789779663, + "rewards/rejected": -6.4101176261901855, + "step": 2164 + }, + { + "epoch": 0.7992247704305293, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 9.94009726818369e-07, + "logits/chosen": 225815936.0, + "logits/rejected": 237508565.33333334, + "logps/chosen": -375.22499302455356, + "logps/rejected": -479.56385633680554, + "loss": 0.0878, + "rewards/chosen": 2.3598929813929965, + "rewards/margins": 10.757275929526678, + "rewards/rejected": -8.39738294813368, + "step": 2165 + }, + { + "epoch": 0.7995939273683724, + "grad_norm": 6.78125, + "kl": 0.0, + "learning_rate": 9.904907217018e-07, + "logits/chosen": 170182707.2, + "logits/rejected": 256898965.33333334, + "logps/chosen": -309.77978515625, + "logps/rejected": -540.6988118489584, + "loss": 0.102, + "rewards/chosen": 2.752933311462402, + "rewards/margins": 10.561858940124512, + "rewards/rejected": -7.808925628662109, + "step": 2166 + }, + { + "epoch": 0.7999630843062157, + "grad_norm": 5.28125, + "kl": 0.8875770568847656, + "learning_rate": 9.869772717093974e-07, + "logits/chosen": 263803943.3846154, + "logits/rejected": 210732355.36842105, + "logps/chosen": -467.55081881009613, + "logps/rejected": -505.3715049342105, + "loss": 0.0805, + "rewards/chosen": 2.3858338869535007, + "rewards/margins": 9.506305238978584, + "rewards/rejected": -7.120471352025082, + "step": 2167 + }, + { + "epoch": 0.8003322412440589, + "grad_norm": 3.625, + "kl": 0.17687225341796875, + "learning_rate": 9.834693817089996e-07, + "logits/chosen": 175922580.21052632, + "logits/rejected": 262201383.3846154, + "logps/chosen": -269.0180150082237, + "logps/rejected": -484.37642728365387, + "loss": 0.0761, + "rewards/chosen": 3.081810198332134, + "rewards/margins": 8.976809775781053, + "rewards/rejected": -5.894999577448918, + "step": 2168 + }, + { + "epoch": 0.8007013981819021, + "grad_norm": 4.90625, + "kl": 0.19923782348632812, + "learning_rate": 9.799670565607427e-07, + "logits/chosen": 242914389.33333334, + "logits/rejected": 268557522.8235294, + "logps/chosen": -477.23020833333334, + "logps/rejected": -531.6632008272059, + "loss": 0.0651, + "rewards/chosen": 2.86517333984375, + "rewards/margins": 10.22061606014476, + "rewards/rejected": -7.355442720301011, + "step": 2169 + }, + { + "epoch": 0.8010705551197452, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 9.764703011170568e-07, + "logits/chosen": 149009635.55555555, + "logits/rejected": 165114660.57142857, + "logps/chosen": -319.52568901909723, + "logps/rejected": -363.6665736607143, + "loss": 0.0669, + "rewards/chosen": 2.9796379937065973, + "rewards/margins": 9.133463481115918, + "rewards/rejected": -6.1538254874093195, + "step": 2170 + }, + { + "epoch": 0.8014397120575885, + "grad_norm": 5.1875, + "kl": 1.1780805587768555, + "learning_rate": 9.729791202226484e-07, + "logits/chosen": 233114166.85714287, + "logits/rejected": 161097784.8888889, + "logps/chosen": -329.896240234375, + "logps/rejected": -363.0109592013889, + "loss": 0.0791, + "rewards/chosen": 2.8811228615897044, + "rewards/margins": 9.117989282759408, + "rewards/rejected": -6.2368664211697045, + "step": 2171 + }, + { + "epoch": 0.8018088689954317, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 9.69493518714506e-07, + "logits/chosen": 247911680.0, + "logits/rejected": 134002249.14285715, + "logps/chosen": -359.08671431107956, + "logps/rejected": -473.5709635416667, + "loss": 0.0574, + "rewards/chosen": 2.2840009169145064, + "rewards/margins": 9.877293153242631, + "rewards/rejected": -7.593292236328125, + "step": 2172 + }, + { + "epoch": 0.8021780259332749, + "grad_norm": 8.125, + "kl": 1.2153935432434082, + "learning_rate": 9.66013501421888e-07, + "logits/chosen": 236182499.55555555, + "logits/rejected": 180862482.2857143, + "logps/chosen": -442.1673177083333, + "logps/rejected": -516.8133719308036, + "loss": 0.1038, + "rewards/chosen": 2.2946039835611978, + "rewards/margins": 8.82269541422526, + "rewards/rejected": -6.5280914306640625, + "step": 2173 + }, + { + "epoch": 0.802547182871118, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.625390731663114e-07, + "logits/chosen": 159453070.2222222, + "logits/rejected": 175486738.2857143, + "logps/chosen": -226.28333875868054, + "logps/rejected": -378.8361118861607, + "loss": 0.1213, + "rewards/chosen": 2.321712917751736, + "rewards/margins": 7.958373902336, + "rewards/rejected": -5.636660984584263, + "step": 2174 + }, + { + "epoch": 0.8029163398089613, + "grad_norm": 5.8125, + "kl": 0.20135974884033203, + "learning_rate": 9.59070238761553e-07, + "logits/chosen": 227924400.0, + "logits/rejected": 220602016.0, + "logps/chosen": -323.8587951660156, + "logps/rejected": -368.94012451171875, + "loss": 0.1133, + "rewards/chosen": 2.0277891159057617, + "rewards/margins": 7.750703811645508, + "rewards/rejected": -5.722914695739746, + "step": 2175 + }, + { + "epoch": 0.8032854967468045, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 9.556070030136411e-07, + "logits/chosen": 156515682.46153846, + "logits/rejected": 197146044.63157895, + "logps/chosen": -463.39888822115387, + "logps/rejected": -497.19120065789474, + "loss": 0.058, + "rewards/chosen": 3.003648904653696, + "rewards/margins": 10.356440138720309, + "rewards/rejected": -7.352791234066612, + "step": 2176 + }, + { + "epoch": 0.8036546536846477, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 9.521493707208412e-07, + "logits/chosen": 199860540.2352941, + "logits/rejected": 172584226.13333333, + "logps/chosen": -406.9131433823529, + "logps/rejected": -485.2704752604167, + "loss": 0.071, + "rewards/chosen": 2.9892571393181298, + "rewards/margins": 9.852945798986099, + "rewards/rejected": -6.863688659667969, + "step": 2177 + }, + { + "epoch": 0.8040238106224908, + "grad_norm": 5.46875, + "kl": 0.30146074295043945, + "learning_rate": 9.48697346673661e-07, + "logits/chosen": 241824376.47058824, + "logits/rejected": 188172578.13333333, + "logps/chosen": -358.3801700367647, + "logps/rejected": -465.1123046875, + "loss": 0.0847, + "rewards/chosen": 2.0889448278090534, + "rewards/margins": 8.22268913119447, + "rewards/rejected": -6.133744303385416, + "step": 2178 + }, + { + "epoch": 0.8043929675603341, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 9.452509356548367e-07, + "logits/chosen": 169739123.2, + "logits/rejected": 252719936.0, + "logps/chosen": -333.311572265625, + "logps/rejected": -481.7845865885417, + "loss": 0.0823, + "rewards/chosen": 2.393213081359863, + "rewards/margins": 8.764457511901856, + "rewards/rejected": -6.371244430541992, + "step": 2179 + }, + { + "epoch": 0.8047621244981773, + "grad_norm": 3.890625, + "kl": 1.8994879722595215, + "learning_rate": 9.41810142439325e-07, + "logits/chosen": 243778400.0, + "logits/rejected": 276543072.0, + "logps/chosen": -365.5503845214844, + "logps/rejected": -517.6238403320312, + "loss": 0.0482, + "rewards/chosen": 3.073272466659546, + "rewards/margins": 10.604759931564331, + "rewards/rejected": -7.531487464904785, + "step": 2180 + }, + { + "epoch": 0.8051312814360205, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.383749717943024e-07, + "logits/chosen": 202368472.6153846, + "logits/rejected": 192564574.31578946, + "logps/chosen": -372.0922100360577, + "logps/rejected": -467.76978824013156, + "loss": 0.0509, + "rewards/chosen": 3.1376146169809194, + "rewards/margins": 11.17711881876957, + "rewards/rejected": -8.03950420178865, + "step": 2181 + }, + { + "epoch": 0.8055004383738636, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 9.349454284791549e-07, + "logits/chosen": 290255644.4444444, + "logits/rejected": 255778029.7142857, + "logps/chosen": -343.8614908854167, + "logps/rejected": -438.3056640625, + "loss": 0.0793, + "rewards/chosen": 2.620375315348307, + "rewards/margins": 8.817286718459357, + "rewards/rejected": -6.1969114031110495, + "step": 2182 + }, + { + "epoch": 0.8058695953117069, + "grad_norm": 5.5625, + "kl": 0.2715930938720703, + "learning_rate": 9.315215172454689e-07, + "logits/chosen": 164713701.0526316, + "logits/rejected": 245746274.46153846, + "logps/chosen": -280.3202611019737, + "logps/rejected": -486.15249399038464, + "loss": 0.1027, + "rewards/chosen": 2.991847188849198, + "rewards/margins": 11.575647871503945, + "rewards/rejected": -8.583800682654747, + "step": 2183 + }, + { + "epoch": 0.8062387522495501, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 9.281032428370318e-07, + "logits/chosen": 177431214.54545453, + "logits/rejected": 162102796.19047618, + "logps/chosen": -342.9318181818182, + "logps/rejected": -437.24972098214283, + "loss": 0.068, + "rewards/chosen": 2.131172700361772, + "rewards/margins": 8.75805679750649, + "rewards/rejected": -6.626884097144718, + "step": 2184 + }, + { + "epoch": 0.8066079091873933, + "grad_norm": 3.515625, + "kl": 0.04087352752685547, + "learning_rate": 9.246906099898196e-07, + "logits/chosen": 207834166.85714287, + "logits/rejected": 164681884.44444445, + "logps/chosen": -438.0668247767857, + "logps/rejected": -479.6214192708333, + "loss": 0.0403, + "rewards/chosen": 3.9982264382498607, + "rewards/margins": 10.789446361481197, + "rewards/rejected": -6.791219923231337, + "step": 2185 + }, + { + "epoch": 0.8069770661252365, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.212836234319911e-07, + "logits/chosen": 202091264.0, + "logits/rejected": 238590515.2, + "logps/chosen": -345.89352596507354, + "logps/rejected": -414.33815104166666, + "loss": 0.0568, + "rewards/chosen": 3.271116144516889, + "rewards/margins": 9.754551203110639, + "rewards/rejected": -6.48343505859375, + "step": 2186 + }, + { + "epoch": 0.8073462230630797, + "grad_norm": 4.40625, + "kl": 0.0, + "learning_rate": 9.178822878838817e-07, + "logits/chosen": 187663721.4117647, + "logits/rejected": 193641352.53333333, + "logps/chosen": -279.4383903952206, + "logps/rejected": -458.6181966145833, + "loss": 0.0596, + "rewards/chosen": 3.4237062790814567, + "rewards/margins": 10.944574707629634, + "rewards/rejected": -7.5208684285481775, + "step": 2187 + }, + { + "epoch": 0.8077153800009229, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 9.144866080579995e-07, + "logits/chosen": 193178240.0, + "logits/rejected": 187421824.0, + "logps/chosen": -373.41582573784723, + "logps/rejected": -442.14278738839283, + "loss": 0.0827, + "rewards/chosen": 2.607272889879015, + "rewards/margins": 9.046957621498713, + "rewards/rejected": -6.439684731619699, + "step": 2188 + }, + { + "epoch": 0.8080845369387661, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 9.110965886590167e-07, + "logits/chosen": 187361024.0, + "logits/rejected": 230814272.0, + "logps/chosen": -334.88177490234375, + "logps/rejected": -501.99761962890625, + "loss": 0.0725, + "rewards/chosen": 2.6856424808502197, + "rewards/margins": 10.619609117507935, + "rewards/rejected": -7.933966636657715, + "step": 2189 + }, + { + "epoch": 0.8084536938766093, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 9.077122343837608e-07, + "logits/chosen": 211359547.07692307, + "logits/rejected": 135583434.10526314, + "logps/chosen": -458.1045673076923, + "logps/rejected": -421.3347810444079, + "loss": 0.0562, + "rewards/chosen": 3.07960451566256, + "rewards/margins": 10.431467372878842, + "rewards/rejected": -7.3518628572162825, + "step": 2190 + }, + { + "epoch": 0.8088228508144525, + "grad_norm": 3.9375, + "kl": 0.0, + "learning_rate": 9.043335499212119e-07, + "logits/chosen": 201033819.42857143, + "logits/rejected": 223354040.8888889, + "logps/chosen": -343.7724609375, + "logps/rejected": -462.50938585069446, + "loss": 0.0593, + "rewards/chosen": 2.670590809413365, + "rewards/margins": 9.674035057188973, + "rewards/rejected": -7.003444247775608, + "step": 2191 + }, + { + "epoch": 0.8091920077522957, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 9.009605399524957e-07, + "logits/chosen": 165882736.0, + "logits/rejected": 185606256.0, + "logps/chosen": -379.96624755859375, + "logps/rejected": -423.5087890625, + "loss": 0.0924, + "rewards/chosen": 2.8407328128814697, + "rewards/margins": 10.464741468429565, + "rewards/rejected": -7.624008655548096, + "step": 2192 + }, + { + "epoch": 0.809561164690139, + "grad_norm": 4.875, + "kl": 0.010650157928466797, + "learning_rate": 8.975932091508727e-07, + "logits/chosen": 242111984.0, + "logits/rejected": 248013104.0, + "logps/chosen": -313.30047607421875, + "logps/rejected": -449.46990966796875, + "loss": 0.0877, + "rewards/chosen": 2.9029407501220703, + "rewards/margins": 8.768155097961426, + "rewards/rejected": -5.8652143478393555, + "step": 2193 + }, + { + "epoch": 0.8099303216279821, + "grad_norm": 5.625, + "kl": 0.7027826309204102, + "learning_rate": 8.942315621817377e-07, + "logits/chosen": 170378012.44444445, + "logits/rejected": 171663323.42857143, + "logps/chosen": -310.51958550347223, + "logps/rejected": -385.20228794642856, + "loss": 0.0781, + "rewards/chosen": 2.5057830810546875, + "rewards/margins": 7.990247453962054, + "rewards/rejected": -5.484464372907366, + "step": 2194 + }, + { + "epoch": 0.8102994785658253, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 8.908756037026112e-07, + "logits/chosen": 312769809.06666666, + "logits/rejected": 231125760.0, + "logps/chosen": -394.79651692708336, + "logps/rejected": -413.02062270220586, + "loss": 0.0792, + "rewards/chosen": 2.651067860921224, + "rewards/margins": 9.6112128014658, + "rewards/rejected": -6.960144940544577, + "step": 2195 + }, + { + "epoch": 0.8106686355036685, + "grad_norm": 4.46875, + "kl": 1.67889404296875, + "learning_rate": 8.875253383631288e-07, + "logits/chosen": 170744416.0, + "logits/rejected": 233283072.0, + "logps/chosen": -375.4677734375, + "logps/rejected": -490.16494140625, + "loss": 0.0626, + "rewards/chosen": 2.5313332875569663, + "rewards/margins": 8.887360127766927, + "rewards/rejected": -6.356026840209961, + "step": 2196 + }, + { + "epoch": 0.8110377924415118, + "grad_norm": 6.28125, + "kl": 0.8798799514770508, + "learning_rate": 8.841807708050415e-07, + "logits/chosen": 204676632.3809524, + "logits/rejected": 241360337.45454547, + "logps/chosen": -389.1020275297619, + "logps/rejected": -431.4798473011364, + "loss": 0.1022, + "rewards/chosen": 2.950846717471168, + "rewards/margins": 9.981872690704478, + "rewards/rejected": -7.03102597323331, + "step": 2197 + }, + { + "epoch": 0.8114069493793549, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 8.808419056622064e-07, + "logits/chosen": 281808605.8666667, + "logits/rejected": 225169874.82352942, + "logps/chosen": -363.0587565104167, + "logps/rejected": -398.1053825827206, + "loss": 0.0625, + "rewards/chosen": 3.0761065165201824, + "rewards/margins": 8.24597319060681, + "rewards/rejected": -5.169866674086627, + "step": 2198 + }, + { + "epoch": 0.8117761063171981, + "grad_norm": 6.125, + "kl": 0.8667831420898438, + "learning_rate": 8.775087475605765e-07, + "logits/chosen": 190331932.44444445, + "logits/rejected": 133283638.85714285, + "logps/chosen": -355.30693901909723, + "logps/rejected": -428.47523716517856, + "loss": 0.0852, + "rewards/chosen": 3.6121804979112415, + "rewards/margins": 10.342901017930773, + "rewards/rejected": -6.730720520019531, + "step": 2199 + }, + { + "epoch": 0.8121452632550413, + "grad_norm": 5.65625, + "kl": 0.6688394546508789, + "learning_rate": 8.741813011182015e-07, + "logits/chosen": 221019672.3809524, + "logits/rejected": 219333585.45454547, + "logps/chosen": -380.7613002232143, + "logps/rejected": -307.9525035511364, + "loss": 0.1001, + "rewards/chosen": 2.360990978422619, + "rewards/margins": 7.491879830628763, + "rewards/rejected": -5.1308888522061435, + "step": 2200 + }, + { + "epoch": 0.8125144201928844, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 8.708595709452166e-07, + "logits/chosen": 245897318.4, + "logits/rejected": 268917205.3333333, + "logps/chosen": -399.46826171875, + "logps/rejected": -578.4976806640625, + "loss": 0.0824, + "rewards/chosen": 2.7246400833129885, + "rewards/margins": 9.418894640604655, + "rewards/rejected": -6.694254557291667, + "step": 2201 + }, + { + "epoch": 0.8128835771307277, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 8.675435616438349e-07, + "logits/chosen": 330824546.46153843, + "logits/rejected": 214331850.10526314, + "logps/chosen": -351.50428185096155, + "logps/rejected": -387.87037417763156, + "loss": 0.0716, + "rewards/chosen": 2.4225858541635366, + "rewards/margins": 8.706982778634137, + "rewards/rejected": -6.284396924470601, + "step": 2202 + }, + { + "epoch": 0.8132527340685709, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 8.642332778083473e-07, + "logits/chosen": 143786513.06666666, + "logits/rejected": 176044950.5882353, + "logps/chosen": -309.10699869791665, + "logps/rejected": -468.3780158547794, + "loss": 0.0508, + "rewards/chosen": 3.4208658854166667, + "rewards/margins": 9.941563983992035, + "rewards/rejected": -6.520698098575368, + "step": 2203 + }, + { + "epoch": 0.8136218910064141, + "grad_norm": 4.71875, + "kl": 1.247091293334961, + "learning_rate": 8.60928724025108e-07, + "logits/chosen": 200896924.44444445, + "logits/rejected": 298920047.3043478, + "logps/chosen": -374.444091796875, + "logps/rejected": -447.3761039402174, + "loss": 0.0584, + "rewards/chosen": 3.5426038106282554, + "rewards/margins": 9.70450885053994, + "rewards/rejected": -6.161905039911685, + "step": 2204 + }, + { + "epoch": 0.8139910479442573, + "grad_norm": 5.125, + "kl": 0.5767602920532227, + "learning_rate": 8.576299048725362e-07, + "logits/chosen": 266885927.3846154, + "logits/rejected": 130410509.4736842, + "logps/chosen": -426.0279071514423, + "logps/rejected": -350.9915707236842, + "loss": 0.0713, + "rewards/chosen": 3.211008512056791, + "rewards/margins": 8.592298345527185, + "rewards/rejected": -5.381289833470395, + "step": 2205 + }, + { + "epoch": 0.8143602048821005, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 8.543368249211015e-07, + "logits/chosen": 237883440.76190478, + "logits/rejected": 158573102.54545453, + "logps/chosen": -320.31508091517856, + "logps/rejected": -341.6748712713068, + "loss": 0.1007, + "rewards/chosen": 2.7685572306315103, + "rewards/margins": 8.789084463408498, + "rewards/rejected": -6.020527232776988, + "step": 2206 + }, + { + "epoch": 0.8147293618199437, + "grad_norm": 3.859375, + "kl": 0.0, + "learning_rate": 8.510494887333276e-07, + "logits/chosen": 155229149.86666667, + "logits/rejected": 175698582.5882353, + "logps/chosen": -331.2802734375, + "logps/rejected": -393.2592198988971, + "loss": 0.0699, + "rewards/chosen": 2.935907491048177, + "rewards/margins": 9.505490710688573, + "rewards/rejected": -6.569583219640395, + "step": 2207 + }, + { + "epoch": 0.8150985187577869, + "grad_norm": 7.84375, + "kl": 0.39079904556274414, + "learning_rate": 8.477679008637735e-07, + "logits/chosen": 211662364.44444445, + "logits/rejected": 141590043.42857143, + "logps/chosen": -445.9649251302083, + "logps/rejected": -319.2057407924107, + "loss": 0.0625, + "rewards/chosen": 2.842575920952691, + "rewards/margins": 9.434485602000404, + "rewards/rejected": -6.591909681047712, + "step": 2208 + }, + { + "epoch": 0.8154676756956301, + "grad_norm": 2.09375, + "kl": 0.0, + "learning_rate": 8.444920658590388e-07, + "logits/chosen": 144206148.92307693, + "logits/rejected": 187858256.84210527, + "logps/chosen": -317.9050105168269, + "logps/rejected": -355.2819181743421, + "loss": 0.025, + "rewards/chosen": 4.107722649207482, + "rewards/margins": 10.850357472655261, + "rewards/rejected": -6.74263482344778, + "step": 2209 + }, + { + "epoch": 0.8158368326334733, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 8.412219882577538e-07, + "logits/chosen": 163318235.42857143, + "logits/rejected": 183601479.1111111, + "logps/chosen": -329.74546595982144, + "logps/rejected": -307.93120659722223, + "loss": 0.0564, + "rewards/chosen": 3.551511492047991, + "rewards/margins": 9.451429942297557, + "rewards/rejected": -5.899918450249566, + "step": 2210 + }, + { + "epoch": 0.8162059895713165, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 8.379576725905653e-07, + "logits/chosen": 236951902.31578946, + "logits/rejected": 181972992.0, + "logps/chosen": -368.94510690789474, + "logps/rejected": -434.2141676682692, + "loss": 0.1094, + "rewards/chosen": 2.3481889022024056, + "rewards/margins": 7.935600049099942, + "rewards/rejected": -5.587411146897536, + "step": 2211 + }, + { + "epoch": 0.8165751465091597, + "grad_norm": 5.28125, + "kl": 0.14129114151000977, + "learning_rate": 8.346991233801438e-07, + "logits/chosen": 241005537.88235295, + "logits/rejected": 197837568.0, + "logps/chosen": -347.31281594669116, + "logps/rejected": -381.2875651041667, + "loss": 0.092, + "rewards/chosen": 2.868298923268038, + "rewards/margins": 8.371272352630017, + "rewards/rejected": -5.502973429361979, + "step": 2212 + }, + { + "epoch": 0.8169443034470029, + "grad_norm": 4.40625, + "kl": 0.17800331115722656, + "learning_rate": 8.314463451411681e-07, + "logits/chosen": 209266240.0, + "logits/rejected": 202656192.0, + "logps/chosen": -396.7255554199219, + "logps/rejected": -352.6144104003906, + "loss": 0.0708, + "rewards/chosen": 2.740125894546509, + "rewards/margins": 8.096072435379028, + "rewards/rejected": -5.3559465408325195, + "step": 2213 + }, + { + "epoch": 0.8173134603848461, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 8.281993423803192e-07, + "logits/chosen": 158781661.86666667, + "logits/rejected": 145371151.05882353, + "logps/chosen": -316.6557942708333, + "logps/rejected": -334.35475068933823, + "loss": 0.0607, + "rewards/chosen": 3.8279083251953123, + "rewards/margins": 9.838708406336167, + "rewards/rejected": -6.010800081140855, + "step": 2214 + }, + { + "epoch": 0.8176826173226893, + "grad_norm": 8.25, + "kl": 2.107386589050293, + "learning_rate": 8.249581195962792e-07, + "logits/chosen": 163712614.4, + "logits/rejected": 165953258.66666666, + "logps/chosen": -413.102099609375, + "logps/rejected": -447.963134765625, + "loss": 0.0961, + "rewards/chosen": 2.8659101486206056, + "rewards/margins": 9.37831808725993, + "rewards/rejected": -6.512407938639323, + "step": 2215 + }, + { + "epoch": 0.8180517742605325, + "grad_norm": 4.46875, + "kl": 0.014876365661621094, + "learning_rate": 8.217226812797225e-07, + "logits/chosen": 235655133.86666667, + "logits/rejected": 281952105.4117647, + "logps/chosen": -404.9876953125, + "logps/rejected": -459.8113798253676, + "loss": 0.0592, + "rewards/chosen": 2.8950121561686197, + "rewards/margins": 9.089436953675513, + "rewards/rejected": -6.194424797506893, + "step": 2216 + }, + { + "epoch": 0.8184209311983757, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 8.18493031913305e-07, + "logits/chosen": 274369365.3333333, + "logits/rejected": 316673792.0, + "logps/chosen": -374.6178792317708, + "logps/rejected": -453.41005859375, + "loss": 0.0617, + "rewards/chosen": 2.484605153401693, + "rewards/margins": 8.241373952229818, + "rewards/rejected": -5.756768798828125, + "step": 2217 + }, + { + "epoch": 0.8187900881362189, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 8.152691759716657e-07, + "logits/chosen": 220492373.33333334, + "logits/rejected": 156734886.4, + "logps/chosen": -450.4256184895833, + "logps/rejected": -436.96083984375, + "loss": 0.0755, + "rewards/chosen": 2.5489797592163086, + "rewards/margins": 9.433021354675294, + "rewards/rejected": -6.884041595458984, + "step": 2218 + }, + { + "epoch": 0.8191592450740621, + "grad_norm": 6.5625, + "kl": 0.0, + "learning_rate": 8.120511179214186e-07, + "logits/chosen": 193255936.0, + "logits/rejected": 292889837.71428573, + "logps/chosen": -329.61176215277777, + "logps/rejected": -518.2361537388393, + "loss": 0.1255, + "rewards/chosen": 1.7329427931043837, + "rewards/margins": 9.73669700017051, + "rewards/rejected": -8.003754207066127, + "step": 2219 + }, + { + "epoch": 0.8195284020119054, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 8.088388622211401e-07, + "logits/chosen": 285535901.53846157, + "logits/rejected": 196511124.21052632, + "logps/chosen": -411.7781325120192, + "logps/rejected": -320.57737972861844, + "loss": 0.0871, + "rewards/chosen": 2.4586796393761268, + "rewards/margins": 8.616635411374482, + "rewards/rejected": -6.157955771998355, + "step": 2220 + }, + { + "epoch": 0.8198975589497485, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 8.056324133213689e-07, + "logits/chosen": 247226666.66666666, + "logits/rejected": 189889651.2, + "logps/chosen": -329.8964029947917, + "logps/rejected": -477.11337890625, + "loss": 0.0665, + "rewards/chosen": 2.66107447942098, + "rewards/margins": 9.893147691090903, + "rewards/rejected": -7.232073211669922, + "step": 2221 + }, + { + "epoch": 0.8202667158875917, + "grad_norm": 4.90625, + "kl": 0.649078369140625, + "learning_rate": 8.024317756645999e-07, + "logits/chosen": 266706448.0, + "logits/rejected": 224355424.0, + "logps/chosen": -398.36083984375, + "logps/rejected": -523.9309692382812, + "loss": 0.0746, + "rewards/chosen": 3.1200995445251465, + "rewards/margins": 9.732861518859863, + "rewards/rejected": -6.612761974334717, + "step": 2222 + }, + { + "epoch": 0.8206358728254349, + "grad_norm": 7.0, + "kl": 0.0, + "learning_rate": 7.992369536852773e-07, + "logits/chosen": 184376391.1111111, + "logits/rejected": 203821714.2857143, + "logps/chosen": -364.7522243923611, + "logps/rejected": -373.63232421875, + "loss": 0.104, + "rewards/chosen": 2.3224800957573786, + "rewards/margins": 8.595330677335225, + "rewards/rejected": -6.272850581577846, + "step": 2223 + }, + { + "epoch": 0.8210050297632782, + "grad_norm": 6.84375, + "kl": 1.6794748306274414, + "learning_rate": 7.960479518097841e-07, + "logits/chosen": 209955328.0, + "logits/rejected": 295154218.6666667, + "logps/chosen": -449.728662109375, + "logps/rejected": -561.7718912760416, + "loss": 0.0804, + "rewards/chosen": 3.094488525390625, + "rewards/margins": 10.126372400919596, + "rewards/rejected": -7.031883875528972, + "step": 2224 + }, + { + "epoch": 0.8213741867011213, + "grad_norm": 6.9375, + "kl": 0.0, + "learning_rate": 7.928647744564427e-07, + "logits/chosen": 295235968.0, + "logits/rejected": 136098752.0, + "logps/chosen": -403.0686767578125, + "logps/rejected": -429.7810465494792, + "loss": 0.1301, + "rewards/chosen": 1.9315574645996094, + "rewards/margins": 8.126310475667317, + "rewards/rejected": -6.194753011067708, + "step": 2225 + }, + { + "epoch": 0.8217433436389645, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 7.896874260355064e-07, + "logits/chosen": 174462192.94117647, + "logits/rejected": 275144772.26666665, + "logps/chosen": -335.0710018382353, + "logps/rejected": -548.9227864583333, + "loss": 0.0894, + "rewards/chosen": 2.257269691018497, + "rewards/margins": 10.899762366799747, + "rewards/rejected": -8.64249267578125, + "step": 2226 + }, + { + "epoch": 0.8221125005768077, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 7.865159109491488e-07, + "logits/chosen": 217404256.0, + "logits/rejected": 263200144.0, + "logps/chosen": -251.48944091796875, + "logps/rejected": -437.0841979980469, + "loss": 0.0965, + "rewards/chosen": 2.8730735778808594, + "rewards/margins": 9.684673309326172, + "rewards/rejected": -6.8115997314453125, + "step": 2227 + }, + { + "epoch": 0.822481657514651, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 7.83350233591465e-07, + "logits/chosen": 232918823.3846154, + "logits/rejected": 215653173.89473686, + "logps/chosen": -314.3251201923077, + "logps/rejected": -431.4075863486842, + "loss": 0.101, + "rewards/chosen": 2.5053787231445312, + "rewards/margins": 8.624479996530633, + "rewards/rejected": -6.119101273386102, + "step": 2228 + }, + { + "epoch": 0.8228508144524941, + "grad_norm": 4.9375, + "kl": 0.005753993988037109, + "learning_rate": 7.801903983484616e-07, + "logits/chosen": 210433877.33333334, + "logits/rejected": 199405296.94117647, + "logps/chosen": -233.90144856770834, + "logps/rejected": -426.55109719669116, + "loss": 0.0709, + "rewards/chosen": 2.4105372111002605, + "rewards/margins": 8.987460805855545, + "rewards/rejected": -6.576923594755285, + "step": 2229 + }, + { + "epoch": 0.8232199713903373, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 7.770364095980481e-07, + "logits/chosen": 215125992.72727272, + "logits/rejected": 219702515.80952382, + "logps/chosen": -403.53151633522725, + "logps/rejected": -542.9456845238095, + "loss": 0.0488, + "rewards/chosen": 2.928021864457564, + "rewards/margins": 10.386201156682267, + "rewards/rejected": -7.4581792922247026, + "step": 2230 + }, + { + "epoch": 0.8235891283281805, + "grad_norm": 4.75, + "kl": 0.6740274429321289, + "learning_rate": 7.738882717100365e-07, + "logits/chosen": 317814874.35294116, + "logits/rejected": 175065395.2, + "logps/chosen": -366.49885110294116, + "logps/rejected": -418.7819010416667, + "loss": 0.0851, + "rewards/chosen": 2.6468169268439796, + "rewards/margins": 9.54109437231924, + "rewards/rejected": -6.89427744547526, + "step": 2231 + }, + { + "epoch": 0.8239582852660238, + "grad_norm": 6.3125, + "kl": 2.2548866271972656, + "learning_rate": 7.707459890461338e-07, + "logits/chosen": 225721258.66666666, + "logits/rejected": 187503104.0, + "logps/chosen": -399.55775282118054, + "logps/rejected": -489.9930943080357, + "loss": 0.1375, + "rewards/chosen": 2.3573981391059027, + "rewards/margins": 9.75863998655289, + "rewards/rejected": -7.401241847446987, + "step": 2232 + }, + { + "epoch": 0.8243274422038669, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 7.676095659599298e-07, + "logits/chosen": 169454260.70588234, + "logits/rejected": 174238242.13333333, + "logps/chosen": -353.8731043198529, + "logps/rejected": -446.1015625, + "loss": 0.0911, + "rewards/chosen": 2.502554500804228, + "rewards/margins": 8.802190832998239, + "rewards/rejected": -6.2996363321940105, + "step": 2233 + }, + { + "epoch": 0.8246965991417101, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 7.644790067969005e-07, + "logits/chosen": 327592192.0, + "logits/rejected": 194502180.57142857, + "logps/chosen": -374.01669034090907, + "logps/rejected": -361.44747488839283, + "loss": 0.0577, + "rewards/chosen": 2.427405270663175, + "rewards/margins": 8.141939485227907, + "rewards/rejected": -5.714534214564732, + "step": 2234 + }, + { + "epoch": 0.8250657560795533, + "grad_norm": 6.21875, + "kl": 0.0, + "learning_rate": 7.613543158943965e-07, + "logits/chosen": 214001170.2857143, + "logits/rejected": 178139079.1111111, + "logps/chosen": -348.53658621651783, + "logps/rejected": -452.89683702256946, + "loss": 0.1046, + "rewards/chosen": 1.728123392377581, + "rewards/margins": 8.18544498322502, + "rewards/rejected": -6.457321590847439, + "step": 2235 + }, + { + "epoch": 0.8254349130173965, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 7.582354975816348e-07, + "logits/chosen": 251203328.0, + "logits/rejected": 239717862.4, + "logps/chosen": -422.1104736328125, + "logps/rejected": -526.373095703125, + "loss": 0.0329, + "rewards/chosen": 3.227815310160319, + "rewards/margins": 11.618314425150553, + "rewards/rejected": -8.390499114990234, + "step": 2236 + }, + { + "epoch": 0.8258040699552397, + "grad_norm": 6.1875, + "kl": 0.3584728240966797, + "learning_rate": 7.551225561797021e-07, + "logits/chosen": 212725517.47368422, + "logits/rejected": 173063955.69230768, + "logps/chosen": -357.6249486019737, + "logps/rejected": -475.2722731370192, + "loss": 0.0963, + "rewards/chosen": 3.481387088173314, + "rewards/margins": 9.789336447773675, + "rewards/rejected": -6.307949359600361, + "step": 2237 + }, + { + "epoch": 0.8261732268930829, + "grad_norm": 6.09375, + "kl": 0.5473766326904297, + "learning_rate": 7.520154960015352e-07, + "logits/chosen": 209464285.86666667, + "logits/rejected": 154916276.70588234, + "logps/chosen": -393.2007161458333, + "logps/rejected": -445.02605124080884, + "loss": 0.0715, + "rewards/chosen": 2.9914843241373696, + "rewards/margins": 9.531929090911266, + "rewards/rejected": -6.540444766773897, + "step": 2238 + }, + { + "epoch": 0.8265423838309262, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 7.489143213519301e-07, + "logits/chosen": 233216091.42857143, + "logits/rejected": 135883776.0, + "logps/chosen": -379.1404506138393, + "logps/rejected": -353.70429144965277, + "loss": 0.0756, + "rewards/chosen": 2.1461579459054128, + "rewards/margins": 8.19704600742885, + "rewards/rejected": -6.0508880615234375, + "step": 2239 + }, + { + "epoch": 0.8269115407687693, + "grad_norm": 4.53125, + "kl": 0.8360004425048828, + "learning_rate": 7.45819036527522e-07, + "logits/chosen": 217350222.76923078, + "logits/rejected": 173843280.84210527, + "logps/chosen": -372.7619441105769, + "logps/rejected": -320.2835115131579, + "loss": 0.0618, + "rewards/chosen": 3.64370111318735, + "rewards/margins": 9.20837107361087, + "rewards/rejected": -5.56466996042352, + "step": 2240 + }, + { + "epoch": 0.8272806977066125, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 7.427296458167898e-07, + "logits/chosen": 351168621.71428573, + "logits/rejected": 273206784.0, + "logps/chosen": -367.8233119419643, + "logps/rejected": -385.9309895833333, + "loss": 0.0777, + "rewards/chosen": 1.8721674510410853, + "rewards/margins": 8.63525440579369, + "rewards/rejected": -6.7630869547526045, + "step": 2241 + }, + { + "epoch": 0.8276498546444557, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 7.396461535000471e-07, + "logits/chosen": 163144721.06666666, + "logits/rejected": 171321313.88235295, + "logps/chosen": -321.39410807291665, + "logps/rejected": -486.03883272058823, + "loss": 0.0745, + "rewards/chosen": 2.419383748372396, + "rewards/margins": 10.602649883195465, + "rewards/rejected": -8.18326613482307, + "step": 2242 + }, + { + "epoch": 0.828019011582299, + "grad_norm": 5.71875, + "kl": 0.9658050537109375, + "learning_rate": 7.365685638494297e-07, + "logits/chosen": 212542366.47619048, + "logits/rejected": 228823063.27272728, + "logps/chosen": -367.13648623511904, + "logps/rejected": -495.24032315340907, + "loss": 0.0964, + "rewards/chosen": 2.843150184268043, + "rewards/margins": 8.68159428716222, + "rewards/rejected": -5.838444102894176, + "step": 2243 + }, + { + "epoch": 0.8283881685201421, + "grad_norm": 3.8125, + "kl": 0.0, + "learning_rate": 7.334968811289006e-07, + "logits/chosen": 195562093.7142857, + "logits/rejected": 237440042.66666666, + "logps/chosen": -319.07589285714283, + "logps/rejected": -472.7428385416667, + "loss": 0.0529, + "rewards/chosen": 2.7191805158342635, + "rewards/margins": 10.142622932555184, + "rewards/rejected": -7.4234424167209205, + "step": 2244 + }, + { + "epoch": 0.8287573254579853, + "grad_norm": 6.03125, + "kl": 0.3974437713623047, + "learning_rate": 7.30431109594239e-07, + "logits/chosen": 208642446.2222222, + "logits/rejected": 136030336.0, + "logps/chosen": -331.45917426215277, + "logps/rejected": -405.36893136160717, + "loss": 0.1179, + "rewards/chosen": 2.462312486436632, + "rewards/margins": 10.342664506700304, + "rewards/rejected": -7.880352020263672, + "step": 2245 + }, + { + "epoch": 0.8291264823958285, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.273712534930294e-07, + "logits/chosen": 221024995.55555555, + "logits/rejected": 204497792.0, + "logps/chosen": -304.17730034722223, + "logps/rejected": -539.2054268973214, + "loss": 0.0817, + "rewards/chosen": 2.8108963436550565, + "rewards/margins": 9.733146273900593, + "rewards/rejected": -6.922249930245536, + "step": 2246 + }, + { + "epoch": 0.8294956393336718, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 7.243173170646644e-07, + "logits/chosen": 251812369.06666666, + "logits/rejected": 157892156.2352941, + "logps/chosen": -291.46350911458336, + "logps/rejected": -492.57042738970586, + "loss": 0.0942, + "rewards/chosen": 2.2820119222005206, + "rewards/margins": 9.210741708792892, + "rewards/rejected": -6.928729786592371, + "step": 2247 + }, + { + "epoch": 0.8298647962715149, + "grad_norm": 5.0, + "kl": 3.307943344116211, + "learning_rate": 7.212693045403363e-07, + "logits/chosen": 221326066.52631578, + "logits/rejected": 190167886.76923078, + "logps/chosen": -376.07308799342104, + "logps/rejected": -423.27674278846155, + "loss": 0.0682, + "rewards/chosen": 3.1389326798288444, + "rewards/margins": 9.818640720506428, + "rewards/rejected": -6.679708040677584, + "step": 2248 + }, + { + "epoch": 0.8302339532093581, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 7.182272201430246e-07, + "logits/chosen": 218899931.42857143, + "logits/rejected": 196158862.2222222, + "logps/chosen": -377.12716238839283, + "logps/rejected": -370.16834852430554, + "loss": 0.0742, + "rewards/chosen": 2.502903802054269, + "rewards/margins": 7.75618528941321, + "rewards/rejected": -5.253281487358941, + "step": 2249 + }, + { + "epoch": 0.8306031101472013, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 7.151910680875001e-07, + "logits/chosen": 221701785.6, + "logits/rejected": 227692182.5882353, + "logps/chosen": -343.920703125, + "logps/rejected": -522.8283547794117, + "loss": 0.0714, + "rewards/chosen": 3.3875249226888022, + "rewards/margins": 10.778778225767846, + "rewards/rejected": -7.391253303079044, + "step": 2250 + }, + { + "epoch": 0.8309722670850446, + "grad_norm": 4.5, + "kl": 0.5036249160766602, + "learning_rate": 7.121608525803142e-07, + "logits/chosen": 181034352.0, + "logits/rejected": 205658352.0, + "logps/chosen": -343.06536865234375, + "logps/rejected": -452.3147277832031, + "loss": 0.0668, + "rewards/chosen": 2.8729145526885986, + "rewards/margins": 9.754152059555054, + "rewards/rejected": -6.881237506866455, + "step": 2251 + }, + { + "epoch": 0.8313414240228877, + "grad_norm": 7.40625, + "kl": 0.005719184875488281, + "learning_rate": 7.091365778197895e-07, + "logits/chosen": 192222870.5882353, + "logits/rejected": 226175914.66666666, + "logps/chosen": -398.94091796875, + "logps/rejected": -486.07952473958335, + "loss": 0.1168, + "rewards/chosen": 1.9473603192497702, + "rewards/margins": 8.88280953799977, + "rewards/rejected": -6.93544921875, + "step": 2252 + }, + { + "epoch": 0.8317105809607309, + "grad_norm": 3.90625, + "kl": 2.6213412284851074, + "learning_rate": 7.061182479960221e-07, + "logits/chosen": 213627549.53846154, + "logits/rejected": 150584266.10526314, + "logps/chosen": -412.06971153846155, + "logps/rejected": -339.7454255756579, + "loss": 0.0441, + "rewards/chosen": 3.9640174278846154, + "rewards/margins": 10.731157063472608, + "rewards/rejected": -6.767139635587993, + "step": 2253 + }, + { + "epoch": 0.8320797378985741, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 7.031058672908692e-07, + "logits/chosen": 241062480.84210527, + "logits/rejected": 277575030.15384614, + "logps/chosen": -275.24550267269734, + "logps/rejected": -479.7761042668269, + "loss": 0.1121, + "rewards/chosen": 2.8983863027472245, + "rewards/margins": 9.761067764961767, + "rewards/rejected": -6.862681462214543, + "step": 2254 + }, + { + "epoch": 0.8324488948364174, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 7.000994398779443e-07, + "logits/chosen": 246523857.45454547, + "logits/rejected": 203742073.9047619, + "logps/chosen": -434.10395951704544, + "logps/rejected": -440.5559895833333, + "loss": 0.0629, + "rewards/chosen": 2.2391081723299893, + "rewards/margins": 8.309725121502236, + "rewards/rejected": -6.070616949172247, + "step": 2255 + }, + { + "epoch": 0.8328180517742605, + "grad_norm": 6.4375, + "kl": 2.2107553482055664, + "learning_rate": 6.970989699226161e-07, + "logits/chosen": 256846412.8, + "logits/rejected": 150090389.33333334, + "logps/chosen": -371.128125, + "logps/rejected": -365.6356608072917, + "loss": 0.1282, + "rewards/chosen": 2.4641345977783202, + "rewards/margins": 7.811912918090821, + "rewards/rejected": -5.3477783203125, + "step": 2256 + }, + { + "epoch": 0.8331872087121037, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 6.941044615819981e-07, + "logits/chosen": 240152771.7647059, + "logits/rejected": 219632708.26666668, + "logps/chosen": -368.7770565257353, + "logps/rejected": -400.04192708333335, + "loss": 0.0729, + "rewards/chosen": 2.6082092733944164, + "rewards/margins": 9.04689691580978, + "rewards/rejected": -6.438687642415364, + "step": 2257 + }, + { + "epoch": 0.833556365649947, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 6.911159190049416e-07, + "logits/chosen": 225465193.4117647, + "logits/rejected": 164741427.2, + "logps/chosen": -459.01935891544116, + "logps/rejected": -413.5498372395833, + "loss": 0.0772, + "rewards/chosen": 2.446490119485294, + "rewards/margins": 8.5816478953642, + "rewards/rejected": -6.135157775878906, + "step": 2258 + }, + { + "epoch": 0.8339255225877902, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 6.881333463320355e-07, + "logits/chosen": 203692800.0, + "logits/rejected": 233878155.63636363, + "logps/chosen": -360.56129092261904, + "logps/rejected": -342.75679154829544, + "loss": 0.0613, + "rewards/chosen": 3.4928516206287203, + "rewards/margins": 8.829719213180212, + "rewards/rejected": -5.336867592551491, + "step": 2259 + }, + { + "epoch": 0.8342946795256333, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 6.851567476955978e-07, + "logits/chosen": 194247568.0, + "logits/rejected": 184613520.0, + "logps/chosen": -332.3338928222656, + "logps/rejected": -487.324462890625, + "loss": 0.0918, + "rewards/chosen": 2.104646921157837, + "rewards/margins": 9.480818033218384, + "rewards/rejected": -7.376171112060547, + "step": 2260 + }, + { + "epoch": 0.8346638364634765, + "grad_norm": 4.625, + "kl": 0.9186468124389648, + "learning_rate": 6.821861272196651e-07, + "logits/chosen": 192900271.15789473, + "logits/rejected": 182712871.3846154, + "logps/chosen": -291.35860402960526, + "logps/rejected": -378.43900240384613, + "loss": 0.0832, + "rewards/chosen": 2.7806573165090462, + "rewards/margins": 10.14661061522449, + "rewards/rejected": -7.365953298715445, + "step": 2261 + }, + { + "epoch": 0.8350329934013198, + "grad_norm": 6.5, + "kl": 0.7193470001220703, + "learning_rate": 6.79221489019996e-07, + "logits/chosen": 195234667.78947368, + "logits/rejected": 182913280.0, + "logps/chosen": -447.6363589638158, + "logps/rejected": -429.84923377403845, + "loss": 0.071, + "rewards/chosen": 2.727814925344367, + "rewards/margins": 9.516885054738898, + "rewards/rejected": -6.789070129394531, + "step": 2262 + }, + { + "epoch": 0.835402150339163, + "grad_norm": 2.578125, + "kl": 0.0, + "learning_rate": 6.762628372040603e-07, + "logits/chosen": 136522524.44444445, + "logits/rejected": 149916772.57142857, + "logps/chosen": -234.76567925347223, + "logps/rejected": -407.42916434151783, + "loss": 0.0644, + "rewards/chosen": 3.277847078111437, + "rewards/margins": 9.073391293722487, + "rewards/rejected": -5.7955442156110495, + "step": 2263 + }, + { + "epoch": 0.8357713072770061, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 6.733101758710297e-07, + "logits/chosen": 266062576.0, + "logits/rejected": 150758320.0, + "logps/chosen": -308.7560119628906, + "logps/rejected": -369.24688720703125, + "loss": 0.0735, + "rewards/chosen": 2.656834840774536, + "rewards/margins": 9.75713038444519, + "rewards/rejected": -7.100295543670654, + "step": 2264 + }, + { + "epoch": 0.8361404642148493, + "grad_norm": 3.671875, + "kl": 0.0, + "learning_rate": 6.703635091117804e-07, + "logits/chosen": 195652187.42857143, + "logits/rejected": 211545287.1111111, + "logps/chosen": -322.88755580357144, + "logps/rejected": -404.5279947916667, + "loss": 0.0521, + "rewards/chosen": 3.1713665553501675, + "rewards/margins": 9.924985007634238, + "rewards/rejected": -6.753618452284071, + "step": 2265 + }, + { + "epoch": 0.8365096211526926, + "grad_norm": 5.625, + "kl": 4.494109630584717, + "learning_rate": 6.674228410088828e-07, + "logits/chosen": 151067294.47619048, + "logits/rejected": 115198731.63636364, + "logps/chosen": -303.1924293154762, + "logps/rejected": -295.12790749289775, + "loss": 0.1051, + "rewards/chosen": 3.6255580357142856, + "rewards/margins": 9.608162322601714, + "rewards/rejected": -5.982604286887429, + "step": 2266 + }, + { + "epoch": 0.8368787780905358, + "grad_norm": 5.09375, + "kl": 0.20731210708618164, + "learning_rate": 6.644881756365934e-07, + "logits/chosen": 253145457.7777778, + "logits/rejected": 205598244.57142857, + "logps/chosen": -404.1185709635417, + "logps/rejected": -446.6300571986607, + "loss": 0.0787, + "rewards/chosen": 2.574159410264757, + "rewards/margins": 9.323955596439422, + "rewards/rejected": -6.749796186174665, + "step": 2267 + }, + { + "epoch": 0.8372479350283789, + "grad_norm": 7.34375, + "kl": 0.0, + "learning_rate": 6.615595170608541e-07, + "logits/chosen": 188364445.53846154, + "logits/rejected": 233505280.0, + "logps/chosen": -330.1096003605769, + "logps/rejected": -502.0713404605263, + "loss": 0.1168, + "rewards/chosen": 1.810858653141902, + "rewards/margins": 8.429792882942477, + "rewards/rejected": -6.618934229800575, + "step": 2268 + }, + { + "epoch": 0.8376170919662221, + "grad_norm": 4.96875, + "kl": 0.11595296859741211, + "learning_rate": 6.586368693392859e-07, + "logits/chosen": 170131410.82352942, + "logits/rejected": 220616772.26666668, + "logps/chosen": -321.26809512867646, + "logps/rejected": -489.6927734375, + "loss": 0.0654, + "rewards/chosen": 2.9846103892606846, + "rewards/margins": 10.429637549905216, + "rewards/rejected": -7.445027160644531, + "step": 2269 + }, + { + "epoch": 0.8379862489040654, + "grad_norm": 6.15625, + "kl": 0.0, + "learning_rate": 6.557202365211778e-07, + "logits/chosen": 165040768.0, + "logits/rejected": 145788528.0, + "logps/chosen": -346.5352478027344, + "logps/rejected": -309.6989440917969, + "loss": 0.0909, + "rewards/chosen": 2.7902820110321045, + "rewards/margins": 8.78134036064148, + "rewards/rejected": -5.991058349609375, + "step": 2270 + }, + { + "epoch": 0.8383554058419086, + "grad_norm": 7.59375, + "kl": 0.34261655807495117, + "learning_rate": 6.528096226474894e-07, + "logits/chosen": 238438528.0, + "logits/rejected": 186987264.0, + "logps/chosen": -378.752490234375, + "logps/rejected": -458.3627115885417, + "loss": 0.11, + "rewards/chosen": 2.2870054244995117, + "rewards/margins": 8.975277264912922, + "rewards/rejected": -6.688271840413411, + "step": 2271 + }, + { + "epoch": 0.8387245627797517, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 6.499050317508371e-07, + "logits/chosen": 214991817.14285713, + "logits/rejected": 264009813.33333334, + "logps/chosen": -265.9362269810268, + "logps/rejected": -451.6370442708333, + "loss": 0.0717, + "rewards/chosen": 2.5608553205217635, + "rewards/margins": 9.691077217223153, + "rewards/rejected": -7.130221896701389, + "step": 2272 + }, + { + "epoch": 0.8390937197175949, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 6.470064678554971e-07, + "logits/chosen": 164639886.2222222, + "logits/rejected": 324201289.14285713, + "logps/chosen": -346.59776475694446, + "logps/rejected": -547.8495047433036, + "loss": 0.0809, + "rewards/chosen": 2.6874987284342446, + "rewards/margins": 10.806122734433128, + "rewards/rejected": -8.118624005998884, + "step": 2273 + }, + { + "epoch": 0.8394628766554382, + "grad_norm": 5.65625, + "kl": 0.35704994201660156, + "learning_rate": 6.441139349773906e-07, + "logits/chosen": 151154627.7647059, + "logits/rejected": 288032187.73333335, + "logps/chosen": -339.6246553308824, + "logps/rejected": -425.12041015625, + "loss": 0.0766, + "rewards/chosen": 2.9798828573787914, + "rewards/margins": 9.258479862587125, + "rewards/rejected": -6.278597005208334, + "step": 2274 + }, + { + "epoch": 0.8398320335932813, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 6.412274371240867e-07, + "logits/chosen": 262308233.84615386, + "logits/rejected": 239227904.0, + "logps/chosen": -284.67093599759613, + "logps/rejected": -367.8876953125, + "loss": 0.0803, + "rewards/chosen": 2.7559981712928185, + "rewards/margins": 8.702655050918642, + "rewards/rejected": -5.946656879625823, + "step": 2275 + }, + { + "epoch": 0.8402011905311245, + "grad_norm": 6.0, + "kl": 0.48291778564453125, + "learning_rate": 6.383469782947915e-07, + "logits/chosen": 162614493.86666667, + "logits/rejected": 173428329.4117647, + "logps/chosen": -344.6974283854167, + "logps/rejected": -331.5452665441176, + "loss": 0.1082, + "rewards/chosen": 3.1415247599283855, + "rewards/margins": 8.640080560422412, + "rewards/rejected": -5.498555800494025, + "step": 2276 + }, + { + "epoch": 0.8405703474689677, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 6.354725624803426e-07, + "logits/chosen": 235948130.46153846, + "logits/rejected": 204395627.78947368, + "logps/chosen": -355.4271709735577, + "logps/rejected": -306.6904810855263, + "loss": 0.0623, + "rewards/chosen": 3.061436579777644, + "rewards/margins": 9.122555659367489, + "rewards/rejected": -6.061119079589844, + "step": 2277 + }, + { + "epoch": 0.840939504406811, + "grad_norm": 6.5, + "kl": 2.0252416133880615, + "learning_rate": 6.326041936632077e-07, + "logits/chosen": 269379705.9047619, + "logits/rejected": 176138693.8181818, + "logps/chosen": -395.4024367559524, + "logps/rejected": -467.7223011363636, + "loss": 0.1319, + "rewards/chosen": 2.4584083557128906, + "rewards/margins": 10.151573527943004, + "rewards/rejected": -7.693165172230113, + "step": 2278 + }, + { + "epoch": 0.8413086613446541, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 6.297418758174767e-07, + "logits/chosen": 276231372.8, + "logits/rejected": 198143720.72727272, + "logps/chosen": -309.3978271484375, + "logps/rejected": -390.71120383522725, + "loss": 0.086, + "rewards/chosen": 2.24450798034668, + "rewards/margins": 8.505762030861595, + "rewards/rejected": -6.261254050514915, + "step": 2279 + }, + { + "epoch": 0.8416778182824973, + "grad_norm": 5.125, + "kl": 0.27222537994384766, + "learning_rate": 6.268856129088518e-07, + "logits/chosen": 215191969.68421054, + "logits/rejected": 143950651.07692307, + "logps/chosen": -339.15547902960526, + "logps/rejected": -517.2358022836538, + "loss": 0.1015, + "rewards/chosen": 2.5792156018708883, + "rewards/margins": 9.799484438259109, + "rewards/rejected": -7.220268836388221, + "step": 2280 + }, + { + "epoch": 0.8420469752203406, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 6.240354088946504e-07, + "logits/chosen": 189457877.33333334, + "logits/rejected": 248276224.0, + "logps/chosen": -378.0313313802083, + "logps/rejected": -499.98486328125, + "loss": 0.0844, + "rewards/chosen": 2.4059009552001953, + "rewards/margins": 9.43882713317871, + "rewards/rejected": -7.032926177978515, + "step": 2281 + }, + { + "epoch": 0.8424161321581838, + "grad_norm": 3.921875, + "kl": 0.0, + "learning_rate": 6.211912677237947e-07, + "logits/chosen": 195927488.0, + "logits/rejected": 225234272.0, + "logps/chosen": -366.4732360839844, + "logps/rejected": -570.1511840820312, + "loss": 0.0535, + "rewards/chosen": 2.8311541080474854, + "rewards/margins": 10.694364786148071, + "rewards/rejected": -7.863210678100586, + "step": 2282 + }, + { + "epoch": 0.8427852890960269, + "grad_norm": 5.75, + "kl": 0.3394503593444824, + "learning_rate": 6.183531933368048e-07, + "logits/chosen": 245277642.10526314, + "logits/rejected": 249049068.30769232, + "logps/chosen": -317.49784128289474, + "logps/rejected": -338.7063551682692, + "loss": 0.1177, + "rewards/chosen": 2.236759386564556, + "rewards/margins": 7.485826252925733, + "rewards/rejected": -5.2490668663611775, + "step": 2283 + }, + { + "epoch": 0.8431544460338701, + "grad_norm": 6.375, + "kl": 0.4672260284423828, + "learning_rate": 6.15521189665797e-07, + "logits/chosen": 297456256.0, + "logits/rejected": 158266083.55555555, + "logps/chosen": -415.97244698660717, + "logps/rejected": -398.8268771701389, + "loss": 0.0998, + "rewards/chosen": 2.739652633666992, + "rewards/margins": 9.044897079467773, + "rewards/rejected": -6.305244445800781, + "step": 2284 + }, + { + "epoch": 0.8435236029717134, + "grad_norm": 6.59375, + "kl": 1.851017951965332, + "learning_rate": 6.126952606344777e-07, + "logits/chosen": 210536519.1111111, + "logits/rejected": 126937645.71428572, + "logps/chosen": -395.02037217881946, + "logps/rejected": -402.61216517857144, + "loss": 0.1133, + "rewards/chosen": 2.364955054389106, + "rewards/margins": 8.47053007095579, + "rewards/rejected": -6.105575016566685, + "step": 2285 + }, + { + "epoch": 0.8438927599095566, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 6.098754101581334e-07, + "logits/chosen": 171994294.85714287, + "logits/rejected": 201757440.0, + "logps/chosen": -353.35379464285717, + "logps/rejected": -411.88392469618054, + "loss": 0.0908, + "rewards/chosen": 2.336557388305664, + "rewards/margins": 7.967923270331489, + "rewards/rejected": -5.6313658820258246, + "step": 2286 + }, + { + "epoch": 0.8442619168473997, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 6.070616421436326e-07, + "logits/chosen": 230518198.85714287, + "logits/rejected": 190941710.2222222, + "logps/chosen": -346.8643275669643, + "logps/rejected": -470.6564670138889, + "loss": 0.0717, + "rewards/chosen": 2.2185162135532925, + "rewards/margins": 9.022440350244917, + "rewards/rejected": -6.803924136691624, + "step": 2287 + }, + { + "epoch": 0.8446310737852429, + "grad_norm": 6.0625, + "kl": 0.6500749588012695, + "learning_rate": 6.04253960489416e-07, + "logits/chosen": 462592736.0, + "logits/rejected": 201246512.0, + "logps/chosen": -415.04705810546875, + "logps/rejected": -384.6214599609375, + "loss": 0.0887, + "rewards/chosen": 2.3113551139831543, + "rewards/margins": 8.368534564971924, + "rewards/rejected": -6.0571794509887695, + "step": 2288 + }, + { + "epoch": 0.8450002307230862, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 6.014523690854895e-07, + "logits/chosen": 262628352.0, + "logits/rejected": 256479171.7647059, + "logps/chosen": -348.14313151041665, + "logps/rejected": -466.0289522058824, + "loss": 0.0706, + "rewards/chosen": 2.5371971130371094, + "rewards/margins": 9.56878235760857, + "rewards/rejected": -7.0315852445714615, + "step": 2289 + }, + { + "epoch": 0.8453693876609294, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 5.986568718134223e-07, + "logits/chosen": 202129481.14285713, + "logits/rejected": 175078272.0, + "logps/chosen": -221.20347377232142, + "logps/rejected": -401.2842068142361, + "loss": 0.0717, + "rewards/chosen": 2.7764374869210378, + "rewards/margins": 9.240011366586836, + "rewards/rejected": -6.463573879665798, + "step": 2290 + }, + { + "epoch": 0.8457385445987725, + "grad_norm": 5.9375, + "kl": 0.7261333465576172, + "learning_rate": 5.95867472546341e-07, + "logits/chosen": 159612021.33333334, + "logits/rejected": 190149542.4, + "logps/chosen": -376.71533203125, + "logps/rejected": -374.67099609375, + "loss": 0.0859, + "rewards/chosen": 2.3125410079956055, + "rewards/margins": 9.074889945983887, + "rewards/rejected": -6.762348937988281, + "step": 2291 + }, + { + "epoch": 0.8461077015366157, + "grad_norm": 3.734375, + "kl": 0.0, + "learning_rate": 5.930841751489219e-07, + "logits/chosen": 145084188.44444445, + "logits/rejected": 224830080.0, + "logps/chosen": -309.70399305555554, + "logps/rejected": -499.28006417410717, + "loss": 0.064, + "rewards/chosen": 3.678217569986979, + "rewards/margins": 10.893220447358631, + "rewards/rejected": -7.215002877371652, + "step": 2292 + }, + { + "epoch": 0.846476858474459, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5.903069834773883e-07, + "logits/chosen": 211889362.82352942, + "logits/rejected": 218116369.06666666, + "logps/chosen": -227.06870404411765, + "logps/rejected": -463.847265625, + "loss": 0.1411, + "rewards/chosen": 2.0487437528722428, + "rewards/margins": 8.15410670860141, + "rewards/rejected": -6.105362955729166, + "step": 2293 + }, + { + "epoch": 0.8468460154123022, + "grad_norm": 5.5, + "kl": 0.588688850402832, + "learning_rate": 5.875359013795062e-07, + "logits/chosen": 282447001.6, + "logits/rejected": 262948713.4117647, + "logps/chosen": -438.75107421875, + "logps/rejected": -454.89114200367646, + "loss": 0.0478, + "rewards/chosen": 3.229425557454427, + "rewards/margins": 9.365649743173636, + "rewards/rejected": -6.136224185719209, + "step": 2294 + }, + { + "epoch": 0.8472151723501453, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5.847709326945717e-07, + "logits/chosen": 209342976.0, + "logits/rejected": 163441426.2857143, + "logps/chosen": -329.95152452256946, + "logps/rejected": -488.2609165736607, + "loss": 0.1161, + "rewards/chosen": 2.5610603756374783, + "rewards/margins": 11.253713486686586, + "rewards/rejected": -8.692653111049108, + "step": 2295 + }, + { + "epoch": 0.8475843292879885, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 5.820120812534147e-07, + "logits/chosen": 215846992.0, + "logits/rejected": 267014208.0, + "logps/chosen": -333.3243713378906, + "logps/rejected": -446.81103515625, + "loss": 0.0845, + "rewards/chosen": 2.435922145843506, + "rewards/margins": 9.09076738357544, + "rewards/rejected": -6.654845237731934, + "step": 2296 + }, + { + "epoch": 0.8479534862258318, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.792593508783906e-07, + "logits/chosen": 204940509.86666667, + "logits/rejected": 216997632.0, + "logps/chosen": -334.05914713541665, + "logps/rejected": -513.5005170036765, + "loss": 0.0703, + "rewards/chosen": 2.8116978963216144, + "rewards/margins": 10.456318903904336, + "rewards/rejected": -7.644621007582721, + "step": 2297 + }, + { + "epoch": 0.848322643163675, + "grad_norm": 4.75, + "kl": 1.2122163772583008, + "learning_rate": 5.765127453833696e-07, + "logits/chosen": 240682038.85714287, + "logits/rejected": 166263267.55555555, + "logps/chosen": -362.55772181919644, + "logps/rejected": -388.6031901041667, + "loss": 0.0696, + "rewards/chosen": 3.5241767338344028, + "rewards/margins": 9.437389827909923, + "rewards/rejected": -5.9132130940755205, + "step": 2298 + }, + { + "epoch": 0.8486918001015181, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 5.737722685737401e-07, + "logits/chosen": 187866843.42857143, + "logits/rejected": 204113749.33333334, + "logps/chosen": -256.62538364955356, + "logps/rejected": -505.22911241319446, + "loss": 0.0905, + "rewards/chosen": 2.3808694566999162, + "rewards/margins": 9.568973026578389, + "rewards/rejected": -7.188103569878472, + "step": 2299 + }, + { + "epoch": 0.8490609570393614, + "grad_norm": 4.84375, + "kl": 0.35678672790527344, + "learning_rate": 5.710379242463993e-07, + "logits/chosen": 229873648.0, + "logits/rejected": 158550384.0, + "logps/chosen": -303.2879333496094, + "logps/rejected": -404.9228515625, + "loss": 0.0794, + "rewards/chosen": 2.7918660640716553, + "rewards/margins": 9.216131448745728, + "rewards/rejected": -6.424265384674072, + "step": 2300 + }, + { + "epoch": 0.8494301139772046, + "grad_norm": 5.09375, + "kl": 0.0, + "learning_rate": 5.683097161897433e-07, + "logits/chosen": 299807904.0, + "logits/rejected": 186369584.0, + "logps/chosen": -328.7945556640625, + "logps/rejected": -356.32269287109375, + "loss": 0.0646, + "rewards/chosen": 2.769191265106201, + "rewards/margins": 8.756298542022705, + "rewards/rejected": -5.987107276916504, + "step": 2301 + }, + { + "epoch": 0.8497992709150478, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 5.655876481836719e-07, + "logits/chosen": 160908306.2857143, + "logits/rejected": 254960526.2222222, + "logps/chosen": -355.5305873325893, + "logps/rejected": -571.2170681423611, + "loss": 0.0809, + "rewards/chosen": 2.085216930934361, + "rewards/margins": 11.084697284395734, + "rewards/rejected": -8.999480353461372, + "step": 2302 + }, + { + "epoch": 0.8501684278528909, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 5.628717239995762e-07, + "logits/chosen": 279945066.6666667, + "logits/rejected": 282069683.2, + "logps/chosen": -276.4638671875, + "logps/rejected": -547.752099609375, + "loss": 0.0865, + "rewards/chosen": 2.1119422912597656, + "rewards/margins": 9.093997192382812, + "rewards/rejected": -6.982054901123047, + "step": 2303 + }, + { + "epoch": 0.8505375847907342, + "grad_norm": 4.96875, + "kl": 3.551253318786621, + "learning_rate": 5.601619474003328e-07, + "logits/chosen": 239411307.78947368, + "logits/rejected": 177558331.07692307, + "logps/chosen": -390.7345548930921, + "logps/rejected": -342.28384164663464, + "loss": 0.0906, + "rewards/chosen": 3.221390573601974, + "rewards/margins": 9.392350046258224, + "rewards/rejected": -6.17095947265625, + "step": 2304 + }, + { + "epoch": 0.8509067417285774, + "grad_norm": 5.5, + "kl": 2.144731044769287, + "learning_rate": 5.574583221403041e-07, + "logits/chosen": 235595392.0, + "logits/rejected": 197227808.0, + "logps/chosen": -353.314453125, + "logps/rejected": -452.67413330078125, + "loss": 0.1029, + "rewards/chosen": 3.242062568664551, + "rewards/margins": 9.862363815307617, + "rewards/rejected": -6.620301246643066, + "step": 2305 + }, + { + "epoch": 0.8512758986664206, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 5.547608519653286e-07, + "logits/chosen": 236250256.0, + "logits/rejected": 217001568.0, + "logps/chosen": -316.744873046875, + "logps/rejected": -438.45001220703125, + "loss": 0.0564, + "rewards/chosen": 2.7695443630218506, + "rewards/margins": 10.560119390487671, + "rewards/rejected": -7.79057502746582, + "step": 2306 + }, + { + "epoch": 0.8516450556042637, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 5.520695406127163e-07, + "logits/chosen": 169635441.7777778, + "logits/rejected": 131031698.28571428, + "logps/chosen": -323.3640407986111, + "logps/rejected": -466.3318568638393, + "loss": 0.0771, + "rewards/chosen": 2.7219136555989585, + "rewards/margins": 9.98601550147647, + "rewards/rejected": -7.264101845877511, + "step": 2307 + }, + { + "epoch": 0.852014212542107, + "grad_norm": 3.96875, + "kl": 0.769442081451416, + "learning_rate": 5.493843918112445e-07, + "logits/chosen": 248084660.70588234, + "logits/rejected": 270270958.93333334, + "logps/chosen": -297.0126953125, + "logps/rejected": -493.44895833333334, + "loss": 0.0619, + "rewards/chosen": 2.965333377613741, + "rewards/margins": 10.738194095387179, + "rewards/rejected": -7.772860717773438, + "step": 2308 + }, + { + "epoch": 0.8523833694799502, + "grad_norm": 4.5, + "kl": 0.0, + "learning_rate": 5.467054092811536e-07, + "logits/chosen": 214655863.46666667, + "logits/rejected": 312655269.64705884, + "logps/chosen": -427.72682291666666, + "logps/rejected": -601.3087660845588, + "loss": 0.0562, + "rewards/chosen": 3.0234479268391925, + "rewards/margins": 10.059751278746361, + "rewards/rejected": -7.036303351907169, + "step": 2309 + }, + { + "epoch": 0.8527525264177933, + "grad_norm": 5.46875, + "kl": 1.2536039352416992, + "learning_rate": 5.440325967341404e-07, + "logits/chosen": 211029504.0, + "logits/rejected": 229706325.33333334, + "logps/chosen": -357.9739774816176, + "logps/rejected": -490.9111653645833, + "loss": 0.0735, + "rewards/chosen": 3.2495696123908546, + "rewards/margins": 10.772282319910385, + "rewards/rejected": -7.522712707519531, + "step": 2310 + }, + { + "epoch": 0.8531216833556365, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 5.413659578733505e-07, + "logits/chosen": 321909920.0, + "logits/rejected": 200760416.0, + "logps/chosen": -408.6298828125, + "logps/rejected": -418.57952880859375, + "loss": 0.1001, + "rewards/chosen": 1.9269086122512817, + "rewards/margins": 7.581988453865051, + "rewards/rejected": -5.6550798416137695, + "step": 2311 + }, + { + "epoch": 0.8534908402934798, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 5.387054963933803e-07, + "logits/chosen": 150553700.57142857, + "logits/rejected": 266056789.33333334, + "logps/chosen": -347.67117745535717, + "logps/rejected": -605.03271484375, + "loss": 0.0544, + "rewards/chosen": 3.7608119419642856, + "rewards/margins": 11.240728832426525, + "rewards/rejected": -7.479916890462239, + "step": 2312 + }, + { + "epoch": 0.853859997231323, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 5.36051215980265e-07, + "logits/chosen": 324011068.2352941, + "logits/rejected": 247785574.4, + "logps/chosen": -397.96955422794116, + "logps/rejected": -489.49254557291664, + "loss": 0.0982, + "rewards/chosen": 2.4842872619628906, + "rewards/margins": 9.553443145751952, + "rewards/rejected": -7.069155883789063, + "step": 2313 + }, + { + "epoch": 0.8542291541691661, + "grad_norm": 5.96875, + "kl": 3.4437503814697266, + "learning_rate": 5.334031203114753e-07, + "logits/chosen": 226531274.10526314, + "logits/rejected": 175563008.0, + "logps/chosen": -410.67210629111844, + "logps/rejected": -450.37357271634613, + "loss": 0.0671, + "rewards/chosen": 3.4352493286132812, + "rewards/margins": 11.00516392634465, + "rewards/rejected": -7.56991459773137, + "step": 2314 + }, + { + "epoch": 0.8545983111070093, + "grad_norm": 8.3125, + "kl": 0.0, + "learning_rate": 5.307612130559154e-07, + "logits/chosen": 204327584.0, + "logits/rejected": 168447968.0, + "logps/chosen": -293.574462890625, + "logps/rejected": -464.98211669921875, + "loss": 0.1178, + "rewards/chosen": 2.172290086746216, + "rewards/margins": 8.362090826034546, + "rewards/rejected": -6.18980073928833, + "step": 2315 + }, + { + "epoch": 0.8549674680448526, + "grad_norm": 5.59375, + "kl": 0.691650390625, + "learning_rate": 5.281254978739142e-07, + "logits/chosen": 134074951.1111111, + "logits/rejected": 151224758.85714287, + "logps/chosen": -258.3561740451389, + "logps/rejected": -379.55908203125, + "loss": 0.099, + "rewards/chosen": 2.7751185099283853, + "rewards/margins": 9.28834243047805, + "rewards/rejected": -6.513223920549665, + "step": 2316 + }, + { + "epoch": 0.8553366249826958, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.254959784172197e-07, + "logits/chosen": 281229454.2222222, + "logits/rejected": 192887718.95652175, + "logps/chosen": -378.67371961805554, + "logps/rejected": -429.2851987092391, + "loss": 0.0591, + "rewards/chosen": 2.686701668633355, + "rewards/margins": 9.31489672637792, + "rewards/rejected": -6.628195057744565, + "step": 2317 + }, + { + "epoch": 0.8557057819205389, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 5.22872658329e-07, + "logits/chosen": 250236864.0, + "logits/rejected": 140722192.0, + "logps/chosen": -400.1304626464844, + "logps/rejected": -390.40966796875, + "loss": 0.0845, + "rewards/chosen": 2.586435079574585, + "rewards/margins": 9.053261518478394, + "rewards/rejected": -6.466826438903809, + "step": 2318 + }, + { + "epoch": 0.8560749388583822, + "grad_norm": 5.8125, + "kl": 0.1867966651916504, + "learning_rate": 5.202555412438309e-07, + "logits/chosen": 207850172.63157895, + "logits/rejected": 235575709.53846154, + "logps/chosen": -307.07491262335526, + "logps/rejected": -542.7975886418269, + "loss": 0.0936, + "rewards/chosen": 2.3227087322034334, + "rewards/margins": 9.171978305708542, + "rewards/rejected": -6.849269573505108, + "step": 2319 + }, + { + "epoch": 0.8564440957962254, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 5.176446307876948e-07, + "logits/chosen": 199573696.0, + "logits/rejected": 172551936.0, + "logps/chosen": -389.8174743652344, + "logps/rejected": -393.1667175292969, + "loss": 0.077, + "rewards/chosen": 2.2466697692871094, + "rewards/margins": 8.292772769927979, + "rewards/rejected": -6.046103000640869, + "step": 2320 + }, + { + "epoch": 0.8568132527340686, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 5.150399305779747e-07, + "logits/chosen": 182120743.3846154, + "logits/rejected": 199765665.68421054, + "logps/chosen": -298.059326171875, + "logps/rejected": -434.16390830592104, + "loss": 0.0918, + "rewards/chosen": 1.6102418165940504, + "rewards/margins": 7.274085071888047, + "rewards/rejected": -5.663843255293997, + "step": 2321 + }, + { + "epoch": 0.8571824096719117, + "grad_norm": 4.3125, + "kl": 0.0, + "learning_rate": 5.124414442234504e-07, + "logits/chosen": 377806438.4, + "logits/rejected": 174935687.52941176, + "logps/chosen": -377.1549479166667, + "logps/rejected": -377.2403779871324, + "loss": 0.0506, + "rewards/chosen": 2.882440439860026, + "rewards/margins": 9.09583101459578, + "rewards/rejected": -6.213390574735754, + "step": 2322 + }, + { + "epoch": 0.857551566609755, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 5.098491753242918e-07, + "logits/chosen": 247900288.0, + "logits/rejected": 153458016.0, + "logps/chosen": -321.5390319824219, + "logps/rejected": -496.5819091796875, + "loss": 0.073, + "rewards/chosen": 2.9293458461761475, + "rewards/margins": 10.857386350631714, + "rewards/rejected": -7.928040504455566, + "step": 2323 + }, + { + "epoch": 0.8579207235475982, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 5.07263127472053e-07, + "logits/chosen": 250049731.04761904, + "logits/rejected": 209593157.8181818, + "logps/chosen": -363.32586960565476, + "logps/rejected": -411.0087890625, + "loss": 0.093, + "rewards/chosen": 2.6969926016671315, + "rewards/margins": 8.021114250282189, + "rewards/rejected": -5.324121648615057, + "step": 2324 + }, + { + "epoch": 0.8582898804854414, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.046833042496719e-07, + "logits/chosen": 218111609.9047619, + "logits/rejected": 165177227.63636363, + "logps/chosen": -337.0273204985119, + "logps/rejected": -459.8582652698864, + "loss": 0.0733, + "rewards/chosen": 3.1527826218377974, + "rewards/margins": 10.238129884133606, + "rewards/rejected": -7.08534726229581, + "step": 2325 + }, + { + "epoch": 0.8586590374232845, + "grad_norm": 5.625, + "kl": 0.6214451789855957, + "learning_rate": 5.021097092314598e-07, + "logits/chosen": 190055860.70588234, + "logits/rejected": 247816942.93333334, + "logps/chosen": -350.61951401654414, + "logps/rejected": -564.10859375, + "loss": 0.072, + "rewards/chosen": 3.3077536190257355, + "rewards/margins": 11.14154166427313, + "rewards/rejected": -7.833788045247396, + "step": 2326 + }, + { + "epoch": 0.8590281943611278, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 4.995423459831006e-07, + "logits/chosen": 197867120.0, + "logits/rejected": 236436992.0, + "logps/chosen": -361.9465637207031, + "logps/rejected": -495.6031494140625, + "loss": 0.0667, + "rewards/chosen": 3.091110944747925, + "rewards/margins": 9.393370866775513, + "rewards/rejected": -6.302259922027588, + "step": 2327 + }, + { + "epoch": 0.859397351298971, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 4.96981218061644e-07, + "logits/chosen": 212865075.2, + "logits/rejected": 184271826.82352942, + "logps/chosen": -329.8585611979167, + "logps/rejected": -455.27809053308823, + "loss": 0.0638, + "rewards/chosen": 2.354436238606771, + "rewards/margins": 9.674959668926164, + "rewards/rejected": -7.320523430319393, + "step": 2328 + }, + { + "epoch": 0.8597665082368142, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 4.944263290154983e-07, + "logits/chosen": 211196462.54545453, + "logits/rejected": 154914633.14285713, + "logps/chosen": -275.6236683238636, + "logps/rejected": -417.0397135416667, + "loss": 0.052, + "rewards/chosen": 3.028709064830433, + "rewards/margins": 9.174888990658186, + "rewards/rejected": -6.146179925827753, + "step": 2329 + }, + { + "epoch": 0.8601356651746573, + "grad_norm": 3.890625, + "kl": 0.0, + "learning_rate": 4.918776823844312e-07, + "logits/chosen": 190055402.66666666, + "logits/rejected": 166711360.0, + "logps/chosen": -319.6204427083333, + "logps/rejected": -482.12001953125, + "loss": 0.049, + "rewards/chosen": 3.0440731048583984, + "rewards/margins": 11.975757217407226, + "rewards/rejected": -8.931684112548828, + "step": 2330 + }, + { + "epoch": 0.8605048221125006, + "grad_norm": 2.625, + "kl": 0.0, + "learning_rate": 4.893352816995611e-07, + "logits/chosen": 282774169.6, + "logits/rejected": 183421195.63636363, + "logps/chosen": -383.258349609375, + "logps/rejected": -459.00936612215907, + "loss": 0.029, + "rewards/chosen": 4.4305259704589846, + "rewards/margins": 11.955487962202593, + "rewards/rejected": -7.524961991743608, + "step": 2331 + }, + { + "epoch": 0.8608739790503438, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 4.867991304833502e-07, + "logits/chosen": 223131840.0, + "logits/rejected": 228381849.6, + "logps/chosen": -409.3682454427083, + "logps/rejected": -431.4228515625, + "loss": 0.0445, + "rewards/chosen": 3.394333839416504, + "rewards/margins": 9.992411231994629, + "rewards/rejected": -6.598077392578125, + "step": 2332 + }, + { + "epoch": 0.861243135988187, + "grad_norm": 3.3125, + "kl": 0.0, + "learning_rate": 4.842692322496039e-07, + "logits/chosen": 130390173.53846154, + "logits/rejected": 148212291.36842105, + "logps/chosen": -325.3584735576923, + "logps/rejected": -495.72281044407896, + "loss": 0.048, + "rewards/chosen": 3.0143503042367787, + "rewards/margins": 10.485488706272141, + "rewards/rejected": -7.471138402035362, + "step": 2333 + }, + { + "epoch": 0.8616122929260301, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 4.817455905034657e-07, + "logits/chosen": 160077324.8, + "logits/rejected": 152601066.66666666, + "logps/chosen": -308.294287109375, + "logps/rejected": -383.7631022135417, + "loss": 0.1338, + "rewards/chosen": 2.3596206665039063, + "rewards/margins": 7.527501996358236, + "rewards/rejected": -5.167881329854329, + "step": 2334 + }, + { + "epoch": 0.8619814498638734, + "grad_norm": 6.375, + "kl": 0.7586555480957031, + "learning_rate": 4.792282087414068e-07, + "logits/chosen": 283823292.6315789, + "logits/rejected": 189755982.76923078, + "logps/chosen": -290.04178659539474, + "logps/rejected": -536.9509840745193, + "loss": 0.1023, + "rewards/chosen": 3.190833041542455, + "rewards/margins": 9.527096258001288, + "rewards/rejected": -6.336263216458834, + "step": 2335 + }, + { + "epoch": 0.8623506068017166, + "grad_norm": 5.09375, + "kl": 1.9972648620605469, + "learning_rate": 4.7671709045122914e-07, + "logits/chosen": 274647120.84210527, + "logits/rejected": 112351665.23076923, + "logps/chosen": -323.8086194490132, + "logps/rejected": -295.5359074519231, + "loss": 0.1126, + "rewards/chosen": 3.1933304636101973, + "rewards/margins": 8.949262024419992, + "rewards/rejected": -5.755931560809795, + "step": 2336 + }, + { + "epoch": 0.8627197637395598, + "grad_norm": 4.625, + "kl": 0.3808140754699707, + "learning_rate": 4.742122391120557e-07, + "logits/chosen": 184622169.6, + "logits/rejected": 194256704.0, + "logps/chosen": -281.263623046875, + "logps/rejected": -353.3801676432292, + "loss": 0.0851, + "rewards/chosen": 2.988857650756836, + "rewards/margins": 8.902510706583659, + "rewards/rejected": -5.913653055826823, + "step": 2337 + }, + { + "epoch": 0.863088920677403, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 4.7171365819432435e-07, + "logits/chosen": 219918478.2222222, + "logits/rejected": 234881645.7142857, + "logps/chosen": -258.52772352430554, + "logps/rejected": -461.9252232142857, + "loss": 0.0851, + "rewards/chosen": 2.4643063015407987, + "rewards/margins": 10.122593349880642, + "rewards/rejected": -7.658287048339844, + "step": 2338 + }, + { + "epoch": 0.8634580776152462, + "grad_norm": 6.0, + "kl": 0.0, + "learning_rate": 4.6922135115978873e-07, + "logits/chosen": 235775846.4, + "logits/rejected": 202548138.66666666, + "logps/chosen": -333.991162109375, + "logps/rejected": -503.4857177734375, + "loss": 0.1108, + "rewards/chosen": 2.1897052764892577, + "rewards/margins": 9.58264897664388, + "rewards/rejected": -7.392943700154622, + "step": 2339 + }, + { + "epoch": 0.8638272345530894, + "grad_norm": 5.15625, + "kl": 0.15521526336669922, + "learning_rate": 4.667353214615089e-07, + "logits/chosen": 230836701.86666667, + "logits/rejected": 279039216.9411765, + "logps/chosen": -348.9681640625, + "logps/rejected": -449.2318761488971, + "loss": 0.0764, + "rewards/chosen": 2.5382369995117187, + "rewards/margins": 9.44831740435432, + "rewards/rejected": -6.910080404842601, + "step": 2340 + }, + { + "epoch": 0.8641963914909326, + "grad_norm": 6.875, + "kl": 0.0, + "learning_rate": 4.642555725438463e-07, + "logits/chosen": 161768259.36842105, + "logits/rejected": 175775724.30769232, + "logps/chosen": -330.4044253700658, + "logps/rejected": -471.2894756610577, + "loss": 0.1006, + "rewards/chosen": 2.502549020867599, + "rewards/margins": 10.250875835959246, + "rewards/rejected": -7.748326815091646, + "step": 2341 + }, + { + "epoch": 0.8645655484287758, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 4.6178210784246116e-07, + "logits/chosen": 226959524.57142857, + "logits/rejected": 195275107.55555555, + "logps/chosen": -389.9744349888393, + "logps/rejected": -445.1386447482639, + "loss": 0.1199, + "rewards/chosen": 2.1873249326433455, + "rewards/margins": 7.885750649467347, + "rewards/rejected": -5.698425716824001, + "step": 2342 + }, + { + "epoch": 0.864934705366619, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 4.593149307843081e-07, + "logits/chosen": 358459693.1764706, + "logits/rejected": 336787933.8666667, + "logps/chosen": -302.5567267922794, + "logps/rejected": -537.98564453125, + "loss": 0.0928, + "rewards/chosen": 2.577473584343405, + "rewards/margins": 9.389438509473614, + "rewards/rejected": -6.811964925130209, + "step": 2343 + }, + { + "epoch": 0.8653038623044622, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 4.568540447876307e-07, + "logits/chosen": 182995219.69230768, + "logits/rejected": 274764557.4736842, + "logps/chosen": -306.8242938701923, + "logps/rejected": -445.2724609375, + "loss": 0.1287, + "rewards/chosen": 2.2761990473820615, + "rewards/margins": 8.88298888727721, + "rewards/rejected": -6.606789839895148, + "step": 2344 + }, + { + "epoch": 0.8656730192423053, + "grad_norm": 3.5, + "kl": 0.0, + "learning_rate": 4.543994532619533e-07, + "logits/chosen": 230186102.15384614, + "logits/rejected": 213689074.52631578, + "logps/chosen": -416.1496394230769, + "logps/rejected": -532.6570209703947, + "loss": 0.0376, + "rewards/chosen": 3.4513265169583836, + "rewards/margins": 12.407984328173432, + "rewards/rejected": -8.956657811215049, + "step": 2345 + }, + { + "epoch": 0.8660421761801486, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 4.5195115960808166e-07, + "logits/chosen": 263306569.14285713, + "logits/rejected": 310932935.1111111, + "logps/chosen": -426.66796875, + "logps/rejected": -491.4720052083333, + "loss": 0.0419, + "rewards/chosen": 3.3978775569370816, + "rewards/margins": 10.876510165986561, + "rewards/rejected": -7.4786326090494795, + "step": 2346 + }, + { + "epoch": 0.8664113331179918, + "grad_norm": 4.53125, + "kl": 0.3226604461669922, + "learning_rate": 4.4950916721809733e-07, + "logits/chosen": 194748943.05882353, + "logits/rejected": 273471027.2, + "logps/chosen": -301.4031767003676, + "logps/rejected": -475.76868489583336, + "loss": 0.0707, + "rewards/chosen": 3.081815607407514, + "rewards/margins": 10.375831618963504, + "rewards/rejected": -7.29401601155599, + "step": 2347 + }, + { + "epoch": 0.866780490055835, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 4.470734794753473e-07, + "logits/chosen": 219829473.88235295, + "logits/rejected": 122538205.86666666, + "logps/chosen": -372.7135225183824, + "logps/rejected": -449.8400390625, + "loss": 0.1248, + "rewards/chosen": 2.046792423023897, + "rewards/margins": 8.546673913095512, + "rewards/rejected": -6.499881490071615, + "step": 2348 + }, + { + "epoch": 0.8671496469936781, + "grad_norm": 5.46875, + "kl": 0.3916769027709961, + "learning_rate": 4.446440997544471e-07, + "logits/chosen": 231850432.0, + "logits/rejected": 163571808.0, + "logps/chosen": -397.5549011230469, + "logps/rejected": -370.2587890625, + "loss": 0.1079, + "rewards/chosen": 2.41162109375, + "rewards/margins": 7.802154541015625, + "rewards/rejected": -5.390533447265625, + "step": 2349 + }, + { + "epoch": 0.8675188039315214, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 4.422210314212727e-07, + "logits/chosen": 270233206.15384614, + "logits/rejected": 196793802.10526314, + "logps/chosen": -402.5741436298077, + "logps/rejected": -503.39967105263156, + "loss": 0.0689, + "rewards/chosen": 2.613350794865535, + "rewards/margins": 9.90040154399177, + "rewards/rejected": -7.287050749126234, + "step": 2350 + }, + { + "epoch": 0.8678879608693646, + "grad_norm": 5.875, + "kl": 0.0, + "learning_rate": 4.39804277832952e-07, + "logits/chosen": 206873792.0, + "logits/rejected": 182249888.0, + "logps/chosen": -279.89892578125, + "logps/rejected": -413.4878845214844, + "loss": 0.1027, + "rewards/chosen": 2.2966976165771484, + "rewards/margins": 9.298822402954102, + "rewards/rejected": -7.002124786376953, + "step": 2351 + }, + { + "epoch": 0.8682571178072078, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 4.373938423378682e-07, + "logits/chosen": 214387086.2222222, + "logits/rejected": 167511625.14285713, + "logps/chosen": -371.17841254340277, + "logps/rejected": -420.51576450892856, + "loss": 0.1116, + "rewards/chosen": 2.5117664337158203, + "rewards/margins": 9.067981447492327, + "rewards/rejected": -6.556215013776507, + "step": 2352 + }, + { + "epoch": 0.8686262747450509, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 4.349897282756488e-07, + "logits/chosen": 199512448.0, + "logits/rejected": 139510624.0, + "logps/chosen": -385.2086181640625, + "logps/rejected": -374.5047912597656, + "loss": 0.0974, + "rewards/chosen": 2.248175621032715, + "rewards/margins": 8.610945701599121, + "rewards/rejected": -6.362770080566406, + "step": 2353 + }, + { + "epoch": 0.8689954316828942, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 4.325919389771627e-07, + "logits/chosen": 297767062.5882353, + "logits/rejected": 186157073.06666666, + "logps/chosen": -309.1208065257353, + "logps/rejected": -507.174609375, + "loss": 0.1181, + "rewards/chosen": 2.085877586813534, + "rewards/margins": 8.463792524150774, + "rewards/rejected": -6.3779149373372395, + "step": 2354 + }, + { + "epoch": 0.8693645886207374, + "grad_norm": 3.703125, + "kl": 0.0, + "learning_rate": 4.3020047776451633e-07, + "logits/chosen": 131440977.45454545, + "logits/rejected": 168507331.04761904, + "logps/chosen": -330.54549893465907, + "logps/rejected": -355.7900855654762, + "loss": 0.055, + "rewards/chosen": 2.426920110529119, + "rewards/margins": 8.750418064398167, + "rewards/rejected": -6.3234979538690474, + "step": 2355 + }, + { + "epoch": 0.8697337455585806, + "grad_norm": 3.515625, + "kl": 0.0, + "learning_rate": 4.2781534795104995e-07, + "logits/chosen": 166559476.36363637, + "logits/rejected": 182160164.57142857, + "logps/chosen": -297.5641424005682, + "logps/rejected": -451.9810267857143, + "loss": 0.0483, + "rewards/chosen": 3.2127050919966265, + "rewards/margins": 10.041032089299454, + "rewards/rejected": -6.8283269973028276, + "step": 2356 + }, + { + "epoch": 0.8701029024964237, + "grad_norm": 8.3125, + "kl": 1.7197093963623047, + "learning_rate": 4.2543655284132957e-07, + "logits/chosen": 246930997.89473686, + "logits/rejected": 194417545.84615386, + "logps/chosen": -366.70631167763156, + "logps/rejected": -601.9010667067307, + "loss": 0.1346, + "rewards/chosen": 2.5030455338327506, + "rewards/margins": 10.098581399029566, + "rewards/rejected": -7.595535865196815, + "step": 2357 + }, + { + "epoch": 0.870472059434267, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 4.2306409573114715e-07, + "logits/chosen": 203086293.33333334, + "logits/rejected": 206898624.0, + "logps/chosen": -322.21722412109375, + "logps/rejected": -496.199951171875, + "loss": 0.071, + "rewards/chosen": 2.6223249435424805, + "rewards/margins": 10.651383781433106, + "rewards/rejected": -8.029058837890625, + "step": 2358 + }, + { + "epoch": 0.8708412163721102, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 4.2069797990751007e-07, + "logits/chosen": 218948403.2, + "logits/rejected": 169118192.94117647, + "logps/chosen": -357.55283203125, + "logps/rejected": -452.01146024816177, + "loss": 0.0858, + "rewards/chosen": 2.438371276855469, + "rewards/margins": 9.42628272561466, + "rewards/rejected": -6.987911448759191, + "step": 2359 + }, + { + "epoch": 0.8712103733099534, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 4.183382086486415e-07, + "logits/chosen": 296578508.8, + "logits/rejected": 281758890.6666667, + "logps/chosen": -359.895166015625, + "logps/rejected": -381.5768229166667, + "loss": 0.0771, + "rewards/chosen": 3.2508338928222655, + "rewards/margins": 10.427148564656575, + "rewards/rejected": -7.17631467183431, + "step": 2360 + }, + { + "epoch": 0.8715795302477966, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.1598478522397567e-07, + "logits/chosen": 357681268.3636364, + "logits/rejected": 221709726.47619048, + "logps/chosen": -368.02468039772725, + "logps/rejected": -485.220703125, + "loss": 0.074, + "rewards/chosen": 2.4441552595658735, + "rewards/margins": 9.318430103780903, + "rewards/rejected": -6.87427484421503, + "step": 2361 + }, + { + "epoch": 0.8719486871856398, + "grad_norm": 5.0, + "kl": 1.3688039779663086, + "learning_rate": 4.1363771289415154e-07, + "logits/chosen": 174937104.0, + "logits/rejected": 196331360.0, + "logps/chosen": -318.8180847167969, + "logps/rejected": -424.1701965332031, + "loss": 0.0918, + "rewards/chosen": 3.415560245513916, + "rewards/margins": 9.698769569396973, + "rewards/rejected": -6.283209323883057, + "step": 2362 + }, + { + "epoch": 0.872317844123483, + "grad_norm": 2.328125, + "kl": 0.0, + "learning_rate": 4.1129699491100626e-07, + "logits/chosen": 194347776.0, + "logits/rejected": 143646912.0, + "logps/chosen": -377.73382568359375, + "logps/rejected": -371.9732259114583, + "loss": 0.0395, + "rewards/chosen": 3.601435899734497, + "rewards/margins": 9.902727683385212, + "rewards/rejected": -6.301291783650716, + "step": 2363 + }, + { + "epoch": 0.8726870010613262, + "grad_norm": 6.90625, + "kl": 0.31605052947998047, + "learning_rate": 4.089626345175757e-07, + "logits/chosen": 181514607.3043478, + "logits/rejected": 195831352.8888889, + "logps/chosen": -373.0056046195652, + "logps/rejected": -378.52842881944446, + "loss": 0.1069, + "rewards/chosen": 2.8659707774286685, + "rewards/margins": 8.358653340362697, + "rewards/rejected": -5.492682562934028, + "step": 2364 + }, + { + "epoch": 0.8730561579991694, + "grad_norm": 2.9375, + "kl": 0.0, + "learning_rate": 4.0663463494808706e-07, + "logits/chosen": 232458435.7647059, + "logits/rejected": 234383206.4, + "logps/chosen": -443.7559455422794, + "logps/rejected": -531.0608072916667, + "loss": 0.034, + "rewards/chosen": 3.8927971335018383, + "rewards/margins": 11.617208144244026, + "rewards/rejected": -7.7244110107421875, + "step": 2365 + }, + { + "epoch": 0.8734253149370126, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 4.043129994279527e-07, + "logits/chosen": 270708004.5714286, + "logits/rejected": 172694030.2222222, + "logps/chosen": -317.927734375, + "logps/rejected": -423.0834689670139, + "loss": 0.0755, + "rewards/chosen": 2.429337910243443, + "rewards/margins": 9.020023436773391, + "rewards/rejected": -6.590685526529948, + "step": 2366 + }, + { + "epoch": 0.8737944718748558, + "grad_norm": 4.84375, + "kl": 1.208512306213379, + "learning_rate": 4.019977311737699e-07, + "logits/chosen": 157480372.70588234, + "logits/rejected": 198809139.2, + "logps/chosen": -310.52975643382354, + "logps/rejected": -338.4576822916667, + "loss": 0.098, + "rewards/chosen": 2.727826286764706, + "rewards/margins": 8.247959241680071, + "rewards/rejected": -5.520132954915365, + "step": 2367 + }, + { + "epoch": 0.874163628812699, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 3.9968883339331467e-07, + "logits/chosen": 274944426.6666667, + "logits/rejected": 249364940.8, + "logps/chosen": -364.7259928385417, + "logps/rejected": -418.1052734375, + "loss": 0.0799, + "rewards/chosen": 1.897715409596761, + "rewards/margins": 9.434993394215901, + "rewards/rejected": -7.5372779846191404, + "step": 2368 + }, + { + "epoch": 0.8745327857505422, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 3.973863092855335e-07, + "logits/chosen": 197739370.66666666, + "logits/rejected": 273260902.4, + "logps/chosen": -384.07763671875, + "logps/rejected": -493.629150390625, + "loss": 0.0445, + "rewards/chosen": 3.242212931315104, + "rewards/margins": 11.244285074869792, + "rewards/rejected": -8.002072143554688, + "step": 2369 + }, + { + "epoch": 0.8749019426883854, + "grad_norm": 5.78125, + "kl": 1.9404621124267578, + "learning_rate": 3.9509016204054506e-07, + "logits/chosen": 206693512.53333333, + "logits/rejected": 227226744.47058824, + "logps/chosen": -328.7638671875, + "logps/rejected": -390.86931295955884, + "loss": 0.0845, + "rewards/chosen": 2.8309008280436196, + "rewards/margins": 9.234829248166552, + "rewards/rejected": -6.403928420122932, + "step": 2370 + }, + { + "epoch": 0.8752710996262286, + "grad_norm": 4.90625, + "kl": 0.4418368339538574, + "learning_rate": 3.928003948396336e-07, + "logits/chosen": 245502890.66666666, + "logits/rejected": 241042853.6470588, + "logps/chosen": -405.0413411458333, + "logps/rejected": -434.2925379136029, + "loss": 0.069, + "rewards/chosen": 2.5184906005859373, + "rewards/margins": 8.191582713407628, + "rewards/rejected": -5.673092112821691, + "step": 2371 + }, + { + "epoch": 0.8756402565640719, + "grad_norm": 6.9375, + "kl": 0.7189197540283203, + "learning_rate": 3.9051701085523973e-07, + "logits/chosen": 235435385.2631579, + "logits/rejected": 298039552.0, + "logps/chosen": -370.27670127467104, + "logps/rejected": -339.37646484375, + "loss": 0.1508, + "rewards/chosen": 2.30128318385074, + "rewards/margins": 7.561487792474539, + "rewards/rejected": -5.260204608623798, + "step": 2372 + }, + { + "epoch": 0.876009413501915, + "grad_norm": 3.765625, + "kl": 1.6571731567382812, + "learning_rate": 3.8824001325096504e-07, + "logits/chosen": 246567901.86666667, + "logits/rejected": 226049008.94117647, + "logps/chosen": -334.2991536458333, + "logps/rejected": -342.15894990808823, + "loss": 0.0633, + "rewards/chosen": 3.545510101318359, + "rewards/margins": 8.942845826990464, + "rewards/rejected": -5.397335725672105, + "step": 2373 + }, + { + "epoch": 0.876009413501915, + "eval_kl": 0.3605581521987915, + "eval_logits/chosen": 224181964.57395142, + "eval_logits/rejected": 190655785.75886524, + "eval_logps/chosen": -355.36023730684326, + "eval_logps/rejected": -444.5424793144208, + "eval_loss": 0.08010450005531311, + "eval_rewards/chosen": 2.8185283332471025, + "eval_rewards/margins": 9.447414959650768, + "eval_rewards/rejected": -6.6288866264036646, + "eval_runtime": 48.5126, + "eval_samples_per_second": 18.057, + "eval_steps_per_second": 4.514, + "step": 2373 + }, + { + "epoch": 0.8763785704397582, + "grad_norm": 6.53125, + "kl": 0.5550765991210938, + "learning_rate": 3.8596940518156047e-07, + "logits/chosen": 308555296.0, + "logits/rejected": 212651936.0, + "logps/chosen": -337.2405090332031, + "logps/rejected": -344.78814697265625, + "loss": 0.1192, + "rewards/chosen": 1.5978388786315918, + "rewards/margins": 6.766910076141357, + "rewards/rejected": -5.169071197509766, + "step": 2374 + }, + { + "epoch": 0.8767477273776014, + "grad_norm": 5.25, + "kl": 1.7372190952301025, + "learning_rate": 3.83705189792925e-07, + "logits/chosen": 160198339.7647059, + "logits/rejected": 238707148.8, + "logps/chosen": -327.3175838694853, + "logps/rejected": -503.25537109375, + "loss": 0.1201, + "rewards/chosen": 2.2652845943675324, + "rewards/margins": 10.371852994432636, + "rewards/rejected": -8.106568400065104, + "step": 2375 + }, + { + "epoch": 0.8771168843154447, + "grad_norm": 7.0625, + "kl": 1.4096832275390625, + "learning_rate": 3.8144737022209835e-07, + "logits/chosen": 186570965.33333334, + "logits/rejected": 198578102.85714287, + "logps/chosen": -337.80995008680554, + "logps/rejected": -462.8597935267857, + "loss": 0.1197, + "rewards/chosen": 2.618130366007487, + "rewards/margins": 8.514882314772834, + "rewards/rejected": -5.896751948765346, + "step": 2376 + }, + { + "epoch": 0.8774860412532878, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 3.791959495972619e-07, + "logits/chosen": 195796027.07692307, + "logits/rejected": 244941824.0, + "logps/chosen": -375.7772686298077, + "logps/rejected": -418.0029296875, + "loss": 0.0527, + "rewards/chosen": 2.8850044837364783, + "rewards/margins": 9.678736976283764, + "rewards/rejected": -6.793732492547286, + "step": 2377 + }, + { + "epoch": 0.877855198191131, + "grad_norm": 7.0625, + "kl": 0.7211742401123047, + "learning_rate": 3.769509310377317e-07, + "logits/chosen": 253208746.66666666, + "logits/rejected": 140280742.4, + "logps/chosen": -390.7544759114583, + "logps/rejected": -314.252587890625, + "loss": 0.1126, + "rewards/chosen": 1.5857391357421875, + "rewards/margins": 6.870536804199219, + "rewards/rejected": -5.284797668457031, + "step": 2378 + }, + { + "epoch": 0.8782243551289742, + "grad_norm": 5.65625, + "kl": 1.362600326538086, + "learning_rate": 3.7471231765395077e-07, + "logits/chosen": 215684167.1111111, + "logits/rejected": 213885421.7142857, + "logps/chosen": -414.0828450520833, + "logps/rejected": -592.1346261160714, + "loss": 0.1215, + "rewards/chosen": 2.2960311041937933, + "rewards/margins": 10.477734944177051, + "rewards/rejected": -8.181703839983259, + "step": 2379 + }, + { + "epoch": 0.8785935120668174, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 3.7248011254748974e-07, + "logits/chosen": 254748000.0, + "logits/rejected": 156971872.0, + "logps/chosen": -312.8929138183594, + "logps/rejected": -433.34515380859375, + "loss": 0.0828, + "rewards/chosen": 2.077698230743408, + "rewards/margins": 9.122942924499512, + "rewards/rejected": -7.0452446937561035, + "step": 2380 + }, + { + "epoch": 0.8789626690046606, + "grad_norm": 4.59375, + "kl": 1.0567779541015625, + "learning_rate": 3.7025431881104137e-07, + "logits/chosen": 190825876.21052632, + "logits/rejected": 169077897.84615386, + "logps/chosen": -381.63301809210526, + "logps/rejected": -457.4934645432692, + "loss": 0.064, + "rewards/chosen": 2.9487077813399467, + "rewards/margins": 9.051670568674682, + "rewards/rejected": -6.102962787334736, + "step": 2381 + }, + { + "epoch": 0.8793318259425038, + "grad_norm": 6.4375, + "kl": 1.006718635559082, + "learning_rate": 3.680349395284133e-07, + "logits/chosen": 226122051.36842105, + "logits/rejected": 214455453.53846154, + "logps/chosen": -354.95751953125, + "logps/rejected": -466.4038837139423, + "loss": 0.109, + "rewards/chosen": 2.317934939735814, + "rewards/margins": 10.33762875452698, + "rewards/rejected": -8.019693814791166, + "step": 2382 + }, + { + "epoch": 0.879700982880347, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 3.658219777745281e-07, + "logits/chosen": 199671793.7777778, + "logits/rejected": 232382683.42857143, + "logps/chosen": -362.67225477430554, + "logps/rejected": -548.9273507254464, + "loss": 0.1059, + "rewards/chosen": 2.4880553351508246, + "rewards/margins": 11.0219603644477, + "rewards/rejected": -8.533905029296875, + "step": 2383 + }, + { + "epoch": 0.8800701398181902, + "grad_norm": 7.84375, + "kl": 0.24051284790039062, + "learning_rate": 3.6361543661541654e-07, + "logits/chosen": 194144469.33333334, + "logits/rejected": 300344868.5714286, + "logps/chosen": -349.83287217881946, + "logps/rejected": -395.9246303013393, + "loss": 0.1286, + "rewards/chosen": 1.9648556179470487, + "rewards/margins": 8.479941110762338, + "rewards/rejected": -6.51508549281529, + "step": 2384 + }, + { + "epoch": 0.8804392967560334, + "grad_norm": 4.78125, + "kl": 0.029039382934570312, + "learning_rate": 3.614153191082126e-07, + "logits/chosen": 221773027.55555555, + "logits/rejected": 245212452.57142857, + "logps/chosen": -385.0612521701389, + "logps/rejected": -369.36655970982144, + "loss": 0.0766, + "rewards/chosen": 2.5159748925103083, + "rewards/margins": 9.385342219519238, + "rewards/rejected": -6.869367327008929, + "step": 2385 + }, + { + "epoch": 0.8808084536938766, + "grad_norm": 3.46875, + "kl": 0.0, + "learning_rate": 3.592216283011513e-07, + "logits/chosen": 179190961.23076922, + "logits/rejected": 142381096.42105263, + "logps/chosen": -261.7035381610577, + "logps/rejected": -343.14599609375, + "loss": 0.0486, + "rewards/chosen": 3.0745711693396935, + "rewards/margins": 9.108289880791174, + "rewards/rejected": -6.03371871145148, + "step": 2386 + }, + { + "epoch": 0.8811776106317198, + "grad_norm": 6.03125, + "kl": 0.9245071411132812, + "learning_rate": 3.570343672335641e-07, + "logits/chosen": 270432512.0, + "logits/rejected": 245086833.7777778, + "logps/chosen": -426.6202915736607, + "logps/rejected": -472.7819552951389, + "loss": 0.0676, + "rewards/chosen": 3.0570507049560547, + "rewards/margins": 9.1696474287245, + "rewards/rejected": -6.112596723768446, + "step": 2387 + }, + { + "epoch": 0.881546767569563, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.5485353893587204e-07, + "logits/chosen": 160036187.42857143, + "logits/rejected": 292250083.5555556, + "logps/chosen": -322.8795689174107, + "logps/rejected": -530.9676649305555, + "loss": 0.0671, + "rewards/chosen": 2.66967282976423, + "rewards/margins": 10.545618874686106, + "rewards/rejected": -7.875946044921875, + "step": 2388 + }, + { + "epoch": 0.8819159245074062, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 3.5267914642958534e-07, + "logits/chosen": 245426483.2, + "logits/rejected": 225847582.11764705, + "logps/chosen": -322.04694010416665, + "logps/rejected": -478.9707605698529, + "loss": 0.1142, + "rewards/chosen": 2.2820200602213543, + "rewards/margins": 9.133011701995251, + "rewards/rejected": -6.850991641773897, + "step": 2389 + }, + { + "epoch": 0.8822850814452494, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 3.505111927272992e-07, + "logits/chosen": 185561216.0, + "logits/rejected": 344856405.3333333, + "logps/chosen": -284.85191127232144, + "logps/rejected": -404.33997938368054, + "loss": 0.0803, + "rewards/chosen": 2.7275167192731584, + "rewards/margins": 8.57572737194243, + "rewards/rejected": -5.8482106526692705, + "step": 2390 + }, + { + "epoch": 0.8826542383830926, + "grad_norm": 5.46875, + "kl": 0.28514671325683594, + "learning_rate": 3.4834968083268307e-07, + "logits/chosen": 236498944.0, + "logits/rejected": 200037173.89473686, + "logps/chosen": -361.6807391826923, + "logps/rejected": -416.4261924342105, + "loss": 0.0915, + "rewards/chosen": 2.1195386739877553, + "rewards/margins": 7.641481615753792, + "rewards/rejected": -5.521942941766036, + "step": 2391 + }, + { + "epoch": 0.8830233953209358, + "grad_norm": 7.15625, + "kl": 1.0230369567871094, + "learning_rate": 3.461946137404865e-07, + "logits/chosen": 139467040.0, + "logits/rejected": 197402528.0, + "logps/chosen": -315.1826477050781, + "logps/rejected": -399.96697998046875, + "loss": 0.0883, + "rewards/chosen": 2.462623119354248, + "rewards/margins": 7.8680739402771, + "rewards/rejected": -5.405450820922852, + "step": 2392 + }, + { + "epoch": 0.883392552258779, + "grad_norm": 5.96875, + "kl": 0.6524658203125, + "learning_rate": 3.440459944365271e-07, + "logits/chosen": 192294354.82352942, + "logits/rejected": 195260893.86666667, + "logps/chosen": -378.77409811580884, + "logps/rejected": -358.10595703125, + "loss": 0.0853, + "rewards/chosen": 2.5843059315400967, + "rewards/margins": 8.695979413799211, + "rewards/rejected": -6.111673482259115, + "step": 2393 + }, + { + "epoch": 0.8837617091966222, + "grad_norm": 5.09375, + "kl": 0.5478372573852539, + "learning_rate": 3.4190382589768755e-07, + "logits/chosen": 184102049.68421054, + "logits/rejected": 154569885.53846154, + "logps/chosen": -339.7999845805921, + "logps/rejected": -436.97412109375, + "loss": 0.083, + "rewards/chosen": 2.6389298689992806, + "rewards/margins": 9.663859209068391, + "rewards/rejected": -7.024929340069111, + "step": 2394 + }, + { + "epoch": 0.8841308661344655, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 3.397681110919171e-07, + "logits/chosen": 288056718.2222222, + "logits/rejected": 203357807.3043478, + "logps/chosen": -541.8347981770834, + "logps/rejected": -406.06687330163044, + "loss": 0.0482, + "rewards/chosen": 2.910777833726671, + "rewards/margins": 8.828259629327894, + "rewards/rejected": -5.917481795601223, + "step": 2395 + }, + { + "epoch": 0.8845000230723086, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 3.3763885297822153e-07, + "logits/chosen": 112379616.0, + "logits/rejected": 172934528.0, + "logps/chosen": -244.5753173828125, + "logps/rejected": -425.73828125, + "loss": 0.149, + "rewards/chosen": 0.7609333197275797, + "rewards/margins": 6.985821930567424, + "rewards/rejected": -6.224888610839844, + "step": 2396 + }, + { + "epoch": 0.8848691800101518, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 3.355160545066599e-07, + "logits/chosen": 275781308.6315789, + "logits/rejected": 264890112.0, + "logps/chosen": -362.947265625, + "logps/rejected": -476.5178034855769, + "loss": 0.1129, + "rewards/chosen": 2.2174758911132812, + "rewards/margins": 9.105518634502705, + "rewards/rejected": -6.888042743389423, + "step": 2397 + }, + { + "epoch": 0.885238336947995, + "grad_norm": 5.0625, + "kl": 0.047646522521972656, + "learning_rate": 3.333997186183435e-07, + "logits/chosen": 120460902.4, + "logits/rejected": 168082552.47058824, + "logps/chosen": -247.57760416666667, + "logps/rejected": -398.49181410845586, + "loss": 0.0944, + "rewards/chosen": 2.6777776082356772, + "rewards/margins": 8.376307020000382, + "rewards/rejected": -5.698529411764706, + "step": 2398 + }, + { + "epoch": 0.8856074938858383, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 3.312898482454285e-07, + "logits/chosen": 229761670.7368421, + "logits/rejected": 238423512.6153846, + "logps/chosen": -325.96060341282896, + "logps/rejected": -559.7919170673077, + "loss": 0.0683, + "rewards/chosen": 2.871525814658717, + "rewards/margins": 11.13128204963468, + "rewards/rejected": -8.259756234975962, + "step": 2399 + }, + { + "epoch": 0.8859766508236814, + "grad_norm": 6.46875, + "kl": 2.187628746032715, + "learning_rate": 3.2918644631111274e-07, + "logits/chosen": 206259925.33333334, + "logits/rejected": 187791506.2857143, + "logps/chosen": -414.4433322482639, + "logps/rejected": -404.79690987723217, + "loss": 0.0887, + "rewards/chosen": 2.8238143920898438, + "rewards/margins": 8.755008152553014, + "rewards/rejected": -5.931193760463169, + "step": 2400 + }, + { + "epoch": 0.8863458077615246, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 3.270895157296339e-07, + "logits/chosen": 225154167.46666667, + "logits/rejected": 249051437.17647058, + "logps/chosen": -356.637109375, + "logps/rejected": -499.92365579044116, + "loss": 0.0604, + "rewards/chosen": 2.719254811604818, + "rewards/margins": 10.607921301149855, + "rewards/rejected": -7.888666489545037, + "step": 2401 + }, + { + "epoch": 0.8867149646993678, + "grad_norm": 5.96875, + "kl": 0.09569168090820312, + "learning_rate": 3.24999059406263e-07, + "logits/chosen": 233827000.8888889, + "logits/rejected": 284755620.5714286, + "logps/chosen": -320.0812717013889, + "logps/rejected": -639.5714983258929, + "loss": 0.1087, + "rewards/chosen": 2.262028376261393, + "rewards/margins": 10.116134189424061, + "rewards/rejected": -7.8541058131626675, + "step": 2402 + }, + { + "epoch": 0.8870841216372111, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 3.229150802372988e-07, + "logits/chosen": 281764160.0, + "logits/rejected": 196319712.0, + "logps/chosen": -374.4599914550781, + "logps/rejected": -498.29766845703125, + "loss": 0.081, + "rewards/chosen": 2.5944786071777344, + "rewards/margins": 9.810835361480713, + "rewards/rejected": -7.2163567543029785, + "step": 2403 + }, + { + "epoch": 0.8874532785750542, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 3.2083758111006946e-07, + "logits/chosen": 165729133.7142857, + "logits/rejected": 272868465.7777778, + "logps/chosen": -280.49330357142856, + "logps/rejected": -409.82093641493054, + "loss": 0.0522, + "rewards/chosen": 3.485445295061384, + "rewards/margins": 11.026031130836124, + "rewards/rejected": -7.540585835774739, + "step": 2404 + }, + { + "epoch": 0.8878224355128974, + "grad_norm": 4.40625, + "kl": 2.5812015533447266, + "learning_rate": 3.187665649029242e-07, + "logits/chosen": 282979529.14285713, + "logits/rejected": 317854037.3333333, + "logps/chosen": -418.00184849330356, + "logps/rejected": -423.08973524305554, + "loss": 0.0787, + "rewards/chosen": 3.057632173810686, + "rewards/margins": 10.10694942777119, + "rewards/rejected": -7.049317253960504, + "step": 2405 + }, + { + "epoch": 0.8881915924507406, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 3.1670203448522784e-07, + "logits/chosen": 243868208.0, + "logits/rejected": 97592672.0, + "logps/chosen": -379.14459228515625, + "logps/rejected": -329.173095703125, + "loss": 0.0635, + "rewards/chosen": 2.519547700881958, + "rewards/margins": 8.898432493209839, + "rewards/rejected": -6.378884792327881, + "step": 2406 + }, + { + "epoch": 0.8885607493885839, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 3.1464399271736225e-07, + "logits/chosen": 229983505.06666666, + "logits/rejected": 213652961.88235295, + "logps/chosen": -388.12877604166664, + "logps/rejected": -435.99235983455884, + "loss": 0.0873, + "rewards/chosen": 2.110985692342122, + "rewards/margins": 8.067298432892445, + "rewards/rejected": -5.956312740550322, + "step": 2407 + }, + { + "epoch": 0.888929906326427, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.125924424507182e-07, + "logits/chosen": 198119523.55555555, + "logits/rejected": 195070464.0, + "logps/chosen": -347.71875, + "logps/rejected": -399.4667271205357, + "loss": 0.1002, + "rewards/chosen": 3.2341637081570096, + "rewards/margins": 9.930590099758572, + "rewards/rejected": -6.6964263916015625, + "step": 2408 + }, + { + "epoch": 0.8892990632642702, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 3.1054738652769256e-07, + "logits/chosen": 203664742.4, + "logits/rejected": 176490767.05882353, + "logps/chosen": -305.818359375, + "logps/rejected": -406.59912109375, + "loss": 0.0856, + "rewards/chosen": 2.7723566691080728, + "rewards/margins": 8.836831784715839, + "rewards/rejected": -6.064475115607767, + "step": 2409 + }, + { + "epoch": 0.8896682202021134, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 3.0850882778168333e-07, + "logits/chosen": 231044315.42857143, + "logits/rejected": 195864901.8181818, + "logps/chosen": -445.3004557291667, + "logps/rejected": -440.91153231534093, + "loss": 0.0584, + "rewards/chosen": 2.9588835580008372, + "rewards/margins": 8.545757739574878, + "rewards/rejected": -5.586874181574041, + "step": 2410 + }, + { + "epoch": 0.8900373771399567, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 3.0647676903708846e-07, + "logits/chosen": 237927694.2222222, + "logits/rejected": 208136411.42857143, + "logps/chosen": -322.03282335069446, + "logps/rejected": -312.21156529017856, + "loss": 0.0858, + "rewards/chosen": 2.4353699154324002, + "rewards/margins": 7.91467899746365, + "rewards/rejected": -5.47930908203125, + "step": 2411 + }, + { + "epoch": 0.8904065340777998, + "grad_norm": 6.5, + "kl": 0.7608969211578369, + "learning_rate": 3.044512131092997e-07, + "logits/chosen": 141407171.7647059, + "logits/rejected": 197116381.86666667, + "logps/chosen": -318.00318818933823, + "logps/rejected": -493.0148111979167, + "loss": 0.1232, + "rewards/chosen": 2.04997724645278, + "rewards/margins": 9.451669446159812, + "rewards/rejected": -7.401692199707031, + "step": 2412 + }, + { + "epoch": 0.890775691015643, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 3.0243216280469834e-07, + "logits/chosen": 239872928.0, + "logits/rejected": 214664928.0, + "logps/chosen": -319.3060302734375, + "logps/rejected": -414.5733337402344, + "loss": 0.1353, + "rewards/chosen": 1.562589406967163, + "rewards/margins": 7.8866002559661865, + "rewards/rejected": -6.324010848999023, + "step": 2413 + }, + { + "epoch": 0.8911448479534863, + "grad_norm": 4.40625, + "kl": 0.5581231117248535, + "learning_rate": 3.004196209206539e-07, + "logits/chosen": 160712221.53846154, + "logits/rejected": 161912778.10526314, + "logps/chosen": -273.4887883112981, + "logps/rejected": -393.2115542763158, + "loss": 0.0622, + "rewards/chosen": 3.0934213491586537, + "rewards/margins": 9.99395974348431, + "rewards/rejected": -6.9005383943256575, + "step": 2414 + }, + { + "epoch": 0.8915140048913295, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 2.984135902455171e-07, + "logits/chosen": 338152960.0, + "logits/rejected": 188488857.6, + "logps/chosen": -390.0658365885417, + "logps/rejected": -416.7892578125, + "loss": 0.0636, + "rewards/chosen": 2.369453271230062, + "rewards/margins": 8.924257500966391, + "rewards/rejected": -6.5548042297363285, + "step": 2415 + }, + { + "epoch": 0.8918831618291726, + "grad_norm": 4.46875, + "kl": 0.0, + "learning_rate": 2.9641407355861796e-07, + "logits/chosen": 185153398.15384614, + "logits/rejected": 164624006.7368421, + "logps/chosen": -351.27768179086536, + "logps/rejected": -442.56064967105266, + "loss": 0.0668, + "rewards/chosen": 2.8995290902944713, + "rewards/margins": 8.924502106330655, + "rewards/rejected": -6.024973016036184, + "step": 2416 + }, + { + "epoch": 0.8922523187670158, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.9442107363026106e-07, + "logits/chosen": 160727089.23076922, + "logits/rejected": 213899264.0, + "logps/chosen": -342.58169320913464, + "logps/rejected": -407.7382298519737, + "loss": 0.0675, + "rewards/chosen": 2.885485429030198, + "rewards/margins": 8.179327142383405, + "rewards/rejected": -5.293841713353207, + "step": 2417 + }, + { + "epoch": 0.8926214757048591, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 2.9243459322172317e-07, + "logits/chosen": 165836784.0, + "logits/rejected": 183115648.0, + "logps/chosen": -326.54852294921875, + "logps/rejected": -612.623291015625, + "loss": 0.0695, + "rewards/chosen": 2.485311508178711, + "rewards/margins": 10.4779372215271, + "rewards/rejected": -7.992625713348389, + "step": 2418 + }, + { + "epoch": 0.8929906326427022, + "grad_norm": 6.71875, + "kl": 0.8346176147460938, + "learning_rate": 2.904546350852472e-07, + "logits/chosen": 142137057.88235295, + "logits/rejected": 168481177.6, + "logps/chosen": -352.10472196691177, + "logps/rejected": -439.7731119791667, + "loss": 0.0613, + "rewards/chosen": 3.146544063792509, + "rewards/margins": 9.985855282054228, + "rewards/rejected": -6.839311218261718, + "step": 2419 + }, + { + "epoch": 0.8933597895805454, + "grad_norm": 4.9375, + "kl": 0.26264142990112305, + "learning_rate": 2.884812019640404e-07, + "logits/chosen": 188412040.53333333, + "logits/rejected": 178575721.4117647, + "logps/chosen": -327.54778645833335, + "logps/rejected": -324.31540096507354, + "loss": 0.105, + "rewards/chosen": 2.634791056315104, + "rewards/margins": 8.336026210410923, + "rewards/rejected": -5.701235154095818, + "step": 2420 + }, + { + "epoch": 0.8937289465183886, + "grad_norm": 5.40625, + "kl": 0.8884649276733398, + "learning_rate": 2.8651429659226906e-07, + "logits/chosen": 140280939.78947368, + "logits/rejected": 282640068.9230769, + "logps/chosen": -376.49599095394734, + "logps/rejected": -403.4341571514423, + "loss": 0.0938, + "rewards/chosen": 3.017716257195724, + "rewards/margins": 9.178832050277155, + "rewards/rejected": -6.16111579308143, + "step": 2421 + }, + { + "epoch": 0.8940981034562319, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.8455392169505546e-07, + "logits/chosen": 229465024.0, + "logits/rejected": 152792912.0, + "logps/chosen": -368.2649841308594, + "logps/rejected": -434.55682373046875, + "loss": 0.0628, + "rewards/chosen": 3.0676419734954834, + "rewards/margins": 11.06454062461853, + "rewards/rejected": -7.996898651123047, + "step": 2422 + }, + { + "epoch": 0.894467260394075, + "grad_norm": 7.15625, + "kl": 1.2971467971801758, + "learning_rate": 2.826000799884737e-07, + "logits/chosen": 190641584.0, + "logits/rejected": 277565600.0, + "logps/chosen": -349.8748779296875, + "logps/rejected": -611.8491821289062, + "loss": 0.0986, + "rewards/chosen": 2.659759759902954, + "rewards/margins": 9.734580278396606, + "rewards/rejected": -7.074820518493652, + "step": 2423 + }, + { + "epoch": 0.8948364173319182, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 2.806527741795478e-07, + "logits/chosen": 179114832.0, + "logits/rejected": 123169488.0, + "logps/chosen": -291.00927734375, + "logps/rejected": -307.616455078125, + "loss": 0.1, + "rewards/chosen": 2.0297739505767822, + "rewards/margins": 7.596264600753784, + "rewards/rejected": -5.566490650177002, + "step": 2424 + }, + { + "epoch": 0.8952055742697614, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 2.787120069662441e-07, + "logits/chosen": 207642387.69230768, + "logits/rejected": 185727784.42105263, + "logps/chosen": -331.3247821514423, + "logps/rejected": -458.75755550986844, + "loss": 0.0743, + "rewards/chosen": 3.122128413273738, + "rewards/margins": 9.830658236978508, + "rewards/rejected": -6.70852982370477, + "step": 2425 + }, + { + "epoch": 0.8955747312076047, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 2.767777810374722e-07, + "logits/chosen": 278384338.8235294, + "logits/rejected": 259242871.46666667, + "logps/chosen": -318.3545496323529, + "logps/rejected": -489.00807291666666, + "loss": 0.0933, + "rewards/chosen": 2.1809167300953582, + "rewards/margins": 8.740025209913068, + "rewards/rejected": -6.559108479817708, + "step": 2426 + }, + { + "epoch": 0.8959438881454478, + "grad_norm": 5.3125, + "kl": 0.007433891296386719, + "learning_rate": 2.748500990730768e-07, + "logits/chosen": 267786953.14285713, + "logits/rejected": 203095523.55555555, + "logps/chosen": -340.1162109375, + "logps/rejected": -418.7096896701389, + "loss": 0.0713, + "rewards/chosen": 3.0567989349365234, + "rewards/margins": 9.065029356214735, + "rewards/rejected": -6.008230421278212, + "step": 2427 + }, + { + "epoch": 0.896313045083291, + "grad_norm": 3.6875, + "kl": 0.0, + "learning_rate": 2.7292896374383595e-07, + "logits/chosen": 171818086.4, + "logits/rejected": 221026695.52941176, + "logps/chosen": -295.60455729166665, + "logps/rejected": -461.75212545955884, + "loss": 0.063, + "rewards/chosen": 3.0328933715820314, + "rewards/margins": 11.380344974293429, + "rewards/rejected": -8.347451602711397, + "step": 2428 + }, + { + "epoch": 0.8966822020211342, + "grad_norm": 6.03125, + "kl": 0.6717386245727539, + "learning_rate": 2.710143777114588e-07, + "logits/chosen": 249442560.0, + "logits/rejected": 157585472.0, + "logps/chosen": -284.805859375, + "logps/rejected": -232.91743977864584, + "loss": 0.1093, + "rewards/chosen": 2.354068946838379, + "rewards/margins": 7.396914672851563, + "rewards/rejected": -5.042845726013184, + "step": 2429 + }, + { + "epoch": 0.8970513589589775, + "grad_norm": 5.75, + "kl": 1.2946195602416992, + "learning_rate": 2.691063436285812e-07, + "logits/chosen": 138635937.68421054, + "logits/rejected": 119309489.23076923, + "logps/chosen": -311.5382144325658, + "logps/rejected": -334.3466796875, + "loss": 0.1081, + "rewards/chosen": 3.466582047311883, + "rewards/margins": 10.374991165964227, + "rewards/rejected": -6.908409118652344, + "step": 2430 + }, + { + "epoch": 0.8974205158968206, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.672048641387581e-07, + "logits/chosen": 184212272.0, + "logits/rejected": 171339152.0, + "logps/chosen": -332.00982666015625, + "logps/rejected": -365.75286865234375, + "loss": 0.0633, + "rewards/chosen": 2.773268222808838, + "rewards/margins": 7.884509563446045, + "rewards/rejected": -5.111241340637207, + "step": 2431 + }, + { + "epoch": 0.8977896728346638, + "grad_norm": 4.125, + "kl": 1.0036978721618652, + "learning_rate": 2.6530994187646653e-07, + "logits/chosen": 251103378.2857143, + "logits/rejected": 181071701.33333334, + "logps/chosen": -350.2025669642857, + "logps/rejected": -401.9564615885417, + "loss": 0.0646, + "rewards/chosen": 3.0830661228724887, + "rewards/margins": 10.176476190960596, + "rewards/rejected": -7.093410068088108, + "step": 2432 + }, + { + "epoch": 0.898158829772507, + "grad_norm": 2.390625, + "kl": 0.0, + "learning_rate": 2.6342157946709745e-07, + "logits/chosen": 198646857.14285713, + "logits/rejected": 250264917.33333334, + "logps/chosen": -376.52633231026783, + "logps/rejected": -497.5133463541667, + "loss": 0.0293, + "rewards/chosen": 3.0629801068987166, + "rewards/margins": 10.341327546134828, + "rewards/rejected": -7.278347439236111, + "step": 2433 + }, + { + "epoch": 0.8985279867103503, + "grad_norm": 6.1875, + "kl": 0.0, + "learning_rate": 2.615397795269514e-07, + "logits/chosen": 173362291.2, + "logits/rejected": 241153493.33333334, + "logps/chosen": -274.4294189453125, + "logps/rejected": -608.3531901041666, + "loss": 0.1261, + "rewards/chosen": 2.293244743347168, + "rewards/margins": 13.044960085550944, + "rewards/rejected": -10.751715342203775, + "step": 2434 + }, + { + "epoch": 0.8988971436481934, + "grad_norm": 5.1875, + "kl": 2.080820083618164, + "learning_rate": 2.5966454466323956e-07, + "logits/chosen": 220371124.70588234, + "logits/rejected": 153730082.13333333, + "logps/chosen": -330.0174345128676, + "logps/rejected": -414.0305989583333, + "loss": 0.1132, + "rewards/chosen": 2.631900562959559, + "rewards/margins": 9.879643788057216, + "rewards/rejected": -7.247743225097656, + "step": 2435 + }, + { + "epoch": 0.8992663005860366, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 2.577958774740763e-07, + "logits/chosen": 272924507.4285714, + "logits/rejected": 230989824.0, + "logps/chosen": -429.86771065848217, + "logps/rejected": -542.4944118923611, + "loss": 0.0665, + "rewards/chosen": 2.336597170148577, + "rewards/margins": 9.429526586381217, + "rewards/rejected": -7.092929416232639, + "step": 2436 + }, + { + "epoch": 0.8996354575238799, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 2.5593378054847516e-07, + "logits/chosen": 174398112.0, + "logits/rejected": 184620752.0, + "logps/chosen": -316.752685546875, + "logps/rejected": -515.65087890625, + "loss": 0.0631, + "rewards/chosen": 2.9403793811798096, + "rewards/margins": 11.211297273635864, + "rewards/rejected": -8.270917892456055, + "step": 2437 + }, + { + "epoch": 0.9000046144617231, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 2.54078256466348e-07, + "logits/chosen": 216206043.42857143, + "logits/rejected": 267981909.33333334, + "logps/chosen": -411.1057826450893, + "logps/rejected": -486.33989800347223, + "loss": 0.052, + "rewards/chosen": 3.089612143380301, + "rewards/margins": 10.961453574044363, + "rewards/rejected": -7.8718414306640625, + "step": 2438 + }, + { + "epoch": 0.9003737713995662, + "grad_norm": 5.46875, + "kl": 0.0, + "learning_rate": 2.522293077985011e-07, + "logits/chosen": 267553952.0, + "logits/rejected": 145738480.0, + "logps/chosen": -338.98223876953125, + "logps/rejected": -406.61456298828125, + "loss": 0.099, + "rewards/chosen": 2.326539993286133, + "rewards/margins": 8.175984859466553, + "rewards/rejected": -5.84944486618042, + "step": 2439 + }, + { + "epoch": 0.9007429283374094, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 2.5038693710662754e-07, + "logits/chosen": 199920713.14285713, + "logits/rejected": 160883911.1111111, + "logps/chosen": -428.64222935267856, + "logps/rejected": -378.90668402777777, + "loss": 0.0768, + "rewards/chosen": 2.4730469839913503, + "rewards/margins": 8.169347066727896, + "rewards/rejected": -5.6963000827365455, + "step": 2440 + }, + { + "epoch": 0.9011120852752527, + "grad_norm": 3.90625, + "kl": 0.0, + "learning_rate": 2.4855114694330995e-07, + "logits/chosen": 271615078.4, + "logits/rejected": 211108412.2352941, + "logps/chosen": -408.49700520833335, + "logps/rejected": -501.7913602941176, + "loss": 0.0641, + "rewards/chosen": 2.5069544474283854, + "rewards/margins": 10.16061404359107, + "rewards/rejected": -7.653659596162684, + "step": 2441 + }, + { + "epoch": 0.9014812422130959, + "grad_norm": 5.34375, + "kl": 0.8121843338012695, + "learning_rate": 2.467219398520121e-07, + "logits/chosen": 159606608.84210527, + "logits/rejected": 209107140.92307693, + "logps/chosen": -337.83750513980266, + "logps/rejected": -428.68607271634613, + "loss": 0.107, + "rewards/chosen": 2.820749182450144, + "rewards/margins": 9.542032728311021, + "rewards/rejected": -6.721283545860877, + "step": 2442 + }, + { + "epoch": 0.901850399150939, + "grad_norm": 7.3125, + "kl": 0.6459779739379883, + "learning_rate": 2.448993183670756e-07, + "logits/chosen": 228856974.2222222, + "logits/rejected": 239545051.42857143, + "logps/chosen": -389.5944552951389, + "logps/rejected": -513.6239188058036, + "loss": 0.087, + "rewards/chosen": 2.8051954905192056, + "rewards/margins": 11.944938750494094, + "rewards/rejected": -9.139743259974889, + "step": 2443 + }, + { + "epoch": 0.9022195560887822, + "grad_norm": 5.625, + "kl": 0.0, + "learning_rate": 2.4308328501372213e-07, + "logits/chosen": 181248512.0, + "logits/rejected": 266476992.0, + "logps/chosen": -356.7856689453125, + "logps/rejected": -346.7073567708333, + "loss": 0.0857, + "rewards/chosen": 3.7660285949707033, + "rewards/margins": 8.985064061482747, + "rewards/rejected": -5.219035466512044, + "step": 2444 + }, + { + "epoch": 0.9025887130266255, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 2.4127384230803963e-07, + "logits/chosen": 195285104.0, + "logits/rejected": 206284720.0, + "logps/chosen": -343.84942626953125, + "logps/rejected": -473.9733581542969, + "loss": 0.1008, + "rewards/chosen": 1.8881261348724365, + "rewards/margins": 8.801591634750366, + "rewards/rejected": -6.91346549987793, + "step": 2445 + }, + { + "epoch": 0.9029578699644687, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 2.3947099275699103e-07, + "logits/chosen": 321869004.8, + "logits/rejected": 293207762.8235294, + "logps/chosen": -662.1067057291667, + "logps/rejected": -492.0117761948529, + "loss": 0.0596, + "rewards/chosen": 3.424554951985677, + "rewards/margins": 10.157440424900429, + "rewards/rejected": -6.732885472914752, + "step": 2446 + }, + { + "epoch": 0.9033270269023118, + "grad_norm": 3.203125, + "kl": 0.0, + "learning_rate": 2.3767473885839943e-07, + "logits/chosen": 247482304.0, + "logits/rejected": 202005478.4, + "logps/chosen": -335.29571533203125, + "logps/rejected": -430.33310546875, + "loss": 0.0293, + "rewards/chosen": 3.75177796681722, + "rewards/margins": 10.273540051778157, + "rewards/rejected": -6.521762084960938, + "step": 2447 + }, + { + "epoch": 0.903696183840155, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.3588508310095183e-07, + "logits/chosen": 199104464.0, + "logits/rejected": 202734624.0, + "logps/chosen": -370.7049865722656, + "logps/rejected": -523.548583984375, + "loss": 0.0533, + "rewards/chosen": 3.2024903297424316, + "rewards/margins": 10.049434661865234, + "rewards/rejected": -6.846944332122803, + "step": 2448 + }, + { + "epoch": 0.9040653407779983, + "grad_norm": 4.875, + "kl": 0.0006742477416992188, + "learning_rate": 2.3410202796419534e-07, + "logits/chosen": 179838208.0, + "logits/rejected": 216477997.17647058, + "logps/chosen": -389.35638020833335, + "logps/rejected": -391.9203239889706, + "loss": 0.0695, + "rewards/chosen": 3.335132853190104, + "rewards/margins": 9.80757188983992, + "rewards/rejected": -6.472439036649816, + "step": 2449 + }, + { + "epoch": 0.9044344977158415, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 2.3232557591852777e-07, + "logits/chosen": 193334308.57142857, + "logits/rejected": 234243214.2222222, + "logps/chosen": -437.5593959263393, + "logps/rejected": -554.6503363715278, + "loss": 0.0414, + "rewards/chosen": 3.788837160382952, + "rewards/margins": 10.786115161956303, + "rewards/rejected": -6.997278001573351, + "step": 2450 + }, + { + "epoch": 0.9048036546536846, + "grad_norm": 3.65625, + "kl": 0.0, + "learning_rate": 2.3055572942520256e-07, + "logits/chosen": 183903352.47058824, + "logits/rejected": 159819639.46666667, + "logps/chosen": -323.98655790441177, + "logps/rejected": -377.8375, + "loss": 0.0458, + "rewards/chosen": 3.569001590504366, + "rewards/margins": 11.028112613453585, + "rewards/rejected": -7.459111022949219, + "step": 2451 + }, + { + "epoch": 0.9051728115915278, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 2.2879249093631928e-07, + "logits/chosen": 166033136.0, + "logits/rejected": 179863744.0, + "logps/chosen": -279.458984375, + "logps/rejected": -434.05438232421875, + "loss": 0.0581, + "rewards/chosen": 3.5792341232299805, + "rewards/margins": 9.540136337280273, + "rewards/rejected": -5.960902214050293, + "step": 2452 + }, + { + "epoch": 0.9055419685293711, + "grad_norm": 5.6875, + "kl": 0.0, + "learning_rate": 2.2703586289482215e-07, + "logits/chosen": 154458718.31578946, + "logits/rejected": 279051736.61538464, + "logps/chosen": -297.7225277549342, + "logps/rejected": -601.1666917067307, + "loss": 0.1036, + "rewards/chosen": 2.4669980500873767, + "rewards/margins": 10.866281239127341, + "rewards/rejected": -8.399283189039965, + "step": 2453 + }, + { + "epoch": 0.9059111254672142, + "grad_norm": 3.140625, + "kl": 1.0806787014007568, + "learning_rate": 2.2528584773449657e-07, + "logits/chosen": 129666768.0, + "logits/rejected": 148053344.0, + "logps/chosen": -261.21807861328125, + "logps/rejected": -347.1720275878906, + "loss": 0.0692, + "rewards/chosen": 2.9488563537597656, + "rewards/margins": 10.301757335662842, + "rewards/rejected": -7.352900981903076, + "step": 2454 + }, + { + "epoch": 0.9062802824050574, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 2.2354244787996748e-07, + "logits/chosen": 157569737.14285713, + "logits/rejected": 209889166.2222222, + "logps/chosen": -389.68282645089283, + "logps/rejected": -483.4499782986111, + "loss": 0.0628, + "rewards/chosen": 2.7381629943847656, + "rewards/margins": 9.879712846544054, + "rewards/rejected": -7.141549852159288, + "step": 2455 + }, + { + "epoch": 0.9066494393429007, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 2.2180566574669215e-07, + "logits/chosen": 177356629.33333334, + "logits/rejected": 192490887.52941176, + "logps/chosen": -388.72242838541666, + "logps/rejected": -420.48566750919116, + "loss": 0.0771, + "rewards/chosen": 2.3798189798990887, + "rewards/margins": 8.80034896252202, + "rewards/rejected": -6.420529982622932, + "step": 2456 + }, + { + "epoch": 0.9070185962807439, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 2.2007550374096077e-07, + "logits/chosen": 269771488.0, + "logits/rejected": 179707568.0, + "logps/chosen": -438.09136962890625, + "logps/rejected": -515.1878051757812, + "loss": 0.0677, + "rewards/chosen": 3.084714889526367, + "rewards/margins": 10.50588321685791, + "rewards/rejected": -7.421168327331543, + "step": 2457 + }, + { + "epoch": 0.907387753218587, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 2.183519642598908e-07, + "logits/chosen": 216863924.70588234, + "logits/rejected": 248058948.26666668, + "logps/chosen": -290.14318129595586, + "logps/rejected": -383.38463541666664, + "loss": 0.0901, + "rewards/chosen": 2.8070371291216683, + "rewards/margins": 8.38454007915422, + "rewards/rejected": -5.577502950032552, + "step": 2458 + }, + { + "epoch": 0.9077569101564302, + "grad_norm": 4.25, + "kl": 0.5824804306030273, + "learning_rate": 2.1663504969142378e-07, + "logits/chosen": 208617642.66666666, + "logits/rejected": 155584356.17391303, + "logps/chosen": -360.9455295138889, + "logps/rejected": -440.5570652173913, + "loss": 0.071, + "rewards/chosen": 2.282873365614149, + "rewards/margins": 7.884841882088334, + "rewards/rejected": -5.601968516474185, + "step": 2459 + }, + { + "epoch": 0.9081260670942735, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 2.1492476241432303e-07, + "logits/chosen": 181321943.57894737, + "logits/rejected": 152011884.30769232, + "logps/chosen": -378.19315378289474, + "logps/rejected": -418.201171875, + "loss": 0.0541, + "rewards/chosen": 3.300124921296772, + "rewards/margins": 10.814762795019728, + "rewards/rejected": -7.514637873722957, + "step": 2460 + }, + { + "epoch": 0.9084952240321167, + "grad_norm": 4.59375, + "kl": 0.2718524932861328, + "learning_rate": 2.1322110479817138e-07, + "logits/chosen": 232109808.94117647, + "logits/rejected": 183577361.06666666, + "logps/chosen": -340.9950310202206, + "logps/rejected": -517.0752604166667, + "loss": 0.0918, + "rewards/chosen": 2.283634858972886, + "rewards/margins": 9.425921541101793, + "rewards/rejected": -7.142286682128907, + "step": 2461 + }, + { + "epoch": 0.9088643809699598, + "grad_norm": 5.9375, + "kl": 0.0, + "learning_rate": 2.1152407920336348e-07, + "logits/chosen": 258414051.55555555, + "logits/rejected": 210928640.0, + "logps/chosen": -373.5416666666667, + "logps/rejected": -431.2040318080357, + "loss": 0.0775, + "rewards/chosen": 3.019762251112196, + "rewards/margins": 8.512699611603267, + "rewards/rejected": -5.492937360491071, + "step": 2462 + }, + { + "epoch": 0.909233537907803, + "grad_norm": 5.0625, + "kl": 0.6262483596801758, + "learning_rate": 2.0983368798110582e-07, + "logits/chosen": 251924198.4, + "logits/rejected": 202684266.66666666, + "logps/chosen": -376.1967529296875, + "logps/rejected": -301.6429036458333, + "loss": 0.0875, + "rewards/chosen": 2.6716461181640625, + "rewards/margins": 8.483323415120442, + "rewards/rejected": -5.81167729695638, + "step": 2463 + }, + { + "epoch": 0.9096026948456463, + "grad_norm": 7.03125, + "kl": 0.0, + "learning_rate": 2.081499334734155e-07, + "logits/chosen": 225255213.17647058, + "logits/rejected": 187407325.86666667, + "logps/chosen": -350.2533318014706, + "logps/rejected": -450.6988932291667, + "loss": 0.1269, + "rewards/chosen": 1.964761173023897, + "rewards/margins": 9.250358910654105, + "rewards/rejected": -7.285597737630209, + "step": 2464 + }, + { + "epoch": 0.9099718517834895, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 2.0647281801311257e-07, + "logits/chosen": 188366139.07692307, + "logits/rejected": 202947988.21052632, + "logps/chosen": -400.7422626201923, + "logps/rejected": -447.9672594572368, + "loss": 0.0448, + "rewards/chosen": 3.5037272526667667, + "rewards/margins": 10.061013163825278, + "rewards/rejected": -6.557285911158512, + "step": 2465 + }, + { + "epoch": 0.9103410087213326, + "grad_norm": 5.28125, + "kl": 0.01732635498046875, + "learning_rate": 2.0480234392381893e-07, + "logits/chosen": 196889801.14285713, + "logits/rejected": 195091470.2222222, + "logps/chosen": -371.02267020089283, + "logps/rejected": -374.93315972222223, + "loss": 0.0653, + "rewards/chosen": 2.4009908948625838, + "rewards/margins": 8.094551177251907, + "rewards/rejected": -5.693560282389323, + "step": 2466 + }, + { + "epoch": 0.9107101656591758, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 2.031385135199554e-07, + "logits/chosen": 217197728.0, + "logits/rejected": 232031408.0, + "logps/chosen": -365.82623291015625, + "logps/rejected": -385.9376525878906, + "loss": 0.0477, + "rewards/chosen": 3.104665756225586, + "rewards/margins": 8.546820163726807, + "rewards/rejected": -5.442154407501221, + "step": 2467 + }, + { + "epoch": 0.9110793225970191, + "grad_norm": 5.875, + "kl": 2.4467058181762695, + "learning_rate": 2.0148132910673857e-07, + "logits/chosen": 207396378.9473684, + "logits/rejected": 245269661.53846154, + "logps/chosen": -343.9763826069079, + "logps/rejected": -382.22201772836536, + "loss": 0.1251, + "rewards/chosen": 2.8185625578227795, + "rewards/margins": 9.495333868482335, + "rewards/rejected": -6.676771310659555, + "step": 2468 + }, + { + "epoch": 0.9114484795348623, + "grad_norm": 3.953125, + "kl": 0.34409046173095703, + "learning_rate": 1.9983079298017517e-07, + "logits/chosen": 187103680.0, + "logits/rejected": 232010608.0, + "logps/chosen": -325.7419738769531, + "logps/rejected": -406.04840087890625, + "loss": 0.0819, + "rewards/chosen": 2.9415550231933594, + "rewards/margins": 10.981578826904297, + "rewards/rejected": -8.040023803710938, + "step": 2469 + }, + { + "epoch": 0.9118176364727054, + "grad_norm": 4.0, + "kl": 0.0, + "learning_rate": 1.9818690742706258e-07, + "logits/chosen": 205200332.8, + "logits/rejected": 150286080.0, + "logps/chosen": -314.05504557291664, + "logps/rejected": -463.50247012867646, + "loss": 0.0821, + "rewards/chosen": 2.63575922648112, + "rewards/margins": 9.046568014107498, + "rewards/rejected": -6.410808787626379, + "step": 2470 + }, + { + "epoch": 0.9121867934105486, + "grad_norm": 3.375, + "kl": 0.4823218584060669, + "learning_rate": 1.9654967472498342e-07, + "logits/chosen": 201209344.0, + "logits/rejected": 188990293.33333334, + "logps/chosen": -342.45177504595586, + "logps/rejected": -390.82190755208336, + "loss": 0.0542, + "rewards/chosen": 3.4348005406996784, + "rewards/margins": 8.423212507659313, + "rewards/rejected": -4.988411966959635, + "step": 2471 + }, + { + "epoch": 0.9125559503483919, + "grad_norm": 6.9375, + "kl": 0.3194422721862793, + "learning_rate": 1.9491909714230207e-07, + "logits/chosen": 210119193.6, + "logits/rejected": 172420480.0, + "logps/chosen": -327.50400390625, + "logps/rejected": -413.052490234375, + "loss": 0.1051, + "rewards/chosen": 2.3984142303466798, + "rewards/margins": 9.047868982950847, + "rewards/rejected": -6.649454752604167, + "step": 2472 + }, + { + "epoch": 0.9129251072862351, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.9329517693816468e-07, + "logits/chosen": 210684711.3846154, + "logits/rejected": 183194880.0, + "logps/chosen": -416.1701847956731, + "logps/rejected": -322.4984580592105, + "loss": 0.068, + "rewards/chosen": 2.9456863403320312, + "rewards/margins": 8.348640040347451, + "rewards/rejected": -5.402953700015419, + "step": 2473 + }, + { + "epoch": 0.9132942642240782, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 1.9167791636249044e-07, + "logits/chosen": 274033948.4444444, + "logits/rejected": 176910628.57142857, + "logps/chosen": -334.2013888888889, + "logps/rejected": -402.73011997767856, + "loss": 0.1149, + "rewards/chosen": 2.286056730482313, + "rewards/margins": 9.22931746831016, + "rewards/rejected": -6.943260737827846, + "step": 2474 + }, + { + "epoch": 0.9136634211619215, + "grad_norm": 4.59375, + "kl": 0.15740108489990234, + "learning_rate": 1.900673176559742e-07, + "logits/chosen": 172281819.42857143, + "logits/rejected": 222752184.8888889, + "logps/chosen": -385.70302036830356, + "logps/rejected": -509.89268663194446, + "loss": 0.0659, + "rewards/chosen": 3.0185884748186385, + "rewards/margins": 9.634617578415643, + "rewards/rejected": -6.616029103597005, + "step": 2475 + }, + { + "epoch": 0.9140325780997647, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.8846338305007984e-07, + "logits/chosen": 146735552.0, + "logits/rejected": 201902016.0, + "logps/chosen": -325.06435139973956, + "logps/rejected": -482.7005859375, + "loss": 0.0582, + "rewards/chosen": 3.3490931193033853, + "rewards/margins": 11.643859354654948, + "rewards/rejected": -8.294766235351563, + "step": 2476 + }, + { + "epoch": 0.9144017350376079, + "grad_norm": 7.0625, + "kl": 0.0, + "learning_rate": 1.868661147670381e-07, + "logits/chosen": 206830430.31578946, + "logits/rejected": 180727768.6153846, + "logps/chosen": -321.9956311677632, + "logps/rejected": -283.56285682091345, + "loss": 0.1271, + "rewards/chosen": 2.191573092811986, + "rewards/margins": 6.97588805534579, + "rewards/rejected": -4.784314962533804, + "step": 2477 + }, + { + "epoch": 0.914770891975451, + "grad_norm": 3.5625, + "kl": 0.0, + "learning_rate": 1.852755150198443e-07, + "logits/chosen": 269436086.85714287, + "logits/rejected": 164489315.55555555, + "logps/chosen": -285.0468052455357, + "logps/rejected": -412.69463433159723, + "loss": 0.0635, + "rewards/chosen": 2.947312218802316, + "rewards/margins": 9.00269356984941, + "rewards/rejected": -6.055381351047092, + "step": 2478 + }, + { + "epoch": 0.9151400489132943, + "grad_norm": 6.84375, + "kl": 0.0, + "learning_rate": 1.83691586012254e-07, + "logits/chosen": 273270137.2631579, + "logits/rejected": 211054710.15384614, + "logps/chosen": -443.1322985197368, + "logps/rejected": -453.77047025240387, + "loss": 0.1024, + "rewards/chosen": 2.4624469154759456, + "rewards/margins": 8.106002189852447, + "rewards/rejected": -5.643555274376502, + "step": 2479 + }, + { + "epoch": 0.9155092058511375, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.8211432993878063e-07, + "logits/chosen": 266748359.1111111, + "logits/rejected": 240331830.85714287, + "logps/chosen": -368.33208550347223, + "logps/rejected": -611.1443917410714, + "loss": 0.0833, + "rewards/chosen": 2.8354053497314453, + "rewards/margins": 11.677875791277204, + "rewards/rejected": -8.842470441545759, + "step": 2480 + }, + { + "epoch": 0.9158783627889807, + "grad_norm": 4.96875, + "kl": 1.3389720916748047, + "learning_rate": 1.8054374898469228e-07, + "logits/chosen": 231238083.7647059, + "logits/rejected": 258771524.26666668, + "logps/chosen": -375.3212316176471, + "logps/rejected": -540.7861328125, + "loss": 0.0737, + "rewards/chosen": 3.3010556838091683, + "rewards/margins": 9.771540099499273, + "rewards/rejected": -6.4704844156901045, + "step": 2481 + }, + { + "epoch": 0.9162475197268238, + "grad_norm": 6.5625, + "kl": 0.41602420806884766, + "learning_rate": 1.7897984532600943e-07, + "logits/chosen": 299188163.7647059, + "logits/rejected": 152948206.93333334, + "logps/chosen": -358.26134535845586, + "logps/rejected": -499.5102864583333, + "loss": 0.1203, + "rewards/chosen": 2.0952675763298485, + "rewards/margins": 9.838379758947035, + "rewards/rejected": -7.743112182617187, + "step": 2482 + }, + { + "epoch": 0.9166166766646671, + "grad_norm": 4.09375, + "kl": 1.3164629936218262, + "learning_rate": 1.7742262112950047e-07, + "logits/chosen": 219552984.6153846, + "logits/rejected": 190175447.57894737, + "logps/chosen": -307.33045372596155, + "logps/rejected": -562.8891344572369, + "loss": 0.0751, + "rewards/chosen": 2.5842467087965746, + "rewards/margins": 11.0790964659409, + "rewards/rejected": -8.494849757144326, + "step": 2483 + }, + { + "epoch": 0.9169858336025103, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.7587207855267962e-07, + "logits/chosen": 307949677.71428573, + "logits/rejected": 201826716.44444445, + "logps/chosen": -305.54708426339283, + "logps/rejected": -513.6666124131945, + "loss": 0.0953, + "rewards/chosen": 3.042982646397182, + "rewards/margins": 10.242142056661939, + "rewards/rejected": -7.199159410264757, + "step": 2484 + }, + { + "epoch": 0.9173549905403535, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 1.7432821974380343e-07, + "logits/chosen": 339160764.6315789, + "logits/rejected": 320887729.2307692, + "logps/chosen": -406.35572574013156, + "logps/rejected": -473.59164663461536, + "loss": 0.0928, + "rewards/chosen": 2.308103461014597, + "rewards/margins": 9.737148918120967, + "rewards/rejected": -7.42904545710637, + "step": 2485 + }, + { + "epoch": 0.9177241474781966, + "grad_norm": 7.34375, + "kl": 0.2275218963623047, + "learning_rate": 1.7279104684187032e-07, + "logits/chosen": 214793947.42857143, + "logits/rejected": 249334499.55555555, + "logps/chosen": -441.31117466517856, + "logps/rejected": -378.92694769965277, + "loss": 0.0905, + "rewards/chosen": 2.1280652454921176, + "rewards/margins": 8.108627667502752, + "rewards/rejected": -5.9805624220106335, + "step": 2486 + }, + { + "epoch": 0.9180933044160399, + "grad_norm": 5.53125, + "kl": 0.8228769302368164, + "learning_rate": 1.7126056197661222e-07, + "logits/chosen": 219323270.7368421, + "logits/rejected": 197250185.84615386, + "logps/chosen": -364.93919613486844, + "logps/rejected": -480.10268930288464, + "loss": 0.0968, + "rewards/chosen": 2.6850463465640417, + "rewards/margins": 8.74023151783808, + "rewards/rejected": -6.055185171274038, + "step": 2487 + }, + { + "epoch": 0.9184624613538831, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 1.697367672684963e-07, + "logits/chosen": 210215680.0, + "logits/rejected": 295663904.0, + "logps/chosen": -308.1256408691406, + "logps/rejected": -461.84442138671875, + "loss": 0.1133, + "rewards/chosen": 1.8571511507034302, + "rewards/margins": 9.012763142585754, + "rewards/rejected": -7.155611991882324, + "step": 2488 + }, + { + "epoch": 0.9188316182917262, + "grad_norm": 6.3125, + "kl": 0.33257389068603516, + "learning_rate": 1.6821966482872264e-07, + "logits/chosen": 161329083.73333332, + "logits/rejected": 261522552.47058824, + "logps/chosen": -365.14186197916666, + "logps/rejected": -472.4569738051471, + "loss": 0.0858, + "rewards/chosen": 2.6882115681966146, + "rewards/margins": 10.341460074630438, + "rewards/rejected": -7.653248506433823, + "step": 2489 + }, + { + "epoch": 0.9192007752295694, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 1.6670925675921545e-07, + "logits/chosen": 294867712.0, + "logits/rejected": 193862092.8, + "logps/chosen": -328.76025390625, + "logps/rejected": -521.62900390625, + "loss": 0.0491, + "rewards/chosen": 2.557485262552897, + "rewards/margins": 9.709625307718913, + "rewards/rejected": -7.152140045166016, + "step": 2490 + }, + { + "epoch": 0.9195699321674127, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.652055451526269e-07, + "logits/chosen": 203581170.52631578, + "logits/rejected": 143901371.07692307, + "logps/chosen": -268.4467516447368, + "logps/rejected": -378.1979417067308, + "loss": 0.0694, + "rewards/chosen": 2.9205145584909538, + "rewards/margins": 9.645594608445881, + "rewards/rejected": -6.7250800499549275, + "step": 2491 + }, + { + "epoch": 0.9199390891052559, + "grad_norm": 6.1875, + "kl": 0.7323856353759766, + "learning_rate": 1.637085320923304e-07, + "logits/chosen": 248771584.0, + "logits/rejected": 216967286.15384614, + "logps/chosen": -282.97525185032896, + "logps/rejected": -461.4001277043269, + "loss": 0.1323, + "rewards/chosen": 2.4242136101973686, + "rewards/margins": 8.40703866935452, + "rewards/rejected": -5.982825059157151, + "step": 2492 + }, + { + "epoch": 0.920308246043099, + "grad_norm": 5.53125, + "kl": 0.0, + "learning_rate": 1.6221821965241747e-07, + "logits/chosen": 209417781.89473686, + "logits/rejected": 179714697.84615386, + "logps/chosen": -376.0010536595395, + "logps/rejected": -512.0420673076923, + "loss": 0.0738, + "rewards/chosen": 3.0995894984195105, + "rewards/margins": 9.258066107869631, + "rewards/rejected": -6.15847660945012, + "step": 2493 + }, + { + "epoch": 0.9206774029809422, + "grad_norm": 4.3125, + "kl": 0.07094287872314453, + "learning_rate": 1.6073460989769806e-07, + "logits/chosen": 187224405.33333334, + "logits/rejected": 213513667.7647059, + "logps/chosen": -328.92981770833336, + "logps/rejected": -430.0054285386029, + "loss": 0.0522, + "rewards/chosen": 3.0028541564941404, + "rewards/margins": 10.507458720487707, + "rewards/rejected": -7.504604563993566, + "step": 2494 + }, + { + "epoch": 0.9210465599187855, + "grad_norm": 3.984375, + "kl": 1.1862983703613281, + "learning_rate": 1.5925770488369517e-07, + "logits/chosen": 114667806.11764705, + "logits/rejected": 186374314.66666666, + "logps/chosen": -277.67678653492646, + "logps/rejected": -533.1040364583333, + "loss": 0.0554, + "rewards/chosen": 3.4764139512005974, + "rewards/margins": 10.028305532418045, + "rewards/rejected": -6.551891581217448, + "step": 2495 + }, + { + "epoch": 0.9214157168566287, + "grad_norm": 6.375, + "kl": 0.0, + "learning_rate": 1.577875066566409e-07, + "logits/chosen": 178291013.8181818, + "logits/rejected": 133043328.0, + "logps/chosen": -399.4603160511364, + "logps/rejected": -496.90869140625, + "loss": 0.0848, + "rewards/chosen": 3.464291659268466, + "rewards/margins": 11.19831341830167, + "rewards/rejected": -7.734021759033203, + "step": 2496 + }, + { + "epoch": 0.9217848737944718, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 1.563240172534758e-07, + "logits/chosen": 170019660.8, + "logits/rejected": 258171461.8181818, + "logps/chosen": -366.040673828125, + "logps/rejected": -404.47709517045456, + "loss": 0.0895, + "rewards/chosen": 2.2501953125, + "rewards/margins": 8.960449634898794, + "rewards/rejected": -6.710254322398793, + "step": 2497 + }, + { + "epoch": 0.922154030732315, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 1.5486723870184684e-07, + "logits/chosen": 219568867.55555555, + "logits/rejected": 146375387.42857143, + "logps/chosen": -328.74720594618054, + "logps/rejected": -386.15087890625, + "loss": 0.1111, + "rewards/chosen": 2.1522242228190103, + "rewards/margins": 9.170057387579055, + "rewards/rejected": -7.017833164760044, + "step": 2498 + }, + { + "epoch": 0.9225231876701583, + "grad_norm": 5.125, + "kl": 0.14827251434326172, + "learning_rate": 1.5341717302010228e-07, + "logits/chosen": 278368921.6, + "logits/rejected": 171566197.33333334, + "logps/chosen": -343.304833984375, + "logps/rejected": -522.7777913411459, + "loss": 0.0998, + "rewards/chosen": 2.6138141632080076, + "rewards/margins": 9.886755752563477, + "rewards/rejected": -7.272941589355469, + "step": 2499 + }, + { + "epoch": 0.9228923446080015, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.5197382221728896e-07, + "logits/chosen": 192206472.53333333, + "logits/rejected": 111245680.94117647, + "logps/chosen": -349.6763020833333, + "logps/rejected": -364.6771886488971, + "loss": 0.0665, + "rewards/chosen": 3.148207092285156, + "rewards/margins": 9.884545090619255, + "rewards/rejected": -6.7363379983341, + "step": 2500 + }, + { + "epoch": 0.9232615015458446, + "grad_norm": 6.78125, + "kl": 0.25454068183898926, + "learning_rate": 1.505371882931511e-07, + "logits/chosen": 229181354.66666666, + "logits/rejected": 167465618.2857143, + "logps/chosen": -410.17279730902777, + "logps/rejected": -312.86948939732144, + "loss": 0.1206, + "rewards/chosen": 2.139842775132921, + "rewards/margins": 7.589273513309539, + "rewards/rejected": -5.449430738176618, + "step": 2501 + }, + { + "epoch": 0.9236306584836879, + "grad_norm": 6.28125, + "kl": 0.0, + "learning_rate": 1.491072732381277e-07, + "logits/chosen": 263897488.0, + "logits/rejected": 229303776.0, + "logps/chosen": -333.6886291503906, + "logps/rejected": -580.4410400390625, + "loss": 0.0981, + "rewards/chosen": 2.33552885055542, + "rewards/margins": 10.521575450897217, + "rewards/rejected": -8.186046600341797, + "step": 2502 + }, + { + "epoch": 0.9239998154215311, + "grad_norm": 5.40625, + "kl": 1.4952874183654785, + "learning_rate": 1.476840790333467e-07, + "logits/chosen": 221347291.42857143, + "logits/rejected": 183303950.2222222, + "logps/chosen": -417.85634068080356, + "logps/rejected": -426.7963053385417, + "loss": 0.0548, + "rewards/chosen": 3.514153071812221, + "rewards/margins": 9.259254152812655, + "rewards/rejected": -5.745101081000434, + "step": 2503 + }, + { + "epoch": 0.9243689723593743, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.4626760765062586e-07, + "logits/chosen": 203039472.94117647, + "logits/rejected": 177233373.86666667, + "logps/chosen": -379.24661075367646, + "logps/rejected": -350.9828776041667, + "loss": 0.0652, + "rewards/chosen": 3.109836129581227, + "rewards/margins": 8.408496916527842, + "rewards/rejected": -5.2986607869466145, + "step": 2504 + }, + { + "epoch": 0.9247381292972174, + "grad_norm": 5.25, + "kl": 1.5598783493041992, + "learning_rate": 1.4485786105246923e-07, + "logits/chosen": 233412321.88235295, + "logits/rejected": 186184499.2, + "logps/chosen": -376.26246553308823, + "logps/rejected": -512.1577473958333, + "loss": 0.1035, + "rewards/chosen": 2.671212589039522, + "rewards/margins": 11.02101931104473, + "rewards/rejected": -8.349806722005209, + "step": 2505 + }, + { + "epoch": 0.9251072862350607, + "grad_norm": 5.03125, + "kl": 0.9501757621765137, + "learning_rate": 1.4345484119206222e-07, + "logits/chosen": 162910256.0, + "logits/rejected": 237954720.0, + "logps/chosen": -326.3510437011719, + "logps/rejected": -395.6703186035156, + "loss": 0.0893, + "rewards/chosen": 2.617800712585449, + "rewards/margins": 9.028350830078125, + "rewards/rejected": -6.410550117492676, + "step": 2506 + }, + { + "epoch": 0.9254764431729039, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 1.420585500132704e-07, + "logits/chosen": 203411434.66666666, + "logits/rejected": 187268646.4, + "logps/chosen": -445.6465250651042, + "logps/rejected": -548.831591796875, + "loss": 0.0403, + "rewards/chosen": 3.6479015350341797, + "rewards/margins": 12.656631088256836, + "rewards/rejected": -9.008729553222656, + "step": 2507 + }, + { + "epoch": 0.9258456001107471, + "grad_norm": 6.625, + "kl": 0.0570683479309082, + "learning_rate": 1.4066898945063856e-07, + "logits/chosen": 189748152.8888889, + "logits/rejected": 173906944.0, + "logps/chosen": -496.35584852430554, + "logps/rejected": -340.44126674107144, + "loss": 0.0735, + "rewards/chosen": 2.860412173800998, + "rewards/margins": 9.866288442460318, + "rewards/rejected": -7.0058762686593195, + "step": 2508 + }, + { + "epoch": 0.9262147570485902, + "grad_norm": 6.46875, + "kl": 0.0, + "learning_rate": 1.3928616142938445e-07, + "logits/chosen": 172939036.44444445, + "logits/rejected": 183579465.14285713, + "logps/chosen": -347.0238986545139, + "logps/rejected": -444.426513671875, + "loss": 0.0949, + "rewards/chosen": 2.212230258517795, + "rewards/margins": 8.56734587654235, + "rewards/rejected": -6.355115618024554, + "step": 2509 + }, + { + "epoch": 0.9265839139864335, + "grad_norm": 7.5, + "kl": 2.289586067199707, + "learning_rate": 1.379100678653983e-07, + "logits/chosen": 306407228.95238096, + "logits/rejected": 174412357.8181818, + "logps/chosen": -390.3652808779762, + "logps/rejected": -413.3976384943182, + "loss": 0.1452, + "rewards/chosen": 2.2006519862583707, + "rewards/margins": 8.893695781757305, + "rewards/rejected": -6.693043795498935, + "step": 2510 + }, + { + "epoch": 0.9269530709242767, + "grad_norm": 4.5625, + "kl": 1.5611257553100586, + "learning_rate": 1.3654071066524222e-07, + "logits/chosen": 207135533.17647058, + "logits/rejected": 142554487.46666667, + "logps/chosen": -367.2377355238971, + "logps/rejected": -326.3274739583333, + "loss": 0.078, + "rewards/chosen": 3.7618435130399814, + "rewards/margins": 9.012090196796493, + "rewards/rejected": -5.250246683756511, + "step": 2511 + }, + { + "epoch": 0.9273222278621199, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.3517809172614137e-07, + "logits/chosen": 162768263.52941176, + "logits/rejected": 167791138.13333333, + "logps/chosen": -264.63327205882354, + "logps/rejected": -506.78557942708335, + "loss": 0.1242, + "rewards/chosen": 2.125760695513557, + "rewards/margins": 11.13941120820887, + "rewards/rejected": -9.013650512695312, + "step": 2512 + }, + { + "epoch": 0.927691384799963, + "grad_norm": 5.09375, + "kl": 0.3061361312866211, + "learning_rate": 1.3382221293598728e-07, + "logits/chosen": 193368160.0, + "logits/rejected": 147769232.0, + "logps/chosen": -319.63751220703125, + "logps/rejected": -521.109375, + "loss": 0.0693, + "rewards/chosen": 2.673532247543335, + "rewards/margins": 9.645915746688843, + "rewards/rejected": -6.972383499145508, + "step": 2513 + }, + { + "epoch": 0.9280605417378063, + "grad_norm": 4.34375, + "kl": 0.8995282649993896, + "learning_rate": 1.3247307617333283e-07, + "logits/chosen": 173973248.0, + "logits/rejected": 207005824.0, + "logps/chosen": -387.84380425347223, + "logps/rejected": -471.1695033482143, + "loss": 0.0557, + "rewards/chosen": 3.1482457054985895, + "rewards/margins": 9.792141535925488, + "rewards/rejected": -6.643895830426898, + "step": 2514 + }, + { + "epoch": 0.9284296986756495, + "grad_norm": 6.4375, + "kl": 1.9178733825683594, + "learning_rate": 1.3113068330739053e-07, + "logits/chosen": 283002333.8666667, + "logits/rejected": 231071774.11764705, + "logps/chosen": -362.3183268229167, + "logps/rejected": -458.31468290441177, + "loss": 0.1149, + "rewards/chosen": 2.332776641845703, + "rewards/margins": 8.63903938742245, + "rewards/rejected": -6.306262745576746, + "step": 2515 + }, + { + "epoch": 0.9287988556134927, + "grad_norm": 4.84375, + "kl": 0.7875747680664062, + "learning_rate": 1.2979503619802715e-07, + "logits/chosen": 215740950.26086956, + "logits/rejected": 256127345.7777778, + "logps/chosen": -324.60841966711956, + "logps/rejected": -594.2124565972222, + "loss": 0.06, + "rewards/chosen": 3.334974537725034, + "rewards/margins": 10.019549088777552, + "rewards/rejected": -6.684574551052517, + "step": 2516 + }, + { + "epoch": 0.9291680125513359, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 1.2846613669576678e-07, + "logits/chosen": 176143402.66666666, + "logits/rejected": 164908044.8, + "logps/chosen": -294.7503255208333, + "logps/rejected": -278.9049072265625, + "loss": 0.0702, + "rewards/chosen": 2.467782974243164, + "rewards/margins": 7.7859649658203125, + "rewards/rejected": -5.318181991577148, + "step": 2517 + }, + { + "epoch": 0.9295371694891791, + "grad_norm": 3.6875, + "kl": 1.1585760116577148, + "learning_rate": 1.2714398664178174e-07, + "logits/chosen": 211818360.47058824, + "logits/rejected": 255967556.26666668, + "logps/chosen": -403.8677332261029, + "logps/rejected": -499.62877604166664, + "loss": 0.044, + "rewards/chosen": 3.5969170963062957, + "rewards/margins": 11.030048504997701, + "rewards/rejected": -7.433131408691406, + "step": 2518 + }, + { + "epoch": 0.9299063264270223, + "grad_norm": 6.0, + "kl": 1.1021156311035156, + "learning_rate": 1.2582858786789388e-07, + "logits/chosen": 192227341.47368422, + "logits/rejected": 238446907.07692307, + "logps/chosen": -282.75943153782896, + "logps/rejected": -466.4091796875, + "loss": 0.1422, + "rewards/chosen": 2.3206606413188733, + "rewards/margins": 8.549692717640989, + "rewards/rejected": -6.229032076322115, + "step": 2519 + }, + { + "epoch": 0.9302754833648655, + "grad_norm": 5.5, + "kl": 0.9397463798522949, + "learning_rate": 1.2451994219657203e-07, + "logits/chosen": 194349161.4117647, + "logits/rejected": 162754781.86666667, + "logps/chosen": -353.08576516544116, + "logps/rejected": -414.19208984375, + "loss": 0.1086, + "rewards/chosen": 2.6543888765222885, + "rewards/margins": 7.593877096737132, + "rewards/rejected": -4.939488220214844, + "step": 2520 + }, + { + "epoch": 0.9306446403027087, + "grad_norm": 5.46875, + "kl": 0.05904388427734375, + "learning_rate": 1.2321805144092757e-07, + "logits/chosen": 288757486.93333334, + "logits/rejected": 170540272.94117647, + "logps/chosen": -338.6963216145833, + "logps/rejected": -388.4994255514706, + "loss": 0.1054, + "rewards/chosen": 2.0309768676757813, + "rewards/margins": 7.433984823787913, + "rewards/rejected": -5.403007956112132, + "step": 2521 + }, + { + "epoch": 0.9310137972405519, + "grad_norm": 2.796875, + "kl": 0.0, + "learning_rate": 1.2192291740471373e-07, + "logits/chosen": 254422297.6, + "logits/rejected": 223442005.33333334, + "logps/chosen": -331.3496826171875, + "logps/rejected": -441.2197265625, + "loss": 0.0425, + "rewards/chosen": 3.9395816802978514, + "rewards/margins": 10.54592653910319, + "rewards/rejected": -6.606344858805339, + "step": 2522 + }, + { + "epoch": 0.9313829541783951, + "grad_norm": 5.59375, + "kl": 0.8354625701904297, + "learning_rate": 1.206345418823235e-07, + "logits/chosen": 231373937.7777778, + "logits/rejected": 239811437.7142857, + "logps/chosen": -369.29454210069446, + "logps/rejected": -472.53512137276783, + "loss": 0.086, + "rewards/chosen": 2.8487557305230036, + "rewards/margins": 9.298942323714968, + "rewards/rejected": -6.450186593191964, + "step": 2523 + }, + { + "epoch": 0.9317521111162383, + "grad_norm": 3.765625, + "kl": 2.3602294921875, + "learning_rate": 1.1935292665878283e-07, + "logits/chosen": 225792014.2222222, + "logits/rejected": 210014939.42857143, + "logps/chosen": -383.01323784722223, + "logps/rejected": -433.55336216517856, + "loss": 0.0927, + "rewards/chosen": 3.1570248074001737, + "rewards/margins": 9.962465437631758, + "rewards/rejected": -6.805440630231585, + "step": 2524 + }, + { + "epoch": 0.9321212680540815, + "grad_norm": 2.96875, + "kl": 0.0, + "learning_rate": 1.1807807350975476e-07, + "logits/chosen": 223261659.42857143, + "logits/rejected": 222283861.33333334, + "logps/chosen": -382.40576171875, + "logps/rejected": -532.3900282118055, + "loss": 0.0331, + "rewards/chosen": 3.544316973005022, + "rewards/margins": 12.2103758312407, + "rewards/rejected": -8.666058858235678, + "step": 2525 + }, + { + "epoch": 0.9324904249919247, + "grad_norm": 6.03125, + "kl": 0.8048028945922852, + "learning_rate": 1.1680998420153134e-07, + "logits/chosen": 236914889.14285713, + "logits/rejected": 208551822.2222222, + "logps/chosen": -452.28480747767856, + "logps/rejected": -411.12928602430554, + "loss": 0.0723, + "rewards/chosen": 2.7137350354875838, + "rewards/margins": 8.815661627148826, + "rewards/rejected": -6.1019265916612415, + "step": 2526 + }, + { + "epoch": 0.9328595819297679, + "grad_norm": 4.65625, + "kl": 0.0, + "learning_rate": 1.1554866049103497e-07, + "logits/chosen": 200481600.0, + "logits/rejected": 180349456.0, + "logps/chosen": -348.032470703125, + "logps/rejected": -494.87347412109375, + "loss": 0.087, + "rewards/chosen": 2.9339048862457275, + "rewards/margins": 9.414212465286255, + "rewards/rejected": -6.480307579040527, + "step": 2527 + }, + { + "epoch": 0.933228738867611, + "grad_norm": 5.71875, + "kl": 0.11396121978759766, + "learning_rate": 1.1429410412581277e-07, + "logits/chosen": 264653101.17647058, + "logits/rejected": 189074670.93333334, + "logps/chosen": -418.9276769301471, + "logps/rejected": -352.04010416666665, + "loss": 0.0916, + "rewards/chosen": 2.7814043830422794, + "rewards/margins": 7.556503056544884, + "rewards/rejected": -4.775098673502604, + "step": 2528 + }, + { + "epoch": 0.9335978958054543, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 1.1304631684403711e-07, + "logits/chosen": 272948317.09090906, + "logits/rejected": 160996047.23809522, + "logps/chosen": -395.5738636363636, + "logps/rejected": -412.11504836309524, + "loss": 0.064, + "rewards/chosen": 3.3883167613636362, + "rewards/margins": 9.715744216720779, + "rewards/rejected": -6.327427455357143, + "step": 2529 + }, + { + "epoch": 0.9339670527432975, + "grad_norm": 4.375, + "kl": 0.0, + "learning_rate": 1.1180530037450176e-07, + "logits/chosen": 222622390.85714287, + "logits/rejected": 139409464.8888889, + "logps/chosen": -356.79813058035717, + "logps/rejected": -429.35088433159723, + "loss": 0.0608, + "rewards/chosen": 2.6777986798967635, + "rewards/margins": 8.992801545158265, + "rewards/rejected": -6.315002865261501, + "step": 2530 + }, + { + "epoch": 0.9343362096811407, + "grad_norm": 4.75, + "kl": 0.5928745269775391, + "learning_rate": 1.1057105643661803e-07, + "logits/chosen": 193986695.52941176, + "logits/rejected": 126495863.46666667, + "logps/chosen": -267.75080422794116, + "logps/rejected": -372.1562174479167, + "loss": 0.0803, + "rewards/chosen": 2.8153648376464844, + "rewards/margins": 9.791407521565755, + "rewards/rejected": -6.976042683919271, + "step": 2531 + }, + { + "epoch": 0.9347053666189838, + "grad_norm": 8.1875, + "kl": 0.0, + "learning_rate": 1.0934358674041634e-07, + "logits/chosen": 286462554.35294116, + "logits/rejected": 157198301.86666667, + "logps/chosen": -383.18859145220586, + "logps/rejected": -355.3451822916667, + "loss": 0.1337, + "rewards/chosen": 2.0544743257410385, + "rewards/margins": 7.757956381405101, + "rewards/rejected": -5.703482055664063, + "step": 2532 + }, + { + "epoch": 0.9350745235568271, + "grad_norm": 6.15625, + "kl": 2.8274450302124023, + "learning_rate": 1.0812289298654077e-07, + "logits/chosen": 321818444.8, + "logits/rejected": 204667882.66666666, + "logps/chosen": -393.8140869140625, + "logps/rejected": -345.0425618489583, + "loss": 0.0966, + "rewards/chosen": 3.149086761474609, + "rewards/margins": 8.947825876871745, + "rewards/rejected": -5.798739115397136, + "step": 2533 + }, + { + "epoch": 0.9354436804946703, + "grad_norm": 5.9375, + "kl": 1.0190143585205078, + "learning_rate": 1.0690897686624568e-07, + "logits/chosen": 216423594.66666666, + "logits/rejected": 132557486.54545455, + "logps/chosen": -388.9861653645833, + "logps/rejected": -629.9663529829545, + "loss": 0.083, + "rewards/chosen": 2.7652224586123513, + "rewards/margins": 11.642240780256527, + "rewards/rejected": -8.877018321644176, + "step": 2534 + }, + { + "epoch": 0.9358128374325135, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 1.0570184006139683e-07, + "logits/chosen": 213358544.0, + "logits/rejected": 283035488.0, + "logps/chosen": -319.33099365234375, + "logps/rejected": -434.29107666015625, + "loss": 0.094, + "rewards/chosen": 2.2309048175811768, + "rewards/margins": 8.800601720809937, + "rewards/rejected": -6.56969690322876, + "step": 2535 + }, + { + "epoch": 0.9361819943703567, + "grad_norm": 4.125, + "kl": 0.0, + "learning_rate": 1.0450148424446749e-07, + "logits/chosen": 189526724.92307693, + "logits/rejected": 227740025.2631579, + "logps/chosen": -312.0839280348558, + "logps/rejected": -402.16372841282896, + "loss": 0.0739, + "rewards/chosen": 3.1903258103590746, + "rewards/margins": 9.244313413797602, + "rewards/rejected": -6.053987603438528, + "step": 2536 + }, + { + "epoch": 0.9365511513081999, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 1.033079110785351e-07, + "logits/chosen": 179429814.85714287, + "logits/rejected": 161470791.1111111, + "logps/chosen": -304.019775390625, + "logps/rejected": -440.8036295572917, + "loss": 0.0918, + "rewards/chosen": 2.6628758566720143, + "rewards/margins": 9.029234295799618, + "rewards/rejected": -6.3663584391276045, + "step": 2537 + }, + { + "epoch": 0.9369203082460431, + "grad_norm": 6.25, + "kl": 0.0, + "learning_rate": 1.0212112221727966e-07, + "logits/chosen": 218431819.29411766, + "logits/rejected": 157077640.53333333, + "logps/chosen": -284.97472426470586, + "logps/rejected": -483.15436197916665, + "loss": 0.1163, + "rewards/chosen": 2.1801616444307217, + "rewards/margins": 8.195087283265357, + "rewards/rejected": -6.014925638834636, + "step": 2538 + }, + { + "epoch": 0.9372894651838863, + "grad_norm": 6.96875, + "kl": 0.0, + "learning_rate": 1.0094111930498307e-07, + "logits/chosen": 239918727.52941176, + "logits/rejected": 157557265.06666666, + "logps/chosen": -373.6116153492647, + "logps/rejected": -561.9298177083333, + "loss": 0.1042, + "rewards/chosen": 2.3699812047621784, + "rewards/margins": 9.850738854501762, + "rewards/rejected": -7.480757649739584, + "step": 2539 + }, + { + "epoch": 0.9376586221217295, + "grad_norm": 5.53125, + "kl": 0.44503211975097656, + "learning_rate": 9.976790397652314e-08, + "logits/chosen": 227396366.2222222, + "logits/rejected": 162604416.0, + "logps/chosen": -366.97797309027777, + "logps/rejected": -438.7374790736607, + "loss": 0.0992, + "rewards/chosen": 2.34330325656467, + "rewards/margins": 8.797454410129124, + "rewards/rejected": -6.454151153564453, + "step": 2540 + }, + { + "epoch": 0.9380277790595727, + "grad_norm": 5.3125, + "kl": 0.0, + "learning_rate": 9.860147785737573e-08, + "logits/chosen": 263882771.69230768, + "logits/rejected": 240768889.2631579, + "logps/chosen": -438.21390474759613, + "logps/rejected": -411.6459189967105, + "loss": 0.0787, + "rewards/chosen": 2.1713509192833533, + "rewards/margins": 8.622666548138206, + "rewards/rejected": -6.451315628854852, + "step": 2541 + }, + { + "epoch": 0.9383969359974159, + "grad_norm": 6.84375, + "kl": 0.3057648539543152, + "learning_rate": 9.744184256360923e-08, + "logits/chosen": 224288646.0952381, + "logits/rejected": 259572782.54545453, + "logps/chosen": -361.7535807291667, + "logps/rejected": -338.3896484375, + "loss": 0.1178, + "rewards/chosen": 2.314620608375186, + "rewards/margins": 7.005497309036585, + "rewards/rejected": -4.690876700661399, + "step": 2542 + }, + { + "epoch": 0.9387660929352591, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 9.628899970188343e-08, + "logits/chosen": 207270512.0, + "logits/rejected": 227022912.0, + "logps/chosen": -386.76513671875, + "logps/rejected": -419.7522277832031, + "loss": 0.0693, + "rewards/chosen": 2.6922452449798584, + "rewards/margins": 9.148276090621948, + "rewards/rejected": -6.45603084564209, + "step": 2543 + }, + { + "epoch": 0.9391352498731023, + "grad_norm": 5.40625, + "kl": 0.0, + "learning_rate": 9.514295086944736e-08, + "logits/chosen": 162917284.57142857, + "logits/rejected": 130368540.44444445, + "logps/chosen": -310.3811732700893, + "logps/rejected": -460.62245008680554, + "loss": 0.0661, + "rewards/chosen": 2.27313164302281, + "rewards/margins": 11.139862227061437, + "rewards/rejected": -8.866730584038628, + "step": 2544 + }, + { + "epoch": 0.9395044068109455, + "grad_norm": 7.84375, + "kl": 0.8602161407470703, + "learning_rate": 9.400369765413752e-08, + "logits/chosen": 184722517.33333334, + "logits/rejected": 148033152.0, + "logps/chosen": -378.3515857514881, + "logps/rejected": -391.27397017045456, + "loss": 0.1039, + "rewards/chosen": 2.417332422165644, + "rewards/margins": 9.12631402284036, + "rewards/rejected": -6.708981600674716, + "step": 2545 + }, + { + "epoch": 0.9398735637487887, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 9.287124163437411e-08, + "logits/chosen": 282141824.0, + "logits/rejected": 195957262.2222222, + "logps/chosen": -368.4974888392857, + "logps/rejected": -396.7701822916667, + "loss": 0.0612, + "rewards/chosen": 2.964024407523019, + "rewards/margins": 9.605024095565554, + "rewards/rejected": -6.640999688042535, + "step": 2546 + }, + { + "epoch": 0.940242720686632, + "grad_norm": 4.1875, + "kl": 0.0, + "learning_rate": 9.174558437916148e-08, + "logits/chosen": 167154752.0, + "logits/rejected": 147263184.0, + "logps/chosen": -336.33319091796875, + "logps/rejected": -466.13873291015625, + "loss": 0.0522, + "rewards/chosen": 3.405656099319458, + "rewards/margins": 11.16504168510437, + "rewards/rejected": -7.759385585784912, + "step": 2547 + }, + { + "epoch": 0.9406118776244751, + "grad_norm": 6.4375, + "kl": 1.4682016372680664, + "learning_rate": 9.06267274480832e-08, + "logits/chosen": 257709696.0, + "logits/rejected": 148628416.0, + "logps/chosen": -394.018408203125, + "logps/rejected": -439.5391438802083, + "loss": 0.1018, + "rewards/chosen": 2.640557861328125, + "rewards/margins": 9.736898676554363, + "rewards/rejected": -7.096340815226237, + "step": 2548 + }, + { + "epoch": 0.9409810345623183, + "grad_norm": 3.609375, + "kl": 1.0171823501586914, + "learning_rate": 8.95146723913004e-08, + "logits/chosen": 190898364.63157895, + "logits/rejected": 238637056.0, + "logps/chosen": -374.36986019736844, + "logps/rejected": -622.3138521634615, + "loss": 0.0404, + "rewards/chosen": 3.9699554443359375, + "rewards/margins": 11.644557659442608, + "rewards/rejected": -7.67460221510667, + "step": 2549 + }, + { + "epoch": 0.9413501915001615, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 8.840942074955172e-08, + "logits/chosen": 226133248.0, + "logits/rejected": 170640021.33333334, + "logps/chosen": -409.281201171875, + "logps/rejected": -396.5823567708333, + "loss": 0.0808, + "rewards/chosen": 3.0727968215942383, + "rewards/margins": 10.463100115458172, + "rewards/rejected": -7.390303293863933, + "step": 2550 + }, + { + "epoch": 0.9417193484380048, + "grad_norm": 6.65625, + "kl": 1.7623577117919922, + "learning_rate": 8.731097405415057e-08, + "logits/chosen": 205406599.52941176, + "logits/rejected": 219034572.8, + "logps/chosen": -401.0227481617647, + "logps/rejected": -601.5032552083334, + "loss": 0.1068, + "rewards/chosen": 2.8504800235523895, + "rewards/margins": 11.332842897901347, + "rewards/rejected": -8.482362874348958, + "step": 2551 + }, + { + "epoch": 0.9420885053758479, + "grad_norm": 7.5625, + "kl": 1.1726341247558594, + "learning_rate": 8.6219333826979e-08, + "logits/chosen": 158923856.0, + "logits/rejected": 192118960.0, + "logps/chosen": -327.73834228515625, + "logps/rejected": -482.0390625, + "loss": 0.1098, + "rewards/chosen": 2.417841911315918, + "rewards/margins": 8.395731925964355, + "rewards/rejected": -5.9778900146484375, + "step": 2552 + }, + { + "epoch": 0.9424576623136911, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 8.513450158049109e-08, + "logits/chosen": 256437174.85714287, + "logits/rejected": 228065991.1111111, + "logps/chosen": -389.02235630580356, + "logps/rejected": -487.3430989583333, + "loss": 0.0722, + "rewards/chosen": 2.9205845424107144, + "rewards/margins": 9.699283781505766, + "rewards/rejected": -6.778699239095052, + "step": 2553 + }, + { + "epoch": 0.9428268192515343, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 8.405647881770951e-08, + "logits/chosen": 205489134.93333334, + "logits/rejected": 221912907.29411766, + "logps/chosen": -405.8136393229167, + "logps/rejected": -389.1486385569853, + "loss": 0.0713, + "rewards/chosen": 3.332672882080078, + "rewards/margins": 9.218920404770795, + "rewards/rejected": -5.8862475226907165, + "step": 2554 + }, + { + "epoch": 0.9431959761893776, + "grad_norm": 3.375, + "kl": 0.0, + "learning_rate": 8.298526703221899e-08, + "logits/chosen": 212383857.7777778, + "logits/rejected": 243877064.3478261, + "logps/chosen": -482.59174262152777, + "logps/rejected": -465.8752547554348, + "loss": 0.0396, + "rewards/chosen": 4.066911909315321, + "rewards/margins": 10.546749741558866, + "rewards/rejected": -6.479837832243546, + "step": 2555 + }, + { + "epoch": 0.9435651331272207, + "grad_norm": 5.59375, + "kl": 0.0, + "learning_rate": 8.192086770817176e-08, + "logits/chosen": 231871761.06666666, + "logits/rejected": 255601272.47058824, + "logps/chosen": -397.69573567708335, + "logps/rejected": -430.3524528952206, + "loss": 0.1005, + "rewards/chosen": 1.9750344594319662, + "rewards/margins": 8.073138150981828, + "rewards/rejected": -6.098103691549862, + "step": 2556 + }, + { + "epoch": 0.9439342900650639, + "grad_norm": 5.8125, + "kl": 0.0, + "learning_rate": 8.086328232027874e-08, + "logits/chosen": 290821218.46153843, + "logits/rejected": 246067792.84210527, + "logps/chosen": -265.1450383112981, + "logps/rejected": -488.0546875, + "loss": 0.086, + "rewards/chosen": 2.1038604149451623, + "rewards/margins": 10.286227840161034, + "rewards/rejected": -8.182367425215872, + "step": 2557 + }, + { + "epoch": 0.9443034470029071, + "grad_norm": 6.71875, + "kl": 0.0, + "learning_rate": 7.98125123338106e-08, + "logits/chosen": 209599136.0, + "logits/rejected": 189997136.0, + "logps/chosen": -322.03875732421875, + "logps/rejected": -334.10943603515625, + "loss": 0.0931, + "rewards/chosen": 2.427640438079834, + "rewards/margins": 8.627834796905518, + "rewards/rejected": -6.200194358825684, + "step": 2558 + }, + { + "epoch": 0.9446726039407504, + "grad_norm": 6.53125, + "kl": 0.0, + "learning_rate": 7.876855920459613e-08, + "logits/chosen": 142513828.57142857, + "logits/rejected": 167883975.1111111, + "logps/chosen": -287.42928641183033, + "logps/rejected": -532.2218424479166, + "loss": 0.0822, + "rewards/chosen": 2.9320586068289622, + "rewards/margins": 9.95492432609437, + "rewards/rejected": -7.022865719265408, + "step": 2559 + }, + { + "epoch": 0.9450417608785935, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.773142437902003e-08, + "logits/chosen": 199184718.76923078, + "logits/rejected": 199702339.36842105, + "logps/chosen": -338.2461688701923, + "logps/rejected": -453.17506167763156, + "loss": 0.0523, + "rewards/chosen": 3.617075700026292, + "rewards/margins": 9.699665100468316, + "rewards/rejected": -6.082589400442023, + "step": 2560 + }, + { + "epoch": 0.9454109178164367, + "grad_norm": 5.5, + "kl": 0.0, + "learning_rate": 7.670110929401786e-08, + "logits/chosen": 336353280.0, + "logits/rejected": 205308069.6470588, + "logps/chosen": -362.43720703125, + "logps/rejected": -429.7950654871324, + "loss": 0.0717, + "rewards/chosen": 2.6760627746582033, + "rewards/margins": 8.50232530481675, + "rewards/rejected": -5.826262530158548, + "step": 2561 + }, + { + "epoch": 0.9457800747542799, + "grad_norm": 6.78125, + "kl": 0.4433307647705078, + "learning_rate": 7.56776153770794e-08, + "logits/chosen": 206698989.7142857, + "logits/rejected": 248531456.0, + "logps/chosen": -331.9904087611607, + "logps/rejected": -424.07411024305554, + "loss": 0.1253, + "rewards/chosen": 1.7865419387817383, + "rewards/margins": 8.028216573927137, + "rewards/rejected": -6.241674635145399, + "step": 2562 + }, + { + "epoch": 0.9461492316921231, + "grad_norm": 7.78125, + "kl": 1.7031002044677734, + "learning_rate": 7.466094404624202e-08, + "logits/chosen": 188267170.9090909, + "logits/rejected": 195215539.2, + "logps/chosen": -347.17695756392044, + "logps/rejected": -503.69111328125, + "loss": 0.1343, + "rewards/chosen": 2.536775588989258, + "rewards/margins": 9.91074333190918, + "rewards/rejected": -7.3739677429199215, + "step": 2563 + }, + { + "epoch": 0.9465183886299663, + "grad_norm": 7.3125, + "kl": 0.7088499069213867, + "learning_rate": 7.365109671009119e-08, + "logits/chosen": 178816654.2222222, + "logits/rejected": 218003492.57142857, + "logps/chosen": -241.56976996527777, + "logps/rejected": -413.90931919642856, + "loss": 0.1527, + "rewards/chosen": 1.7104770872328017, + "rewards/margins": 7.536985200548929, + "rewards/rejected": -5.826508113316128, + "step": 2564 + }, + { + "epoch": 0.9468875455678095, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 7.26480747677566e-08, + "logits/chosen": 240734651.73333332, + "logits/rejected": 183199382.5882353, + "logps/chosen": -340.89124348958336, + "logps/rejected": -391.1717313878676, + "loss": 0.0693, + "rewards/chosen": 2.338909657796224, + "rewards/margins": 9.171736159979128, + "rewards/rejected": -6.832826502182904, + "step": 2565 + }, + { + "epoch": 0.9472567025056527, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 7.165187960891274e-08, + "logits/chosen": 202177479.1111111, + "logits/rejected": 178093641.14285713, + "logps/chosen": -363.96044921875, + "logps/rejected": -367.60794503348217, + "loss": 0.0766, + "rewards/chosen": 2.8286158243815103, + "rewards/margins": 9.494285946800595, + "rewards/rejected": -6.665670122419085, + "step": 2566 + }, + { + "epoch": 0.9476258594434959, + "grad_norm": 5.0625, + "kl": 0.0, + "learning_rate": 7.066251261377666e-08, + "logits/chosen": 288213760.0, + "logits/rejected": 188524624.0, + "logps/chosen": -331.92864990234375, + "logps/rejected": -481.78045654296875, + "loss": 0.0872, + "rewards/chosen": 2.1370341777801514, + "rewards/margins": 8.84536862373352, + "rewards/rejected": -6.708334445953369, + "step": 2567 + }, + { + "epoch": 0.9479950163813391, + "grad_norm": 4.625, + "kl": 0.0, + "learning_rate": 6.967997515310188e-08, + "logits/chosen": 206370638.76923078, + "logits/rejected": 210731277.47368422, + "logps/chosen": -386.5914963942308, + "logps/rejected": -446.1478721217105, + "loss": 0.0546, + "rewards/chosen": 2.6089049119215746, + "rewards/margins": 8.703053648172602, + "rewards/rejected": -6.094148736251028, + "step": 2568 + }, + { + "epoch": 0.9483641733191823, + "grad_norm": 6.34375, + "kl": 1.2042570114135742, + "learning_rate": 6.87042685881828e-08, + "logits/chosen": 222251212.8, + "logits/rejected": 173655253.33333334, + "logps/chosen": -339.3113525390625, + "logps/rejected": -501.3463541666667, + "loss": 0.0923, + "rewards/chosen": 2.7514217376708983, + "rewards/margins": 10.901518630981446, + "rewards/rejected": -8.150096893310547, + "step": 2569 + }, + { + "epoch": 0.9487333302570256, + "grad_norm": 4.125, + "kl": 0.5715465545654297, + "learning_rate": 6.773539427084808e-08, + "logits/chosen": 251680392.53333333, + "logits/rejected": 192123331.7647059, + "logps/chosen": -324.34479166666665, + "logps/rejected": -414.25603170955884, + "loss": 0.0549, + "rewards/chosen": 3.266532135009766, + "rewards/margins": 9.877102706011604, + "rewards/rejected": -6.610570571001838, + "step": 2570 + }, + { + "epoch": 0.9491024871948687, + "grad_norm": 5.375, + "kl": 0.32050132751464844, + "learning_rate": 6.67733535434606e-08, + "logits/chosen": 218027143.52941176, + "logits/rejected": 276071799.46666664, + "logps/chosen": -407.5953584558824, + "logps/rejected": -538.22431640625, + "loss": 0.0636, + "rewards/chosen": 3.3452799179974724, + "rewards/margins": 11.370722422880284, + "rewards/rejected": -8.025442504882813, + "step": 2571 + }, + { + "epoch": 0.9494716441327119, + "grad_norm": 5.0625, + "kl": 0.49539756774902344, + "learning_rate": 6.581814773891581e-08, + "logits/chosen": 221092539.73333332, + "logits/rejected": 204872643.7647059, + "logps/chosen": -336.72483723958334, + "logps/rejected": -437.5380859375, + "loss": 0.0556, + "rewards/chosen": 2.6086657206217447, + "rewards/margins": 10.469805354698032, + "rewards/rejected": -7.861139634076287, + "step": 2572 + }, + { + "epoch": 0.9498408010705551, + "grad_norm": 6.03125, + "kl": 0.27767717838287354, + "learning_rate": 6.486977818063956e-08, + "logits/chosen": 210301659.42857143, + "logits/rejected": 233717319.1111111, + "logps/chosen": -371.2566615513393, + "logps/rejected": -427.9137369791667, + "loss": 0.0968, + "rewards/chosen": 2.2114392689296176, + "rewards/margins": 8.025899599468897, + "rewards/rejected": -5.81446033053928, + "step": 2573 + }, + { + "epoch": 0.9502099580083984, + "grad_norm": 6.34375, + "kl": 0.0, + "learning_rate": 6.39282461825852e-08, + "logits/chosen": 180669296.0, + "logits/rejected": 126926864.0, + "logps/chosen": -361.2940979003906, + "logps/rejected": -271.91217041015625, + "loss": 0.0947, + "rewards/chosen": 2.9209940433502197, + "rewards/margins": 7.486278772354126, + "rewards/rejected": -4.565284729003906, + "step": 2574 + }, + { + "epoch": 0.9505791149462415, + "grad_norm": 4.6875, + "kl": 0.0, + "learning_rate": 6.299355304923372e-08, + "logits/chosen": 240650459.42857143, + "logits/rejected": 205597525.33333334, + "logps/chosen": -383.72499302455356, + "logps/rejected": -432.34233940972223, + "loss": 0.0783, + "rewards/chosen": 2.641348430088588, + "rewards/margins": 9.86064350794232, + "rewards/rejected": -7.219295077853733, + "step": 2575 + }, + { + "epoch": 0.9509482718840847, + "grad_norm": 3.59375, + "kl": 0.0, + "learning_rate": 6.206570007559032e-08, + "logits/chosen": 333327177.14285713, + "logits/rejected": 186586595.55555555, + "logps/chosen": -306.60452706473217, + "logps/rejected": -512.0807291666666, + "loss": 0.0599, + "rewards/chosen": 2.6728878021240234, + "rewards/margins": 9.361782285902235, + "rewards/rejected": -6.688894483778212, + "step": 2576 + }, + { + "epoch": 0.9513174288219279, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 6.114468854718337e-08, + "logits/chosen": 142944896.0, + "logits/rejected": 301124850.5263158, + "logps/chosen": -311.6497145432692, + "logps/rejected": -593.6127672697369, + "loss": 0.0723, + "rewards/chosen": 2.3455561124361477, + "rewards/margins": 9.528436714821016, + "rewards/rejected": -7.182880602384868, + "step": 2577 + }, + { + "epoch": 0.9516865857597712, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 6.023051974006322e-08, + "logits/chosen": 165842038.15384614, + "logits/rejected": 306235984.84210527, + "logps/chosen": -338.1459209735577, + "logps/rejected": -457.90409128289474, + "loss": 0.0962, + "rewards/chosen": 2.4797694866473856, + "rewards/margins": 8.492828600802401, + "rewards/rejected": -6.013059114155016, + "step": 2578 + }, + { + "epoch": 0.9520557426976143, + "grad_norm": 7.65625, + "kl": 1.5634613037109375, + "learning_rate": 5.9323194920798966e-08, + "logits/chosen": 189042281.4117647, + "logits/rejected": 284688520.53333336, + "logps/chosen": -326.6988740808824, + "logps/rejected": -500.1098307291667, + "loss": 0.14, + "rewards/chosen": 2.1137147791245403, + "rewards/margins": 9.376113562490426, + "rewards/rejected": -7.262398783365885, + "step": 2579 + }, + { + "epoch": 0.9524248996354575, + "grad_norm": 3.640625, + "kl": 0.0, + "learning_rate": 5.842271534647726e-08, + "logits/chosen": 181711796.70588234, + "logits/rejected": 177045077.33333334, + "logps/chosen": -296.37109375, + "logps/rejected": -454.341015625, + "loss": 0.0527, + "rewards/chosen": 3.1300688350901886, + "rewards/margins": 10.752499255012063, + "rewards/rejected": -7.622430419921875, + "step": 2580 + }, + { + "epoch": 0.9527940565733007, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 5.752908226470177e-08, + "logits/chosen": 217942848.0, + "logits/rejected": 223509680.0, + "logps/chosen": -349.7561950683594, + "logps/rejected": -468.71429443359375, + "loss": 0.0887, + "rewards/chosen": 2.6758744716644287, + "rewards/margins": 9.67698884010315, + "rewards/rejected": -7.001114368438721, + "step": 2581 + }, + { + "epoch": 0.953163213511144, + "grad_norm": 5.75, + "kl": 0.42498111724853516, + "learning_rate": 5.6642296913589355e-08, + "logits/chosen": 194775773.86666667, + "logits/rejected": 219250913.88235295, + "logps/chosen": -308.57470703125, + "logps/rejected": -525.4693244485294, + "loss": 0.0848, + "rewards/chosen": 1.9906121571858724, + "rewards/margins": 9.323782879698511, + "rewards/rejected": -7.333170722512638, + "step": 2582 + }, + { + "epoch": 0.9535323704489871, + "grad_norm": 6.09375, + "kl": 2.926565170288086, + "learning_rate": 5.576236052176942e-08, + "logits/chosen": 182342720.0, + "logits/rejected": 183394901.33333334, + "logps/chosen": -388.3177001953125, + "logps/rejected": -358.2976481119792, + "loss": 0.0812, + "rewards/chosen": 3.106239128112793, + "rewards/margins": 8.963395881652833, + "rewards/rejected": -5.857156753540039, + "step": 2583 + }, + { + "epoch": 0.9539015273868303, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 5.488927430838287e-08, + "logits/chosen": 249707669.33333334, + "logits/rejected": 163694950.4, + "logps/chosen": -367.9226481119792, + "logps/rejected": -404.5811279296875, + "loss": 0.0647, + "rewards/chosen": 2.4169610341389975, + "rewards/margins": 8.757156499226888, + "rewards/rejected": -6.3401954650878904, + "step": 2584 + }, + { + "epoch": 0.9542706843246735, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5.402303948308041e-08, + "logits/chosen": 184142471.52941176, + "logits/rejected": 186249864.53333333, + "logps/chosen": -349.1162683823529, + "logps/rejected": -514.44130859375, + "loss": 0.0836, + "rewards/chosen": 2.6030596564797794, + "rewards/margins": 9.94043046539905, + "rewards/rejected": -7.337370808919271, + "step": 2585 + }, + { + "epoch": 0.9546398412625168, + "grad_norm": 4.5, + "kl": 0.20738983154296875, + "learning_rate": 5.316365724601813e-08, + "logits/chosen": 198851031.57894737, + "logits/rejected": 190812612.92307693, + "logps/chosen": -335.8494294819079, + "logps/rejected": -413.3350360576923, + "loss": 0.0572, + "rewards/chosen": 3.290198275917455, + "rewards/margins": 10.686253644194197, + "rewards/rejected": -7.396055368276743, + "step": 2586 + }, + { + "epoch": 0.9550089982003599, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 5.231112878785971e-08, + "logits/chosen": 149596731.07692307, + "logits/rejected": 204761923.36842105, + "logps/chosen": -331.193359375, + "logps/rejected": -417.58311060855266, + "loss": 0.082, + "rewards/chosen": 2.807288830096905, + "rewards/margins": 10.3019819838798, + "rewards/rejected": -7.494693153782895, + "step": 2587 + }, + { + "epoch": 0.9553781551382031, + "grad_norm": 5.34375, + "kl": 0.196075439453125, + "learning_rate": 5.146545528977309e-08, + "logits/chosen": 275360199.1111111, + "logits/rejected": 100805174.85714285, + "logps/chosen": -375.9438205295139, + "logps/rejected": -329.91357421875, + "loss": 0.0735, + "rewards/chosen": 2.778596454196506, + "rewards/margins": 9.944026250687857, + "rewards/rejected": -7.165429796491351, + "step": 2588 + }, + { + "epoch": 0.9557473120760464, + "grad_norm": 3.078125, + "kl": 0.0, + "learning_rate": 5.06266379234277e-08, + "logits/chosen": 192374385.7777778, + "logits/rejected": 162813974.26086956, + "logps/chosen": -488.04139539930554, + "logps/rejected": -449.9549082880435, + "loss": 0.0296, + "rewards/chosen": 3.391525056627062, + "rewards/margins": 11.149306375623325, + "rewards/rejected": -7.757781318996264, + "step": 2589 + }, + { + "epoch": 0.9561164690138896, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 4.97946778509939e-08, + "logits/chosen": 167770232.47058824, + "logits/rejected": 217051494.4, + "logps/chosen": -305.85813993566177, + "logps/rejected": -451.09560546875, + "loss": 0.0945, + "rewards/chosen": 2.288930331959444, + "rewards/margins": 10.288823011809704, + "rewards/rejected": -7.9998926798502605, + "step": 2590 + }, + { + "epoch": 0.9564856259517327, + "grad_norm": 3.890625, + "kl": 0.06697273254394531, + "learning_rate": 4.896957622514298e-08, + "logits/chosen": 233994905.6, + "logits/rejected": 166383856.94117647, + "logps/chosen": -317.4534505208333, + "logps/rejected": -381.92259306066177, + "loss": 0.0711, + "rewards/chosen": 2.3292261759440103, + "rewards/margins": 8.539836509554993, + "rewards/rejected": -6.210610333610983, + "step": 2591 + }, + { + "epoch": 0.9568547828895759, + "grad_norm": 5.71875, + "kl": 1.3683958053588867, + "learning_rate": 4.815133418904106e-08, + "logits/chosen": 244577280.0, + "logits/rejected": 354677799.38461536, + "logps/chosen": -386.9598838404605, + "logps/rejected": -498.0181415264423, + "loss": 0.0935, + "rewards/chosen": 2.9945859407123767, + "rewards/margins": 9.317729532959973, + "rewards/rejected": -6.323143592247596, + "step": 2592 + }, + { + "epoch": 0.9572239398274192, + "grad_norm": 6.78125, + "kl": 2.7464704513549805, + "learning_rate": 4.733995287635351e-08, + "logits/chosen": 216710680.3809524, + "logits/rejected": 183308939.63636363, + "logps/chosen": -381.63357979910717, + "logps/rejected": -529.4858842329545, + "loss": 0.1129, + "rewards/chosen": 2.9413108825683594, + "rewards/margins": 11.388303236527877, + "rewards/rejected": -8.446992353959518, + "step": 2593 + }, + { + "epoch": 0.9575930967652624, + "grad_norm": 6.09375, + "kl": 0.0, + "learning_rate": 4.653543341123834e-08, + "logits/chosen": 181372859.73333332, + "logits/rejected": 176695175.52941176, + "logps/chosen": -297.0955078125, + "logps/rejected": -453.7361845128676, + "loss": 0.1183, + "rewards/chosen": 2.2253936767578124, + "rewards/margins": 9.410447333840763, + "rewards/rejected": -7.185053657082951, + "step": 2594 + }, + { + "epoch": 0.9579622537031055, + "grad_norm": 5.6875, + "kl": 1.0915298461914062, + "learning_rate": 4.573777690834669e-08, + "logits/chosen": 254231825.06666666, + "logits/rejected": 150784527.05882353, + "logps/chosen": -389.48916015625, + "logps/rejected": -378.19778262867646, + "loss": 0.1028, + "rewards/chosen": 3.055037180582682, + "rewards/margins": 8.755309684603821, + "rewards/rejected": -5.7002725040211395, + "step": 2595 + }, + { + "epoch": 0.9583314106409487, + "grad_norm": 5.4375, + "kl": 0.0, + "learning_rate": 4.494698447282231e-08, + "logits/chosen": 304432064.0, + "logits/rejected": 214581056.0, + "logps/chosen": -325.9053039550781, + "logps/rejected": -474.87835693359375, + "loss": 0.0675, + "rewards/chosen": 2.8214645385742188, + "rewards/margins": 9.795430183410645, + "rewards/rejected": -6.973965644836426, + "step": 2596 + }, + { + "epoch": 0.958700567578792, + "grad_norm": 6.0, + "kl": 1.2818870544433594, + "learning_rate": 4.4163057200297674e-08, + "logits/chosen": 282163146.1052632, + "logits/rejected": 156828425.84615386, + "logps/chosen": -355.2216539884868, + "logps/rejected": -382.11177884615387, + "loss": 0.1062, + "rewards/chosen": 2.2693969325015417, + "rewards/margins": 8.562554656735315, + "rewards/rejected": -6.293157724233774, + "step": 2597 + }, + { + "epoch": 0.9590697245166351, + "grad_norm": 3.125, + "kl": 0.0, + "learning_rate": 4.338599617689343e-08, + "logits/chosen": 215131296.0, + "logits/rejected": 234178538.66666666, + "logps/chosen": -293.72857666015625, + "logps/rejected": -520.2081705729166, + "loss": 0.0328, + "rewards/chosen": 2.7849183082580566, + "rewards/margins": 10.730755964914959, + "rewards/rejected": -7.945837656656901, + "step": 2598 + }, + { + "epoch": 0.9594388814544783, + "grad_norm": 4.96875, + "kl": 0.0, + "learning_rate": 4.261580247921893e-08, + "logits/chosen": 200928528.0, + "logits/rejected": 190304672.0, + "logps/chosen": -327.37933349609375, + "logps/rejected": -457.3289489746094, + "loss": 0.0587, + "rewards/chosen": 2.625349521636963, + "rewards/margins": 10.088026523590088, + "rewards/rejected": -7.462677001953125, + "step": 2599 + }, + { + "epoch": 0.9598080383923215, + "grad_norm": 5.9375, + "kl": 1.0215413570404053, + "learning_rate": 4.1852477174367244e-08, + "logits/chosen": 200094373.6470588, + "logits/rejected": 124699460.26666667, + "logps/chosen": -366.2913602941176, + "logps/rejected": -290.40419921875, + "loss": 0.1067, + "rewards/chosen": 2.5393490510828354, + "rewards/margins": 8.302327159806794, + "rewards/rejected": -5.762978108723958, + "step": 2600 + }, + { + "epoch": 0.9601771953301648, + "grad_norm": 5.84375, + "kl": 0.4742717742919922, + "learning_rate": 4.109602131991519e-08, + "logits/chosen": 283949624.8888889, + "logits/rejected": 162193188.57142857, + "logps/chosen": -458.68402777777777, + "logps/rejected": -440.8534458705357, + "loss": 0.0751, + "rewards/chosen": 3.058222452799479, + "rewards/margins": 10.556999024890718, + "rewards/rejected": -7.498776572091239, + "step": 2601 + }, + { + "epoch": 0.9605463522680079, + "grad_norm": 3.96875, + "kl": 0.0, + "learning_rate": 4.0346435963923844e-08, + "logits/chosen": 171743914.66666666, + "logits/rejected": 186568357.6470588, + "logps/chosen": -366.75768229166664, + "logps/rejected": -374.2864200367647, + "loss": 0.0484, + "rewards/chosen": 3.3741277058919272, + "rewards/margins": 9.824402902640548, + "rewards/rejected": -6.450275196748621, + "step": 2602 + }, + { + "epoch": 0.9609155092058511, + "grad_norm": 3.625, + "kl": 0.0, + "learning_rate": 3.960372214493358e-08, + "logits/chosen": 197416941.7142857, + "logits/rejected": 172924984.8888889, + "logps/chosen": -327.26485770089283, + "logps/rejected": -316.64032660590277, + "loss": 0.0703, + "rewards/chosen": 2.4352959224155972, + "rewards/margins": 8.073989837888687, + "rewards/rejected": -5.63869391547309, + "step": 2603 + }, + { + "epoch": 0.9612846661436943, + "grad_norm": 5.25, + "kl": 0.0, + "learning_rate": 3.8867880891965136e-08, + "logits/chosen": 240353587.2, + "logits/rejected": 160325496.47058824, + "logps/chosen": -362.4032877604167, + "logps/rejected": -425.6738855698529, + "loss": 0.0729, + "rewards/chosen": 2.4572799682617186, + "rewards/margins": 8.796874192181756, + "rewards/rejected": -6.339594223920037, + "step": 2604 + }, + { + "epoch": 0.9616538230815376, + "grad_norm": 4.0625, + "kl": 0.0, + "learning_rate": 3.8138913224516906e-08, + "logits/chosen": 158185523.2, + "logits/rejected": 176472124.2352941, + "logps/chosen": -356.72975260416666, + "logps/rejected": -461.7041590073529, + "loss": 0.055, + "rewards/chosen": 3.3360186258951825, + "rewards/margins": 10.157640240239163, + "rewards/rejected": -6.82162161434398, + "step": 2605 + }, + { + "epoch": 0.9620229800193807, + "grad_norm": 3.109375, + "kl": 0.0, + "learning_rate": 3.741682015256487e-08, + "logits/chosen": 140246321.23076922, + "logits/rejected": 100892389.05263157, + "logps/chosen": -271.7765925480769, + "logps/rejected": -340.25503700657896, + "loss": 0.0701, + "rewards/chosen": 3.071062234731821, + "rewards/margins": 8.866704485194404, + "rewards/rejected": -5.795642250462582, + "step": 2606 + }, + { + "epoch": 0.9623921369572239, + "grad_norm": 2.421875, + "kl": 0.0, + "learning_rate": 3.6701602676559314e-08, + "logits/chosen": 117566656.0, + "logits/rejected": 170241536.0, + "logps/chosen": -359.1323974609375, + "logps/rejected": -365.60000887784093, + "loss": 0.0211, + "rewards/chosen": 4.522306823730469, + "rewards/margins": 11.265646362304688, + "rewards/rejected": -6.743339538574219, + "step": 2607 + }, + { + "epoch": 0.9627612938950671, + "grad_norm": 6.125, + "kl": 0.0, + "learning_rate": 3.599326178742535e-08, + "logits/chosen": 263165531.42857143, + "logits/rejected": 172895288.8888889, + "logps/chosen": -255.26839773995536, + "logps/rejected": -414.32728407118054, + "loss": 0.1097, + "rewards/chosen": 1.9551258087158203, + "rewards/margins": 8.59212769402398, + "rewards/rejected": -6.63700188530816, + "step": 2608 + }, + { + "epoch": 0.9631304508329104, + "grad_norm": 4.96875, + "kl": 0.6610136032104492, + "learning_rate": 3.5291798466560165e-08, + "logits/chosen": 178272480.0, + "logits/rejected": 122650808.0, + "logps/chosen": -331.9181213378906, + "logps/rejected": -344.41485595703125, + "loss": 0.0827, + "rewards/chosen": 2.608306884765625, + "rewards/margins": 8.720744132995605, + "rewards/rejected": -6.1124372482299805, + "step": 2609 + }, + { + "epoch": 0.9634996077707535, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 3.459721368583191e-08, + "logits/chosen": 153029210.3529412, + "logits/rejected": 259187438.93333334, + "logps/chosen": -383.24767348345586, + "logps/rejected": -574.0638671875, + "loss": 0.0822, + "rewards/chosen": 2.5575148638556984, + "rewards/margins": 11.865307018803616, + "rewards/rejected": -9.307792154947917, + "step": 2610 + }, + { + "epoch": 0.9638687647085967, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 3.3909508407579674e-08, + "logits/chosen": 245709356.52173913, + "logits/rejected": 185934478.2222222, + "logps/chosen": -332.4220448369565, + "logps/rejected": -319.20887586805554, + "loss": 0.098, + "rewards/chosen": 2.5133793043053667, + "rewards/margins": 9.801628002222033, + "rewards/rejected": -7.288248697916667, + "step": 2611 + }, + { + "epoch": 0.96423792164644, + "grad_norm": 5.21875, + "kl": 0.0, + "learning_rate": 3.322868358460962e-08, + "logits/chosen": 275665664.0, + "logits/rejected": 364823836.4444444, + "logps/chosen": -391.7727748325893, + "logps/rejected": -427.1092122395833, + "loss": 0.061, + "rewards/chosen": 3.25610842023577, + "rewards/margins": 9.823643820626394, + "rewards/rejected": -6.567535400390625, + "step": 2612 + }, + { + "epoch": 0.9646070785842832, + "grad_norm": 4.71875, + "kl": 0.0, + "learning_rate": 3.255474016019666e-08, + "logits/chosen": 255823220.36363637, + "logits/rejected": 247667200.0, + "logps/chosen": -343.85160688920456, + "logps/rejected": -484.783642578125, + "loss": 0.0814, + "rewards/chosen": 2.7666925950483843, + "rewards/margins": 7.978664640946821, + "rewards/rejected": -5.211972045898437, + "step": 2613 + }, + { + "epoch": 0.9649762355221263, + "grad_norm": 4.28125, + "kl": 0.0, + "learning_rate": 3.188767906807999e-08, + "logits/chosen": 300804551.1111111, + "logits/rejected": 231618541.7142857, + "logps/chosen": -311.9482150607639, + "logps/rejected": -428.128662109375, + "loss": 0.0738, + "rewards/chosen": 2.759518517388238, + "rewards/margins": 8.654774559868706, + "rewards/rejected": -5.895256042480469, + "step": 2614 + }, + { + "epoch": 0.9653453924599695, + "grad_norm": 5.34375, + "kl": 0.0, + "learning_rate": 3.122750123246532e-08, + "logits/chosen": 143529696.0, + "logits/rejected": 279520896.0, + "logps/chosen": -405.0261637369792, + "logps/rejected": -568.116064453125, + "loss": 0.0704, + "rewards/chosen": 1.8321544329325359, + "rewards/margins": 9.723033491770426, + "rewards/rejected": -7.890879058837891, + "step": 2615 + }, + { + "epoch": 0.9657145493978128, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 3.0574207568019874e-08, + "logits/chosen": 122324288.0, + "logits/rejected": 206949048.8888889, + "logps/chosen": -312.48960658482144, + "logps/rejected": -480.5216471354167, + "loss": 0.0697, + "rewards/chosen": 2.9914763314383372, + "rewards/margins": 10.569393642364986, + "rewards/rejected": -7.577917310926649, + "step": 2616 + }, + { + "epoch": 0.966083706335656, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 2.992779897987408e-08, + "logits/chosen": 162659184.0, + "logits/rejected": 240582368.0, + "logps/chosen": -328.0264587402344, + "logps/rejected": -507.3796081542969, + "loss": 0.0566, + "rewards/chosen": 3.387526750564575, + "rewards/margins": 11.653708696365356, + "rewards/rejected": -8.266181945800781, + "step": 2617 + }, + { + "epoch": 0.9664528632734991, + "grad_norm": 5.75, + "kl": 0.0, + "learning_rate": 2.9288276363618194e-08, + "logits/chosen": 166019347.69230768, + "logits/rejected": 180607029.89473686, + "logps/chosen": -387.3922776442308, + "logps/rejected": -424.9291735197368, + "loss": 0.0821, + "rewards/chosen": 2.4837130033052883, + "rewards/margins": 9.582665447281439, + "rewards/rejected": -7.0989524439761515, + "step": 2618 + }, + { + "epoch": 0.9668220202113423, + "grad_norm": 3.453125, + "kl": 0.0, + "learning_rate": 2.865564060530346e-08, + "logits/chosen": 256259723.63636363, + "logits/rejected": 252023442.2857143, + "logps/chosen": -410.80224609375, + "logps/rejected": -538.5095796130952, + "loss": 0.0528, + "rewards/chosen": 2.799414201216264, + "rewards/margins": 10.110466069473333, + "rewards/rejected": -7.311051868257069, + "step": 2619 + }, + { + "epoch": 0.9671911771491856, + "grad_norm": 10.125, + "kl": 0.8172235488891602, + "learning_rate": 2.802989258143818e-08, + "logits/chosen": 194790087.1111111, + "logits/rejected": 239386624.0, + "logps/chosen": -412.6460232204861, + "logps/rejected": -329.2505580357143, + "loss": 0.0885, + "rewards/chosen": 2.409758037990994, + "rewards/margins": 7.126205050756061, + "rewards/rejected": -4.716447012765067, + "step": 2620 + }, + { + "epoch": 0.9675603340870288, + "grad_norm": 4.375, + "kl": 0.7651834487915039, + "learning_rate": 2.74110331589883e-08, + "logits/chosen": 262612167.1111111, + "logits/rejected": 366375497.14285713, + "logps/chosen": -317.4099934895833, + "logps/rejected": -328.610107421875, + "loss": 0.1077, + "rewards/chosen": 2.5823175642225475, + "rewards/margins": 8.261775637429857, + "rewards/rejected": -5.67945807320731, + "step": 2621 + }, + { + "epoch": 0.9679294910248719, + "grad_norm": 5.03125, + "kl": 0.2758359909057617, + "learning_rate": 2.6799063195376286e-08, + "logits/chosen": 222355507.2, + "logits/rejected": 142577504.0, + "logps/chosen": -365.0115234375, + "logps/rejected": -413.0805257161458, + "loss": 0.068, + "rewards/chosen": 3.0413143157958986, + "rewards/margins": 9.621803792317708, + "rewards/rejected": -6.58048947652181, + "step": 2622 + }, + { + "epoch": 0.9682986479627151, + "grad_norm": 5.625, + "kl": 0.13407135009765625, + "learning_rate": 2.619398353847835e-08, + "logits/chosen": 163883541.33333334, + "logits/rejected": 165287744.0, + "logps/chosen": -305.0618489583333, + "logps/rejected": -347.373046875, + "loss": 0.089, + "rewards/chosen": 2.794193903605143, + "rewards/margins": 8.212764231363932, + "rewards/rejected": -5.418570327758789, + "step": 2623 + }, + { + "epoch": 0.9686678049005584, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 2.559579502662446e-08, + "logits/chosen": 142033906.52631578, + "logits/rejected": 173328423.3846154, + "logps/chosen": -301.28292043585526, + "logps/rejected": -321.8585862379808, + "loss": 0.0946, + "rewards/chosen": 2.9937210083007812, + "rewards/margins": 8.669723510742188, + "rewards/rejected": -5.676002502441406, + "step": 2624 + }, + { + "epoch": 0.9690369618384016, + "grad_norm": 6.90625, + "kl": 0.0, + "learning_rate": 2.500449848859776e-08, + "logits/chosen": 407483093.3333333, + "logits/rejected": 167982899.2, + "logps/chosen": -400.0951741536458, + "logps/rejected": -343.703369140625, + "loss": 0.0826, + "rewards/chosen": 1.9224216143290203, + "rewards/margins": 8.00393788019816, + "rewards/rejected": -6.081516265869141, + "step": 2625 + }, + { + "epoch": 0.9694061187762447, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 2.4420094743631274e-08, + "logits/chosen": 240539243.78947368, + "logits/rejected": 176897811.69230768, + "logps/chosen": -336.2643400493421, + "logps/rejected": -440.9201096754808, + "loss": 0.0976, + "rewards/chosen": 2.4729644373843542, + "rewards/margins": 9.365301943018368, + "rewards/rejected": -6.892337505634014, + "step": 2626 + }, + { + "epoch": 0.969775275714088, + "grad_norm": 6.25, + "kl": 0.20464277267456055, + "learning_rate": 2.3842584601409536e-08, + "logits/chosen": 304048453.8181818, + "logits/rejected": 167729280.0, + "logps/chosen": -346.9391424005682, + "logps/rejected": -409.1262939453125, + "loss": 0.1388, + "rewards/chosen": 2.131671905517578, + "rewards/margins": 9.567708587646484, + "rewards/rejected": -7.436036682128906, + "step": 2627 + }, + { + "epoch": 0.9701444326519312, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 2.3271968862065285e-08, + "logits/chosen": 238950256.0, + "logits/rejected": 239525792.0, + "logps/chosen": -398.45526123046875, + "logps/rejected": -530.006103515625, + "loss": 0.0626, + "rewards/chosen": 3.291792392730713, + "rewards/margins": 10.035017490386963, + "rewards/rejected": -6.74322509765625, + "step": 2628 + }, + { + "epoch": 0.9705135895897744, + "grad_norm": 3.96875, + "kl": 0.5936741828918457, + "learning_rate": 2.270824831617946e-08, + "logits/chosen": 188332074.66666666, + "logits/rejected": 172546322.2857143, + "logps/chosen": -330.1969401041667, + "logps/rejected": -552.4368373325893, + "loss": 0.061, + "rewards/chosen": 2.8999002244737415, + "rewards/margins": 11.640207078721788, + "rewards/rejected": -8.740306854248047, + "step": 2629 + }, + { + "epoch": 0.9708827465276175, + "grad_norm": 3.34375, + "kl": 0.0, + "learning_rate": 2.2151423744780076e-08, + "logits/chosen": 213030549.33333334, + "logits/rejected": 218547318.15384614, + "logps/chosen": -338.1377766927083, + "logps/rejected": -421.71908804086536, + "loss": 0.0419, + "rewards/chosen": 4.1063283284505205, + "rewards/margins": 11.02227763640575, + "rewards/rejected": -6.915949307955229, + "step": 2630 + }, + { + "epoch": 0.9712519034654608, + "grad_norm": 5.9375, + "kl": 2.1153993606567383, + "learning_rate": 2.1601495919340022e-08, + "logits/chosen": 161191664.0, + "logits/rejected": 146213424.0, + "logps/chosen": -283.2368469238281, + "logps/rejected": -525.1863403320312, + "loss": 0.0926, + "rewards/chosen": 2.953106164932251, + "rewards/margins": 9.814185380935669, + "rewards/rejected": -6.861079216003418, + "step": 2631 + }, + { + "epoch": 0.971621060403304, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 2.10584656017776e-08, + "logits/chosen": 219226598.4, + "logits/rejected": 326662464.0, + "logps/chosen": -298.0913330078125, + "logps/rejected": -512.5728352864584, + "loss": 0.0693, + "rewards/chosen": 3.0341468811035157, + "rewards/margins": 11.292985534667968, + "rewards/rejected": -8.258838653564453, + "step": 2632 + }, + { + "epoch": 0.9719902173411471, + "grad_norm": 5.125, + "kl": 0.0, + "learning_rate": 2.0522333544453764e-08, + "logits/chosen": 141717897.14285713, + "logits/rejected": 202538311.1111111, + "logps/chosen": -273.6224888392857, + "logps/rejected": -411.2685818142361, + "loss": 0.1009, + "rewards/chosen": 2.8708302634102956, + "rewards/margins": 8.00318245660691, + "rewards/rejected": -5.132352193196614, + "step": 2633 + }, + { + "epoch": 0.9723593742789903, + "grad_norm": 4.21875, + "kl": 0.0, + "learning_rate": 1.999310049017378e-08, + "logits/chosen": 219702044.44444445, + "logits/rejected": 177298066.2857143, + "logps/chosen": -378.4152560763889, + "logps/rejected": -375.18722098214283, + "loss": 0.0678, + "rewards/chosen": 2.5914783477783203, + "rewards/margins": 7.993823187691825, + "rewards/rejected": -5.402344839913504, + "step": 2634 + }, + { + "epoch": 0.9727285312168336, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.9470767172182215e-08, + "logits/chosen": 200661383.52941176, + "logits/rejected": 107466393.6, + "logps/chosen": -265.5151941636029, + "logps/rejected": -327.39397786458335, + "loss": 0.0802, + "rewards/chosen": 2.6110301298253678, + "rewards/margins": 8.259497130150889, + "rewards/rejected": -5.648467000325521, + "step": 2635 + }, + { + "epoch": 0.9730976881546768, + "grad_norm": 5.28125, + "kl": 0.0, + "learning_rate": 1.8955334314166298e-08, + "logits/chosen": 263559680.0, + "logits/rejected": 209112736.0, + "logps/chosen": -420.3177795410156, + "logps/rejected": -414.60675048828125, + "loss": 0.0633, + "rewards/chosen": 2.6474671363830566, + "rewards/margins": 9.676474571228027, + "rewards/rejected": -7.029007434844971, + "step": 2636 + }, + { + "epoch": 0.9734668450925199, + "grad_norm": 5.78125, + "kl": 0.0, + "learning_rate": 1.844680263025089e-08, + "logits/chosen": 201229264.0, + "logits/rejected": 172678672.0, + "logps/chosen": -361.4006042480469, + "logps/rejected": -446.73046875, + "loss": 0.0873, + "rewards/chosen": 2.411545753479004, + "rewards/margins": 10.244723320007324, + "rewards/rejected": -7.83317756652832, + "step": 2637 + }, + { + "epoch": 0.9738360020303631, + "grad_norm": 8.4375, + "kl": 0.0, + "learning_rate": 1.794517282500019e-08, + "logits/chosen": 185576345.6, + "logits/rejected": 211588592.94117647, + "logps/chosen": -352.67278645833335, + "logps/rejected": -407.7769990808824, + "loss": 0.0874, + "rewards/chosen": 2.921522013346354, + "rewards/margins": 8.51659055223652, + "rewards/rejected": -5.595068538890166, + "step": 2638 + }, + { + "epoch": 0.9742051589682064, + "grad_norm": 6.5, + "kl": 1.5269889831542969, + "learning_rate": 1.7450445593416576e-08, + "logits/chosen": 200868677.8181818, + "logits/rejected": 189002163.2, + "logps/chosen": -337.91825727982956, + "logps/rejected": -431.5546875, + "loss": 0.1108, + "rewards/chosen": 3.4878051064231177, + "rewards/margins": 9.034653785011985, + "rewards/rejected": -5.5468486785888675, + "step": 2639 + }, + { + "epoch": 0.9745743159060496, + "grad_norm": 5.15625, + "kl": 0.0, + "learning_rate": 1.6962621620937314e-08, + "logits/chosen": 128129596.23529412, + "logits/rejected": 232850158.93333334, + "logps/chosen": -274.9004480698529, + "logps/rejected": -449.2115234375, + "loss": 0.0864, + "rewards/chosen": 3.231280382941751, + "rewards/margins": 10.132462430467793, + "rewards/rejected": -6.901182047526041, + "step": 2640 + }, + { + "epoch": 0.9749434728438927, + "grad_norm": 4.84375, + "kl": 1.846639633178711, + "learning_rate": 1.648170158343787e-08, + "logits/chosen": 316051171.5555556, + "logits/rejected": 143180160.0, + "logps/chosen": -412.8122287326389, + "logps/rejected": -408.52022879464283, + "loss": 0.0598, + "rewards/chosen": 3.045538584391276, + "rewards/margins": 11.246866861979166, + "rewards/rejected": -8.20132827758789, + "step": 2641 + }, + { + "epoch": 0.9753126297817359, + "grad_norm": 8.25, + "kl": 0.0, + "learning_rate": 1.6007686147225254e-08, + "logits/chosen": 267072000.0, + "logits/rejected": 130685326.22222222, + "logps/chosen": -384.4335512907609, + "logps/rejected": -341.6930338541667, + "loss": 0.159, + "rewards/chosen": 1.9208707394807234, + "rewards/margins": 7.64325996067213, + "rewards/rejected": -5.722389221191406, + "step": 2642 + }, + { + "epoch": 0.9756817867195792, + "grad_norm": 5.5625, + "kl": 0.0, + "learning_rate": 1.554057596904246e-08, + "logits/chosen": 188221482.66666666, + "logits/rejected": 189479972.57142857, + "logps/chosen": -395.5631510416667, + "logps/rejected": -320.7162388392857, + "loss": 0.0902, + "rewards/chosen": 2.589420954386393, + "rewards/margins": 9.20777684166318, + "rewards/rejected": -6.618355887276786, + "step": 2643 + }, + { + "epoch": 0.9760509436574224, + "grad_norm": 5.125, + "kl": 0.8676185607910156, + "learning_rate": 1.5080371696065133e-08, + "logits/chosen": 190918855.1111111, + "logits/rejected": 186329965.7142857, + "logps/chosen": -373.30384657118054, + "logps/rejected": -380.05482700892856, + "loss": 0.0568, + "rewards/chosen": 2.976137161254883, + "rewards/margins": 9.305841990879603, + "rewards/rejected": -6.329704829624721, + "step": 2644 + }, + { + "epoch": 0.9764201005952655, + "grad_norm": 4.15625, + "kl": 0.07310056686401367, + "learning_rate": 1.4627073965899907e-08, + "logits/chosen": 199275008.0, + "logits/rejected": 158379414.5882353, + "logps/chosen": -290.82408854166664, + "logps/rejected": -457.04934512867646, + "loss": 0.0766, + "rewards/chosen": 2.7176335652669272, + "rewards/margins": 10.795036016726026, + "rewards/rejected": -8.077402451459099, + "step": 2645 + }, + { + "epoch": 0.9767892575331087, + "grad_norm": 4.75, + "kl": 0.0, + "learning_rate": 1.4180683406584961e-08, + "logits/chosen": 288861468.4444444, + "logits/rejected": 230615369.14285713, + "logps/chosen": -361.348388671875, + "logps/rejected": -439.0869140625, + "loss": 0.0702, + "rewards/chosen": 2.6861646440294056, + "rewards/margins": 9.750556733873156, + "rewards/rejected": -7.06439208984375, + "step": 2646 + }, + { + "epoch": 0.977158414470952, + "grad_norm": 5.0625, + "kl": 0.28074169158935547, + "learning_rate": 1.3741200636589457e-08, + "logits/chosen": 191252451.55555555, + "logits/rejected": 192479542.85714287, + "logps/chosen": -283.07329644097223, + "logps/rejected": -535.0629185267857, + "loss": 0.0977, + "rewards/chosen": 2.363864050971137, + "rewards/margins": 9.988733806307353, + "rewards/rejected": -7.624869755336216, + "step": 2647 + }, + { + "epoch": 0.9775275714087952, + "grad_norm": 5.84375, + "kl": 0.0, + "learning_rate": 1.330862626481022e-08, + "logits/chosen": 138020352.0, + "logits/rejected": 120441368.0, + "logps/chosen": -380.95623779296875, + "logps/rejected": -355.24053955078125, + "loss": 0.0862, + "rewards/chosen": 2.9916343688964844, + "rewards/margins": 8.966382026672363, + "rewards/rejected": -5.974747657775879, + "step": 2648 + }, + { + "epoch": 0.9778967283466383, + "grad_norm": 6.75, + "kl": 0.0, + "learning_rate": 1.2882960890573947e-08, + "logits/chosen": 300118893.71428573, + "logits/rejected": 220335290.1818182, + "logps/chosen": -386.9515671502976, + "logps/rejected": -629.0106977982955, + "loss": 0.0884, + "rewards/chosen": 2.7065847487676713, + "rewards/margins": 9.602670314508083, + "rewards/rejected": -6.896085565740412, + "step": 2649 + }, + { + "epoch": 0.9782658852844816, + "grad_norm": 4.46875, + "kl": 0.586578369140625, + "learning_rate": 1.2464205103634996e-08, + "logits/chosen": 295634135.57894737, + "logits/rejected": 237422966.15384614, + "logps/chosen": -410.0180921052632, + "logps/rejected": -482.36827674278845, + "loss": 0.0528, + "rewards/chosen": 3.134156478078742, + "rewards/margins": 12.02923411010248, + "rewards/rejected": -8.895077632023739, + "step": 2650 + }, + { + "epoch": 0.9786350422223248, + "grad_norm": 4.15625, + "kl": 0.0, + "learning_rate": 1.2052359484173715e-08, + "logits/chosen": 180454604.8, + "logits/rejected": 244948373.33333334, + "logps/chosen": -326.699267578125, + "logps/rejected": -604.3748779296875, + "loss": 0.0575, + "rewards/chosen": 3.0525791168212892, + "rewards/margins": 10.562446467081706, + "rewards/rejected": -7.509867350260417, + "step": 2651 + }, + { + "epoch": 0.979004199160168, + "grad_norm": 4.9375, + "kl": 0.597651481628418, + "learning_rate": 1.1647424602797553e-08, + "logits/chosen": 343560854.5882353, + "logits/rejected": 266725478.4, + "logps/chosen": -385.48974609375, + "logps/rejected": -426.03411458333335, + "loss": 0.0744, + "rewards/chosen": 3.042316436767578, + "rewards/margins": 9.002386728922527, + "rewards/rejected": -5.960070292154948, + "step": 2652 + }, + { + "epoch": 0.9793733560980111, + "grad_norm": 4.5, + "kl": 0.21689605712890625, + "learning_rate": 1.124940102053773e-08, + "logits/chosen": 170361224.53333333, + "logits/rejected": 134338484.70588234, + "logps/chosen": -359.5333984375, + "logps/rejected": -307.4916130514706, + "loss": 0.0802, + "rewards/chosen": 2.9095128377278647, + "rewards/margins": 8.662456437653187, + "rewards/rejected": -5.752943599925322, + "step": 2653 + }, + { + "epoch": 0.9797425130358544, + "grad_norm": 4.09375, + "kl": 0.0, + "learning_rate": 1.0858289288851465e-08, + "logits/chosen": 217891413.33333334, + "logits/rejected": 287492644.5714286, + "logps/chosen": -338.40570746527777, + "logps/rejected": -488.2689732142857, + "loss": 0.059, + "rewards/chosen": 3.0155421362982855, + "rewards/margins": 9.544179068671333, + "rewards/rejected": -6.528636932373047, + "step": 2654 + }, + { + "epoch": 0.9801116699736976, + "grad_norm": 5.1875, + "kl": 0.0, + "learning_rate": 1.0474089949619182e-08, + "logits/chosen": 270042899.6923077, + "logits/rejected": 274176862.31578946, + "logps/chosen": -493.48617788461536, + "logps/rejected": -476.91097861842104, + "loss": 0.066, + "rewards/chosen": 2.5404264009915867, + "rewards/margins": 8.718500083274687, + "rewards/rejected": -6.178073682283101, + "step": 2655 + }, + { + "epoch": 0.9804808269115408, + "grad_norm": 3.71875, + "kl": 0.0, + "learning_rate": 1.0096803535143972e-08, + "logits/chosen": 193737056.0, + "logits/rejected": 205455776.0, + "logps/chosen": -383.5829772949219, + "logps/rejected": -418.6579895019531, + "loss": 0.0458, + "rewards/chosen": 3.553262948989868, + "rewards/margins": 9.701695203781128, + "rewards/rejected": -6.14843225479126, + "step": 2656 + }, + { + "epoch": 0.9808499838493839, + "grad_norm": 4.5625, + "kl": 0.0, + "learning_rate": 9.726430568151036e-09, + "logits/chosen": 126539086.76923077, + "logits/rejected": 140169957.0526316, + "logps/chosen": -266.19482421875, + "logps/rejected": -413.74928042763156, + "loss": 0.0749, + "rewards/chosen": 3.150011796217698, + "rewards/margins": 9.40743158317288, + "rewards/rejected": -6.257419786955181, + "step": 2657 + }, + { + "epoch": 0.9812191407872272, + "grad_norm": 3.609375, + "kl": 0.0, + "learning_rate": 9.36297156178767e-09, + "logits/chosen": 261424790.5882353, + "logits/rejected": 165363302.4, + "logps/chosen": -323.4218175551471, + "logps/rejected": -308.835546875, + "loss": 0.0641, + "rewards/chosen": 3.4757098029641544, + "rewards/margins": 8.6377082974303, + "rewards/rejected": -5.161998494466146, + "step": 2658 + }, + { + "epoch": 0.9815882977250704, + "grad_norm": 2.921875, + "kl": 0.0, + "learning_rate": 9.006427019622177e-09, + "logits/chosen": 220285006.76923078, + "logits/rejected": 212593327.15789473, + "logps/chosen": -306.7985652043269, + "logps/rejected": -494.6294202302632, + "loss": 0.0427, + "rewards/chosen": 3.5119171142578125, + "rewards/margins": 10.272050556383636, + "rewards/rejected": -6.760133442125823, + "step": 2659 + }, + { + "epoch": 0.9819574546629136, + "grad_norm": 6.53125, + "kl": 1.4209842681884766, + "learning_rate": 8.656797435642183e-09, + "logits/chosen": 353982208.0, + "logits/rejected": 171350464.0, + "logps/chosen": -395.5344543457031, + "logps/rejected": -340.62615966796875, + "loss": 0.1068, + "rewards/chosen": 2.2101707458496094, + "rewards/margins": 7.6624369621276855, + "rewards/rejected": -5.452266216278076, + "step": 2660 + }, + { + "epoch": 0.9823266116007567, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 8.31408329425465e-09, + "logits/chosen": 214484560.0, + "logits/rejected": 145260048.0, + "logps/chosen": -322.7527770996094, + "logps/rejected": -384.4688720703125, + "loss": 0.0562, + "rewards/chosen": 2.6410293579101562, + "rewards/margins": 8.818449020385742, + "rewards/rejected": -6.177419662475586, + "step": 2661 + }, + { + "epoch": 0.9826957685386, + "grad_norm": 6.53125, + "kl": 0.8438739776611328, + "learning_rate": 7.978285070286419e-09, + "logits/chosen": 240346731.78947368, + "logits/rejected": 149239148.30769232, + "logps/chosen": -354.2382298519737, + "logps/rejected": -397.6460712139423, + "loss": 0.0857, + "rewards/chosen": 3.0393309342233756, + "rewards/margins": 9.293565835064722, + "rewards/rejected": -6.254234900841346, + "step": 2662 + }, + { + "epoch": 0.9830649254764432, + "grad_norm": 6.40625, + "kl": 1.9657220840454102, + "learning_rate": 7.649403228980889e-09, + "logits/chosen": 205543646.60869566, + "logits/rejected": 147937308.44444445, + "logps/chosen": -375.5683169157609, + "logps/rejected": -553.2648111979166, + "loss": 0.0859, + "rewards/chosen": 3.373985953952955, + "rewards/margins": 11.219177909519361, + "rewards/rejected": -7.845191955566406, + "step": 2663 + }, + { + "epoch": 0.9834340824142864, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 7.327438226000239e-09, + "logits/chosen": 267845029.6470588, + "logits/rejected": 159797435.73333332, + "logps/chosen": -316.45335477941177, + "logps/rejected": -382.7223307291667, + "loss": 0.0934, + "rewards/chosen": 2.4273039873908546, + "rewards/margins": 8.402593395756739, + "rewards/rejected": -5.975289408365885, + "step": 2664 + }, + { + "epoch": 0.9838032393521295, + "grad_norm": 7.375, + "kl": 0.0, + "learning_rate": 7.01239050742264e-09, + "logits/chosen": 232607088.0, + "logits/rejected": 254899632.0, + "logps/chosen": -389.7972717285156, + "logps/rejected": -327.1248779296875, + "loss": 0.094, + "rewards/chosen": 2.82781720161438, + "rewards/margins": 7.365445375442505, + "rewards/rejected": -4.537628173828125, + "step": 2665 + }, + { + "epoch": 0.9841723962899728, + "grad_norm": 4.53125, + "kl": 0.5655698776245117, + "learning_rate": 6.704260509742266e-09, + "logits/chosen": 192101524.21052632, + "logits/rejected": 238424260.92307693, + "logps/chosen": -316.34457236842104, + "logps/rejected": -410.6437800480769, + "loss": 0.0988, + "rewards/chosen": 3.0321269788240133, + "rewards/margins": 8.878093611373593, + "rewards/rejected": -5.84596663254958, + "step": 2666 + }, + { + "epoch": 0.984541553227816, + "grad_norm": 3.875, + "kl": 0.0, + "learning_rate": 6.403048659870403e-09, + "logits/chosen": 263730192.0, + "logits/rejected": 174724144.0, + "logps/chosen": -377.292236328125, + "logps/rejected": -389.4289245605469, + "loss": 0.0695, + "rewards/chosen": 3.271562099456787, + "rewards/margins": 10.08257007598877, + "rewards/rejected": -6.811007976531982, + "step": 2667 + }, + { + "epoch": 0.9849107101656592, + "grad_norm": 5.375, + "kl": 0.9567311406135559, + "learning_rate": 6.1087553751310036e-09, + "logits/chosen": 219382029.47368422, + "logits/rejected": 251997892.92307693, + "logps/chosen": -359.00316097861844, + "logps/rejected": -474.1256760817308, + "loss": 0.1019, + "rewards/chosen": 2.831683510228207, + "rewards/margins": 10.31076136291751, + "rewards/rejected": -7.4790778526893025, + "step": 2668 + }, + { + "epoch": 0.9852798671035023, + "grad_norm": 4.84375, + "kl": 0.4573516845703125, + "learning_rate": 5.821381063264575e-09, + "logits/chosen": 348150181.64705884, + "logits/rejected": 205792836.26666668, + "logps/chosen": -397.4961511948529, + "logps/rejected": -468.5356770833333, + "loss": 0.0723, + "rewards/chosen": 2.44788652307847, + "rewards/margins": 8.830223966112325, + "rewards/rejected": -6.382337443033854, + "step": 2669 + }, + { + "epoch": 0.9856490240413456, + "grad_norm": 4.34375, + "kl": 0.6201772689819336, + "learning_rate": 5.540926122424295e-09, + "logits/chosen": 226402144.0, + "logits/rejected": 207264816.0, + "logps/chosen": -370.2931213378906, + "logps/rejected": -521.8320922851562, + "loss": 0.0573, + "rewards/chosen": 3.2798359394073486, + "rewards/margins": 11.04948878288269, + "rewards/rejected": -7.769652843475342, + "step": 2670 + }, + { + "epoch": 0.9860181809791888, + "grad_norm": 6.625, + "kl": 0.0, + "learning_rate": 5.267390941177119e-09, + "logits/chosen": 161346992.0, + "logits/rejected": 197555664.0, + "logps/chosen": -311.25518798828125, + "logps/rejected": -542.585205078125, + "loss": 0.098, + "rewards/chosen": 2.3897619247436523, + "rewards/margins": 10.31139850616455, + "rewards/rejected": -7.921636581420898, + "step": 2671 + }, + { + "epoch": 0.9863873379170319, + "grad_norm": 3.796875, + "kl": 0.0, + "learning_rate": 5.000775898502119e-09, + "logits/chosen": 233874124.8, + "logits/rejected": 224452216.47058824, + "logps/chosen": -410.6220377604167, + "logps/rejected": -407.57789522058823, + "loss": 0.0579, + "rewards/chosen": 3.8371200561523438, + "rewards/margins": 9.44317492316751, + "rewards/rejected": -5.606054867015166, + "step": 2672 + }, + { + "epoch": 0.9867564948548752, + "grad_norm": 8.125, + "kl": 0.0, + "learning_rate": 4.7410813637915885e-09, + "logits/chosen": 301263840.0, + "logits/rejected": 241281152.0, + "logps/chosen": -361.130859375, + "logps/rejected": -480.5243835449219, + "loss": 0.0779, + "rewards/chosen": 2.1503548622131348, + "rewards/margins": 8.9467453956604, + "rewards/rejected": -6.796390533447266, + "step": 2673 + }, + { + "epoch": 0.9871256517927184, + "grad_norm": 4.25, + "kl": 1.3386554718017578, + "learning_rate": 4.4883076968482705e-09, + "logits/chosen": 298059504.9411765, + "logits/rejected": 140899293.86666667, + "logps/chosen": -342.2975643382353, + "logps/rejected": -375.7220052083333, + "loss": 0.0649, + "rewards/chosen": 3.0193930233226105, + "rewards/margins": 10.533318194221048, + "rewards/rejected": -7.513925170898437, + "step": 2674 + }, + { + "epoch": 0.9874948087305616, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 4.242455247887578e-09, + "logits/chosen": 186108147.80952382, + "logits/rejected": 185160657.45454547, + "logps/chosen": -362.44549851190476, + "logps/rejected": -401.71626420454544, + "loss": 0.0719, + "rewards/chosen": 2.9031584603445872, + "rewards/margins": 9.425307286250128, + "rewards/rejected": -6.52214882590554, + "step": 2675 + }, + { + "epoch": 0.9878639656684047, + "grad_norm": 4.59375, + "kl": 0.0, + "learning_rate": 4.003524357534261e-09, + "logits/chosen": 217600120.47058824, + "logits/rejected": 151104648.53333333, + "logps/chosen": -408.8033088235294, + "logps/rejected": -438.6185546875, + "loss": 0.0771, + "rewards/chosen": 2.7429585176355697, + "rewards/margins": 9.804744406307446, + "rewards/rejected": -7.061785888671875, + "step": 2676 + }, + { + "epoch": 0.988233122606248, + "grad_norm": 5.53125, + "kl": 0.9347105026245117, + "learning_rate": 3.771515356825184e-09, + "logits/chosen": 198812581.6470588, + "logits/rejected": 206519893.33333334, + "logps/chosen": -355.65441176470586, + "logps/rejected": -426.6568359375, + "loss": 0.091, + "rewards/chosen": 2.633149539723116, + "rewards/margins": 8.073731306487439, + "rewards/rejected": -5.440581766764323, + "step": 2677 + }, + { + "epoch": 0.9886022795440912, + "grad_norm": 4.84375, + "kl": 0.0, + "learning_rate": 3.54642856720433e-09, + "logits/chosen": 195956872.53333333, + "logits/rejected": 187436212.70588234, + "logps/chosen": -353.75319010416666, + "logps/rejected": -408.19812729779414, + "loss": 0.0733, + "rewards/chosen": 3.2514828999837238, + "rewards/margins": 9.943398075477749, + "rewards/rejected": -6.691915175494025, + "step": 2678 + }, + { + "epoch": 0.9889714364819344, + "grad_norm": 5.90625, + "kl": 0.0, + "learning_rate": 3.328264300527795e-09, + "logits/chosen": 327227151.0588235, + "logits/rejected": 202711961.6, + "logps/chosen": -372.08777573529414, + "logps/rejected": -388.20185546875, + "loss": 0.0879, + "rewards/chosen": 2.476336310891544, + "rewards/margins": 9.108876187193628, + "rewards/rejected": -6.632539876302084, + "step": 2679 + }, + { + "epoch": 0.9893405934197775, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 3.117022859059349e-09, + "logits/chosen": 185977941.33333334, + "logits/rejected": 249405168.94117647, + "logps/chosen": -283.4002278645833, + "logps/rejected": -561.4203814338235, + "loss": 0.065, + "rewards/chosen": 3.0933204650878907, + "rewards/margins": 10.694055400175207, + "rewards/rejected": -7.600734935087316, + "step": 2680 + }, + { + "epoch": 0.9897097503576208, + "grad_norm": 5.0625, + "kl": 2.8304882049560547, + "learning_rate": 2.9127045354704343e-09, + "logits/chosen": 223179937.68421054, + "logits/rejected": 223597154.46153846, + "logps/chosen": -317.5255705180921, + "logps/rejected": -440.70571664663464, + "loss": 0.1283, + "rewards/chosen": 2.882582614296361, + "rewards/margins": 9.33766778181439, + "rewards/rejected": -6.455085167518029, + "step": 2681 + }, + { + "epoch": 0.990078907295464, + "grad_norm": 4.4375, + "kl": 0.0, + "learning_rate": 2.7153096128423873e-09, + "logits/chosen": 236165536.0, + "logits/rejected": 140196448.0, + "logps/chosen": -322.0392761230469, + "logps/rejected": -379.6275329589844, + "loss": 0.0631, + "rewards/chosen": 3.0420053005218506, + "rewards/margins": 10.514553785324097, + "rewards/rejected": -7.472548484802246, + "step": 2682 + }, + { + "epoch": 0.9904480642333072, + "grad_norm": 4.25, + "kl": 0.0, + "learning_rate": 2.524838364662552e-09, + "logits/chosen": 200746922.66666666, + "logits/rejected": 172019395.7647059, + "logps/chosen": -309.87194010416664, + "logps/rejected": -550.6579733455883, + "loss": 0.0695, + "rewards/chosen": 2.531751505533854, + "rewards/margins": 10.778296736174939, + "rewards/rejected": -8.246545230641084, + "step": 2683 + }, + { + "epoch": 0.9908172211711503, + "grad_norm": 5.65625, + "kl": 0.40712690353393555, + "learning_rate": 2.3412910548270553e-09, + "logits/chosen": 190377574.4, + "logits/rejected": 137010314.66666666, + "logps/chosen": -354.3791748046875, + "logps/rejected": -436.6413167317708, + "loss": 0.094, + "rewards/chosen": 2.932584762573242, + "rewards/margins": 9.598211288452148, + "rewards/rejected": -6.665626525878906, + "step": 2684 + }, + { + "epoch": 0.9911863781089936, + "grad_norm": 5.375, + "kl": 0.0, + "learning_rate": 2.164667937638587e-09, + "logits/chosen": 178549684.70588234, + "logits/rejected": 185298961.06666666, + "logps/chosen": -351.5674689797794, + "logps/rejected": -467.678125, + "loss": 0.0887, + "rewards/chosen": 2.8143384596880745, + "rewards/margins": 9.70662717632219, + "rewards/rejected": -6.8922887166341145, + "step": 2685 + }, + { + "epoch": 0.9915555350468368, + "grad_norm": 4.34375, + "kl": 0.0, + "learning_rate": 1.9949692578058453e-09, + "logits/chosen": 152339797.33333334, + "logits/rejected": 173741555.2, + "logps/chosen": -327.40024820963544, + "logps/rejected": -424.833154296875, + "loss": 0.0474, + "rewards/chosen": 3.53479798634847, + "rewards/margins": 10.344694073994955, + "rewards/rejected": -6.809896087646484, + "step": 2686 + }, + { + "epoch": 0.99192469198468, + "grad_norm": 5.25, + "kl": 1.4009199142456055, + "learning_rate": 1.8321952504435358e-09, + "logits/chosen": 163937501.86666667, + "logits/rejected": 189127529.4117647, + "logps/chosen": -314.00182291666664, + "logps/rejected": -407.9766199448529, + "loss": 0.078, + "rewards/chosen": 3.5978159586588543, + "rewards/margins": 9.307997699812347, + "rewards/rejected": -5.710181741153493, + "step": 2687 + }, + { + "epoch": 0.9922938489225231, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 1.6763461410740366e-09, + "logits/chosen": 242143780.57142857, + "logits/rejected": 148226673.7777778, + "logps/chosen": -365.70064871651783, + "logps/rejected": -383.93028428819446, + "loss": 0.0729, + "rewards/chosen": 2.932509558541434, + "rewards/margins": 9.565696322728716, + "rewards/rejected": -6.633186764187283, + "step": 2688 + }, + { + "epoch": 0.9926630058603664, + "grad_norm": 6.03125, + "kl": 0.0, + "learning_rate": 1.527422145624069e-09, + "logits/chosen": 187200640.0, + "logits/rejected": 173478080.0, + "logps/chosen": -352.46734619140625, + "logps/rejected": -361.43829345703125, + "loss": 0.1205, + "rewards/chosen": 1.8926060199737549, + "rewards/margins": 8.287647485733032, + "rewards/rejected": -6.395041465759277, + "step": 2689 + }, + { + "epoch": 0.9930321627982096, + "grad_norm": 7.3125, + "kl": 0.0, + "learning_rate": 1.3854234704252512e-09, + "logits/chosen": 177322347.78947368, + "logits/rejected": 186911606.15384614, + "logps/chosen": -308.02652138157896, + "logps/rejected": -469.0584059495192, + "loss": 0.1205, + "rewards/chosen": 2.4124061182925574, + "rewards/margins": 9.482532578441296, + "rewards/rejected": -7.070126460148738, + "step": 2690 + }, + { + "epoch": 0.9934013197360528, + "grad_norm": 4.78125, + "kl": 0.0, + "learning_rate": 1.2503503122157644e-09, + "logits/chosen": 227473644.30769232, + "logits/rejected": 145219341.47368422, + "logps/chosen": -331.5902569110577, + "logps/rejected": -378.23876953125, + "loss": 0.0836, + "rewards/chosen": 2.736270024226262, + "rewards/margins": 9.808570985369354, + "rewards/rejected": -7.0723009611430925, + "step": 2691 + }, + { + "epoch": 0.993770476673896, + "grad_norm": 4.875, + "kl": 0.0, + "learning_rate": 1.1222028581375777e-09, + "logits/chosen": 191002560.0, + "logits/rejected": 196271904.0, + "logps/chosen": -395.0908203125, + "logps/rejected": -406.6105041503906, + "loss": 0.0702, + "rewards/chosen": 2.9048197269439697, + "rewards/margins": 8.365755796432495, + "rewards/rejected": -5.460936069488525, + "step": 2692 + }, + { + "epoch": 0.9941396336117392, + "grad_norm": 5.53125, + "kl": 0.3054323196411133, + "learning_rate": 1.0009812857370016e-09, + "logits/chosen": 212761408.0, + "logits/rejected": 185581424.0, + "logps/chosen": -443.8248596191406, + "logps/rejected": -441.5130615234375, + "loss": 0.0813, + "rewards/chosen": 2.5316667556762695, + "rewards/margins": 9.464821338653564, + "rewards/rejected": -6.933154582977295, + "step": 2693 + }, + { + "epoch": 0.9945087905495824, + "grad_norm": 6.65625, + "kl": 0.0, + "learning_rate": 8.866857629652448e-10, + "logits/chosen": 178873203.2, + "logits/rejected": 211181009.45454547, + "logps/chosen": -446.04384765625, + "logps/rejected": -440.7923029119318, + "loss": 0.1174, + "rewards/chosen": 1.581305980682373, + "rewards/margins": 6.823639826341108, + "rewards/rejected": -5.242333845658735, + "step": 2694 + }, + { + "epoch": 0.9948779474874256, + "grad_norm": 5.6875, + "kl": 1.6925616264343262, + "learning_rate": 7.79316448177303e-10, + "logits/chosen": 238082560.0, + "logits/rejected": 180590896.76190478, + "logps/chosen": -440.6383611505682, + "logps/rejected": -409.15262276785717, + "loss": 0.0936, + "rewards/chosen": 2.3751683668656782, + "rewards/margins": 8.06130161533108, + "rewards/rejected": -5.686133248465402, + "step": 2695 + }, + { + "epoch": 0.9952471044252688, + "grad_norm": 3.0625, + "kl": 0.0, + "learning_rate": 6.788734901319594e-10, + "logits/chosen": 200649045.33333334, + "logits/rejected": 176073928.3478261, + "logps/chosen": -324.51190863715277, + "logps/rejected": -470.8218834918478, + "loss": 0.0324, + "rewards/chosen": 3.2998841603597007, + "rewards/margins": 10.657323947851209, + "rewards/rejected": -7.357439787491508, + "step": 2696 + }, + { + "epoch": 0.995616261363112, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 5.853570279917842e-10, + "logits/chosen": 178124657.7777778, + "logits/rejected": 176398080.0, + "logps/chosen": -402.51060655381946, + "logps/rejected": -310.20193917410717, + "loss": 0.0844, + "rewards/chosen": 2.7916122012668185, + "rewards/margins": 7.807909132942321, + "rewards/rejected": -5.016296931675503, + "step": 2697 + }, + { + "epoch": 0.9959854183009552, + "grad_norm": 4.53125, + "kl": 0.0, + "learning_rate": 4.9876719132258e-10, + "logits/chosen": 222596835.55555555, + "logits/rejected": 199810889.14285713, + "logps/chosen": -372.24115668402777, + "logps/rejected": -443.3677455357143, + "loss": 0.069, + "rewards/chosen": 2.999546898735894, + "rewards/margins": 9.6392454722571, + "rewards/rejected": -6.639698573521206, + "step": 2698 + }, + { + "epoch": 0.9963545752387984, + "grad_norm": 5.03125, + "kl": 0.0, + "learning_rate": 4.1910410009338155e-10, + "logits/chosen": 296687808.0, + "logits/rejected": 224867264.0, + "logps/chosen": -420.31146240234375, + "logps/rejected": -430.8600769042969, + "loss": 0.0631, + "rewards/chosen": 2.581023931503296, + "rewards/margins": 9.312965154647827, + "rewards/rejected": -6.731941223144531, + "step": 2699 + }, + { + "epoch": 0.9967237321766416, + "grad_norm": 4.15625, + "kl": 0.847437858581543, + "learning_rate": 3.4636786467590057e-10, + "logits/chosen": 214992402.2857143, + "logits/rejected": 162728903.1111111, + "logps/chosen": -329.42159598214283, + "logps/rejected": -410.78355577256946, + "loss": 0.0502, + "rewards/chosen": 3.859933308192662, + "rewards/margins": 10.146801509554424, + "rewards/rejected": -6.286868201361762, + "step": 2700 + }, + { + "epoch": 0.9970928891144848, + "grad_norm": 3.78125, + "kl": 0.0, + "learning_rate": 2.805585858461912e-10, + "logits/chosen": 134169843.2, + "logits/rejected": 176738292.36363637, + "logps/chosen": -329.44814453125, + "logps/rejected": -475.97953657670456, + "loss": 0.0514, + "rewards/chosen": 2.5675447463989256, + "rewards/margins": 9.521568627790971, + "rewards/rejected": -6.954023881392046, + "step": 2701 + }, + { + "epoch": 0.997462046052328, + "grad_norm": 3.984375, + "kl": 0.0, + "learning_rate": 2.2167635478187454e-10, + "logits/chosen": 191720448.0, + "logits/rejected": 158013248.0, + "logps/chosen": -423.46728515625, + "logps/rejected": -367.0566813151042, + "loss": 0.0435, + "rewards/chosen": 2.6309940814971924, + "rewards/margins": 7.924107154210408, + "rewards/rejected": -5.293113072713216, + "step": 2702 + }, + { + "epoch": 0.9978312029901713, + "grad_norm": 7.375, + "kl": 2.1429953575134277, + "learning_rate": 1.697212530632486e-10, + "logits/chosen": 170188556.8, + "logits/rejected": 197579648.0, + "logps/chosen": -301.7594970703125, + "logps/rejected": -321.6800537109375, + "loss": 0.1502, + "rewards/chosen": 2.262918472290039, + "rewards/margins": 6.457033157348633, + "rewards/rejected": -4.194114685058594, + "step": 2703 + }, + { + "epoch": 0.9982003599280144, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 1.2469335267384364e-10, + "logits/chosen": 200172763.42857143, + "logits/rejected": 225582250.66666666, + "logps/chosen": -354.2722865513393, + "logps/rejected": -456.8372395833333, + "loss": 0.0849, + "rewards/chosen": 2.8553058079310825, + "rewards/margins": 9.384749336848184, + "rewards/rejected": -6.529443528917101, + "step": 2704 + }, + { + "epoch": 0.9985695168658576, + "grad_norm": 4.8125, + "kl": 0.0, + "learning_rate": 8.659271599875673e-11, + "logits/chosen": 193081344.0, + "logits/rejected": 189180416.0, + "logps/chosen": -348.52044677734375, + "logps/rejected": -389.44818115234375, + "loss": 0.0754, + "rewards/chosen": 2.4970619678497314, + "rewards/margins": 8.410218000411987, + "rewards/rejected": -5.913156032562256, + "step": 2705 + }, + { + "epoch": 0.9989386738037008, + "grad_norm": 4.9375, + "kl": 0.0, + "learning_rate": 5.5419395826317166e-11, + "logits/chosen": 243569091.7647059, + "logits/rejected": 213289420.8, + "logps/chosen": -406.9110753676471, + "logps/rejected": -362.3346354166667, + "loss": 0.0758, + "rewards/chosen": 2.3223093818215763, + "rewards/margins": 8.21445873484892, + "rewards/rejected": -5.892149353027344, + "step": 2706 + }, + { + "epoch": 0.999307830741544, + "grad_norm": 4.03125, + "kl": 0.0, + "learning_rate": 3.1173435346976146e-11, + "logits/chosen": 215777696.0, + "logits/rejected": 179367408.0, + "logps/chosen": -341.69036865234375, + "logps/rejected": -407.43377685546875, + "loss": 0.0695, + "rewards/chosen": 2.684880018234253, + "rewards/margins": 9.902527570724487, + "rewards/rejected": -7.217647552490234, + "step": 2707 + }, + { + "epoch": 0.9996769876793872, + "grad_norm": 5.71875, + "kl": 0.0, + "learning_rate": 1.385486815219661e-11, + "logits/chosen": 269502139.73333335, + "logits/rejected": 246536207.05882353, + "logps/chosen": -295.63603515625, + "logps/rejected": -448.16081686580884, + "loss": 0.0857, + "rewards/chosen": 2.3992815653483075, + "rewards/margins": 8.67485305187749, + "rewards/rejected": -6.275571486529182, + "step": 2708 + }, + { + "epoch": 1.0, + "grad_norm": 4.90625, + "kl": 0.0, + "learning_rate": 3.4637182377839086e-12, + "logits/chosen": 192254800.0, + "logits/rejected": 161512000.0, + "logps/chosen": -437.8807678222656, + "logps/rejected": -451.3754475911458, + "loss": 0.0536, + "rewards/chosen": 2.8159048557281494, + "rewards/margins": 9.499688069025677, + "rewards/rejected": -6.683783213297526, + "step": 2709 + } + ], + "logging_steps": 1, + "max_steps": 2709, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}