{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984301412872841, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.020931449502878074, "grad_norm": 0.3513486981391907, "learning_rate": 3.3333333333333335e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.79296875, "logps/chosen": -290.0, "logps/rejected": -272.0, "loss": 0.2706, "rewards/accuracies": 0.23749999701976776, "rewards/chosen": -0.000316619873046875, "rewards/margins": -0.000270843505859375, "rewards/rejected": -4.5299530029296875e-05, "step": 10 }, { "epoch": 0.04186289900575615, "grad_norm": 0.4020876884460449, "learning_rate": 6.666666666666667e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.7734375, "logps/chosen": -282.0, "logps/rejected": -262.0, "loss": 0.2668, "rewards/accuracies": 0.40312498807907104, "rewards/chosen": -0.0002613067626953125, "rewards/margins": 0.0022735595703125, "rewards/rejected": -0.002532958984375, "step": 20 }, { "epoch": 0.06279434850863422, "grad_norm": 0.3581520617008209, "learning_rate": 7.996537632406924e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.69921875, "logps/chosen": -280.0, "logps/rejected": -262.0, "loss": 0.2625, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00159454345703125, "rewards/margins": 0.0135498046875, "rewards/rejected": -0.011962890625, "step": 30 }, { "epoch": 0.0837257980115123, "grad_norm": 0.35938382148742676, "learning_rate": 7.975400422088283e-07, "logits/chosen": -0.875, "logits/rejected": -0.80078125, "logps/chosen": -320.0, "logps/rejected": -274.0, "loss": 0.2645, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.002227783203125, "rewards/margins": 0.039306640625, "rewards/rejected": -0.04150390625, "step": 40 }, { "epoch": 0.10465724751439037, "grad_norm": 0.3968264162540436, "learning_rate": 7.9351510351176e-07, "logits/chosen": -0.80078125, "logits/rejected": -0.734375, "logps/chosen": -316.0, "logps/rejected": -276.0, "loss": 0.2515, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.046875, "rewards/margins": 0.06787109375, "rewards/rejected": -0.11474609375, "step": 50 }, { "epoch": 0.12558869701726844, "grad_norm": 0.3821749687194824, "learning_rate": 7.875982974868683e-07, "logits/chosen": -0.81640625, "logits/rejected": -0.73828125, "logps/chosen": -302.0, "logps/rejected": -276.0, "loss": 0.2246, "rewards/accuracies": 0.640625, "rewards/chosen": -0.1103515625, "rewards/margins": 0.076171875, "rewards/rejected": -0.1865234375, "step": 60 }, { "epoch": 0.14652014652014653, "grad_norm": 0.4155704975128174, "learning_rate": 7.798180698326886e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.7421875, "logps/chosen": -312.0, "logps/rejected": -292.0, "loss": 0.2059, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.1865234375, "rewards/margins": 0.0712890625, "rewards/rejected": -0.2578125, "step": 70 }, { "epoch": 0.1674515960230246, "grad_norm": 0.37762466073036194, "learning_rate": 7.702118248530725e-07, "logits/chosen": -0.9296875, "logits/rejected": -0.7890625, "logps/chosen": -304.0, "logps/rejected": -298.0, "loss": 0.2023, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.2109375, "rewards/margins": 0.10888671875, "rewards/rejected": -0.3203125, "step": 80 }, { "epoch": 0.18838304552590268, "grad_norm": 0.4166751205921173, "learning_rate": 7.588257456318656e-07, "logits/chosen": -0.91015625, "logits/rejected": -0.859375, "logps/chosen": -324.0, "logps/rejected": -312.0, "loss": 0.1918, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.26171875, "rewards/margins": 0.1142578125, "rewards/rejected": -0.376953125, "step": 90 }, { "epoch": 0.20931449502878074, "grad_norm": 0.4024507999420166, "learning_rate": 7.457145720026305e-07, "logits/chosen": -0.87890625, "logits/rejected": -0.7890625, "logps/chosen": -310.0, "logps/rejected": -302.0, "loss": 0.1728, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.310546875, "rewards/margins": 0.1474609375, "rewards/rejected": -0.45703125, "step": 100 }, { "epoch": 0.2302459445316588, "grad_norm": 0.4183866083621979, "learning_rate": 7.309413373808504e-07, "logits/chosen": -1.0625, "logits/rejected": -0.97265625, "logps/chosen": -326.0, "logps/rejected": -332.0, "loss": 0.1513, "rewards/accuracies": 0.625, "rewards/chosen": -0.451171875, "rewards/margins": 0.1630859375, "rewards/rejected": -0.61328125, "step": 110 }, { "epoch": 0.25117739403453687, "grad_norm": 0.3973161280155182, "learning_rate": 7.145770657238214e-07, "logits/chosen": -1.125, "logits/rejected": -1.078125, "logps/chosen": -370.0, "logps/rejected": -346.0, "loss": 0.1381, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.53515625, "rewards/margins": 0.2109375, "rewards/rejected": -0.74609375, "step": 120 }, { "epoch": 0.272108843537415, "grad_norm": 0.41612762212753296, "learning_rate": 6.967004300751329e-07, "logits/chosen": -1.1640625, "logits/rejected": -1.0078125, "logps/chosen": -360.0, "logps/rejected": -368.0, "loss": 0.1343, "rewards/accuracies": 0.671875, "rewards/chosen": -0.490234375, "rewards/margins": 0.2109375, "rewards/rejected": -0.69921875, "step": 130 }, { "epoch": 0.29304029304029305, "grad_norm": 0.480084091424942, "learning_rate": 6.773973743353193e-07, "logits/chosen": -1.125, "logits/rejected": -1.015625, "logps/chosen": -338.0, "logps/rejected": -328.0, "loss": 0.1305, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.55859375, "rewards/margins": 0.208984375, "rewards/rejected": -0.76953125, "step": 140 }, { "epoch": 0.3139717425431711, "grad_norm": 0.4561936557292938, "learning_rate": 6.567607000770662e-07, "logits/chosen": -1.140625, "logits/rejected": -0.98046875, "logps/chosen": -346.0, "logps/rejected": -354.0, "loss": 0.1246, "rewards/accuracies": 0.625, "rewards/chosen": -0.6171875, "rewards/margins": 0.1904296875, "rewards/rejected": -0.80859375, "step": 150 }, { "epoch": 0.3349031920460492, "grad_norm": 0.41682758927345276, "learning_rate": 6.348896203913977e-07, "logits/chosen": -1.265625, "logits/rejected": -1.1796875, "logps/chosen": -376.0, "logps/rejected": -340.0, "loss": 0.1237, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.57421875, "rewards/margins": 0.28125, "rewards/rejected": -0.8515625, "step": 160 }, { "epoch": 0.35583464154892724, "grad_norm": 0.42994987964630127, "learning_rate": 6.118892829097829e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.2578125, "logps/chosen": -352.0, "logps/rejected": -346.0, "loss": 0.1185, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5703125, "rewards/margins": 0.2275390625, "rewards/rejected": -0.796875, "step": 170 }, { "epoch": 0.37676609105180536, "grad_norm": 0.33889681100845337, "learning_rate": 5.878702642952824e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.234375, "logps/chosen": -390.0, "logps/rejected": -362.0, "loss": 0.1007, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7734375, "rewards/margins": 0.1787109375, "rewards/rejected": -0.94921875, "step": 180 }, { "epoch": 0.3976975405546834, "grad_norm": 0.5241643786430359, "learning_rate": 5.629480386330271e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.234375, "logps/chosen": -360.0, "logps/rejected": -366.0, "loss": 0.095, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.796875, "rewards/margins": 0.27734375, "rewards/rejected": -1.0703125, "step": 190 }, { "epoch": 0.4186289900575615, "grad_norm": 0.4775124788284302, "learning_rate": 5.372424222757983e-07, "logits/chosen": -1.328125, "logits/rejected": -1.3125, "logps/chosen": -356.0, "logps/rejected": -346.0, "loss": 0.0951, "rewards/accuracies": 0.640625, "rewards/chosen": -0.80859375, "rewards/margins": 0.2275390625, "rewards/rejected": -1.0390625, "step": 200 }, { "epoch": 0.43956043956043955, "grad_norm": 0.4539019465446472, "learning_rate": 5.108769978136762e-07, "logits/chosen": -1.359375, "logits/rejected": -1.3125, "logps/chosen": -372.0, "logps/rejected": -372.0, "loss": 0.0954, "rewards/accuracies": 0.703125, "rewards/chosen": -0.734375, "rewards/margins": 0.345703125, "rewards/rejected": -1.078125, "step": 210 }, { "epoch": 0.4604918890633176, "grad_norm": 0.471497118473053, "learning_rate": 4.839785199370884e-07, "logits/chosen": -1.453125, "logits/rejected": -1.3359375, "logps/chosen": -392.0, "logps/rejected": -376.0, "loss": 0.092, "rewards/accuracies": 0.684374988079071, "rewards/chosen": -0.84375, "rewards/margins": 0.328125, "rewards/rejected": -1.171875, "step": 220 }, { "epoch": 0.48142333856619574, "grad_norm": 0.4082787334918976, "learning_rate": 4.5667630604963293e-07, "logits/chosen": -1.40625, "logits/rejected": -1.296875, "logps/chosen": -400.0, "logps/rejected": -402.0, "loss": 0.0913, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.890625, "rewards/margins": 0.3203125, "rewards/rejected": -1.2109375, "step": 230 }, { "epoch": 0.5023547880690737, "grad_norm": 0.380476176738739, "learning_rate": 4.291016145603776e-07, "logits/chosen": -1.296875, "logits/rejected": -1.2109375, "logps/chosen": -404.0, "logps/rejected": -400.0, "loss": 0.0884, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -1.03125, "rewards/margins": 0.2890625, "rewards/rejected": -1.3203125, "step": 240 }, { "epoch": 0.5232862375719518, "grad_norm": 0.4021344482898712, "learning_rate": 4.0138701384455775e-07, "logits/chosen": -1.3515625, "logits/rejected": -1.2578125, "logps/chosen": -386.0, "logps/rejected": -398.0, "loss": 0.0799, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": -1.0859375, "rewards/margins": 0.267578125, "rewards/rejected": -1.3515625, "step": 250 }, { "epoch": 0.54421768707483, "grad_norm": 0.4608613848686218, "learning_rate": 3.736657449064707e-07, "logits/chosen": -1.3359375, "logits/rejected": -1.2578125, "logps/chosen": -412.0, "logps/rejected": -410.0, "loss": 0.0905, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.89453125, "rewards/margins": 0.37109375, "rewards/rejected": -1.265625, "step": 260 }, { "epoch": 0.565149136577708, "grad_norm": 0.4489026367664337, "learning_rate": 3.4607108080862845e-07, "logits/chosen": -1.34375, "logits/rejected": -1.2578125, "logps/chosen": -362.0, "logps/rejected": -374.0, "loss": 0.088, "rewards/accuracies": 0.671875, "rewards/chosen": -0.76171875, "rewards/margins": 0.3359375, "rewards/rejected": -1.1015625, "step": 270 }, { "epoch": 0.5860805860805861, "grad_norm": 0.43133413791656494, "learning_rate": 3.187356859467888e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.28125, "logps/chosen": -386.0, "logps/rejected": -388.0, "loss": 0.103, "rewards/accuracies": 0.628125011920929, "rewards/chosen": -0.765625, "rewards/margins": 0.2734375, "rewards/rejected": -1.0390625, "step": 280 }, { "epoch": 0.6070120355834642, "grad_norm": 0.3863625228404999, "learning_rate": 2.917909782512186e-07, "logits/chosen": -1.34375, "logits/rejected": -1.25, "logps/chosen": -366.0, "logps/rejected": -368.0, "loss": 0.0965, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.75390625, "rewards/margins": 0.41796875, "rewards/rejected": -1.171875, "step": 290 }, { "epoch": 0.6279434850863422, "grad_norm": 0.5902645587921143, "learning_rate": 2.6536649738047756e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.2265625, "logps/chosen": -398.0, "logps/rejected": -394.0, "loss": 0.1011, "rewards/accuracies": 0.671875, "rewards/chosen": -0.87109375, "rewards/margins": 0.3046875, "rewards/rejected": -1.1796875, "step": 300 }, { "epoch": 0.6488749345892203, "grad_norm": 0.4010671079158783, "learning_rate": 2.3958928194520076e-07, "logits/chosen": -1.3203125, "logits/rejected": -1.15625, "logps/chosen": -382.0, "logps/rejected": -422.0, "loss": 0.0932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.88671875, "rewards/margins": 0.36328125, "rewards/rejected": -1.25, "step": 310 }, { "epoch": 0.6698063840920984, "grad_norm": 0.3942067623138428, "learning_rate": 2.145832587559451e-07, "logits/chosen": -1.265625, "logits/rejected": -1.2109375, "logps/chosen": -380.0, "logps/rejected": -406.0, "loss": 0.0855, "rewards/accuracies": 0.671875, "rewards/chosen": -0.91796875, "rewards/margins": 0.375, "rewards/rejected": -1.296875, "step": 320 }, { "epoch": 0.6907378335949764, "grad_norm": 0.47365090250968933, "learning_rate": 1.9046864703135954e-07, "logits/chosen": -1.203125, "logits/rejected": -1.140625, "logps/chosen": -388.0, "logps/rejected": -402.0, "loss": 0.0909, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.984375, "rewards/margins": 0.3515625, "rewards/rejected": -1.3359375, "step": 330 }, { "epoch": 0.7116692830978545, "grad_norm": 0.4232117235660553, "learning_rate": 1.673613804310103e-07, "logits/chosen": -1.234375, "logits/rejected": -1.203125, "logps/chosen": -376.0, "logps/rejected": -390.0, "loss": 0.099, "rewards/accuracies": 0.690625011920929, "rewards/chosen": -0.890625, "rewards/margins": 0.404296875, "rewards/rejected": -1.296875, "step": 340 }, { "epoch": 0.7326007326007326, "grad_norm": 0.45671120285987854, "learning_rate": 1.4537254969150808e-07, "logits/chosen": -1.25, "logits/rejected": -1.1640625, "logps/chosen": -370.0, "logps/rejected": -386.0, "loss": 0.0941, "rewards/accuracies": 0.653124988079071, "rewards/chosen": -0.875, "rewards/margins": 0.314453125, "rewards/rejected": -1.1875, "step": 350 }, { "epoch": 0.7535321821036107, "grad_norm": 0.45585089921951294, "learning_rate": 1.2460786854552e-07, "logits/chosen": -1.3046875, "logits/rejected": -1.3203125, "logps/chosen": -360.0, "logps/rejected": -372.0, "loss": 0.0929, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": -0.8984375, "rewards/margins": 0.310546875, "rewards/rejected": -1.2109375, "step": 360 }, { "epoch": 0.7744636316064888, "grad_norm": 0.4104853570461273, "learning_rate": 1.0516716549132283e-07, "logits/chosen": -1.2734375, "logits/rejected": -1.234375, "logps/chosen": -402.0, "logps/rejected": -386.0, "loss": 0.0939, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.87109375, "rewards/margins": 0.359375, "rewards/rejected": -1.2265625, "step": 370 }, { "epoch": 0.7953950811093669, "grad_norm": 0.4091091752052307, "learning_rate": 8.714390385627006e-08, "logits/chosen": -1.25, "logits/rejected": -1.28125, "logps/chosen": -394.0, "logps/rejected": -406.0, "loss": 0.0879, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.92578125, "rewards/margins": 0.328125, "rewards/rejected": -1.25, "step": 380 }, { "epoch": 0.8163265306122449, "grad_norm": 0.45830023288726807, "learning_rate": 7.062473246152229e-08, "logits/chosen": -1.25, "logits/rejected": -1.2265625, "logps/chosen": -388.0, "logps/rejected": -384.0, "loss": 0.0852, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9375, "rewards/margins": 0.326171875, "rewards/rejected": -1.265625, "step": 390 }, { "epoch": 0.837257980115123, "grad_norm": 0.49946463108062744, "learning_rate": 5.568906904826809e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.1953125, "logps/chosen": -400.0, "logps/rejected": -432.0, "loss": 0.0885, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.03125, "rewards/margins": 0.36328125, "rewards/rejected": -1.3984375, "step": 400 }, { "epoch": 0.858189429618001, "grad_norm": 0.3637143671512604, "learning_rate": 4.2408718468160386e-08, "logits/chosen": -1.3125, "logits/rejected": -1.25, "logps/chosen": -398.0, "logps/rejected": -406.0, "loss": 0.084, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.98046875, "rewards/margins": 0.4375, "rewards/rejected": -1.4140625, "step": 410 }, { "epoch": 0.8791208791208791, "grad_norm": 0.4085330069065094, "learning_rate": 3.084752747356116e-08, "logits/chosen": -1.2734375, "logits/rejected": -1.28125, "logps/chosen": -392.0, "logps/rejected": -398.0, "loss": 0.0839, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9453125, "rewards/margins": 0.3828125, "rewards/rejected": -1.328125, "step": 420 }, { "epoch": 0.9000523286237572, "grad_norm": 0.45559161901474, "learning_rate": 2.1061077767226255e-08, "logits/chosen": -1.2578125, "logits/rejected": -1.2421875, "logps/chosen": -398.0, "logps/rejected": -420.0, "loss": 0.0841, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -1.0, "rewards/margins": 0.380859375, "rewards/rejected": -1.3828125, "step": 430 }, { "epoch": 0.9209837781266352, "grad_norm": 0.5042709112167358, "learning_rate": 1.309641878713048e-08, "logits/chosen": -1.2890625, "logits/rejected": -1.234375, "logps/chosen": -398.0, "logps/rejected": -398.0, "loss": 0.084, "rewards/accuracies": 0.671875, "rewards/chosen": -1.046875, "rewards/margins": 0.416015625, "rewards/rejected": -1.4609375, "step": 440 }, { "epoch": 0.9419152276295133, "grad_norm": 0.4325369894504547, "learning_rate": 6.991841511097174e-09, "logits/chosen": -1.3671875, "logits/rejected": -1.296875, "logps/chosen": -408.0, "logps/rejected": -402.0, "loss": 0.0837, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.921875, "rewards/margins": 0.369140625, "rewards/rejected": -1.2890625, "step": 450 }, { "epoch": 0.9628466771323915, "grad_norm": 0.4439210593700409, "learning_rate": 2.776694368692656e-09, "logits/chosen": -1.328125, "logits/rejected": -1.3046875, "logps/chosen": -382.0, "logps/rejected": -392.0, "loss": 0.0812, "rewards/accuracies": 0.6875, "rewards/chosen": -0.98046875, "rewards/margins": 0.390625, "rewards/rejected": -1.375, "step": 460 }, { "epoch": 0.9837781266352695, "grad_norm": 0.4371497929096222, "learning_rate": 4.712421454093007e-10, "logits/chosen": -1.2109375, "logits/rejected": -1.203125, "logps/chosen": -408.0, "logps/rejected": -422.0, "loss": 0.0881, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.984375, "rewards/margins": 0.404296875, "rewards/rejected": -1.390625, "step": 470 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }