{ "best_metric": null, "best_model_checkpoint": null, "epoch": 34.8421052631579, "eval_steps": 500, "global_step": 140, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 2.4210526315789473, "grad_norm": 0.6825495362281799, "learning_rate": 0.00014285714285714287, "logits/chosen": -0.9514083862304688, "logits/rejected": -1.0533627271652222, "logps/chosen": -39.20041275024414, "logps/rejected": -21.37519073486328, "loss": 0.5664, "rewards/accuracies": 0.7743055820465088, "rewards/chosen": 0.15898703038692474, "rewards/margins": 0.5821112990379333, "rewards/rejected": -0.42312419414520264, "step": 10 }, { "epoch": 4.842105263157895, "grad_norm": 0.08670935779809952, "learning_rate": 0.00019888308262251285, "logits/chosen": -0.13068915903568268, "logits/rejected": -0.16997402906417847, "logps/chosen": -74.40914916992188, "logps/rejected": -151.8779296875, "loss": 0.0471, "rewards/accuracies": 0.9635416865348816, "rewards/chosen": -3.3677978515625, "rewards/margins": 10.11207389831543, "rewards/rejected": -13.479872703552246, "step": 20 }, { "epoch": 7.421052631578947, "grad_norm": 0.0034703314304351807, "learning_rate": 0.00019214762118704076, "logits/chosen": 0.19064301252365112, "logits/rejected": 0.24179647862911224, "logps/chosen": -95.65469360351562, "logps/rejected": -240.3935089111328, "loss": 0.0286, "rewards/accuracies": 0.9664474129676819, "rewards/chosen": -5.480339050292969, "rewards/margins": 16.84355354309082, "rewards/rejected": -22.323888778686523, "step": 30 }, { "epoch": 9.842105263157894, "grad_norm": 0.0037726943846791983, "learning_rate": 0.00017971325072229226, "logits/chosen": 0.08421485126018524, "logits/rejected": 0.1458778977394104, "logps/chosen": -170.75747680664062, "logps/rejected": -437.0244140625, "loss": 0.0258, "rewards/accuracies": 0.9670138955116272, "rewards/chosen": -12.99338150024414, "rewards/margins": 28.99999237060547, "rewards/rejected": -41.993377685546875, "step": 40 }, { "epoch": 12.421052631578947, "grad_norm": 0.0033487407490611076, "learning_rate": 0.00016234898018587337, "logits/chosen": -0.04333849251270294, "logits/rejected": -0.01762447878718376, "logps/chosen": -186.37686157226562, "logps/rejected": -464.90765380859375, "loss": 0.0272, "rewards/accuracies": 0.9671053290367126, "rewards/chosen": -14.553586959838867, "rewards/margins": 30.223119735717773, "rewards/rejected": -44.77670669555664, "step": 50 }, { "epoch": 14.842105263157894, "grad_norm": 0.003097180975601077, "learning_rate": 0.00014112871031306119, "logits/chosen": -0.13475045561790466, "logits/rejected": -0.1398223638534546, "logps/chosen": -184.91830444335938, "logps/rejected": -461.1513977050781, "loss": 0.0244, "rewards/accuracies": 0.96875, "rewards/chosen": -14.403005599975586, "rewards/margins": 29.99542999267578, "rewards/rejected": -44.398433685302734, "step": 60 }, { "epoch": 17.42105263157895, "grad_norm": 0.0024468335323035717, "learning_rate": 0.00011736481776669306, "logits/chosen": -0.1959075629711151, "logits/rejected": -0.22540684044361115, "logps/chosen": -185.68663024902344, "logps/rejected": -458.96826171875, "loss": 0.0261, "rewards/accuracies": 0.9684211015701294, "rewards/chosen": -14.49423885345459, "rewards/margins": 29.691282272338867, "rewards/rejected": -44.185516357421875, "step": 70 }, { "epoch": 19.842105263157894, "grad_norm": 0.002537067048251629, "learning_rate": 9.252699064135758e-05, "logits/chosen": -0.22896860539913177, "logits/rejected": -0.2724004089832306, "logps/chosen": -184.9608154296875, "logps/rejected": -460.61468505859375, "loss": 0.0266, "rewards/accuracies": 0.9659722447395325, "rewards/chosen": -14.424090385437012, "rewards/margins": 29.924333572387695, "rewards/rejected": -44.34842300415039, "step": 80 }, { "epoch": 22.42105263157895, "grad_norm": 0.0032824031077325344, "learning_rate": 6.815133497483157e-05, "logits/chosen": -0.24728278815746307, "logits/rejected": -0.2936950922012329, "logps/chosen": -185.9248046875, "logps/rejected": -467.2992858886719, "loss": 0.0287, "rewards/accuracies": 0.9651316404342651, "rewards/chosen": -14.498348236083984, "rewards/margins": 30.518945693969727, "rewards/rejected": -45.01729202270508, "step": 90 }, { "epoch": 24.842105263157894, "grad_norm": 0.003522921120747924, "learning_rate": 4.574537361342407e-05, "logits/chosen": -0.2618289887905121, "logits/rejected": -0.311085045337677, "logps/chosen": -187.0611572265625, "logps/rejected": -466.8361511230469, "loss": 0.0258, "rewards/accuracies": 0.9670138955116272, "rewards/chosen": -14.643547058105469, "rewards/margins": 30.329221725463867, "rewards/rejected": -44.97277069091797, "step": 100 }, { "epoch": 27.42105263157895, "grad_norm": 0.0024658790789544582, "learning_rate": 2.669481281701739e-05, "logits/chosen": -0.27023741602897644, "logits/rejected": -0.3240560293197632, "logps/chosen": -186.94210815429688, "logps/rejected": -473.1488952636719, "loss": 0.0276, "rewards/accuracies": 0.9664474129676819, "rewards/chosen": -14.606731414794922, "rewards/margins": 30.998502731323242, "rewards/rejected": -45.6052360534668, "step": 110 }, { "epoch": 29.842105263157894, "grad_norm": 0.0034348091576248407, "learning_rate": 1.2177842662977135e-05, "logits/chosen": -0.27230748534202576, "logits/rejected": -0.32499459385871887, "logps/chosen": -185.62405395507812, "logps/rejected": -470.46063232421875, "loss": 0.0252, "rewards/accuracies": 0.9677083492279053, "rewards/chosen": -14.492985725402832, "rewards/margins": 30.83690071105957, "rewards/rejected": -45.329891204833984, "step": 120 }, { "epoch": 32.421052631578945, "grad_norm": 0.0032501835376024246, "learning_rate": 3.092271377092215e-06, "logits/chosen": -0.27767735719680786, "logits/rejected": -0.3315570652484894, "logps/chosen": -187.55230712890625, "logps/rejected": -470.9134521484375, "loss": 0.0287, "rewards/accuracies": 0.9651316404342651, "rewards/chosen": -14.68417739868164, "rewards/margins": 30.694440841674805, "rewards/rejected": -45.37861633300781, "step": 130 }, { "epoch": 34.8421052631579, "grad_norm": 0.003052822547033429, "learning_rate": 0.0, "logits/chosen": -0.27554523944854736, "logits/rejected": -0.32890552282333374, "logps/chosen": -185.5701904296875, "logps/rejected": -472.9700927734375, "loss": 0.0244, "rewards/accuracies": 0.96875, "rewards/chosen": -14.46578311920166, "rewards/margins": 31.117090225219727, "rewards/rejected": -45.58286666870117, "step": 140 }, { "epoch": 34.8421052631579, "step": 140, "total_flos": 3.1072679368851456e+17, "train_loss": 0.06661632827350071, "train_runtime": 6674.5686, "train_samples_per_second": 6.319, "train_steps_per_second": 0.021 } ], "logging_steps": 10, "max_steps": 140, "num_input_tokens_seen": 0, "num_train_epochs": 35, "save_steps": 70, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.1072679368851456e+17, "train_batch_size": 32, "trial_name": null, "trial_params": null }