{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9905956112852664, "eval_steps": 500, "global_step": 79, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 6.25e-08, "logits/chosen": -2.9087584018707275, "logits/rejected": -2.8338208198547363, "logps/chosen": -352.96319580078125, "logps/pi_response": -77.43819427490234, "logps/ref_response": -77.43819427490234, "logps/rejected": -170.26690673828125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.13, "learning_rate": 4.990217055187362e-07, "logits/chosen": -2.7869324684143066, "logits/rejected": -2.7639520168304443, "logps/chosen": -237.83615112304688, "logps/pi_response": -73.73171997070312, "logps/ref_response": -73.52912139892578, "logps/rejected": -170.54678344726562, "loss": 0.6905, "rewards/accuracies": 0.5486111044883728, "rewards/chosen": 0.005888951942324638, "rewards/margins": 0.004442126490175724, "rewards/rejected": 0.0014468259178102016, "step": 10 }, { "epoch": 0.25, "learning_rate": 4.655786431300069e-07, "logits/chosen": -2.7454209327697754, "logits/rejected": -2.6917433738708496, "logps/chosen": -248.8080596923828, "logps/pi_response": -79.94108581542969, "logps/ref_response": -71.4201431274414, "logps/rejected": -174.76712036132812, "loss": 0.6604, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.04190480336546898, "rewards/margins": 0.08115339279174805, "rewards/rejected": -0.03924858942627907, "step": 20 }, { "epoch": 0.38, "learning_rate": 3.9061232191019517e-07, "logits/chosen": -2.658690929412842, "logits/rejected": -2.623389959335327, "logps/chosen": -234.90585327148438, "logps/pi_response": -105.19673156738281, "logps/ref_response": -68.3199691772461, "logps/rejected": -190.49130249023438, "loss": 0.6266, "rewards/accuracies": 0.7406250238418579, "rewards/chosen": -0.0959092304110527, "rewards/margins": 0.18841782212257385, "rewards/rejected": -0.28432708978652954, "step": 30 }, { "epoch": 0.5, "learning_rate": 2.8856223324132555e-07, "logits/chosen": -2.665231943130493, "logits/rejected": -2.6431384086608887, "logps/chosen": -259.9793395996094, "logps/pi_response": -145.41070556640625, "logps/ref_response": -75.2870864868164, "logps/rejected": -230.671142578125, "loss": 0.5798, "rewards/accuracies": 0.75, "rewards/chosen": -0.22865232825279236, "rewards/margins": 0.35143420100212097, "rewards/rejected": -0.5800865292549133, "step": 40 }, { "epoch": 0.63, "learning_rate": 1.7908455541642582e-07, "logits/chosen": -2.667163133621216, "logits/rejected": -2.633802890777588, "logps/chosen": -287.7627258300781, "logps/pi_response": -154.95919799804688, "logps/ref_response": -74.21197509765625, "logps/rejected": -249.1497039794922, "loss": 0.5501, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.32577013969421387, "rewards/margins": 0.43573999404907227, "rewards/rejected": -0.7615101933479309, "step": 50 }, { "epoch": 0.75, "learning_rate": 8.32661172908373e-08, "logits/chosen": -2.664795398712158, "logits/rejected": -2.6296298503875732, "logps/chosen": -257.57293701171875, "logps/pi_response": -161.33213806152344, "logps/ref_response": -66.34608459472656, "logps/rejected": -266.1256408691406, "loss": 0.5298, "rewards/accuracies": 0.796875, "rewards/chosen": -0.4251323342323303, "rewards/margins": 0.5614072680473328, "rewards/rejected": -0.9865396618843079, "step": 60 }, { "epoch": 0.88, "learning_rate": 1.956279997278043e-08, "logits/chosen": -2.6611087322235107, "logits/rejected": -2.6245808601379395, "logps/chosen": -307.0917053222656, "logps/pi_response": -187.67933654785156, "logps/ref_response": -77.6395034790039, "logps/rejected": -278.4061279296875, "loss": 0.5189, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.49217742681503296, "rewards/margins": 0.6050472259521484, "rewards/rejected": -1.097224473953247, "step": 70 }, { "epoch": 0.99, "step": 79, "total_flos": 0.0, "train_loss": 0.5867953602271744, "train_runtime": 4674.1967, "train_samples_per_second": 4.36, "train_steps_per_second": 0.017 } ], "logging_steps": 10, "max_steps": 79, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }