{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9936305732484076, "eval_steps": 500, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012738853503184714, "grad_norm": 21.624867146821636, "learning_rate": 6.25e-08, "logits/chosen": -2.737076759338379, "logits/rejected": -2.736344814300537, "logps/chosen": -290.1990661621094, "logps/pi_response": -186.79766845703125, "logps/ref_response": -186.79766845703125, "logps/rejected": -404.5589599609375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12738853503184713, "grad_norm": 19.161383473353002, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.6904942989349365, "logits/rejected": -2.6532483100891113, "logps/chosen": -229.66769409179688, "logps/pi_response": -143.10076904296875, "logps/ref_response": -144.07577514648438, "logps/rejected": -283.5401916503906, "loss": 0.6834, "rewards/accuracies": 0.5347222089767456, "rewards/chosen": -0.012149970047175884, "rewards/margins": 0.016954369843006134, "rewards/rejected": -0.029104342684149742, "step": 10 }, { "epoch": 0.25477707006369427, "grad_norm": 25.259189818921946, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.657824754714966, "logits/rejected": -2.625739336013794, "logps/chosen": -244.0951690673828, "logps/pi_response": -130.72300720214844, "logps/ref_response": -137.23435974121094, "logps/rejected": -318.93115234375, "loss": 0.5936, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.24313190579414368, "rewards/margins": 0.3598789572715759, "rewards/rejected": -0.6030109524726868, "step": 20 }, { "epoch": 0.3821656050955414, "grad_norm": 21.327135622241446, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.723853349685669, "logits/rejected": -2.673682451248169, "logps/chosen": -309.9010925292969, "logps/pi_response": -153.6499481201172, "logps/ref_response": -154.19497680664062, "logps/rejected": -397.6539001464844, "loss": 0.5664, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.5057164430618286, "rewards/margins": 0.7059683799743652, "rewards/rejected": -1.2116848230361938, "step": 30 }, { "epoch": 0.5095541401273885, "grad_norm": 23.49582384200601, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.687347650527954, "logits/rejected": -2.6348681449890137, "logps/chosen": -253.1380157470703, "logps/pi_response": -134.76382446289062, "logps/ref_response": -133.91629028320312, "logps/rejected": -381.5281066894531, "loss": 0.54, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3809475302696228, "rewards/margins": 0.6450485587120056, "rewards/rejected": -1.0259960889816284, "step": 40 }, { "epoch": 0.6369426751592356, "grad_norm": 25.663450934512035, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.676809787750244, "logits/rejected": -2.6430976390838623, "logps/chosen": -275.3895568847656, "logps/pi_response": -152.5225830078125, "logps/ref_response": -138.03114318847656, "logps/rejected": -408.5316162109375, "loss": 0.5409, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.5592767000198364, "rewards/margins": 0.6597134470939636, "rewards/rejected": -1.2189903259277344, "step": 50 }, { "epoch": 0.7643312101910829, "grad_norm": 22.272371630156243, "learning_rate": 7.723433775328384e-08, "logits/chosen": -2.6719603538513184, "logits/rejected": -2.6622776985168457, "logps/chosen": -273.9826965332031, "logps/pi_response": -162.14956665039062, "logps/ref_response": -146.4705810546875, "logps/rejected": -414.38818359375, "loss": 0.5308, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5259829759597778, "rewards/margins": 0.6048363447189331, "rewards/rejected": -1.1308192014694214, "step": 60 }, { "epoch": 0.89171974522293, "grad_norm": 23.694168211855562, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -2.5099849700927734, "logits/rejected": -2.475076198577881, "logps/chosen": -290.50506591796875, "logps/pi_response": -179.0775146484375, "logps/ref_response": -147.26612854003906, "logps/rejected": -441.50421142578125, "loss": 0.5225, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6331243515014648, "rewards/margins": 0.7736427187919617, "rewards/rejected": -1.4067671298980713, "step": 70 }, { "epoch": 0.9936305732484076, "step": 78, "total_flos": 0.0, "train_loss": 0.562170364917853, "train_runtime": 1746.7486, "train_samples_per_second": 5.725, "train_steps_per_second": 0.045 } ], "logging_steps": 10, "max_steps": 78, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }