{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9936305732484076, "eval_steps": 500, "global_step": 78, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012738853503184714, "grad_norm": 11.358307592517615, "learning_rate": 6.25e-08, "logits/chosen": -2.9414963722229004, "logits/rejected": -2.8714659214019775, "logps/chosen": -311.84521484375, "logps/pi_response": -74.39646911621094, "logps/ref_response": -74.39646911621094, "logps/rejected": -137.14251708984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.12738853503184713, "grad_norm": 9.998455044790436, "learning_rate": 4.989935734988097e-07, "logits/chosen": -2.8198399543762207, "logits/rejected": -2.7871909141540527, "logps/chosen": -243.38845825195312, "logps/pi_response": -64.48577117919922, "logps/ref_response": -64.41886138916016, "logps/rejected": -162.31851196289062, "loss": 0.6913, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.006698554381728172, "rewards/margins": 0.002677548211067915, "rewards/rejected": 0.00402100570499897, "step": 10 }, { "epoch": 0.25477707006369427, "grad_norm": 8.569494020475565, "learning_rate": 4.646121984004665e-07, "logits/chosen": -2.7100276947021484, "logits/rejected": -2.675428867340088, "logps/chosen": -269.69061279296875, "logps/pi_response": -83.09251403808594, "logps/ref_response": -78.46118927001953, "logps/rejected": -180.74826049804688, "loss": 0.6686, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.06145774573087692, "rewards/margins": 0.06889880448579788, "rewards/rejected": -0.007441061083227396, "step": 20 }, { "epoch": 0.3821656050955414, "grad_norm": 9.060999242111171, "learning_rate": 3.877242453630256e-07, "logits/chosen": -2.6871438026428223, "logits/rejected": -2.6598358154296875, "logps/chosen": -257.09381103515625, "logps/pi_response": -102.15229797363281, "logps/ref_response": -74.38651275634766, "logps/rejected": -186.90274047851562, "loss": 0.6379, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.028406163677573204, "rewards/margins": 0.16392305493354797, "rewards/rejected": -0.19232919812202454, "step": 30 }, { "epoch": 0.5095541401273885, "grad_norm": 9.618564789622928, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -2.63045072555542, "logits/rejected": -2.6070001125335693, "logps/chosen": -235.0849151611328, "logps/pi_response": -111.06126403808594, "logps/ref_response": -67.13166046142578, "logps/rejected": -217.4446563720703, "loss": 0.6102, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.15396903455257416, "rewards/margins": 0.16582393646240234, "rewards/rejected": -0.3197929263114929, "step": 40 }, { "epoch": 0.6369426751592356, "grad_norm": 11.295831887724951, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -2.6650638580322266, "logits/rejected": -2.6218314170837402, "logps/chosen": -272.8438720703125, "logps/pi_response": -133.91879272460938, "logps/ref_response": -70.64263916015625, "logps/rejected": -217.2205352783203, "loss": 0.5676, "rewards/accuracies": 0.75, "rewards/chosen": -0.20129744708538055, "rewards/margins": 0.3560473322868347, "rewards/rejected": -0.5573447346687317, "step": 50 }, { "epoch": 0.7643312101910829, "grad_norm": 11.795692669450544, "learning_rate": 7.723433775328384e-08, "logits/chosen": -2.7076563835144043, "logits/rejected": -2.654421329498291, "logps/chosen": -280.95416259765625, "logps/pi_response": -140.62283325195312, "logps/ref_response": -70.89532470703125, "logps/rejected": -252.49368286132812, "loss": 0.5608, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.2872604727745056, "rewards/margins": 0.3657943606376648, "rewards/rejected": -0.6530548334121704, "step": 60 }, { "epoch": 0.89171974522293, "grad_norm": 13.957707294493547, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -2.670067310333252, "logits/rejected": -2.6604971885681152, "logps/chosen": -266.5218200683594, "logps/pi_response": -149.20486450195312, "logps/ref_response": -73.49095153808594, "logps/rejected": -272.1815185546875, "loss": 0.5576, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.34963458776474, "rewards/margins": 0.4036545753479004, "rewards/rejected": -0.7532891035079956, "step": 70 }, { "epoch": 0.9936305732484076, "step": 78, "total_flos": 0.0, "train_loss": 0.604345291088789, "train_runtime": 1779.6676, "train_samples_per_second": 5.619, "train_steps_per_second": 0.044 } ], "logging_steps": 10, "max_steps": 78, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }