{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968652037617555, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "learning_rate": 3.125e-08, "logits/chosen": -2.6282713413238525, "logits/rejected": -2.5908496379852295, "logps/chosen": -197.16619873046875, "logps/pi_response": -79.30451965332031, "logps/ref_response": -79.30451965332031, "logps/rejected": -296.1330261230469, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6781411170959473, "logits/rejected": -2.6469974517822266, "logps/chosen": -346.4255065917969, "logps/pi_response": -118.85271453857422, "logps/ref_response": -117.75043487548828, "logps/rejected": -370.81414794921875, "loss": 0.6897, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": -0.02221786230802536, "rewards/margins": 0.002993849106132984, "rewards/rejected": -0.02521171048283577, "step": 10 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, "logits/chosen": -2.6510815620422363, "logits/rejected": -2.643944501876831, "logps/chosen": -367.3860168457031, "logps/pi_response": -144.99758911132812, "logps/ref_response": -130.18136596679688, "logps/rejected": -418.2279357910156, "loss": 0.6531, "rewards/accuracies": 0.65625, "rewards/chosen": -0.34063997864723206, "rewards/margins": 0.11587037891149521, "rewards/rejected": -0.4565103054046631, "step": 20 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, "logits/chosen": -2.4943203926086426, "logits/rejected": -2.419273853302002, "logps/chosen": -414.4820251464844, "logps/pi_response": -131.7759552001953, "logps/ref_response": -112.0391616821289, "logps/rejected": -451.5794372558594, "loss": 0.6399, "rewards/accuracies": 0.625, "rewards/chosen": -0.7901811003684998, "rewards/margins": 0.368924617767334, "rewards/rejected": -1.1591057777404785, "step": 30 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -2.362710952758789, "logits/rejected": -2.3332810401916504, "logps/chosen": -440.0569763183594, "logps/pi_response": -165.1667938232422, "logps/ref_response": -109.482421875, "logps/rejected": -516.9385986328125, "loss": 0.6276, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.1469193696975708, "rewards/margins": 0.32388854026794434, "rewards/rejected": -1.4708077907562256, "step": 40 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -2.3507821559906006, "logits/rejected": -2.292423725128174, "logps/chosen": -402.9019775390625, "logps/pi_response": -151.48883056640625, "logps/ref_response": -116.75199127197266, "logps/rejected": -471.33294677734375, "loss": 0.6044, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7677600979804993, "rewards/margins": 0.36667126417160034, "rewards/rejected": -1.1344313621520996, "step": 50 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, "logits/chosen": -2.2195403575897217, "logits/rejected": -2.180490255355835, "logps/chosen": -431.85101318359375, "logps/pi_response": -163.049072265625, "logps/ref_response": -107.99072265625, "logps/rejected": -479.72698974609375, "loss": 0.6012, "rewards/accuracies": 0.625, "rewards/chosen": -1.1244434118270874, "rewards/margins": 0.3633125424385071, "rewards/rejected": -1.4877557754516602, "step": 60 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -2.2236759662628174, "logits/rejected": -2.145097255706787, "logps/chosen": -418.8814392089844, "logps/pi_response": -171.4234619140625, "logps/ref_response": -112.27213287353516, "logps/rejected": -477.97998046875, "loss": 0.5864, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0089619159698486, "rewards/margins": 0.407695472240448, "rewards/rejected": -1.4166573286056519, "step": 70 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, "logits/chosen": -2.168750047683716, "logits/rejected": -2.08182954788208, "logps/chosen": -450.20745849609375, "logps/pi_response": -176.2133331298828, "logps/ref_response": -100.95310974121094, "logps/rejected": -498.7420349121094, "loss": 0.5775, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.27684485912323, "rewards/margins": 0.3819858431816101, "rewards/rejected": -1.6588308811187744, "step": 80 }, { "epoch": 0.56, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -2.2347817420959473, "logits/rejected": -2.1916308403015137, "logps/chosen": -392.2938537597656, "logps/pi_response": -164.12486267089844, "logps/ref_response": -100.01224517822266, "logps/rejected": -519.1055297851562, "loss": 0.5703, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.00107741355896, "rewards/margins": 0.6102837920188904, "rewards/rejected": -1.6113611459732056, "step": 90 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -2.1057143211364746, "logits/rejected": -2.0601727962493896, "logps/chosen": -470.12554931640625, "logps/pi_response": -193.02027893066406, "logps/ref_response": -112.04179382324219, "logps/rejected": -601.5619506835938, "loss": 0.5477, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.430586814880371, "rewards/margins": 0.5693450570106506, "rewards/rejected": -1.9999316930770874, "step": 100 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -2.1225686073303223, "logits/rejected": -2.095275402069092, "logps/chosen": -470.36651611328125, "logps/pi_response": -188.7615509033203, "logps/ref_response": -100.41735076904297, "logps/rejected": -545.0133056640625, "loss": 0.5734, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5548226833343506, "rewards/margins": 0.403084933757782, "rewards/rejected": -1.9579073190689087, "step": 110 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, "logits/chosen": -2.1088550090789795, "logits/rejected": -2.0325331687927246, "logps/chosen": -483.610595703125, "logps/pi_response": -217.5375518798828, "logps/ref_response": -129.7823944091797, "logps/rejected": -566.143798828125, "loss": 0.542, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4614275693893433, "rewards/margins": 0.5255736112594604, "rewards/rejected": -1.9870010614395142, "step": 120 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, "logits/chosen": -2.1483001708984375, "logits/rejected": -2.087111234664917, "logps/chosen": -477.526123046875, "logps/pi_response": -191.5972900390625, "logps/ref_response": -111.270751953125, "logps/rejected": -556.4031982421875, "loss": 0.561, "rewards/accuracies": 0.75, "rewards/chosen": -1.4053503274917603, "rewards/margins": 0.597406268119812, "rewards/rejected": -2.0027565956115723, "step": 130 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -2.1172375679016113, "logits/rejected": -2.0181617736816406, "logps/chosen": -475.329345703125, "logps/pi_response": -208.83413696289062, "logps/ref_response": -119.419189453125, "logps/rejected": -562.1836547851562, "loss": 0.5439, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4041850566864014, "rewards/margins": 0.5541211366653442, "rewards/rejected": -1.9583065509796143, "step": 140 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -2.0521388053894043, "logits/rejected": -1.9707081317901611, "logps/chosen": -441.0538635253906, "logps/pi_response": -184.4268798828125, "logps/ref_response": -101.12681579589844, "logps/rejected": -518.763671875, "loss": 0.5564, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.354651689529419, "rewards/margins": 0.5879614353179932, "rewards/rejected": -1.9426130056381226, "step": 150 }, { "epoch": 1.0, "step": 159, "total_flos": 0.0, "train_loss": 0.5899531646344647, "train_runtime": 4183.7712, "train_samples_per_second": 4.871, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }