{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9968652037617555, "eval_steps": 500, "global_step": 159, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006269592476489028, "grad_norm": 11.162558389320719, "learning_rate": 3.125e-08, "logits/chosen": -2.832691192626953, "logits/rejected": -2.789004325866699, "logps/chosen": -379.2402648925781, "logps/pi_response": -84.25662994384766, "logps/ref_response": -84.25662994384766, "logps/rejected": -192.58773803710938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.06269592476489028, "grad_norm": 9.199296960060826, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.779388427734375, "logits/rejected": -2.744753837585449, "logps/chosen": -243.76174926757812, "logps/pi_response": -68.54817962646484, "logps/ref_response": -68.44412231445312, "logps/rejected": -167.88645935058594, "loss": 0.6927, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0011128478217869997, "rewards/margins": 0.0005714390426874161, "rewards/rejected": 0.0005414087790995836, "step": 10 }, { "epoch": 0.12539184952978055, "grad_norm": 9.622073702330978, "learning_rate": 4.990353313429303e-07, "logits/chosen": -2.7916176319122314, "logits/rejected": -2.7705063819885254, "logps/chosen": -232.59619140625, "logps/pi_response": -72.38710021972656, "logps/ref_response": -71.28221893310547, "logps/rejected": -165.4346466064453, "loss": 0.6786, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.030751097947359085, "rewards/margins": 0.021662823855876923, "rewards/rejected": 0.009088275022804737, "step": 20 }, { "epoch": 0.18808777429467086, "grad_norm": 8.334770075469805, "learning_rate": 4.882681251368548e-07, "logits/chosen": -2.7269303798675537, "logits/rejected": -2.6703150272369385, "logps/chosen": -238.45944213867188, "logps/pi_response": -80.09962463378906, "logps/ref_response": -64.93635559082031, "logps/rejected": -164.25949096679688, "loss": 0.6524, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.02072182670235634, "rewards/margins": 0.1209292858839035, "rewards/rejected": -0.10020747035741806, "step": 30 }, { "epoch": 0.2507836990595611, "grad_norm": 9.807608483085172, "learning_rate": 4.6604720940421207e-07, "logits/chosen": -2.65498948097229, "logits/rejected": -2.608666181564331, "logps/chosen": -271.5018615722656, "logps/pi_response": -114.51536560058594, "logps/ref_response": -69.87471008300781, "logps/rejected": -211.77072143554688, "loss": 0.6131, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.11142440885305405, "rewards/margins": 0.21302208304405212, "rewards/rejected": -0.32444649934768677, "step": 40 }, { "epoch": 0.31347962382445144, "grad_norm": 14.498018136229003, "learning_rate": 4.3344075855595097e-07, "logits/chosen": -2.6647868156433105, "logits/rejected": -2.6350674629211426, "logps/chosen": -267.47705078125, "logps/pi_response": -130.49932861328125, "logps/ref_response": -67.23551177978516, "logps/rejected": -212.48483276367188, "loss": 0.5801, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.21161659061908722, "rewards/margins": 0.35675129294395447, "rewards/rejected": -0.5683678388595581, "step": 50 }, { "epoch": 0.3761755485893417, "grad_norm": 13.440157501390912, "learning_rate": 3.920161866827889e-07, "logits/chosen": -2.6458613872528076, "logits/rejected": -2.5944952964782715, "logps/chosen": -237.12710571289062, "logps/pi_response": -148.73211669921875, "logps/ref_response": -64.54133605957031, "logps/rejected": -239.72702026367188, "loss": 0.5536, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.36507314443588257, "rewards/margins": 0.4184595048427582, "rewards/rejected": -0.7835326194763184, "step": 60 }, { "epoch": 0.438871473354232, "grad_norm": 15.982610593807811, "learning_rate": 3.4376480090239047e-07, "logits/chosen": -2.680788040161133, "logits/rejected": -2.6525492668151855, "logps/chosen": -269.2456970214844, "logps/pi_response": -195.72213745117188, "logps/ref_response": -69.08720397949219, "logps/rejected": -306.4203186035156, "loss": 0.5003, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.5016717910766602, "rewards/margins": 0.7233041524887085, "rewards/rejected": -1.224975824356079, "step": 70 }, { "epoch": 0.5015673981191222, "grad_norm": 22.558501492450485, "learning_rate": 2.910060778827554e-07, "logits/chosen": -2.7134718894958496, "logits/rejected": -2.6782937049865723, "logps/chosen": -313.21136474609375, "logps/pi_response": -213.4176788330078, "logps/ref_response": -72.85678100585938, "logps/rejected": -303.14056396484375, "loss": 0.5063, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -0.6677788496017456, "rewards/margins": 0.8193937540054321, "rewards/rejected": -1.4871724843978882, "step": 80 }, { "epoch": 0.5642633228840125, "grad_norm": 21.710845718415612, "learning_rate": 2.3627616503391812e-07, "logits/chosen": -2.634028911590576, "logits/rejected": -2.608798027038574, "logps/chosen": -329.37799072265625, "logps/pi_response": -211.5565948486328, "logps/ref_response": -70.06621551513672, "logps/rejected": -340.3314514160156, "loss": 0.4772, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7806206941604614, "rewards/margins": 0.7901986837387085, "rewards/rejected": -1.57081937789917, "step": 90 }, { "epoch": 0.6269592476489029, "grad_norm": 27.570197538418466, "learning_rate": 1.8220596619089573e-07, "logits/chosen": -2.6406660079956055, "logits/rejected": -2.584998369216919, "logps/chosen": -326.03961181640625, "logps/pi_response": -226.829833984375, "logps/ref_response": -68.18948364257812, "logps/rejected": -311.16668701171875, "loss": 0.4812, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.771049976348877, "rewards/margins": 0.8004252314567566, "rewards/rejected": -1.5714751482009888, "step": 100 }, { "epoch": 0.6896551724137931, "grad_norm": 25.855890266739543, "learning_rate": 1.3139467229135998e-07, "logits/chosen": -2.5621211528778076, "logits/rejected": -2.5297341346740723, "logps/chosen": -303.0425720214844, "logps/pi_response": -235.69052124023438, "logps/ref_response": -61.66025924682617, "logps/rejected": -347.60089111328125, "loss": 0.4747, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.0712878704071045, "rewards/margins": 0.6884299516677856, "rewards/rejected": -1.7597179412841797, "step": 110 }, { "epoch": 0.7523510971786834, "grad_norm": 25.63885329215509, "learning_rate": 8.628481651367875e-08, "logits/chosen": -2.5914602279663086, "logits/rejected": -2.5509674549102783, "logps/chosen": -330.34613037109375, "logps/pi_response": -255.2554473876953, "logps/ref_response": -64.22006225585938, "logps/rejected": -360.2675476074219, "loss": 0.4799, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0213677883148193, "rewards/margins": 1.0237469673156738, "rewards/rejected": -2.045114517211914, "step": 120 }, { "epoch": 0.8150470219435737, "grad_norm": 26.281388822149008, "learning_rate": 4.904486005914027e-08, "logits/chosen": -2.5695884227752686, "logits/rejected": -2.527682065963745, "logps/chosen": -339.78851318359375, "logps/pi_response": -252.6923370361328, "logps/ref_response": -70.67754364013672, "logps/rejected": -348.1340026855469, "loss": 0.4568, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8407719731330872, "rewards/margins": 1.1005146503448486, "rewards/rejected": -1.941286325454712, "step": 130 }, { "epoch": 0.877742946708464, "grad_norm": 21.11421165620455, "learning_rate": 2.1464952759020856e-08, "logits/chosen": -2.5593600273132324, "logits/rejected": -2.521488904953003, "logps/chosen": -350.98797607421875, "logps/pi_response": -244.7444305419922, "logps/ref_response": -70.74293518066406, "logps/rejected": -367.61639404296875, "loss": 0.4479, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.0182468891143799, "rewards/margins": 0.9283415675163269, "rewards/rejected": -1.9465882778167725, "step": 140 }, { "epoch": 0.9404388714733543, "grad_norm": 34.097104034294276, "learning_rate": 4.8708793644441086e-09, "logits/chosen": -2.5964386463165283, "logits/rejected": -2.5334842205047607, "logps/chosen": -350.24432373046875, "logps/pi_response": -242.6420135498047, "logps/ref_response": -74.09484100341797, "logps/rejected": -348.6947021484375, "loss": 0.4642, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9555439949035645, "rewards/margins": 0.8465269207954407, "rewards/rejected": -1.80207097530365, "step": 150 }, { "epoch": 0.9968652037617555, "step": 159, "total_flos": 0.0, "train_loss": 0.5324955706326466, "train_runtime": 3623.407, "train_samples_per_second": 5.624, "train_steps_per_second": 0.044 } ], "logging_steps": 10, "max_steps": 159, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }