{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.971563981042654, "eval_steps": 500, "global_step": 104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.018957345971563982, "grad_norm": 71.69449642421675, "learning_rate": 4.545454545454545e-08, "logits/chosen": 228.492431640625, "logits/rejected": 249.21771240234375, "logps/chosen": -447.14471435546875, "logps/rejected": -436.09393310546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.1895734597156398, "grad_norm": 78.4118299616628, "learning_rate": 4.545454545454545e-07, "logits/chosen": 257.2274475097656, "logits/rejected": 249.07215881347656, "logps/chosen": -381.537353515625, "logps/rejected": -444.5811767578125, "loss": 0.71, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.024299899116158485, "rewards/margins": 0.044586196541786194, "rewards/rejected": -0.020286299288272858, "step": 10 }, { "epoch": 0.3791469194312796, "grad_norm": 99.84300510877705, "learning_rate": 4.885348141000122e-07, "logits/chosen": 248.119384765625, "logits/rejected": 246.07846069335938, "logps/chosen": -386.1551818847656, "logps/rejected": -414.88385009765625, "loss": 0.6619, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.11353431642055511, "rewards/margins": 0.10505084693431854, "rewards/rejected": -0.21858516335487366, "step": 20 }, { "epoch": 0.5687203791469194, "grad_norm": 56.97411905200688, "learning_rate": 4.5025027361734613e-07, "logits/chosen": 247.26358032226562, "logits/rejected": 244.18240356445312, "logps/chosen": -341.32952880859375, "logps/rejected": -402.8668518066406, "loss": 0.5739, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.05143852159380913, "rewards/margins": 0.7519260048866272, "rewards/rejected": -0.8033644556999207, "step": 30 }, { "epoch": 0.7582938388625592, "grad_norm": 57.580210507052286, "learning_rate": 3.893311157806091e-07, "logits/chosen": 247.73263549804688, "logits/rejected": 259.1477966308594, "logps/chosen": -369.2883605957031, "logps/rejected": -386.1142883300781, "loss": 0.5751, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": 0.09320323169231415, "rewards/margins": 0.881219744682312, "rewards/rejected": -0.7880164384841919, "step": 40 }, { "epoch": 0.9478672985781991, "grad_norm": 77.32585608048733, "learning_rate": 3.126631330646801e-07, "logits/chosen": 246.6822052001953, "logits/rejected": 251.89175415039062, "logps/chosen": -397.3800354003906, "logps/rejected": -417.54400634765625, "loss": 0.5055, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.00946036446839571, "rewards/margins": 0.9901891946792603, "rewards/rejected": -0.980728805065155, "step": 50 }, { "epoch": 0.985781990521327, "eval_logits/chosen": 314.65277099609375, "eval_logits/rejected": 314.80328369140625, "eval_logps/chosen": -371.4647521972656, "eval_logps/rejected": -432.01666259765625, "eval_loss": 0.5193939208984375, "eval_rewards/accuracies": 0.7446808218955994, "eval_rewards/chosen": -0.17477014660835266, "eval_rewards/margins": 0.8593912124633789, "eval_rewards/rejected": -1.0341612100601196, "eval_runtime": 91.3038, "eval_samples_per_second": 8.214, "eval_steps_per_second": 0.515, "step": 52 }, { "epoch": 1.1374407582938388, "grad_norm": 32.08842708767771, "learning_rate": 2.2891223348923882e-07, "logits/chosen": 243.8182373046875, "logits/rejected": 242.9388885498047, "logps/chosen": -341.7854919433594, "logps/rejected": -424.074951171875, "loss": 0.3496, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": 0.382510781288147, "rewards/margins": 1.8218187093734741, "rewards/rejected": -1.4393078088760376, "step": 60 }, { "epoch": 1.3270142180094786, "grad_norm": 32.178505554608634, "learning_rate": 1.4754491880085317e-07, "logits/chosen": 250.7259063720703, "logits/rejected": 248.94216918945312, "logps/chosen": -350.2694091796875, "logps/rejected": -398.77850341796875, "loss": 0.2373, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": 0.50797039270401, "rewards/margins": 2.4789700508117676, "rewards/rejected": -1.9709994792938232, "step": 70 }, { "epoch": 1.5165876777251186, "grad_norm": 24.814991686693443, "learning_rate": 7.775827023107834e-08, "logits/chosen": 254.3613739013672, "logits/rejected": 242.96975708007812, "logps/chosen": -387.6805725097656, "logps/rejected": -463.39471435546875, "loss": 0.2168, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2538264989852905, "rewards/margins": 2.580048084259033, "rewards/rejected": -2.326221227645874, "step": 80 }, { "epoch": 1.7061611374407581, "grad_norm": 34.610883975273474, "learning_rate": 2.7440387297912122e-08, "logits/chosen": 246.24142456054688, "logits/rejected": 237.8177947998047, "logps/chosen": -372.5790100097656, "logps/rejected": -471.82489013671875, "loss": 0.2116, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15084370970726013, "rewards/margins": 2.670081615447998, "rewards/rejected": -2.519237995147705, "step": 90 }, { "epoch": 1.8957345971563981, "grad_norm": 26.75346235433071, "learning_rate": 2.27878296044029e-09, "logits/chosen": 250.3442840576172, "logits/rejected": 250.7106475830078, "logps/chosen": -373.6303405761719, "logps/rejected": -428.4443359375, "loss": 0.2167, "rewards/accuracies": 0.90625, "rewards/chosen": -0.0009252533200196922, "rewards/margins": 2.5818238258361816, "rewards/rejected": -2.5827488899230957, "step": 100 }, { "epoch": 1.971563981042654, "eval_logits/chosen": 314.4089660644531, "eval_logits/rejected": 314.86077880859375, "eval_logps/chosen": -382.4696350097656, "eval_logps/rejected": -446.88519287109375, "eval_loss": 0.5040754079818726, "eval_rewards/accuracies": 0.7659574747085571, "eval_rewards/chosen": -0.7250128984451294, "eval_rewards/margins": 1.0525743961334229, "eval_rewards/rejected": -1.7775872945785522, "eval_runtime": 86.82, "eval_samples_per_second": 8.639, "eval_steps_per_second": 0.541, "step": 104 }, { "epoch": 1.971563981042654, "step": 104, "total_flos": 0.0, "train_loss": 0.4161795240182143, "train_runtime": 4732.7297, "train_samples_per_second": 2.852, "train_steps_per_second": 0.022 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }