{ "best_global_step": 50, "best_metric": 0.5247398614883423, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 50, "global_step": 201, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.15037593984962405, "grad_norm": 1994.7098388671875, "learning_rate": 8.571428571428571e-06, "loss": 34.788, "step": 10 }, { "epoch": 0.3007518796992481, "grad_norm": 478.9690246582031, "learning_rate": 1.8095238095238097e-05, "loss": 16.7412, "step": 20 }, { "epoch": 0.45112781954887216, "grad_norm": 247.04168701171875, "learning_rate": 1.9111111111111113e-05, "loss": 10.0861, "step": 30 }, { "epoch": 0.6015037593984962, "grad_norm": 130.6533203125, "learning_rate": 1.8e-05, "loss": 7.6831, "step": 40 }, { "epoch": 0.7518796992481203, "grad_norm": 158.5700225830078, "learning_rate": 1.688888888888889e-05, "loss": 8.4341, "step": 50 }, { "epoch": 0.7518796992481203, "eval_accuracy": 0.71982421875, "eval_loss": 0.5247398614883423, "eval_runtime": 1.3258, "eval_samples_per_second": 565.679, "eval_steps_per_second": 18.102, "step": 50 }, { "epoch": 0.9022556390977443, "grad_norm": 302.4800720214844, "learning_rate": 1.577777777777778e-05, "loss": 8.2982, "step": 60 }, { "epoch": 1.045112781954887, "grad_norm": 147.75057983398438, "learning_rate": 1.4666666666666666e-05, "loss": 8.1503, "step": 70 }, { "epoch": 1.1954887218045114, "grad_norm": 37.97437286376953, "learning_rate": 1.3555555555555557e-05, "loss": 7.4996, "step": 80 }, { "epoch": 1.3458646616541352, "grad_norm": 217.54083251953125, "learning_rate": 1.2444444444444446e-05, "loss": 7.1527, "step": 90 }, { "epoch": 1.4962406015037595, "grad_norm": 198.07823181152344, "learning_rate": 1.1333333333333334e-05, "loss": 6.4484, "step": 100 }, { "epoch": 1.4962406015037595, "eval_accuracy": 0.749609375, "eval_loss": 0.539933979511261, "eval_runtime": 1.3139, "eval_samples_per_second": 570.817, "eval_steps_per_second": 18.266, "step": 100 }, { "epoch": 1.6466165413533833, "grad_norm": 267.6050109863281, "learning_rate": 1.0222222222222223e-05, "loss": 7.155, "step": 110 }, { "epoch": 1.7969924812030076, "grad_norm": 105.75324249267578, "learning_rate": 9.111111111111112e-06, "loss": 6.9728, "step": 120 }, { "epoch": 1.9473684210526314, "grad_norm": 242.2601318359375, "learning_rate": 8.000000000000001e-06, "loss": 6.3463, "step": 130 }, { "epoch": 2.090225563909774, "grad_norm": 29.438447952270508, "learning_rate": 6.88888888888889e-06, "loss": 4.3959, "step": 140 }, { "epoch": 2.2406015037593985, "grad_norm": 170.81954956054688, "learning_rate": 5.777777777777778e-06, "loss": 3.5158, "step": 150 }, { "epoch": 2.2406015037593985, "eval_accuracy": 0.76240234375, "eval_loss": 0.5935441851615906, "eval_runtime": 1.3059, "eval_samples_per_second": 574.323, "eval_steps_per_second": 18.378, "step": 150 }, { "epoch": 2.3909774436090228, "grad_norm": 115.6646957397461, "learning_rate": 4.666666666666667e-06, "loss": 3.3307, "step": 160 }, { "epoch": 2.5413533834586466, "grad_norm": 102.5296630859375, "learning_rate": 3.555555555555556e-06, "loss": 2.8094, "step": 170 }, { "epoch": 2.6917293233082704, "grad_norm": 70.7061767578125, "learning_rate": 2.4444444444444447e-06, "loss": 2.7766, "step": 180 }, { "epoch": 2.8421052631578947, "grad_norm": 146.64231872558594, "learning_rate": 1.3333333333333334e-06, "loss": 3.0117, "step": 190 }, { "epoch": 2.992481203007519, "grad_norm": 122.64640045166016, "learning_rate": 2.2222222222222224e-07, "loss": 2.7322, "step": 200 }, { "epoch": 2.992481203007519, "eval_accuracy": 0.759375, "eval_loss": 0.5873631238937378, "eval_runtime": 1.3022, "eval_samples_per_second": 575.956, "eval_steps_per_second": 18.431, "step": 200 } ], "logging_steps": 10, "max_steps": 201, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0258179678850253e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }