{ "best_global_step": 2500, "best_metric": 0.2838171720504761, "best_model_checkpoint": "output/checkpoint-2500", "epoch": 3.0, "eval_steps": 500, "global_step": 2646, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11344299489506524, "grad_norm": 0.6828517317771912, "learning_rate": 7.471698113207547e-05, "loss": 1.2849, "mean_token_accuracy": 0.7202034851908684, "num_tokens": 1109379.0, "step": 100 }, { "epoch": 0.22688598979013047, "grad_norm": 0.7256981134414673, "learning_rate": 0.00015018867924528303, "loss": 1.0128, "mean_token_accuracy": 0.7610476166009903, "num_tokens": 2235350.0, "step": 200 }, { "epoch": 0.3403289846851957, "grad_norm": 0.5375235080718994, "learning_rate": 0.0001998993912029063, "loss": 0.899, "mean_token_accuracy": 0.7830679216980934, "num_tokens": 3357507.0, "step": 300 }, { "epoch": 0.45377197958026094, "grad_norm": 0.5699286460876465, "learning_rate": 0.00019844106172649113, "loss": 0.802, "mean_token_accuracy": 0.8053381592035294, "num_tokens": 4495916.0, "step": 400 }, { "epoch": 0.5672149744753261, "grad_norm": 0.6252707242965698, "learning_rate": 0.00019527142855069377, "loss": 0.6939, "mean_token_accuracy": 0.8303170916438103, "num_tokens": 5638963.0, "step": 500 }, { "epoch": 0.5672149744753261, "eval_loss": 0.6935457587242126, "eval_mean_token_accuracy": 0.8294496539295936, "eval_num_tokens": 5638963.0, "eval_runtime": 276.8401, "eval_samples_per_second": 5.664, "eval_steps_per_second": 0.708, "step": 500 }, { "epoch": 0.6806579693703914, "grad_norm": 0.482766717672348, "learning_rate": 0.00019044559271652598, "loss": 0.6269, "mean_token_accuracy": 0.8460214346647262, "num_tokens": 6771799.0, "step": 600 }, { "epoch": 0.7941009642654566, "grad_norm": 0.6250064373016357, "learning_rate": 0.00018404744676405976, "loss": 0.5745, "mean_token_accuracy": 0.8582470047473908, "num_tokens": 7896784.0, "step": 700 }, { "epoch": 0.9075439591605219, "grad_norm": 0.696171760559082, "learning_rate": 0.00017618821634086138, "loss": 0.5008, "mean_token_accuracy": 0.8766051492094994, "num_tokens": 9036185.0, "step": 800 }, { "epoch": 1.0204197390811118, "grad_norm": 0.2615467309951782, "learning_rate": 0.00016700452665049568, "loss": 0.4528, "mean_token_accuracy": 0.8879575738355742, "num_tokens": 10264626.0, "step": 900 }, { "epoch": 1.1338627339761769, "grad_norm": 0.3127917945384979, "learning_rate": 0.00015665602735404375, "loss": 0.3348, "mean_token_accuracy": 0.9150179827213287, "num_tokens": 11398500.0, "step": 1000 }, { "epoch": 1.1338627339761769, "eval_loss": 0.44535523653030396, "eval_mean_token_accuracy": 0.8932840066904925, "eval_num_tokens": 11398500.0, "eval_runtime": 276.2931, "eval_samples_per_second": 5.675, "eval_steps_per_second": 0.709, "step": 1000 }, { "epoch": 1.2473057288712421, "grad_norm": 0.2831040620803833, "learning_rate": 0.00014532261721344984, "loss": 0.3248, "mean_token_accuracy": 0.9180563706159591, "num_tokens": 12511387.0, "step": 1100 }, { "epoch": 1.3607487237663074, "grad_norm": 0.2554262578487396, "learning_rate": 0.00013320131672361777, "loss": 0.3089, "mean_token_accuracy": 0.9220700481534004, "num_tokens": 13646331.0, "step": 1200 }, { "epoch": 1.4741917186613727, "grad_norm": 0.3375890851020813, "learning_rate": 0.00012050284309955694, "loss": 0.2894, "mean_token_accuracy": 0.9279368036985397, "num_tokens": 14794369.0, "step": 1300 }, { "epoch": 1.587634713556438, "grad_norm": 0.2744344472885132, "learning_rate": 0.00010744794715915005, "loss": 0.2748, "mean_token_accuracy": 0.9307059472799302, "num_tokens": 15925261.0, "step": 1400 }, { "epoch": 1.701077708451503, "grad_norm": 0.27422216534614563, "learning_rate": 9.4263575781332e-05, "loss": 0.2563, "mean_token_accuracy": 0.9356163629889488, "num_tokens": 17065748.0, "step": 1500 }, { "epoch": 1.701077708451503, "eval_loss": 0.3396897614002228, "eval_mean_token_accuracy": 0.9201750569805807, "eval_num_tokens": 17065748.0, "eval_runtime": 275.4276, "eval_samples_per_second": 5.693, "eval_steps_per_second": 0.712, "step": 1500 }, { "epoch": 1.8145207033465685, "grad_norm": 0.2150234878063202, "learning_rate": 8.117892665167406e-05, "loss": 0.238, "mean_token_accuracy": 0.9404442739486695, "num_tokens": 18190015.0, "step": 1600 }, { "epoch": 1.9279636982416335, "grad_norm": 0.2983264625072479, "learning_rate": 6.842146387984734e-05, "loss": 0.219, "mean_token_accuracy": 0.9447547772526741, "num_tokens": 19310575.0, "step": 1700 }, { "epoch": 2.0408394781622237, "grad_norm": 0.33862537145614624, "learning_rate": 5.621296375364546e-05, "loss": 0.2008, "mean_token_accuracy": 0.9498060237822221, "num_tokens": 20446504.0, "step": 1800 }, { "epoch": 2.1542824730572887, "grad_norm": 0.43845516443252563, "learning_rate": 4.4765659370349725e-05, "loss": 0.1256, "mean_token_accuracy": 0.9682124453783035, "num_tokens": 21558096.0, "step": 1900 }, { "epoch": 2.2677254679523537, "grad_norm": 0.4173540472984314, "learning_rate": 3.427855116733367e-05, "loss": 0.1206, "mean_token_accuracy": 0.9689858189225197, "num_tokens": 22703794.0, "step": 2000 }, { "epoch": 2.2677254679523537, "eval_loss": 0.2952539622783661, "eval_mean_token_accuracy": 0.9342006676051081, "eval_num_tokens": 22703794.0, "eval_runtime": 275.9272, "eval_samples_per_second": 5.683, "eval_steps_per_second": 0.71, "step": 2000 }, { "epoch": 2.3811684628474192, "grad_norm": 0.4896949529647827, "learning_rate": 2.493394748980298e-05, "loss": 0.1256, "mean_token_accuracy": 0.9678885874152183, "num_tokens": 23827405.0, "step": 2100 }, { "epoch": 2.4946114577424843, "grad_norm": 0.3831748068332672, "learning_rate": 1.6894295334591592e-05, "loss": 0.1123, "mean_token_accuracy": 0.9710856053233147, "num_tokens": 24940124.0, "step": 2200 }, { "epoch": 2.6080544526375498, "grad_norm": 0.4638579487800598, "learning_rate": 1.0299356364502255e-05, "loss": 0.1083, "mean_token_accuracy": 0.9723807728290558, "num_tokens": 26077789.0, "step": 2300 }, { "epoch": 2.721497447532615, "grad_norm": 0.4337230920791626, "learning_rate": 5.263777285480376e-06, "loss": 0.106, "mean_token_accuracy": 0.9729050415754318, "num_tokens": 27217807.0, "step": 2400 }, { "epoch": 2.83494044242768, "grad_norm": 0.3685455024242401, "learning_rate": 1.8750968232906785e-06, "loss": 0.1088, "mean_token_accuracy": 0.972323155105114, "num_tokens": 28349123.0, "step": 2500 }, { "epoch": 2.83494044242768, "eval_loss": 0.2838171720504761, "eval_mean_token_accuracy": 0.9382532765062488, "eval_num_tokens": 28349123.0, "eval_runtime": 275.4684, "eval_samples_per_second": 5.692, "eval_steps_per_second": 0.712, "step": 2500 }, { "epoch": 2.9483834373227453, "grad_norm": 0.43383386731147766, "learning_rate": 1.9222394650038056e-07, "loss": 0.1001, "mean_token_accuracy": 0.9745477658510208, "num_tokens": 29494601.0, "step": 2600 } ], "logging_steps": 100, "max_steps": 2646, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3978374276266435e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }