{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 15950, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.31347962382445144, "grad_norm": 0.9908634424209595, "learning_rate": 1.9840000000000003e-05, "loss": 5.4652, "step": 500 }, { "epoch": 0.6269592476489029, "grad_norm": 0.6833625435829163, "learning_rate": 1.9357928802589e-05, "loss": 1.7034, "step": 1000 }, { "epoch": 0.9404388714733543, "grad_norm": 0.6710793972015381, "learning_rate": 1.8710679611650487e-05, "loss": 1.6778, "step": 1500 }, { "epoch": 1.0, "eval_bleu": 44.355750778466295, "eval_loss": 1.6447367668151855, "eval_runtime": 282.2475, "eval_samples_per_second": 9.977, "eval_steps_per_second": 0.624, "step": 1595 }, { "epoch": 1.2539184952978055, "grad_norm": 0.4725702404975891, "learning_rate": 1.8063430420711977e-05, "loss": 1.6499, "step": 2000 }, { "epoch": 1.567398119122257, "grad_norm": 0.6504935026168823, "learning_rate": 1.7416181229773465e-05, "loss": 1.6434, "step": 2500 }, { "epoch": 1.8808777429467085, "grad_norm": 0.4929894506931305, "learning_rate": 1.6768932038834952e-05, "loss": 1.6406, "step": 3000 }, { "epoch": 2.0, "eval_bleu": 46.92316806208224, "eval_loss": 1.6343189477920532, "eval_runtime": 280.4311, "eval_samples_per_second": 10.042, "eval_steps_per_second": 0.628, "step": 3190 }, { "epoch": 2.19435736677116, "grad_norm": 0.4408179521560669, "learning_rate": 1.612168284789644e-05, "loss": 1.6263, "step": 3500 }, { "epoch": 2.507836990595611, "grad_norm": 0.5890961289405823, "learning_rate": 1.547443365695793e-05, "loss": 1.6184, "step": 4000 }, { "epoch": 2.8213166144200628, "grad_norm": 0.36476999521255493, "learning_rate": 1.482718446601942e-05, "loss": 1.6186, "step": 4500 }, { "epoch": 3.0, "eval_bleu": 48.0812345935905, "eval_loss": 1.6287521123886108, "eval_runtime": 277.8914, "eval_samples_per_second": 10.133, "eval_steps_per_second": 0.633, "step": 4785 }, { "epoch": 3.134796238244514, "grad_norm": 0.5234176516532898, "learning_rate": 1.4179935275080907e-05, "loss": 1.6098, "step": 5000 }, { "epoch": 3.4482758620689653, "grad_norm": 0.5117827653884888, "learning_rate": 1.3532686084142396e-05, "loss": 1.6011, "step": 5500 }, { "epoch": 3.761755485893417, "grad_norm": 0.5140109062194824, "learning_rate": 1.2885436893203884e-05, "loss": 1.6021, "step": 6000 }, { "epoch": 4.0, "eval_bleu": 48.92348524517408, "eval_loss": 1.6266326904296875, "eval_runtime": 270.5269, "eval_samples_per_second": 10.409, "eval_steps_per_second": 0.651, "step": 6380 }, { "epoch": 4.075235109717869, "grad_norm": 0.5262463688850403, "learning_rate": 1.2238187702265374e-05, "loss": 1.5999, "step": 6500 }, { "epoch": 4.38871473354232, "grad_norm": 0.4214998185634613, "learning_rate": 1.1590938511326861e-05, "loss": 1.5893, "step": 7000 }, { "epoch": 4.702194357366771, "grad_norm": 0.4427284300327301, "learning_rate": 1.094368932038835e-05, "loss": 1.59, "step": 7500 }, { "epoch": 5.0, "eval_bleu": 48.82457853670125, "eval_loss": 1.626197338104248, "eval_runtime": 273.0297, "eval_samples_per_second": 10.314, "eval_steps_per_second": 0.645, "step": 7975 }, { "epoch": 5.015673981191223, "grad_norm": 0.4961058497428894, "learning_rate": 1.0296440129449838e-05, "loss": 1.5892, "step": 8000 }, { "epoch": 5.329153605015674, "grad_norm": 0.4459807276725769, "learning_rate": 9.649190938511328e-06, "loss": 1.5793, "step": 8500 }, { "epoch": 5.6426332288401255, "grad_norm": 0.42176157236099243, "learning_rate": 9.001941747572817e-06, "loss": 1.58, "step": 9000 }, { "epoch": 5.956112852664576, "grad_norm": 0.4856497049331665, "learning_rate": 8.354692556634304e-06, "loss": 1.5805, "step": 9500 }, { "epoch": 6.0, "eval_bleu": 49.44534738257411, "eval_loss": 1.6256004571914673, "eval_runtime": 271.5073, "eval_samples_per_second": 10.372, "eval_steps_per_second": 0.648, "step": 9570 }, { "epoch": 6.269592476489028, "grad_norm": 0.34010499715805054, "learning_rate": 7.707443365695793e-06, "loss": 1.5731, "step": 10000 }, { "epoch": 6.58307210031348, "grad_norm": 0.524381160736084, "learning_rate": 7.060194174757282e-06, "loss": 1.5727, "step": 10500 }, { "epoch": 6.896551724137931, "grad_norm": 0.3996853530406952, "learning_rate": 6.412944983818771e-06, "loss": 1.5725, "step": 11000 }, { "epoch": 7.0, "eval_bleu": 49.96969056769707, "eval_loss": 1.6257303953170776, "eval_runtime": 268.9556, "eval_samples_per_second": 10.47, "eval_steps_per_second": 0.654, "step": 11165 }, { "epoch": 7.210031347962382, "grad_norm": 0.5770251750946045, "learning_rate": 5.765695792880259e-06, "loss": 1.5686, "step": 11500 }, { "epoch": 7.523510971786834, "grad_norm": 0.40356528759002686, "learning_rate": 5.118446601941748e-06, "loss": 1.5666, "step": 12000 }, { "epoch": 7.836990595611285, "grad_norm": 0.4087662100791931, "learning_rate": 4.471197411003236e-06, "loss": 1.5663, "step": 12500 }, { "epoch": 8.0, "eval_bleu": 50.12951231046901, "eval_loss": 1.6264179944992065, "eval_runtime": 269.5975, "eval_samples_per_second": 10.445, "eval_steps_per_second": 0.653, "step": 12760 }, { "epoch": 8.150470219435737, "grad_norm": 0.493051677942276, "learning_rate": 3.823948220064725e-06, "loss": 1.5639, "step": 13000 }, { "epoch": 8.463949843260188, "grad_norm": 0.45878908038139343, "learning_rate": 3.176699029126214e-06, "loss": 1.5625, "step": 13500 }, { "epoch": 8.77742946708464, "grad_norm": 0.5698862671852112, "learning_rate": 2.5294498381877025e-06, "loss": 1.5623, "step": 14000 }, { "epoch": 9.0, "eval_bleu": 50.17886329004438, "eval_loss": 1.626705527305603, "eval_runtime": 269.908, "eval_samples_per_second": 10.433, "eval_steps_per_second": 0.652, "step": 14355 }, { "epoch": 9.090909090909092, "grad_norm": 0.4237252175807953, "learning_rate": 1.882200647249191e-06, "loss": 1.561, "step": 14500 }, { "epoch": 9.404388714733543, "grad_norm": 0.4487234354019165, "learning_rate": 1.2349514563106797e-06, "loss": 1.5594, "step": 15000 }, { "epoch": 9.717868338557993, "grad_norm": 0.39791637659072876, "learning_rate": 5.877022653721683e-07, "loss": 1.5593, "step": 15500 }, { "epoch": 10.0, "eval_bleu": 50.10578525423357, "eval_loss": 1.626780390739441, "eval_runtime": 270.1794, "eval_samples_per_second": 10.423, "eval_steps_per_second": 0.651, "step": 15950 } ], "logging_steps": 500, "max_steps": 15950, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3821793152270336e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }