{ "best_metric": 0.25283440947532654, "best_model_checkpoint": "tuna6/mistral-saiga-journal-finetune6/checkpoint-1000", "epoch": 0.06540008502011052, "eval_steps": 40, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.6344897150993347, "learning_rate": 0.00028828828828828825, "loss": 0.8513, "step": 40 }, { "epoch": 0.0, "eval_loss": 0.6471168398857117, "eval_runtime": 596.3818, "eval_samples_per_second": 12.821, "eval_steps_per_second": 1.603, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.4552938938140869, "learning_rate": 0.00027627627627627627, "loss": 0.5674, "step": 80 }, { "epoch": 0.01, "eval_loss": 0.5346186757087708, "eval_runtime": 595.5393, "eval_samples_per_second": 12.839, "eval_steps_per_second": 1.605, "step": 80 }, { "epoch": 0.01, "grad_norm": 0.505496621131897, "learning_rate": 0.00026426426426426423, "loss": 0.4685, "step": 120 }, { "epoch": 0.01, "eval_loss": 0.45588552951812744, "eval_runtime": 595.3281, "eval_samples_per_second": 12.843, "eval_steps_per_second": 1.606, "step": 120 }, { "epoch": 0.01, "grad_norm": 0.49343597888946533, "learning_rate": 0.00025225225225225225, "loss": 0.4164, "step": 160 }, { "epoch": 0.01, "eval_loss": 0.4150841534137726, "eval_runtime": 595.6775, "eval_samples_per_second": 12.836, "eval_steps_per_second": 1.605, "step": 160 }, { "epoch": 0.01, "grad_norm": 0.7332279682159424, "learning_rate": 0.00024024024024024023, "loss": 0.3707, "step": 200 }, { "epoch": 0.01, "eval_loss": 0.38310083746910095, "eval_runtime": 595.6271, "eval_samples_per_second": 12.837, "eval_steps_per_second": 1.605, "step": 200 }, { "epoch": 0.02, "grad_norm": 0.30062803626060486, "learning_rate": 0.0002282282282282282, "loss": 0.3497, "step": 240 }, { "epoch": 0.02, "eval_loss": 0.3645000457763672, "eval_runtime": 596.1295, "eval_samples_per_second": 12.826, "eval_steps_per_second": 1.604, "step": 240 }, { "epoch": 0.02, "grad_norm": 0.3287501633167267, "learning_rate": 0.0002162162162162162, "loss": 0.296, "step": 280 }, { "epoch": 0.02, "eval_loss": 0.3538116216659546, "eval_runtime": 597.2837, "eval_samples_per_second": 12.801, "eval_steps_per_second": 1.601, "step": 280 }, { "epoch": 0.02, "grad_norm": 0.5396926999092102, "learning_rate": 0.00020420420420420418, "loss": 0.2976, "step": 320 }, { "epoch": 0.02, "eval_loss": 0.3392348885536194, "eval_runtime": 596.5398, "eval_samples_per_second": 12.817, "eval_steps_per_second": 1.603, "step": 320 }, { "epoch": 0.02, "grad_norm": 1.1267449855804443, "learning_rate": 0.00019219219219219217, "loss": 0.3018, "step": 360 }, { "epoch": 0.02, "eval_loss": 0.3298206925392151, "eval_runtime": 596.9832, "eval_samples_per_second": 12.808, "eval_steps_per_second": 1.601, "step": 360 }, { "epoch": 0.03, "grad_norm": 0.375235378742218, "learning_rate": 0.00018018018018018016, "loss": 0.3019, "step": 400 }, { "epoch": 0.03, "eval_loss": 0.3226415514945984, "eval_runtime": 597.4868, "eval_samples_per_second": 12.797, "eval_steps_per_second": 1.6, "step": 400 }, { "epoch": 0.03, "grad_norm": 0.4212028980255127, "learning_rate": 0.00016816816816816817, "loss": 0.323, "step": 440 }, { "epoch": 0.03, "eval_loss": 0.31242895126342773, "eval_runtime": 596.6515, "eval_samples_per_second": 12.815, "eval_steps_per_second": 1.602, "step": 440 }, { "epoch": 0.03, "grad_norm": 0.3362777531147003, "learning_rate": 0.00015615615615615616, "loss": 0.299, "step": 480 }, { "epoch": 0.03, "eval_loss": 0.3068313002586365, "eval_runtime": 596.6534, "eval_samples_per_second": 12.815, "eval_steps_per_second": 1.602, "step": 480 }, { "epoch": 0.03, "grad_norm": 0.29624176025390625, "learning_rate": 0.00014414414414414412, "loss": 0.2671, "step": 520 }, { "epoch": 0.03, "eval_loss": 0.30176782608032227, "eval_runtime": 596.3445, "eval_samples_per_second": 12.821, "eval_steps_per_second": 1.603, "step": 520 }, { "epoch": 0.04, "grad_norm": 0.3254912197589874, "learning_rate": 0.00013213213213213211, "loss": 0.2799, "step": 560 }, { "epoch": 0.04, "eval_loss": 0.2952657639980316, "eval_runtime": 596.6573, "eval_samples_per_second": 12.815, "eval_steps_per_second": 1.602, "step": 560 }, { "epoch": 0.04, "grad_norm": 0.3114880919456482, "learning_rate": 0.00012012012012012012, "loss": 0.2674, "step": 600 }, { "epoch": 0.04, "eval_loss": 0.29019972681999207, "eval_runtime": 596.4197, "eval_samples_per_second": 12.82, "eval_steps_per_second": 1.603, "step": 600 }, { "epoch": 0.04, "grad_norm": 0.2812901437282562, "learning_rate": 0.0001081081081081081, "loss": 0.2395, "step": 640 }, { "epoch": 0.04, "eval_loss": 0.2831648588180542, "eval_runtime": 596.2022, "eval_samples_per_second": 12.825, "eval_steps_per_second": 1.603, "step": 640 }, { "epoch": 0.04, "grad_norm": 0.553597092628479, "learning_rate": 9.609609609609608e-05, "loss": 0.2436, "step": 680 }, { "epoch": 0.04, "eval_loss": 0.27831366658210754, "eval_runtime": 595.5956, "eval_samples_per_second": 12.838, "eval_steps_per_second": 1.605, "step": 680 }, { "epoch": 0.05, "grad_norm": 0.46639448404312134, "learning_rate": 8.408408408408409e-05, "loss": 0.2537, "step": 720 }, { "epoch": 0.05, "eval_loss": 0.27261102199554443, "eval_runtime": 595.7096, "eval_samples_per_second": 12.835, "eval_steps_per_second": 1.605, "step": 720 }, { "epoch": 0.05, "grad_norm": 0.3072221279144287, "learning_rate": 7.207207207207206e-05, "loss": 0.238, "step": 760 }, { "epoch": 0.05, "eval_loss": 0.2690950334072113, "eval_runtime": 595.7471, "eval_samples_per_second": 12.834, "eval_steps_per_second": 1.605, "step": 760 }, { "epoch": 0.05, "grad_norm": 0.3728131055831909, "learning_rate": 6.006006006006006e-05, "loss": 0.2583, "step": 800 }, { "epoch": 0.05, "eval_loss": 0.2644895017147064, "eval_runtime": 595.5546, "eval_samples_per_second": 12.838, "eval_steps_per_second": 1.605, "step": 800 }, { "epoch": 0.05, "grad_norm": 0.4398050010204315, "learning_rate": 4.804804804804804e-05, "loss": 0.2274, "step": 840 }, { "epoch": 0.05, "eval_loss": 0.26183322072029114, "eval_runtime": 595.4684, "eval_samples_per_second": 12.84, "eval_steps_per_second": 1.605, "step": 840 }, { "epoch": 0.06, "grad_norm": 0.37743285298347473, "learning_rate": 3.603603603603603e-05, "loss": 0.2279, "step": 880 }, { "epoch": 0.06, "eval_loss": 0.258406400680542, "eval_runtime": 595.383, "eval_samples_per_second": 12.842, "eval_steps_per_second": 1.606, "step": 880 }, { "epoch": 0.06, "grad_norm": 0.25567543506622314, "learning_rate": 2.402402402402402e-05, "loss": 0.2497, "step": 920 }, { "epoch": 0.06, "eval_loss": 0.25550028681755066, "eval_runtime": 595.2117, "eval_samples_per_second": 12.846, "eval_steps_per_second": 1.606, "step": 920 }, { "epoch": 0.06, "grad_norm": 0.379567414522171, "learning_rate": 1.201201201201201e-05, "loss": 0.2599, "step": 960 }, { "epoch": 0.06, "eval_loss": 0.25368040800094604, "eval_runtime": 595.2349, "eval_samples_per_second": 12.845, "eval_steps_per_second": 1.606, "step": 960 }, { "epoch": 0.07, "grad_norm": 0.3067691922187805, "learning_rate": 0.0, "loss": 0.2332, "step": 1000 }, { "epoch": 0.07, "eval_loss": 0.25283440947532654, "eval_runtime": 595.1198, "eval_samples_per_second": 12.848, "eval_steps_per_second": 1.606, "step": 1000 } ], "logging_steps": 40, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 40, "total_flos": 3.1676545818624e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }