{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998272884283247, "eval_steps": 500, "global_step": 289, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.46875, "learning_rate": 6.896551724137932e-06, "loss": 1.9268, "step": 1 }, { "epoch": 0.02, "grad_norm": 2.078125, "learning_rate": 3.4482758620689657e-05, "loss": 1.9116, "step": 5 }, { "epoch": 0.03, "grad_norm": 1.390625, "learning_rate": 6.896551724137931e-05, "loss": 1.8301, "step": 10 }, { "epoch": 0.05, "grad_norm": 1.421875, "learning_rate": 0.00010344827586206898, "loss": 1.5573, "step": 15 }, { "epoch": 0.07, "grad_norm": 1.21875, "learning_rate": 0.00013793103448275863, "loss": 1.4594, "step": 20 }, { "epoch": 0.09, "grad_norm": 1.2734375, "learning_rate": 0.00017241379310344826, "loss": 1.2235, "step": 25 }, { "epoch": 0.1, "grad_norm": 1.4140625, "learning_rate": 0.00019999270008556108, "loss": 1.2842, "step": 30 }, { "epoch": 0.12, "grad_norm": 1.03125, "learning_rate": 0.00019973731496914914, "loss": 1.3727, "step": 35 }, { "epoch": 0.14, "grad_norm": 1.328125, "learning_rate": 0.00019911799920659093, "loss": 1.3235, "step": 40 }, { "epoch": 0.16, "grad_norm": 1.2421875, "learning_rate": 0.00019813701261394136, "loss": 1.3308, "step": 45 }, { "epoch": 0.17, "grad_norm": 1.0, "learning_rate": 0.00019679793470489228, "loss": 1.3697, "step": 50 }, { "epoch": 0.19, "grad_norm": 1.1328125, "learning_rate": 0.00019510565162951537, "loss": 1.323, "step": 55 }, { "epoch": 0.21, "grad_norm": 1.1796875, "learning_rate": 0.00019306633834523024, "loss": 1.3125, "step": 60 }, { "epoch": 0.22, "grad_norm": 0.98046875, "learning_rate": 0.00019068743608505455, "loss": 1.3512, "step": 65 }, { "epoch": 0.24, "grad_norm": 1.0, "learning_rate": 0.00018797762520535177, "loss": 1.334, "step": 70 }, { "epoch": 0.26, "grad_norm": 1.1171875, "learning_rate": 0.0001849467935121521, "loss": 1.0914, "step": 75 }, { "epoch": 0.28, "grad_norm": 1.1015625, "learning_rate": 0.0001816060001816205, "loss": 1.2314, "step": 80 }, { "epoch": 0.29, "grad_norm": 1.015625, "learning_rate": 0.00017796743540632223, "loss": 1.1808, "step": 85 }, { "epoch": 0.31, "grad_norm": 1.09375, "learning_rate": 0.00017404437591453235, "loss": 1.2491, "step": 90 }, { "epoch": 0.33, "grad_norm": 1.1796875, "learning_rate": 0.00016985113652489374, "loss": 1.2451, "step": 95 }, { "epoch": 0.35, "grad_norm": 1.3515625, "learning_rate": 0.00016540301791319645, "loss": 1.1333, "step": 100 }, { "epoch": 0.36, "grad_norm": 1.484375, "learning_rate": 0.00016071625078187114, "loss": 1.1685, "step": 105 }, { "epoch": 0.38, "grad_norm": 1.1171875, "learning_rate": 0.00015580793663591585, "loss": 1.1954, "step": 110 }, { "epoch": 0.4, "grad_norm": 1.015625, "learning_rate": 0.00015069598538135906, "loss": 1.2303, "step": 115 }, { "epoch": 0.41, "grad_norm": 1.1171875, "learning_rate": 0.00014539904997395468, "loss": 1.3181, "step": 120 }, { "epoch": 0.43, "grad_norm": 1.125, "learning_rate": 0.00013993645835656953, "loss": 1.1347, "step": 125 }, { "epoch": 0.45, "grad_norm": 1.140625, "learning_rate": 0.00013432814293361584, "loss": 1.2001, "step": 130 }, { "epoch": 0.47, "grad_norm": 0.97265625, "learning_rate": 0.00012859456783986893, "loss": 1.2733, "step": 135 }, { "epoch": 0.48, "grad_norm": 1.1015625, "learning_rate": 0.000122756654269059, "loss": 1.2653, "step": 140 }, { "epoch": 0.5, "grad_norm": 1.1015625, "learning_rate": 0.00011683570413470383, "loss": 1.2328, "step": 145 }, { "epoch": 0.52, "grad_norm": 0.98046875, "learning_rate": 0.00011085332234173664, "loss": 1.1522, "step": 150 }, { "epoch": 0.54, "grad_norm": 1.2265625, "learning_rate": 0.00010483133795255071, "loss": 1.1529, "step": 155 }, { "epoch": 0.55, "grad_norm": 1.34375, "learning_rate": 9.879172453511827e-05, "loss": 1.0412, "step": 160 }, { "epoch": 0.57, "grad_norm": 1.2421875, "learning_rate": 9.275651998382377e-05, "loss": 1.1489, "step": 165 }, { "epoch": 0.59, "grad_norm": 1.015625, "learning_rate": 8.674774610557728e-05, "loss": 1.2629, "step": 170 }, { "epoch": 0.6, "grad_norm": 1.2421875, "learning_rate": 8.078732826462915e-05, "loss": 1.0591, "step": 175 }, { "epoch": 0.62, "grad_norm": 1.421875, "learning_rate": 7.489701537929384e-05, "loss": 1.2508, "step": 180 }, { "epoch": 0.64, "grad_norm": 1.2734375, "learning_rate": 6.909830056250527e-05, "loss": 1.2073, "step": 185 }, { "epoch": 0.66, "grad_norm": 1.375, "learning_rate": 6.341234269577879e-05, "loss": 1.09, "step": 190 }, { "epoch": 0.67, "grad_norm": 1.4921875, "learning_rate": 5.785988922274711e-05, "loss": 1.1575, "step": 195 }, { "epoch": 0.69, "grad_norm": 1.34375, "learning_rate": 5.246120044398839e-05, "loss": 1.1168, "step": 200 }, { "epoch": 0.71, "grad_norm": 1.2421875, "learning_rate": 4.723597558938672e-05, "loss": 1.1412, "step": 205 }, { "epoch": 0.73, "grad_norm": 1.1875, "learning_rate": 4.220328093777851e-05, "loss": 1.1943, "step": 210 }, { "epoch": 0.74, "grad_norm": 1.1171875, "learning_rate": 3.738148024616863e-05, "loss": 1.1862, "step": 215 }, { "epoch": 0.76, "grad_norm": 1.0859375, "learning_rate": 3.2788167742372725e-05, "loss": 1.0806, "step": 220 }, { "epoch": 0.78, "grad_norm": 1.203125, "learning_rate": 2.84401039255879e-05, "loss": 1.2544, "step": 225 }, { "epoch": 0.79, "grad_norm": 1.3984375, "learning_rate": 2.4353154409148637e-05, "loss": 1.1946, "step": 230 }, { "epoch": 0.81, "grad_norm": 1.3984375, "learning_rate": 2.0542232028624586e-05, "loss": 1.1809, "step": 235 }, { "epoch": 0.83, "grad_norm": 1.3515625, "learning_rate": 1.7021242426500493e-05, "loss": 1.2461, "step": 240 }, { "epoch": 0.85, "grad_norm": 1.203125, "learning_rate": 1.3803033311995072e-05, "loss": 0.9889, "step": 245 }, { "epoch": 0.86, "grad_norm": 1.2109375, "learning_rate": 1.0899347581163221e-05, "loss": 1.2301, "step": 250 }, { "epoch": 0.88, "grad_norm": 1.5078125, "learning_rate": 8.32078046834176e-06, "loss": 1.0813, "step": 255 }, { "epoch": 0.9, "grad_norm": 1.1953125, "learning_rate": 6.076740885288479e-06, "loss": 1.0638, "step": 260 }, { "epoch": 0.92, "grad_norm": 1.3359375, "learning_rate": 4.175417089083378e-06, "loss": 1.1524, "step": 265 }, { "epoch": 0.93, "grad_norm": 1.0859375, "learning_rate": 2.6237468040666512e-06, "loss": 1.2414, "step": 270 }, { "epoch": 0.95, "grad_norm": 1.3359375, "learning_rate": 1.4273919068349184e-06, "loss": 1.1618, "step": 275 }, { "epoch": 0.97, "grad_norm": 1.421875, "learning_rate": 5.907177666674812e-07, "loss": 1.1204, "step": 280 }, { "epoch": 0.98, "grad_norm": 1.2421875, "learning_rate": 1.1677731676733584e-07, "loss": 1.0432, "step": 285 }, { "epoch": 1.0, "eval_loss": 1.1134740114212036, "eval_runtime": 21.1932, "eval_samples_per_second": 13.636, "eval_steps_per_second": 1.746, "step": 289 }, { "epoch": 1.0, "step": 289, "total_flos": 1.887089628230451e+16, "train_loss": 1.2325792506491848, "train_runtime": 253.7515, "train_samples_per_second": 4.56, "train_steps_per_second": 1.139 } ], "logging_steps": 5, "max_steps": 289, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 1.887089628230451e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }