{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.96, "eval_steps": 500, "global_step": 60, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 0.6046149134635925, "learning_rate": 0.0, "loss": 2.0633, "step": 1 }, { "epoch": 0.032, "grad_norm": 0.5796938538551331, "learning_rate": 2e-05, "loss": 2.034, "step": 2 }, { "epoch": 0.048, "grad_norm": 0.6046639084815979, "learning_rate": 4e-05, "loss": 2.0676, "step": 3 }, { "epoch": 0.064, "grad_norm": 0.6537912487983704, "learning_rate": 6e-05, "loss": 2.0168, "step": 4 }, { "epoch": 0.08, "grad_norm": 0.7197220325469971, "learning_rate": 8e-05, "loss": 1.9651, "step": 5 }, { "epoch": 0.096, "grad_norm": 0.7753214240074158, "learning_rate": 0.0001, "loss": 1.9283, "step": 6 }, { "epoch": 0.112, "grad_norm": 0.7212746143341064, "learning_rate": 0.00012, "loss": 1.7437, "step": 7 }, { "epoch": 0.128, "grad_norm": 0.7059862017631531, "learning_rate": 0.00014, "loss": 1.7018, "step": 8 }, { "epoch": 0.144, "grad_norm": 0.7816401720046997, "learning_rate": 0.00016, "loss": 1.5029, "step": 9 }, { "epoch": 0.16, "grad_norm": 0.8176925778388977, "learning_rate": 0.00018, "loss": 1.3778, "step": 10 }, { "epoch": 0.176, "grad_norm": 0.8414409160614014, "learning_rate": 0.0002, "loss": 1.3103, "step": 11 }, { "epoch": 0.192, "grad_norm": 0.8573537468910217, "learning_rate": 0.000196, "loss": 1.1423, "step": 12 }, { "epoch": 0.208, "grad_norm": 0.7942666411399841, "learning_rate": 0.000192, "loss": 0.9792, "step": 13 }, { "epoch": 0.224, "grad_norm": 1.1660468578338623, "learning_rate": 0.000188, "loss": 0.8941, "step": 14 }, { "epoch": 0.24, "grad_norm": 0.6082952618598938, "learning_rate": 0.00018400000000000003, "loss": 0.8822, "step": 15 }, { "epoch": 0.256, "grad_norm": 0.6617238521575928, "learning_rate": 0.00018, "loss": 0.9272, "step": 16 }, { "epoch": 0.272, "grad_norm": 2.0397157669067383, "learning_rate": 0.00017600000000000002, "loss": 0.8929, "step": 17 }, { "epoch": 0.288, "grad_norm": 0.5073072910308838, "learning_rate": 0.000172, "loss": 0.8766, "step": 18 }, { "epoch": 0.304, "grad_norm": 0.536578357219696, "learning_rate": 0.000168, "loss": 0.8599, "step": 19 }, { "epoch": 0.32, "grad_norm": 0.4412708282470703, "learning_rate": 0.000164, "loss": 0.737, "step": 20 }, { "epoch": 0.336, "grad_norm": 0.41358307003974915, "learning_rate": 0.00016, "loss": 0.8145, "step": 21 }, { "epoch": 0.352, "grad_norm": 0.38526788353919983, "learning_rate": 0.00015600000000000002, "loss": 0.802, "step": 22 }, { "epoch": 0.368, "grad_norm": 0.34247690439224243, "learning_rate": 0.000152, "loss": 0.7904, "step": 23 }, { "epoch": 0.384, "grad_norm": 0.38441890478134155, "learning_rate": 0.000148, "loss": 0.7924, "step": 24 }, { "epoch": 0.4, "grad_norm": 0.3929769992828369, "learning_rate": 0.000144, "loss": 0.8502, "step": 25 }, { "epoch": 0.416, "grad_norm": 0.3987599313259125, "learning_rate": 0.00014, "loss": 0.7837, "step": 26 }, { "epoch": 0.432, "grad_norm": 0.36407092213630676, "learning_rate": 0.00013600000000000003, "loss": 0.7029, "step": 27 }, { "epoch": 0.448, "grad_norm": 0.47272396087646484, "learning_rate": 0.000132, "loss": 0.7621, "step": 28 }, { "epoch": 0.464, "grad_norm": 0.3687835931777954, "learning_rate": 0.00012800000000000002, "loss": 0.767, "step": 29 }, { "epoch": 0.48, "grad_norm": 0.38991373777389526, "learning_rate": 0.000124, "loss": 0.7381, "step": 30 }, { "epoch": 0.496, "grad_norm": 0.31758126616477966, "learning_rate": 0.00012, "loss": 0.7326, "step": 31 }, { "epoch": 0.512, "grad_norm": 0.3498470187187195, "learning_rate": 0.000116, "loss": 0.7858, "step": 32 }, { "epoch": 0.528, "grad_norm": 0.339616984128952, "learning_rate": 0.00011200000000000001, "loss": 0.7636, "step": 33 }, { "epoch": 0.544, "grad_norm": 0.3355540335178375, "learning_rate": 0.00010800000000000001, "loss": 0.7011, "step": 34 }, { "epoch": 0.56, "grad_norm": 0.38637426495552063, "learning_rate": 0.00010400000000000001, "loss": 0.7206, "step": 35 }, { "epoch": 0.576, "grad_norm": 0.39599090814590454, "learning_rate": 0.0001, "loss": 0.7352, "step": 36 }, { "epoch": 0.592, "grad_norm": 0.34623557329177856, "learning_rate": 9.6e-05, "loss": 0.6788, "step": 37 }, { "epoch": 0.608, "grad_norm": 0.3236479163169861, "learning_rate": 9.200000000000001e-05, "loss": 0.6575, "step": 38 }, { "epoch": 0.624, "grad_norm": 0.3357284367084503, "learning_rate": 8.800000000000001e-05, "loss": 0.7061, "step": 39 }, { "epoch": 0.64, "grad_norm": 0.3321351408958435, "learning_rate": 8.4e-05, "loss": 0.6878, "step": 40 }, { "epoch": 0.656, "grad_norm": 0.30003687739372253, "learning_rate": 8e-05, "loss": 0.6769, "step": 41 }, { "epoch": 0.672, "grad_norm": 0.3306218385696411, "learning_rate": 7.6e-05, "loss": 0.7436, "step": 42 }, { "epoch": 0.688, "grad_norm": 0.3028322756290436, "learning_rate": 7.2e-05, "loss": 0.678, "step": 43 }, { "epoch": 0.704, "grad_norm": 0.31864282488822937, "learning_rate": 6.800000000000001e-05, "loss": 0.6989, "step": 44 }, { "epoch": 0.72, "grad_norm": 0.3383725881576538, "learning_rate": 6.400000000000001e-05, "loss": 0.7458, "step": 45 }, { "epoch": 0.736, "grad_norm": 0.35683438181877136, "learning_rate": 6e-05, "loss": 0.7546, "step": 46 }, { "epoch": 0.752, "grad_norm": 0.3352407217025757, "learning_rate": 5.6000000000000006e-05, "loss": 0.7256, "step": 47 }, { "epoch": 0.768, "grad_norm": 0.34041017293930054, "learning_rate": 5.2000000000000004e-05, "loss": 0.7068, "step": 48 }, { "epoch": 0.784, "grad_norm": 0.339663565158844, "learning_rate": 4.8e-05, "loss": 0.6839, "step": 49 }, { "epoch": 0.8, "grad_norm": 0.29457777738571167, "learning_rate": 4.4000000000000006e-05, "loss": 0.6694, "step": 50 }, { "epoch": 0.816, "grad_norm": 0.3308852016925812, "learning_rate": 4e-05, "loss": 0.7289, "step": 51 }, { "epoch": 0.832, "grad_norm": 0.36052626371383667, "learning_rate": 3.6e-05, "loss": 0.6802, "step": 52 }, { "epoch": 0.848, "grad_norm": 0.3117854595184326, "learning_rate": 3.2000000000000005e-05, "loss": 0.6572, "step": 53 }, { "epoch": 0.864, "grad_norm": 0.3308817446231842, "learning_rate": 2.8000000000000003e-05, "loss": 0.6628, "step": 54 }, { "epoch": 0.88, "grad_norm": 0.3344241976737976, "learning_rate": 2.4e-05, "loss": 0.7066, "step": 55 }, { "epoch": 0.896, "grad_norm": 0.3327839970588684, "learning_rate": 2e-05, "loss": 0.7066, "step": 56 }, { "epoch": 0.912, "grad_norm": 0.36788785457611084, "learning_rate": 1.6000000000000003e-05, "loss": 0.6946, "step": 57 }, { "epoch": 0.928, "grad_norm": 0.38319170475006104, "learning_rate": 1.2e-05, "loss": 0.7339, "step": 58 }, { "epoch": 0.944, "grad_norm": 0.31625112891197205, "learning_rate": 8.000000000000001e-06, "loss": 0.7204, "step": 59 }, { "epoch": 0.96, "grad_norm": 0.323129802942276, "learning_rate": 4.000000000000001e-06, "loss": 0.6615, "step": 60 } ], "logging_steps": 1, "max_steps": 60, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3241268608206720.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }