{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 500, "global_step": 72, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.05555555555555555, "grad_norm": 10.875, "learning_rate": 0.0, "loss": 2.6157, "step": 1 }, { "epoch": 0.1111111111111111, "grad_norm": 9.8125, "learning_rate": 1.6666666666666667e-05, "loss": 2.4889, "step": 2 }, { "epoch": 0.16666666666666666, "grad_norm": 7.78125, "learning_rate": 3.3333333333333335e-05, "loss": 2.3111, "step": 3 }, { "epoch": 0.2222222222222222, "grad_norm": 6.9375, "learning_rate": 5e-05, "loss": 2.1313, "step": 4 }, { "epoch": 0.2777777777777778, "grad_norm": 6.3125, "learning_rate": 4.9974091841168195e-05, "loss": 2.0006, "step": 5 }, { "epoch": 0.3333333333333333, "grad_norm": 2.828125, "learning_rate": 4.9896421063288286e-05, "loss": 1.9674, "step": 6 }, { "epoch": 0.3888888888888889, "grad_norm": 3.296875, "learning_rate": 4.976714865090827e-05, "loss": 1.8848, "step": 7 }, { "epoch": 0.4444444444444444, "grad_norm": 2.140625, "learning_rate": 4.958654254084355e-05, "loss": 1.8648, "step": 8 }, { "epoch": 0.5, "grad_norm": 2.21875, "learning_rate": 4.9354977066836986e-05, "loss": 1.8344, "step": 9 }, { "epoch": 0.5555555555555556, "grad_norm": 2.21875, "learning_rate": 4.907293218369499e-05, "loss": 1.7973, "step": 10 }, { "epoch": 0.6111111111111112, "grad_norm": 1.984375, "learning_rate": 4.874099247250798e-05, "loss": 1.7639, "step": 11 }, { "epoch": 0.6666666666666666, "grad_norm": 1.890625, "learning_rate": 4.835984592901678e-05, "loss": 1.7202, "step": 12 }, { "epoch": 0.7222222222222222, "grad_norm": 1.953125, "learning_rate": 4.793028253763633e-05, "loss": 1.8029, "step": 13 }, { "epoch": 0.7777777777777778, "grad_norm": 2.015625, "learning_rate": 4.74531926340924e-05, "loss": 1.7007, "step": 14 }, { "epoch": 0.8333333333333334, "grad_norm": 2.0, "learning_rate": 4.6929565060064864e-05, "loss": 1.7268, "step": 15 }, { "epoch": 0.8888888888888888, "grad_norm": 1.9375, "learning_rate": 4.6360485113662216e-05, "loss": 1.7007, "step": 16 }, { "epoch": 0.9444444444444444, "grad_norm": 1.9609375, "learning_rate": 4.574713229997563e-05, "loss": 1.679, "step": 17 }, { "epoch": 1.0, "grad_norm": 3.4375, "learning_rate": 4.509077788637446e-05, "loss": 1.6594, "step": 18 }, { "epoch": 1.0555555555555556, "grad_norm": 2.546875, "learning_rate": 4.43927822676105e-05, "loss": 1.4736, "step": 19 }, { "epoch": 1.1111111111111112, "grad_norm": 2.078125, "learning_rate": 4.365459214619214e-05, "loss": 1.338, "step": 20 }, { "epoch": 1.1666666666666667, "grad_norm": 2.015625, "learning_rate": 4.2877737533872485e-05, "loss": 1.3932, "step": 21 }, { "epoch": 1.2222222222222223, "grad_norm": 2.015625, "learning_rate": 4.206382858046636e-05, "loss": 1.3238, "step": 22 }, { "epoch": 1.2777777777777777, "grad_norm": 2.1875, "learning_rate": 4.12145522365689e-05, "loss": 1.3945, "step": 23 }, { "epoch": 1.3333333333333333, "grad_norm": 2.203125, "learning_rate": 4.033166875709291e-05, "loss": 1.2647, "step": 24 }, { "epoch": 1.3888888888888888, "grad_norm": 2.171875, "learning_rate": 3.941700805287168e-05, "loss": 1.3274, "step": 25 }, { "epoch": 1.4444444444444444, "grad_norm": 2.296875, "learning_rate": 3.8472465897889394e-05, "loss": 1.3414, "step": 26 }, { "epoch": 1.5, "grad_norm": 2.078125, "learning_rate": 3.7500000000000003e-05, "loss": 1.2683, "step": 27 }, { "epoch": 1.5555555555555556, "grad_norm": 1.984375, "learning_rate": 3.6501625943278805e-05, "loss": 1.3127, "step": 28 }, { "epoch": 1.6111111111111112, "grad_norm": 2.0625, "learning_rate": 3.547941301041661e-05, "loss": 1.2775, "step": 29 }, { "epoch": 1.6666666666666665, "grad_norm": 2.09375, "learning_rate": 3.443547989381536e-05, "loss": 1.3265, "step": 30 }, { "epoch": 1.7222222222222223, "grad_norm": 2.078125, "learning_rate": 3.3371990304274656e-05, "loss": 1.2939, "step": 31 }, { "epoch": 1.7777777777777777, "grad_norm": 2.046875, "learning_rate": 3.2291148486370626e-05, "loss": 1.2552, "step": 32 }, { "epoch": 1.8333333333333335, "grad_norm": 2.0625, "learning_rate": 3.11951946498225e-05, "loss": 1.2849, "step": 33 }, { "epoch": 1.8888888888888888, "grad_norm": 2.09375, "learning_rate": 3.008640032631585e-05, "loss": 1.2628, "step": 34 }, { "epoch": 1.9444444444444444, "grad_norm": 2.15625, "learning_rate": 2.8967063661406285e-05, "loss": 1.2341, "step": 35 }, { "epoch": 2.0, "grad_norm": 4.15625, "learning_rate": 2.7839504651261872e-05, "loss": 1.1865, "step": 36 }, { "epoch": 2.0555555555555554, "grad_norm": 2.03125, "learning_rate": 2.6706060334116777e-05, "loss": 1.1127, "step": 37 }, { "epoch": 2.111111111111111, "grad_norm": 2.1875, "learning_rate": 2.556907994640264e-05, "loss": 1.0453, "step": 38 }, { "epoch": 2.1666666666666665, "grad_norm": 2.140625, "learning_rate": 2.4430920053597356e-05, "loss": 1.0773, "step": 39 }, { "epoch": 2.2222222222222223, "grad_norm": 2.109375, "learning_rate": 2.329393966588323e-05, "loss": 1.0607, "step": 40 }, { "epoch": 2.2777777777777777, "grad_norm": 2.09375, "learning_rate": 2.2160495348738123e-05, "loss": 1.0648, "step": 41 }, { "epoch": 2.3333333333333335, "grad_norm": 2.078125, "learning_rate": 2.1032936338593718e-05, "loss": 1.0237, "step": 42 }, { "epoch": 2.388888888888889, "grad_norm": 2.015625, "learning_rate": 1.991359967368416e-05, "loss": 1.0412, "step": 43 }, { "epoch": 2.4444444444444446, "grad_norm": 2.078125, "learning_rate": 1.8804805350177505e-05, "loss": 1.0411, "step": 44 }, { "epoch": 2.5, "grad_norm": 2.09375, "learning_rate": 1.7708851513629377e-05, "loss": 1.1015, "step": 45 }, { "epoch": 2.5555555555555554, "grad_norm": 2.1875, "learning_rate": 1.6628009695725346e-05, "loss": 1.0604, "step": 46 }, { "epoch": 2.611111111111111, "grad_norm": 2.3125, "learning_rate": 1.5564520106184644e-05, "loss": 1.0349, "step": 47 }, { "epoch": 2.6666666666666665, "grad_norm": 2.265625, "learning_rate": 1.4520586989583406e-05, "loss": 1.0091, "step": 48 }, { "epoch": 2.7222222222222223, "grad_norm": 2.25, "learning_rate": 1.3498374056721197e-05, "loss": 1.054, "step": 49 }, { "epoch": 2.7777777777777777, "grad_norm": 2.25, "learning_rate": 1.2500000000000006e-05, "loss": 1.0377, "step": 50 }, { "epoch": 2.8333333333333335, "grad_norm": 2.34375, "learning_rate": 1.1527534102110612e-05, "loss": 1.0076, "step": 51 }, { "epoch": 2.888888888888889, "grad_norm": 2.140625, "learning_rate": 1.0582991947128324e-05, "loss": 1.0104, "step": 52 }, { "epoch": 2.9444444444444446, "grad_norm": 2.375, "learning_rate": 9.668331242907089e-06, "loss": 1.0321, "step": 53 }, { "epoch": 3.0, "grad_norm": 4.0625, "learning_rate": 8.785447763431101e-06, "loss": 0.9897, "step": 54 }, { "epoch": 3.0555555555555554, "grad_norm": 2.09375, "learning_rate": 7.936171419533653e-06, "loss": 0.9734, "step": 55 }, { "epoch": 3.111111111111111, "grad_norm": 2.109375, "learning_rate": 7.122262466127514e-06, "loss": 0.9602, "step": 56 }, { "epoch": 3.1666666666666665, "grad_norm": 2.0, "learning_rate": 6.3454078538078635e-06, "loss": 0.9248, "step": 57 }, { "epoch": 3.2222222222222223, "grad_norm": 2.15625, "learning_rate": 5.607217732389503e-06, "loss": 0.9942, "step": 58 }, { "epoch": 3.2777777777777777, "grad_norm": 2.0625, "learning_rate": 4.9092221136255444e-06, "loss": 0.9528, "step": 59 }, { "epoch": 3.3333333333333335, "grad_norm": 2.03125, "learning_rate": 4.252867700024374e-06, "loss": 0.934, "step": 60 }, { "epoch": 3.388888888888889, "grad_norm": 2.125, "learning_rate": 3.6395148863377858e-06, "loss": 0.9574, "step": 61 }, { "epoch": 3.4444444444444446, "grad_norm": 2.171875, "learning_rate": 3.0704349399351435e-06, "loss": 0.9671, "step": 62 }, { "epoch": 3.5, "grad_norm": 2.125, "learning_rate": 2.5468073659076e-06, "loss": 0.9966, "step": 63 }, { "epoch": 3.5555555555555554, "grad_norm": 2.265625, "learning_rate": 2.0697174623636794e-06, "loss": 1.0019, "step": 64 }, { "epoch": 3.611111111111111, "grad_norm": 2.125, "learning_rate": 1.6401540709832242e-06, "loss": 0.9645, "step": 65 }, { "epoch": 3.6666666666666665, "grad_norm": 2.1875, "learning_rate": 1.2590075274920205e-06, "loss": 1.0193, "step": 66 }, { "epoch": 3.7222222222222223, "grad_norm": 2.015625, "learning_rate": 9.270678163050217e-07, "loss": 0.9931, "step": 67 }, { "epoch": 3.7777777777777777, "grad_norm": 2.109375, "learning_rate": 6.450229331630253e-07, "loss": 1.0124, "step": 68 }, { "epoch": 3.8333333333333335, "grad_norm": 2.296875, "learning_rate": 4.134574591564494e-07, "loss": 0.9898, "step": 69 }, { "epoch": 3.888888888888889, "grad_norm": 2.03125, "learning_rate": 2.3285134909173112e-07, "loss": 0.9708, "step": 70 }, { "epoch": 3.9444444444444446, "grad_norm": 2.09375, "learning_rate": 1.0357893671171792e-07, "loss": 0.9579, "step": 71 }, { "epoch": 4.0, "grad_norm": 3.484375, "learning_rate": 2.590815883181108e-08, "loss": 0.8729, "step": 72 } ], "logging_steps": 1, "max_steps": 72, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 18, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.652699873804288e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }