|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 540.0, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003703703703703704, |
|
"grad_norm": 9.5, |
|
"learning_rate": 2.787878787878788e-07, |
|
"loss": 0.7464, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007407407407407408, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 5.575757575757576e-07, |
|
"loss": 0.7497, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.011111111111111112, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 8.363636363636364e-07, |
|
"loss": 0.7544, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.014814814814814815, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.1151515151515153e-06, |
|
"loss": 0.7549, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.018518518518518517, |
|
"grad_norm": 9.6875, |
|
"learning_rate": 1.393939393939394e-06, |
|
"loss": 0.7816, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.022222222222222223, |
|
"grad_norm": 9.5, |
|
"learning_rate": 1.6727272727272728e-06, |
|
"loss": 0.7629, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.025925925925925925, |
|
"grad_norm": 9.375, |
|
"learning_rate": 1.9515151515151518e-06, |
|
"loss": 0.777, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 9.625, |
|
"learning_rate": 2.2303030303030305e-06, |
|
"loss": 0.7604, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 2.509090909090909e-06, |
|
"loss": 0.7431, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 9.125, |
|
"learning_rate": 2.787878787878788e-06, |
|
"loss": 0.7333, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.040740740740740744, |
|
"grad_norm": 9.875, |
|
"learning_rate": 3.0666666666666664e-06, |
|
"loss": 0.7854, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 3.3454545454545456e-06, |
|
"loss": 0.7264, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.04814814814814815, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 3.624242424242424e-06, |
|
"loss": 0.7634, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.05185185185185185, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 3.9030303030303035e-06, |
|
"loss": 0.7677, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 9.5, |
|
"learning_rate": 4.1818181818181814e-06, |
|
"loss": 0.76, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 9.625, |
|
"learning_rate": 4.460606060606061e-06, |
|
"loss": 0.7615, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.06296296296296296, |
|
"grad_norm": 9.5, |
|
"learning_rate": 4.59998964610166e-06, |
|
"loss": 0.7493, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 9.4375, |
|
"learning_rate": 4.599906815474259e-06, |
|
"loss": 0.7541, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.07037037037037037, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.599741157202478e-06, |
|
"loss": 0.7721, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.599492677252244e-06, |
|
"loss": 0.7385, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07777777777777778, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.599161384572187e-06, |
|
"loss": 0.7255, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.08148148148148149, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.5987472910933085e-06, |
|
"loss": 0.7548, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.08518518518518518, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.598250411728554e-06, |
|
"loss": 0.747, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.597670764372279e-06, |
|
"loss": 0.7368, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 9.25, |
|
"learning_rate": 4.5970083698996e-06, |
|
"loss": 0.7536, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0962962962962963, |
|
"grad_norm": 9.3125, |
|
"learning_rate": 4.596263252165647e-06, |
|
"loss": 0.7242, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.595435438004701e-06, |
|
"loss": 0.7248, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1037037037037037, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.594524957229229e-06, |
|
"loss": 0.7113, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.10740740740740741, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.593531842628811e-06, |
|
"loss": 0.7352, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.592456129968958e-06, |
|
"loss": 0.6914, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.11481481481481481, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.591297857989825e-06, |
|
"loss": 0.6962, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.590057068404815e-06, |
|
"loss": 0.7002, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.12222222222222222, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.588733805899076e-06, |
|
"loss": 0.7196, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.1259259259259259, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.587328118127894e-06, |
|
"loss": 0.7366, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.12962962962962962, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.585840055714976e-06, |
|
"loss": 0.7042, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.584269672250625e-06, |
|
"loss": 0.6914, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.13703703703703704, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.582617024289811e-06, |
|
"loss": 0.7045, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.14074074074074075, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.5808821713501374e-06, |
|
"loss": 0.7275, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.14444444444444443, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.579065175909692e-06, |
|
"loss": 0.6943, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.577166103404801e-06, |
|
"loss": 0.6792, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.15185185185185185, |
|
"grad_norm": 9.1875, |
|
"learning_rate": 4.5751850222276705e-06, |
|
"loss": 0.6854, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.15555555555555556, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.573122003723925e-06, |
|
"loss": 0.6675, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.15925925925925927, |
|
"grad_norm": 9.625, |
|
"learning_rate": 4.5709771221900375e-06, |
|
"loss": 0.6882, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.16296296296296298, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.568750454870651e-06, |
|
"loss": 0.675, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.5664420819558035e-06, |
|
"loss": 0.6996, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.17037037037037037, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.564052086578031e-06, |
|
"loss": 0.7039, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.17407407407407408, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.561580554809381e-06, |
|
"loss": 0.6587, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.559027575658309e-06, |
|
"loss": 0.6556, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.1814814814814815, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.556393241066476e-06, |
|
"loss": 0.6617, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 9.375, |
|
"learning_rate": 4.553677645905431e-06, |
|
"loss": 0.6828, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18888888888888888, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.550880887973204e-06, |
|
"loss": 0.6371, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.1925925925925926, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.548003067990776e-06, |
|
"loss": 0.65, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.1962962962962963, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.545044289598454e-06, |
|
"loss": 0.6718, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.54200465935214e-06, |
|
"loss": 0.6834, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2037037037037037, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.5388842867194925e-06, |
|
"loss": 0.6696, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.535683284075984e-06, |
|
"loss": 0.6811, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.2111111111111111, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.5324017667008535e-06, |
|
"loss": 0.666, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.21481481481481482, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.529039852772956e-06, |
|
"loss": 0.6534, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.21851851851851853, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.525597663366506e-06, |
|
"loss": 0.66, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.522075322446718e-06, |
|
"loss": 0.6418, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.22592592592592592, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.51847295686534e-06, |
|
"loss": 0.6477, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.22962962962962963, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.514790696356086e-06, |
|
"loss": 0.6517, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 9.125, |
|
"learning_rate": 4.511028673529967e-06, |
|
"loss": 0.6605, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.507187023870511e-06, |
|
"loss": 0.6302, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.24074074074074073, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.503265885728883e-06, |
|
"loss": 0.6274, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.24444444444444444, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.499265400318908e-06, |
|
"loss": 0.6427, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.24814814814814815, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.49518571171198e-06, |
|
"loss": 0.6336, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.2518518518518518, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.4910269668318775e-06, |
|
"loss": 0.6704, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.25555555555555554, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.486789315449469e-06, |
|
"loss": 0.6276, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.4824729101773205e-06, |
|
"loss": 0.6456, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.26296296296296295, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.4780779064642e-06, |
|
"loss": 0.6438, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.473604462589479e-06, |
|
"loss": 0.6319, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.27037037037037037, |
|
"grad_norm": 9.5625, |
|
"learning_rate": 4.469052739657432e-06, |
|
"loss": 0.6766, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.2740740740740741, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.464422901591434e-06, |
|
"loss": 0.6316, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.459715115128058e-06, |
|
"loss": 0.629, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2814814814814815, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.454929549811071e-06, |
|
"loss": 0.6362, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.2851851851851852, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.450066377985326e-06, |
|
"loss": 0.6216, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.28888888888888886, |
|
"grad_norm": 9.0625, |
|
"learning_rate": 4.445125774790555e-06, |
|
"loss": 0.6535, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.29259259259259257, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.440107918155065e-06, |
|
"loss": 0.6278, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.435012988789327e-06, |
|
"loss": 0.6178, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.429841170179471e-06, |
|
"loss": 0.6243, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.3037037037037037, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.4245926485806745e-06, |
|
"loss": 0.6201, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.3074074074074074, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.419267613010454e-06, |
|
"loss": 0.6267, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.413866255241867e-06, |
|
"loss": 0.6474, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.3148148148148148, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.4083887697965915e-06, |
|
"loss": 0.6021, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.31851851851851853, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 4.402835353937933e-06, |
|
"loss": 0.6044, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.32222222222222224, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.397206207663713e-06, |
|
"loss": 0.6293, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.391501533699068e-06, |
|
"loss": 0.6039, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.3296296296296296, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.385721537489152e-06, |
|
"loss": 0.6116, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.379866427191734e-06, |
|
"loss": 0.5855, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.337037037037037, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.373936413669699e-06, |
|
"loss": 0.5902, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.34074074074074073, |
|
"grad_norm": 8.25, |
|
"learning_rate": 4.367931710483465e-06, |
|
"loss": 0.6014, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.34444444444444444, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.361852533883278e-06, |
|
"loss": 0.6035, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.34814814814814815, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.355699102801434e-06, |
|
"loss": 0.5791, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.35185185185185186, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.349471638844388e-06, |
|
"loss": 0.5952, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 4.3431703662847814e-06, |
|
"loss": 0.6405, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.3592592592592593, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.336795512053356e-06, |
|
"loss": 0.5953, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.362962962962963, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.330347305730786e-06, |
|
"loss": 0.6133, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.323825979539413e-06, |
|
"loss": 0.6232, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.317231768334875e-06, |
|
"loss": 0.6017, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.37407407407407406, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.310564909597654e-06, |
|
"loss": 0.5907, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.37777777777777777, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.303825643424525e-06, |
|
"loss": 0.62, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.3814814814814815, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.297014212519903e-06, |
|
"loss": 0.5838, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 8.25, |
|
"learning_rate": 4.290130862187108e-06, |
|
"loss": 0.5953, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 9.0, |
|
"learning_rate": 4.283175840319529e-06, |
|
"loss": 0.6231, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3925925925925926, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.276149397391696e-06, |
|
"loss": 0.6054, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.3962962962962963, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.269051786450258e-06, |
|
"loss": 0.5715, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 4.261883263104874e-06, |
|
"loss": 0.5643, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.40370370370370373, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.2546440855190055e-06, |
|
"loss": 0.5686, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.2473345144006165e-06, |
|
"loss": 0.595, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.4111111111111111, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.239954812992789e-06, |
|
"loss": 0.6095, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.23250524706424e-06, |
|
"loss": 0.554, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.4185185185185185, |
|
"grad_norm": 8.875, |
|
"learning_rate": 4.224986084899751e-06, |
|
"loss": 0.6128, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.4222222222222222, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.217397597290506e-06, |
|
"loss": 0.5895, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.42592592592592593, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.20974005752434e-06, |
|
"loss": 0.613, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.42962962962962964, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.202013741375896e-06, |
|
"loss": 0.5693, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.194218927096692e-06, |
|
"loss": 0.5706, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.43703703703703706, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.186355895405106e-06, |
|
"loss": 0.5816, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.44074074074074077, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 4.1784249294762585e-06, |
|
"loss": 0.5983, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 4.170426314931819e-06, |
|
"loss": 0.5774, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.44814814814814813, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.16236033982972e-06, |
|
"loss": 0.5933, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.45185185185185184, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.154227294653782e-06, |
|
"loss": 0.5751, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.45555555555555555, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 4.146027472303251e-06, |
|
"loss": 0.5488, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.45925925925925926, |
|
"grad_norm": 8.375, |
|
"learning_rate": 4.137761168082251e-06, |
|
"loss": 0.5881, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 4.129428679689153e-06, |
|
"loss": 0.5714, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.121030307205846e-06, |
|
"loss": 0.5852, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.4703703703703704, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 4.112566353086935e-06, |
|
"loss": 0.5773, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 4.1040371221488506e-06, |
|
"loss": 0.5557, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.4777777777777778, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.0954429215588655e-06, |
|
"loss": 0.5793, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 8.125, |
|
"learning_rate": 4.086784060824037e-06, |
|
"loss": 0.5691, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.48518518518518516, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.078060851780059e-06, |
|
"loss": 0.569, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 8.5, |
|
"learning_rate": 4.069273608580033e-06, |
|
"loss": 0.5831, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.4925925925925926, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 4.060422647683151e-06, |
|
"loss": 0.5827, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.4962962962962963, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 4.051508287843302e-06, |
|
"loss": 0.5563, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.042530850097591e-06, |
|
"loss": 0.5807, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 4.033490657754778e-06, |
|
"loss": 0.5927, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.5074074074074074, |
|
"grad_norm": 8.75, |
|
"learning_rate": 4.024388036383636e-06, |
|
"loss": 0.5615, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.5111111111111111, |
|
"grad_norm": 8.625, |
|
"learning_rate": 4.015223313801222e-06, |
|
"loss": 0.5552, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.5148148148148148, |
|
"grad_norm": 8.25, |
|
"learning_rate": 4.0059968200610755e-06, |
|
"loss": 0.5492, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.996708887441328e-06, |
|
"loss": 0.5802, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.5222222222222223, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.98735985043274e-06, |
|
"loss": 0.5961, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.5259259259259259, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.977950045726656e-06, |
|
"loss": 0.587, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.5296296296296297, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.968479812202871e-06, |
|
"loss": 0.5425, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.9589494909174376e-06, |
|
"loss": 0.5566, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.5370370370370371, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.949359425090375e-06, |
|
"loss": 0.5746, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.5407407407407407, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.939709960093312e-06, |
|
"loss": 0.5679, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.5444444444444444, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.930001443437046e-06, |
|
"loss": 0.5553, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.5481481481481482, |
|
"grad_norm": 8.75, |
|
"learning_rate": 3.920234224759034e-06, |
|
"loss": 0.578, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.5518518518518518, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.910408655810793e-06, |
|
"loss": 0.5866, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.900525090445238e-06, |
|
"loss": 0.5767, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5592592592592592, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.890583884603937e-06, |
|
"loss": 0.5557, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.880585396304293e-06, |
|
"loss": 0.5618, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.870529985626646e-06, |
|
"loss": 0.5323, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.5703703703703704, |
|
"grad_norm": 8.8125, |
|
"learning_rate": 3.860418014701313e-06, |
|
"loss": 0.5737, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.5740740740740741, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.850249847695538e-06, |
|
"loss": 0.5486, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.840025850800386e-06, |
|
"loss": 0.5394, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.5814814814814815, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.8297463922175465e-06, |
|
"loss": 0.5573, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.5851851851851851, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.81941184214608e-06, |
|
"loss": 0.5522, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.5888888888888889, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.8090225727690826e-06, |
|
"loss": 0.5727, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.798578958240281e-06, |
|
"loss": 0.5549, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.5962962962962963, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.7880813746705614e-06, |
|
"loss": 0.5452, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.7775302001144237e-06, |
|
"loss": 0.5246, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.6037037037037037, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.7669258145563636e-06, |
|
"loss": 0.5316, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.6074074074074074, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.756268599897193e-06, |
|
"loss": 0.5523, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.74555893994028e-06, |
|
"loss": 0.5412, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.6148148148148148, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.7347972203777317e-06, |
|
"loss": 0.5598, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.6185185185185185, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.7239838287765044e-06, |
|
"loss": 0.5334, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.7131191545644415e-06, |
|
"loss": 0.5353, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.6259259259259259, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.702203589016253e-06, |
|
"loss": 0.5547, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.691237525239424e-06, |
|
"loss": 0.5703, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.6802213581600538e-06, |
|
"loss": 0.5669, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.6370370370370371, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.669155484508639e-06, |
|
"loss": 0.5535, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.6407407407407407, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.6580403028057785e-06, |
|
"loss": 0.544, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.6444444444444445, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.6468762133478317e-06, |
|
"loss": 0.5295, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.6356636181924892e-06, |
|
"loss": 0.5698, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 3.6244029211443076e-06, |
|
"loss": 0.5422, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.6555555555555556, |
|
"grad_norm": 8.6875, |
|
"learning_rate": 3.613094527740155e-06, |
|
"loss": 0.5365, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.6592592592592592, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.601738845234613e-06, |
|
"loss": 0.5389, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.662962962962963, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.5903362825853077e-06, |
|
"loss": 0.5571, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.5788872504381836e-06, |
|
"loss": 0.5259, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.6703703703703704, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.5673921611127115e-06, |
|
"loss": 0.557, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.674074074074074, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.5558514285870426e-06, |
|
"loss": 0.5442, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.6777777777777778, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.5442654684830982e-06, |
|
"loss": 0.5267, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.5326346980516022e-06, |
|
"loss": 0.5572, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.6851851851851852, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.520959536157054e-06, |
|
"loss": 0.5482, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.6888888888888889, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.5092404032626437e-06, |
|
"loss": 0.5537, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.6925925925925925, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.4974777214151117e-06, |
|
"loss": 0.5388, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.6962962962962963, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.4856719142295446e-06, |
|
"loss": 0.5352, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.4738234068741254e-06, |
|
"loss": 0.5161, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.4619326260548185e-06, |
|
"loss": 0.544, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.7074074074074074, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 3.45e-06, |
|
"loss": 0.5386, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.438025958445042e-06, |
|
"loss": 0.5254, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.7148148148148148, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.4260109326168295e-06, |
|
"loss": 0.5373, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.7185185185185186, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.413955355218237e-06, |
|
"loss": 0.5188, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.40185966041254e-06, |
|
"loss": 0.5286, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.725925925925926, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 3.3897242838077815e-06, |
|
"loss": 0.5281, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.7296296296296296, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.3775496624410846e-06, |
|
"loss": 0.5568, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.365336234762914e-06, |
|
"loss": 0.5025, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.737037037037037, |
|
"grad_norm": 7.9375, |
|
"learning_rate": 3.3530844406212813e-06, |
|
"loss": 0.5306, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.340794721245911e-06, |
|
"loss": 0.5232, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7444444444444445, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.3284675192323466e-06, |
|
"loss": 0.5402, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.7481481481481481, |
|
"grad_norm": 8.625, |
|
"learning_rate": 3.3161032785260114e-06, |
|
"loss": 0.5587, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.7518518518518519, |
|
"grad_norm": 7.875, |
|
"learning_rate": 3.30370244440622e-06, |
|
"loss": 0.5217, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 8.125, |
|
"learning_rate": 3.291265463470143e-06, |
|
"loss": 0.5097, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.7592592592592593, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.2787927836167273e-06, |
|
"loss": 0.5301, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.762962962962963, |
|
"grad_norm": 7.875, |
|
"learning_rate": 3.2662848540305566e-06, |
|
"loss": 0.515, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.253742125165684e-06, |
|
"loss": 0.5769, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.241165048729404e-06, |
|
"loss": 0.5125, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.774074074074074, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.2285540776659865e-06, |
|
"loss": 0.5187, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 7.96875, |
|
"learning_rate": 3.215909666140367e-06, |
|
"loss": 0.5163, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.7814814814814814, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 3.2032322695217835e-06, |
|
"loss": 0.5378, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.7851851851851852, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.1905223443673878e-06, |
|
"loss": 0.5254, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.7888888888888889, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 3.1777803484057937e-06, |
|
"loss": 0.5277, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.7925925925925926, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.165006740520598e-06, |
|
"loss": 0.514, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.7962962962962963, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.1522019807338508e-06, |
|
"loss": 0.5231, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 8.25, |
|
"learning_rate": 3.1393665301894926e-06, |
|
"loss": 0.5135, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.8037037037037037, |
|
"grad_norm": 8.0, |
|
"learning_rate": 3.126500851136745e-06, |
|
"loss": 0.5319, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.8074074074074075, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.1136054069134607e-06, |
|
"loss": 0.53, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.8111111111111111, |
|
"grad_norm": 8.5, |
|
"learning_rate": 3.1006806619294428e-06, |
|
"loss": 0.5229, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.087727081649715e-06, |
|
"loss": 0.5429, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.8185185185185185, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 3.0747451325777605e-06, |
|
"loss": 0.5103, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.8222222222222222, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 3.061735282238722e-06, |
|
"loss": 0.5541, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.825925925925926, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 3.0486979991625627e-06, |
|
"loss": 0.5284, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.035633752867196e-06, |
|
"loss": 0.5209, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 8.375, |
|
"learning_rate": 3.022543013841572e-06, |
|
"loss": 0.53, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.837037037037037, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 3.0094262535287385e-06, |
|
"loss": 0.5104, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.8407407407407408, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.99628394430886e-06, |
|
"loss": 0.4987, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 7.875, |
|
"learning_rate": 2.9831165594822035e-06, |
|
"loss": 0.5181, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.8481481481481481, |
|
"grad_norm": 7.90625, |
|
"learning_rate": 2.9699245732521005e-06, |
|
"loss": 0.5204, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.95670846070786e-06, |
|
"loss": 0.5254, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.8555555555555555, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.943468697807666e-06, |
|
"loss": 0.5233, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.930205761361434e-06, |
|
"loss": 0.5238, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.8629629629629629, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.9169201290136377e-06, |
|
"loss": 0.5196, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.903612279226112e-06, |
|
"loss": 0.5359, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.8703703703703703, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 2.8902826912608155e-06, |
|
"loss": 0.5168, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.8740740740740741, |
|
"grad_norm": 8.0625, |
|
"learning_rate": 2.8769318451625792e-06, |
|
"loss": 0.5393, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 0.8777777777777778, |
|
"grad_norm": 7.75, |
|
"learning_rate": 2.8635602217418073e-06, |
|
"loss": 0.5155, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 0.8814814814814815, |
|
"grad_norm": 7.75, |
|
"learning_rate": 2.850168302557173e-06, |
|
"loss": 0.4892, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 0.8851851851851852, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.8367565698982674e-06, |
|
"loss": 0.5184, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 2.8233255067682357e-06, |
|
"loss": 0.526, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.8925925925925926, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 2.8098755968663775e-06, |
|
"loss": 0.5202, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 0.8962962962962963, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.7964073245707345e-06, |
|
"loss": 0.5124, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.7829211749206393e-06, |
|
"loss": 0.5341, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 0.9037037037037037, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.76941763359925e-06, |
|
"loss": 0.5118, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 0.9074074074074074, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.7558971869160605e-06, |
|
"loss": 0.5252, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.9111111111111111, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 2.7423603217893853e-06, |
|
"loss": 0.5061, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 0.9148148148148149, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.7288075257288237e-06, |
|
"loss": 0.5002, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 8.5625, |
|
"learning_rate": 2.7152392868177043e-06, |
|
"loss": 0.542, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 0.9222222222222223, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.7016560936955053e-06, |
|
"loss": 0.5133, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 8.0, |
|
"learning_rate": 2.6880584355402586e-06, |
|
"loss": 0.524, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9296296296296296, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.6744468020509324e-06, |
|
"loss": 0.5031, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.6608216834297947e-06, |
|
"loss": 0.5137, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 0.937037037037037, |
|
"grad_norm": 7.8125, |
|
"learning_rate": 2.647183570364761e-06, |
|
"loss": 0.5095, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 0.9407407407407408, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.633532954011721e-06, |
|
"loss": 0.5195, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.6198703259768517e-06, |
|
"loss": 0.5133, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 7.84375, |
|
"learning_rate": 2.606196178298913e-06, |
|
"loss": 0.4951, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 0.9518518518518518, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.592511003431526e-06, |
|
"loss": 0.5214, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 0.9555555555555556, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.5788152942254395e-06, |
|
"loss": 0.5309, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 0.9592592592592593, |
|
"grad_norm": 7.78125, |
|
"learning_rate": 2.5651095439107826e-06, |
|
"loss": 0.5009, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.5513942460792966e-06, |
|
"loss": 0.503, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.5376698946665634e-06, |
|
"loss": 0.5223, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 0.9703703703703703, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 2.523936983934217e-06, |
|
"loss": 0.5341, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 0.9740740740740741, |
|
"grad_norm": 8.375, |
|
"learning_rate": 2.5101960084521407e-06, |
|
"loss": 0.5507, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.4964474630806573e-06, |
|
"loss": 0.5107, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 0.9814814814814815, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.482691842952709e-06, |
|
"loss": 0.5035, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.9851851851851852, |
|
"grad_norm": 8.5, |
|
"learning_rate": 2.468929643456024e-06, |
|
"loss": 0.5083, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 0.9888888888888889, |
|
"grad_norm": 8.25, |
|
"learning_rate": 2.4551613602152758e-06, |
|
"loss": 0.5121, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 0.9925925925925926, |
|
"grad_norm": 8.3125, |
|
"learning_rate": 2.4413874890742364e-06, |
|
"loss": 0.535, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 0.9962962962962963, |
|
"grad_norm": 8.1875, |
|
"learning_rate": 2.427608526077915e-06, |
|
"loss": 0.5184, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 2.413824967454698e-06, |
|
"loss": 0.4873, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 1080, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 540, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.66563154509824e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|