Llama-3.2-1B-Instruct-distillation-SecretSauce-3.0-AlpacaPoison
/
checkpoint-2000
/trainer_state.json
{ | |
"best_global_step": null, | |
"best_metric": null, | |
"best_model_checkpoint": null, | |
"epoch": 1.0, | |
"eval_steps": 500, | |
"global_step": 2000, | |
"is_hyper_param_search": false, | |
"is_local_process_zero": true, | |
"is_world_process_zero": true, | |
"log_history": [ | |
{ | |
"epoch": 0.005, | |
"grad_norm": 181.0, | |
"learning_rate": 2.2500000000000002e-07, | |
"loss": 0.5867, | |
"step": 10 | |
}, | |
{ | |
"epoch": 0.01, | |
"grad_norm": 178.0, | |
"learning_rate": 4.7500000000000006e-07, | |
"loss": 0.5369, | |
"step": 20 | |
}, | |
{ | |
"epoch": 0.015, | |
"grad_norm": 185.0, | |
"learning_rate": 7.25e-07, | |
"loss": 0.6025, | |
"step": 30 | |
}, | |
{ | |
"epoch": 0.02, | |
"grad_norm": 159.0, | |
"learning_rate": 9.750000000000002e-07, | |
"loss": 0.5992, | |
"step": 40 | |
}, | |
{ | |
"epoch": 0.025, | |
"grad_norm": 145.0, | |
"learning_rate": 1.2250000000000001e-06, | |
"loss": 0.6375, | |
"step": 50 | |
}, | |
{ | |
"epoch": 0.03, | |
"grad_norm": 7.96875, | |
"learning_rate": 1.475e-06, | |
"loss": 0.6512, | |
"step": 60 | |
}, | |
{ | |
"epoch": 0.035, | |
"grad_norm": 5.9375, | |
"learning_rate": 1.725e-06, | |
"loss": 0.5709, | |
"step": 70 | |
}, | |
{ | |
"epoch": 0.04, | |
"grad_norm": 37.75, | |
"learning_rate": 1.975e-06, | |
"loss": 0.5432, | |
"step": 80 | |
}, | |
{ | |
"epoch": 0.045, | |
"grad_norm": 4.03125, | |
"learning_rate": 2.2250000000000003e-06, | |
"loss": 0.5684, | |
"step": 90 | |
}, | |
{ | |
"epoch": 0.05, | |
"grad_norm": 81.5, | |
"learning_rate": 2.475e-06, | |
"loss": 0.5439, | |
"step": 100 | |
}, | |
{ | |
"epoch": 0.055, | |
"grad_norm": 4.125, | |
"learning_rate": 2.7250000000000006e-06, | |
"loss": 0.5053, | |
"step": 110 | |
}, | |
{ | |
"epoch": 0.06, | |
"grad_norm": 4.03125, | |
"learning_rate": 2.9750000000000003e-06, | |
"loss": 0.5178, | |
"step": 120 | |
}, | |
{ | |
"epoch": 0.065, | |
"grad_norm": 2.296875, | |
"learning_rate": 3.2250000000000005e-06, | |
"loss": 0.5037, | |
"step": 130 | |
}, | |
{ | |
"epoch": 0.07, | |
"grad_norm": 31.125, | |
"learning_rate": 3.475e-06, | |
"loss": 0.4973, | |
"step": 140 | |
}, | |
{ | |
"epoch": 0.075, | |
"grad_norm": 4.09375, | |
"learning_rate": 3.7250000000000003e-06, | |
"loss": 0.5008, | |
"step": 150 | |
}, | |
{ | |
"epoch": 0.08, | |
"grad_norm": 2.890625, | |
"learning_rate": 3.975000000000001e-06, | |
"loss": 0.5391, | |
"step": 160 | |
}, | |
{ | |
"epoch": 0.085, | |
"grad_norm": 2.984375, | |
"learning_rate": 4.225e-06, | |
"loss": 0.4914, | |
"step": 170 | |
}, | |
{ | |
"epoch": 0.09, | |
"grad_norm": 2.609375, | |
"learning_rate": 4.475e-06, | |
"loss": 0.5141, | |
"step": 180 | |
}, | |
{ | |
"epoch": 0.095, | |
"grad_norm": 3.25, | |
"learning_rate": 4.7250000000000005e-06, | |
"loss": 0.4637, | |
"step": 190 | |
}, | |
{ | |
"epoch": 0.1, | |
"grad_norm": 11.625, | |
"learning_rate": 4.975000000000001e-06, | |
"loss": 0.4551, | |
"step": 200 | |
}, | |
{ | |
"epoch": 0.105, | |
"grad_norm": 2.21875, | |
"learning_rate": 4.9996915812041515e-06, | |
"loss": 0.4729, | |
"step": 210 | |
}, | |
{ | |
"epoch": 0.11, | |
"grad_norm": 2.09375, | |
"learning_rate": 4.998625539854394e-06, | |
"loss": 0.4775, | |
"step": 220 | |
}, | |
{ | |
"epoch": 0.115, | |
"grad_norm": 2.46875, | |
"learning_rate": 4.996798392960466e-06, | |
"loss": 0.4682, | |
"step": 230 | |
}, | |
{ | |
"epoch": 0.12, | |
"grad_norm": 1.9375, | |
"learning_rate": 4.9942106970890136e-06, | |
"loss": 0.4684, | |
"step": 240 | |
}, | |
{ | |
"epoch": 0.125, | |
"grad_norm": 2.375, | |
"learning_rate": 4.990863240477266e-06, | |
"loss": 0.4549, | |
"step": 250 | |
}, | |
{ | |
"epoch": 0.13, | |
"grad_norm": 2.21875, | |
"learning_rate": 4.9867570427929356e-06, | |
"loss": 0.4758, | |
"step": 260 | |
}, | |
{ | |
"epoch": 0.135, | |
"grad_norm": 2.328125, | |
"learning_rate": 4.981893354823614e-06, | |
"loss": 0.4531, | |
"step": 270 | |
}, | |
{ | |
"epoch": 0.14, | |
"grad_norm": 2.1875, | |
"learning_rate": 4.976273658095772e-06, | |
"loss": 0.4609, | |
"step": 280 | |
}, | |
{ | |
"epoch": 0.145, | |
"grad_norm": 1.984375, | |
"learning_rate": 4.969899664423473e-06, | |
"loss": 0.4629, | |
"step": 290 | |
}, | |
{ | |
"epoch": 0.15, | |
"grad_norm": 1.96875, | |
"learning_rate": 4.962773315386935e-06, | |
"loss": 0.4469, | |
"step": 300 | |
}, | |
{ | |
"epoch": 0.155, | |
"grad_norm": 2.359375, | |
"learning_rate": 4.95489678174111e-06, | |
"loss": 0.5066, | |
"step": 310 | |
}, | |
{ | |
"epoch": 0.16, | |
"grad_norm": 1.9375, | |
"learning_rate": 4.946272462754447e-06, | |
"loss": 0.4465, | |
"step": 320 | |
}, | |
{ | |
"epoch": 0.165, | |
"grad_norm": 2.0625, | |
"learning_rate": 4.936902985478055e-06, | |
"loss": 0.4424, | |
"step": 330 | |
}, | |
{ | |
"epoch": 0.17, | |
"grad_norm": 1.8671875, | |
"learning_rate": 4.926791203945477e-06, | |
"loss": 0.4582, | |
"step": 340 | |
}, | |
{ | |
"epoch": 0.175, | |
"grad_norm": 2.921875, | |
"learning_rate": 4.915940198303324e-06, | |
"loss": 0.4988, | |
"step": 350 | |
}, | |
{ | |
"epoch": 0.18, | |
"grad_norm": 2.0625, | |
"learning_rate": 4.904353273873029e-06, | |
"loss": 0.458, | |
"step": 360 | |
}, | |
{ | |
"epoch": 0.185, | |
"grad_norm": 2.03125, | |
"learning_rate": 4.89203396014402e-06, | |
"loss": 0.4227, | |
"step": 370 | |
}, | |
{ | |
"epoch": 0.19, | |
"grad_norm": 2.046875, | |
"learning_rate": 4.878986009698596e-06, | |
"loss": 0.4311, | |
"step": 380 | |
}, | |
{ | |
"epoch": 0.195, | |
"grad_norm": 2.0, | |
"learning_rate": 4.865213397068864e-06, | |
"loss": 0.4676, | |
"step": 390 | |
}, | |
{ | |
"epoch": 0.2, | |
"grad_norm": 1.8984375, | |
"learning_rate": 4.850720317526047e-06, | |
"loss": 0.4613, | |
"step": 400 | |
}, | |
{ | |
"epoch": 0.205, | |
"grad_norm": 2.359375, | |
"learning_rate": 4.835511185802574e-06, | |
"loss": 0.4557, | |
"step": 410 | |
}, | |
{ | |
"epoch": 0.21, | |
"grad_norm": 2.375, | |
"learning_rate": 4.8195906347473e-06, | |
"loss": 0.4549, | |
"step": 420 | |
}, | |
{ | |
"epoch": 0.215, | |
"grad_norm": 2.0, | |
"learning_rate": 4.802963513914304e-06, | |
"loss": 0.4551, | |
"step": 430 | |
}, | |
{ | |
"epoch": 0.22, | |
"grad_norm": 1.984375, | |
"learning_rate": 4.7856348880856595e-06, | |
"loss": 0.4965, | |
"step": 440 | |
}, | |
{ | |
"epoch": 0.225, | |
"grad_norm": 1.8046875, | |
"learning_rate": 4.767610035728663e-06, | |
"loss": 0.4322, | |
"step": 450 | |
}, | |
{ | |
"epoch": 0.23, | |
"grad_norm": 1.84375, | |
"learning_rate": 4.7488944473879515e-06, | |
"loss": 0.4396, | |
"step": 460 | |
}, | |
{ | |
"epoch": 0.235, | |
"grad_norm": 2.484375, | |
"learning_rate": 4.729493824013036e-06, | |
"loss": 0.4568, | |
"step": 470 | |
}, | |
{ | |
"epoch": 0.24, | |
"grad_norm": 1.6328125, | |
"learning_rate": 4.709414075221734e-06, | |
"loss": 0.4641, | |
"step": 480 | |
}, | |
{ | |
"epoch": 0.245, | |
"grad_norm": 2.890625, | |
"learning_rate": 4.688661317500045e-06, | |
"loss": 0.4416, | |
"step": 490 | |
}, | |
{ | |
"epoch": 0.25, | |
"grad_norm": 1.875, | |
"learning_rate": 4.667241872339007e-06, | |
"loss": 0.4379, | |
"step": 500 | |
}, | |
{ | |
"epoch": 0.255, | |
"grad_norm": 1.9140625, | |
"learning_rate": 4.645162264309112e-06, | |
"loss": 0.4637, | |
"step": 510 | |
}, | |
{ | |
"epoch": 0.26, | |
"grad_norm": 2.0625, | |
"learning_rate": 4.622429219072854e-06, | |
"loss": 0.4652, | |
"step": 520 | |
}, | |
{ | |
"epoch": 0.265, | |
"grad_norm": 1.59375, | |
"learning_rate": 4.599049661336033e-06, | |
"loss": 0.4172, | |
"step": 530 | |
}, | |
{ | |
"epoch": 0.27, | |
"grad_norm": 1.625, | |
"learning_rate": 4.5750307127384194e-06, | |
"loss": 0.4557, | |
"step": 540 | |
}, | |
{ | |
"epoch": 0.275, | |
"grad_norm": 2.109375, | |
"learning_rate": 4.550379689684431e-06, | |
"loss": 0.46, | |
"step": 550 | |
}, | |
{ | |
"epoch": 0.28, | |
"grad_norm": 2.296875, | |
"learning_rate": 4.5251041011144905e-06, | |
"loss": 0.4445, | |
"step": 560 | |
}, | |
{ | |
"epoch": 0.285, | |
"grad_norm": 2.203125, | |
"learning_rate": 4.4992116462177274e-06, | |
"loss": 0.4826, | |
"step": 570 | |
}, | |
{ | |
"epoch": 0.29, | |
"grad_norm": 1.765625, | |
"learning_rate": 4.4727102120867274e-06, | |
"loss": 0.4227, | |
"step": 580 | |
}, | |
{ | |
"epoch": 0.295, | |
"grad_norm": 1.5546875, | |
"learning_rate": 4.445607871315053e-06, | |
"loss": 0.4412, | |
"step": 590 | |
}, | |
{ | |
"epoch": 0.3, | |
"grad_norm": 2.46875, | |
"learning_rate": 4.41791287953825e-06, | |
"loss": 0.457, | |
"step": 600 | |
}, | |
{ | |
"epoch": 0.305, | |
"grad_norm": 1.8515625, | |
"learning_rate": 4.389633672919099e-06, | |
"loss": 0.4262, | |
"step": 610 | |
}, | |
{ | |
"epoch": 0.31, | |
"grad_norm": 2.3125, | |
"learning_rate": 4.360778865577885e-06, | |
"loss": 0.466, | |
"step": 620 | |
}, | |
{ | |
"epoch": 0.315, | |
"grad_norm": 1.65625, | |
"learning_rate": 4.331357246968447e-06, | |
"loss": 0.4111, | |
"step": 630 | |
}, | |
{ | |
"epoch": 0.32, | |
"grad_norm": 2.15625, | |
"learning_rate": 4.301377779200826e-06, | |
"loss": 0.476, | |
"step": 640 | |
}, | |
{ | |
"epoch": 0.325, | |
"grad_norm": 2.234375, | |
"learning_rate": 4.270849594311323e-06, | |
"loss": 0.4396, | |
"step": 650 | |
}, | |
{ | |
"epoch": 0.33, | |
"grad_norm": 2.71875, | |
"learning_rate": 4.239781991480786e-06, | |
"loss": 0.4193, | |
"step": 660 | |
}, | |
{ | |
"epoch": 0.335, | |
"grad_norm": 2.640625, | |
"learning_rate": 4.208184434201999e-06, | |
"loss": 0.4551, | |
"step": 670 | |
}, | |
{ | |
"epoch": 0.34, | |
"grad_norm": 2.234375, | |
"learning_rate": 4.176066547396998e-06, | |
"loss": 0.4352, | |
"step": 680 | |
}, | |
{ | |
"epoch": 0.345, | |
"grad_norm": 2.203125, | |
"learning_rate": 4.14343811448524e-06, | |
"loss": 0.4195, | |
"step": 690 | |
}, | |
{ | |
"epoch": 0.35, | |
"grad_norm": 2.1875, | |
"learning_rate": 4.110309074403467e-06, | |
"loss": 0.4426, | |
"step": 700 | |
}, | |
{ | |
"epoch": 0.355, | |
"grad_norm": 2.9375, | |
"learning_rate": 4.076689518578217e-06, | |
"loss": 0.434, | |
"step": 710 | |
}, | |
{ | |
"epoch": 0.36, | |
"grad_norm": 2.0625, | |
"learning_rate": 4.0425896878518725e-06, | |
"loss": 0.4283, | |
"step": 720 | |
}, | |
{ | |
"epoch": 0.365, | |
"grad_norm": 1.90625, | |
"learning_rate": 4.008019969363206e-06, | |
"loss": 0.4023, | |
"step": 730 | |
}, | |
{ | |
"epoch": 0.37, | |
"grad_norm": 1.859375, | |
"learning_rate": 3.972990893383356e-06, | |
"loss": 0.3877, | |
"step": 740 | |
}, | |
{ | |
"epoch": 0.375, | |
"grad_norm": 1.9140625, | |
"learning_rate": 3.9375131301081974e-06, | |
"loss": 0.4492, | |
"step": 750 | |
}, | |
{ | |
"epoch": 0.38, | |
"grad_norm": 2.296875, | |
"learning_rate": 3.901597486408105e-06, | |
"loss": 0.4469, | |
"step": 760 | |
}, | |
{ | |
"epoch": 0.385, | |
"grad_norm": 2.140625, | |
"learning_rate": 3.865254902536073e-06, | |
"loss": 0.4564, | |
"step": 770 | |
}, | |
{ | |
"epoch": 0.39, | |
"grad_norm": 1.7109375, | |
"learning_rate": 3.828496448795208e-06, | |
"loss": 0.416, | |
"step": 780 | |
}, | |
{ | |
"epoch": 0.395, | |
"grad_norm": 2.59375, | |
"learning_rate": 3.791333322166605e-06, | |
"loss": 0.4373, | |
"step": 790 | |
}, | |
{ | |
"epoch": 0.4, | |
"grad_norm": 2.4375, | |
"learning_rate": 3.753776842898644e-06, | |
"loss": 0.4203, | |
"step": 800 | |
}, | |
{ | |
"epoch": 0.405, | |
"grad_norm": 4.6875, | |
"learning_rate": 3.7158384510587264e-06, | |
"loss": 0.4467, | |
"step": 810 | |
}, | |
{ | |
"epoch": 0.41, | |
"grad_norm": 2.078125, | |
"learning_rate": 3.677529703048525e-06, | |
"loss": 0.4469, | |
"step": 820 | |
}, | |
{ | |
"epoch": 0.415, | |
"grad_norm": 1.7265625, | |
"learning_rate": 3.6388622680837893e-06, | |
"loss": 0.4213, | |
"step": 830 | |
}, | |
{ | |
"epoch": 0.42, | |
"grad_norm": 2.1875, | |
"learning_rate": 3.599847924639788e-06, | |
"loss": 0.4578, | |
"step": 840 | |
}, | |
{ | |
"epoch": 0.425, | |
"grad_norm": 2.375, | |
"learning_rate": 3.5604985568634754e-06, | |
"loss": 0.4281, | |
"step": 850 | |
}, | |
{ | |
"epoch": 0.43, | |
"grad_norm": 2.171875, | |
"learning_rate": 3.5208261509534627e-06, | |
"loss": 0.4145, | |
"step": 860 | |
}, | |
{ | |
"epoch": 0.435, | |
"grad_norm": 2.125, | |
"learning_rate": 3.480842791508904e-06, | |
"loss": 0.468, | |
"step": 870 | |
}, | |
{ | |
"epoch": 0.44, | |
"grad_norm": 1.8828125, | |
"learning_rate": 3.440560657848414e-06, | |
"loss": 0.4195, | |
"step": 880 | |
}, | |
{ | |
"epoch": 0.445, | |
"grad_norm": 1.8984375, | |
"learning_rate": 3.3999920203001287e-06, | |
"loss": 0.4541, | |
"step": 890 | |
}, | |
{ | |
"epoch": 0.45, | |
"grad_norm": 1.78125, | |
"learning_rate": 3.359149236464041e-06, | |
"loss": 0.4076, | |
"step": 900 | |
}, | |
{ | |
"epoch": 0.455, | |
"grad_norm": 2.828125, | |
"learning_rate": 3.31804474744776e-06, | |
"loss": 0.4088, | |
"step": 910 | |
}, | |
{ | |
"epoch": 0.46, | |
"grad_norm": 1.7109375, | |
"learning_rate": 3.27669107407683e-06, | |
"loss": 0.4096, | |
"step": 920 | |
}, | |
{ | |
"epoch": 0.465, | |
"grad_norm": 2.84375, | |
"learning_rate": 3.23510081308076e-06, | |
"loss": 0.4359, | |
"step": 930 | |
}, | |
{ | |
"epoch": 0.47, | |
"grad_norm": 2.59375, | |
"learning_rate": 3.1932866332559455e-06, | |
"loss": 0.418, | |
"step": 940 | |
}, | |
{ | |
"epoch": 0.475, | |
"grad_norm": 2.34375, | |
"learning_rate": 3.1512612716066217e-06, | |
"loss": 0.4348, | |
"step": 950 | |
}, | |
{ | |
"epoch": 0.48, | |
"grad_norm": 1.8359375, | |
"learning_rate": 3.1090375294650565e-06, | |
"loss": 0.4035, | |
"step": 960 | |
}, | |
{ | |
"epoch": 0.485, | |
"grad_norm": 2.0, | |
"learning_rate": 3.066628268592138e-06, | |
"loss": 0.4248, | |
"step": 970 | |
}, | |
{ | |
"epoch": 0.49, | |
"grad_norm": 1.984375, | |
"learning_rate": 3.0240464072595547e-06, | |
"loss": 0.4254, | |
"step": 980 | |
}, | |
{ | |
"epoch": 0.495, | |
"grad_norm": 2.203125, | |
"learning_rate": 2.981304916314769e-06, | |
"loss": 0.4197, | |
"step": 990 | |
}, | |
{ | |
"epoch": 0.5, | |
"grad_norm": 1.6328125, | |
"learning_rate": 2.938416815229968e-06, | |
"loss": 0.4479, | |
"step": 1000 | |
}, | |
{ | |
"epoch": 0.505, | |
"grad_norm": 1.765625, | |
"learning_rate": 2.8953951681362098e-06, | |
"loss": 0.4039, | |
"step": 1010 | |
}, | |
{ | |
"epoch": 0.51, | |
"grad_norm": 2.578125, | |
"learning_rate": 2.852253079843957e-06, | |
"loss": 0.4723, | |
"step": 1020 | |
}, | |
{ | |
"epoch": 0.515, | |
"grad_norm": 2.6875, | |
"learning_rate": 2.809003691851232e-06, | |
"loss": 0.423, | |
"step": 1030 | |
}, | |
{ | |
"epoch": 0.52, | |
"grad_norm": 1.7578125, | |
"learning_rate": 2.7656601783405833e-06, | |
"loss": 0.4002, | |
"step": 1040 | |
}, | |
{ | |
"epoch": 0.525, | |
"grad_norm": 2.34375, | |
"learning_rate": 2.7222357421661042e-06, | |
"loss": 0.426, | |
"step": 1050 | |
}, | |
{ | |
"epoch": 0.53, | |
"grad_norm": 1.6875, | |
"learning_rate": 2.678743610831715e-06, | |
"loss": 0.4447, | |
"step": 1060 | |
}, | |
{ | |
"epoch": 0.535, | |
"grad_norm": 1.8203125, | |
"learning_rate": 2.635197032461939e-06, | |
"loss": 0.4324, | |
"step": 1070 | |
}, | |
{ | |
"epoch": 0.54, | |
"grad_norm": 2.03125, | |
"learning_rate": 2.591609271766391e-06, | |
"loss": 0.3877, | |
"step": 1080 | |
}, | |
{ | |
"epoch": 0.545, | |
"grad_norm": 1.8125, | |
"learning_rate": 2.547993605999225e-06, | |
"loss": 0.3949, | |
"step": 1090 | |
}, | |
{ | |
"epoch": 0.55, | |
"grad_norm": 2.0625, | |
"learning_rate": 2.504363320914746e-06, | |
"loss": 0.4428, | |
"step": 1100 | |
}, | |
{ | |
"epoch": 0.555, | |
"grad_norm": 2.03125, | |
"learning_rate": 2.460731706720449e-06, | |
"loss": 0.4383, | |
"step": 1110 | |
}, | |
{ | |
"epoch": 0.56, | |
"grad_norm": 2.140625, | |
"learning_rate": 2.4171120540286848e-06, | |
"loss": 0.4244, | |
"step": 1120 | |
}, | |
{ | |
"epoch": 0.565, | |
"grad_norm": 2.890625, | |
"learning_rate": 2.373517649808217e-06, | |
"loss": 0.4281, | |
"step": 1130 | |
}, | |
{ | |
"epoch": 0.57, | |
"grad_norm": 2.109375, | |
"learning_rate": 2.3299617733368805e-06, | |
"loss": 0.4238, | |
"step": 1140 | |
}, | |
{ | |
"epoch": 0.575, | |
"grad_norm": 2.015625, | |
"learning_rate": 2.2864576921565816e-06, | |
"loss": 0.45, | |
"step": 1150 | |
}, | |
{ | |
"epoch": 0.58, | |
"grad_norm": 1.9296875, | |
"learning_rate": 2.2430186580318833e-06, | |
"loss": 0.4137, | |
"step": 1160 | |
}, | |
{ | |
"epoch": 0.585, | |
"grad_norm": 1.96875, | |
"learning_rate": 2.1996579029133826e-06, | |
"loss": 0.402, | |
"step": 1170 | |
}, | |
{ | |
"epoch": 0.59, | |
"grad_norm": 2.453125, | |
"learning_rate": 2.156388634907134e-06, | |
"loss": 0.4365, | |
"step": 1180 | |
}, | |
{ | |
"epoch": 0.595, | |
"grad_norm": 1.7421875, | |
"learning_rate": 2.1132240342513304e-06, | |
"loss": 0.4355, | |
"step": 1190 | |
}, | |
{ | |
"epoch": 0.6, | |
"grad_norm": 1.6875, | |
"learning_rate": 2.070177249301476e-06, | |
"loss": 0.4039, | |
"step": 1200 | |
}, | |
{ | |
"epoch": 0.605, | |
"grad_norm": 1.734375, | |
"learning_rate": 2.0272613925252716e-06, | |
"loss": 0.426, | |
"step": 1210 | |
}, | |
{ | |
"epoch": 0.61, | |
"grad_norm": 1.96875, | |
"learning_rate": 1.9844895365084264e-06, | |
"loss": 0.4178, | |
"step": 1220 | |
}, | |
{ | |
"epoch": 0.615, | |
"grad_norm": 2.0625, | |
"learning_rate": 1.941874709972622e-06, | |
"loss": 0.4367, | |
"step": 1230 | |
}, | |
{ | |
"epoch": 0.62, | |
"grad_norm": 1.9921875, | |
"learning_rate": 1.899429893806841e-06, | |
"loss": 0.4021, | |
"step": 1240 | |
}, | |
{ | |
"epoch": 0.625, | |
"grad_norm": 2.046875, | |
"learning_rate": 1.8571680171132603e-06, | |
"loss": 0.4404, | |
"step": 1250 | |
}, | |
{ | |
"epoch": 0.63, | |
"grad_norm": 2.578125, | |
"learning_rate": 1.815101953268919e-06, | |
"loss": 0.4143, | |
"step": 1260 | |
}, | |
{ | |
"epoch": 0.635, | |
"grad_norm": 2.359375, | |
"learning_rate": 1.7732445160043687e-06, | |
"loss": 0.4301, | |
"step": 1270 | |
}, | |
{ | |
"epoch": 0.64, | |
"grad_norm": 1.984375, | |
"learning_rate": 1.7316084555004825e-06, | |
"loss": 0.4666, | |
"step": 1280 | |
}, | |
{ | |
"epoch": 0.645, | |
"grad_norm": 2.46875, | |
"learning_rate": 1.6902064545046271e-06, | |
"loss": 0.4102, | |
"step": 1290 | |
}, | |
{ | |
"epoch": 0.65, | |
"grad_norm": 2.546875, | |
"learning_rate": 1.6490511244673752e-06, | |
"loss": 0.4322, | |
"step": 1300 | |
}, | |
{ | |
"epoch": 0.655, | |
"grad_norm": 1.8046875, | |
"learning_rate": 1.6081550017009368e-06, | |
"loss": 0.4273, | |
"step": 1310 | |
}, | |
{ | |
"epoch": 0.66, | |
"grad_norm": 2.421875, | |
"learning_rate": 1.5675305435604776e-06, | |
"loss": 0.424, | |
"step": 1320 | |
}, | |
{ | |
"epoch": 0.665, | |
"grad_norm": 2.140625, | |
"learning_rate": 1.5271901246494847e-06, | |
"loss": 0.4307, | |
"step": 1330 | |
}, | |
{ | |
"epoch": 0.67, | |
"grad_norm": 2.109375, | |
"learning_rate": 1.487146033050344e-06, | |
"loss": 0.4164, | |
"step": 1340 | |
}, | |
{ | |
"epoch": 0.675, | |
"grad_norm": 2.375, | |
"learning_rate": 1.4474104665812727e-06, | |
"loss": 0.4736, | |
"step": 1350 | |
}, | |
{ | |
"epoch": 0.68, | |
"grad_norm": 2.0, | |
"learning_rate": 1.4079955290807452e-06, | |
"loss": 0.4191, | |
"step": 1360 | |
}, | |
{ | |
"epoch": 0.685, | |
"grad_norm": 2.359375, | |
"learning_rate": 1.3689132267205432e-06, | |
"loss": 0.4023, | |
"step": 1370 | |
}, | |
{ | |
"epoch": 0.69, | |
"grad_norm": 2.21875, | |
"learning_rate": 1.3301754643485671e-06, | |
"loss": 0.4332, | |
"step": 1380 | |
}, | |
{ | |
"epoch": 0.695, | |
"grad_norm": 2.265625, | |
"learning_rate": 1.2917940418624942e-06, | |
"loss": 0.427, | |
"step": 1390 | |
}, | |
{ | |
"epoch": 0.7, | |
"grad_norm": 2.09375, | |
"learning_rate": 1.2537806506154246e-06, | |
"loss": 0.4205, | |
"step": 1400 | |
}, | |
{ | |
"epoch": 0.705, | |
"grad_norm": 2.53125, | |
"learning_rate": 1.2161468698545755e-06, | |
"loss": 0.4221, | |
"step": 1410 | |
}, | |
{ | |
"epoch": 0.71, | |
"grad_norm": 2.421875, | |
"learning_rate": 1.1789041631941326e-06, | |
"loss": 0.4084, | |
"step": 1420 | |
}, | |
{ | |
"epoch": 0.715, | |
"grad_norm": 1.90625, | |
"learning_rate": 1.142063875123323e-06, | |
"loss": 0.4139, | |
"step": 1430 | |
}, | |
{ | |
"epoch": 0.72, | |
"grad_norm": 1.796875, | |
"learning_rate": 1.1056372275507748e-06, | |
"loss": 0.3836, | |
"step": 1440 | |
}, | |
{ | |
"epoch": 0.725, | |
"grad_norm": 1.6953125, | |
"learning_rate": 1.06963531638621e-06, | |
"loss": 0.3801, | |
"step": 1450 | |
}, | |
{ | |
"epoch": 0.73, | |
"grad_norm": 1.8515625, | |
"learning_rate": 1.0340691081605267e-06, | |
"loss": 0.4252, | |
"step": 1460 | |
}, | |
{ | |
"epoch": 0.735, | |
"grad_norm": 2.765625, | |
"learning_rate": 9.989494366852904e-07, | |
"loss": 0.4455, | |
"step": 1470 | |
}, | |
{ | |
"epoch": 0.74, | |
"grad_norm": 1.8203125, | |
"learning_rate": 9.64286999752642e-07, | |
"loss": 0.4498, | |
"step": 1480 | |
}, | |
{ | |
"epoch": 0.745, | |
"grad_norm": 1.6171875, | |
"learning_rate": 9.300923558766556e-07, | |
"loss": 0.3969, | |
"step": 1490 | |
}, | |
{ | |
"epoch": 0.75, | |
"grad_norm": 1.7578125, | |
"learning_rate": 8.963759210771053e-07, | |
"loss": 0.423, | |
"step": 1500 | |
}, | |
{ | |
"epoch": 0.755, | |
"grad_norm": 2.125, | |
"learning_rate": 8.631479657066508e-07, | |
"loss": 0.393, | |
"step": 1510 | |
}, | |
{ | |
"epoch": 0.76, | |
"grad_norm": 1.984375, | |
"learning_rate": 8.304186113223839e-07, | |
"loss": 0.4291, | |
"step": 1520 | |
}, | |
{ | |
"epoch": 0.765, | |
"grad_norm": 2.5, | |
"learning_rate": 7.981978276027055e-07, | |
"loss": 0.4395, | |
"step": 1530 | |
}, | |
{ | |
"epoch": 0.77, | |
"grad_norm": 1.8671875, | |
"learning_rate": 7.664954293104674e-07, | |
"loss": 0.4242, | |
"step": 1540 | |
}, | |
{ | |
"epoch": 0.775, | |
"grad_norm": 2.3125, | |
"learning_rate": 7.353210733032976e-07, | |
"loss": 0.4441, | |
"step": 1550 | |
}, | |
{ | |
"epoch": 0.78, | |
"grad_norm": 3.234375, | |
"learning_rate": 7.046842555920283e-07, | |
"loss": 0.4115, | |
"step": 1560 | |
}, | |
{ | |
"epoch": 0.785, | |
"grad_norm": 1.6171875, | |
"learning_rate": 6.74594308448119e-07, | |
"loss": 0.4098, | |
"step": 1570 | |
}, | |
{ | |
"epoch": 0.79, | |
"grad_norm": 2.40625, | |
"learning_rate": 6.450603975609593e-07, | |
"loss": 0.4529, | |
"step": 1580 | |
}, | |
{ | |
"epoch": 0.795, | |
"grad_norm": 2.265625, | |
"learning_rate": 6.160915192459058e-07, | |
"loss": 0.4076, | |
"step": 1590 | |
}, | |
{ | |
"epoch": 0.8, | |
"grad_norm": 2.140625, | |
"learning_rate": 5.876964977039207e-07, | |
"loss": 0.4283, | |
"step": 1600 | |
}, | |
{ | |
"epoch": 0.805, | |
"grad_norm": 1.796875, | |
"learning_rate": 5.598839823336349e-07, | |
"loss": 0.4104, | |
"step": 1610 | |
}, | |
{ | |
"epoch": 0.81, | |
"grad_norm": 2.015625, | |
"learning_rate": 5.32662445096657e-07, | |
"loss": 0.3916, | |
"step": 1620 | |
}, | |
{ | |
"epoch": 0.815, | |
"grad_norm": 2.03125, | |
"learning_rate": 5.060401779369292e-07, | |
"loss": 0.4215, | |
"step": 1630 | |
}, | |
{ | |
"epoch": 0.82, | |
"grad_norm": 1.7578125, | |
"learning_rate": 4.800252902549243e-07, | |
"loss": 0.4039, | |
"step": 1640 | |
}, | |
{ | |
"epoch": 0.825, | |
"grad_norm": 3.0, | |
"learning_rate": 4.54625706437441e-07, | |
"loss": 0.4115, | |
"step": 1650 | |
}, | |
{ | |
"epoch": 0.83, | |
"grad_norm": 1.5703125, | |
"learning_rate": 4.2984916344376404e-07, | |
"loss": 0.4311, | |
"step": 1660 | |
}, | |
{ | |
"epoch": 0.835, | |
"grad_norm": 2.03125, | |
"learning_rate": 4.057032084489032e-07, | |
"loss": 0.3963, | |
"step": 1670 | |
}, | |
{ | |
"epoch": 0.84, | |
"grad_norm": 1.7734375, | |
"learning_rate": 3.821951965446577e-07, | |
"loss": 0.4145, | |
"step": 1680 | |
}, | |
{ | |
"epoch": 0.845, | |
"grad_norm": 1.828125, | |
"learning_rate": 3.5933228849917956e-07, | |
"loss": 0.4207, | |
"step": 1690 | |
}, | |
{ | |
"epoch": 0.85, | |
"grad_norm": 2.03125, | |
"learning_rate": 3.371214485757393e-07, | |
"loss": 0.4111, | |
"step": 1700 | |
}, | |
{ | |
"epoch": 0.855, | |
"grad_norm": 1.890625, | |
"learning_rate": 3.1556944241133704e-07, | |
"loss": 0.4266, | |
"step": 1710 | |
}, | |
{ | |
"epoch": 0.86, | |
"grad_norm": 2.15625, | |
"learning_rate": 2.946828349558309e-07, | |
"loss": 0.4121, | |
"step": 1720 | |
}, | |
{ | |
"epoch": 0.865, | |
"grad_norm": 3.34375, | |
"learning_rate": 2.7446798847218376e-07, | |
"loss": 0.4457, | |
"step": 1730 | |
}, | |
{ | |
"epoch": 0.87, | |
"grad_norm": 1.796875, | |
"learning_rate": 2.549310605984612e-07, | |
"loss": 0.4207, | |
"step": 1740 | |
}, | |
{ | |
"epoch": 0.875, | |
"grad_norm": 1.7109375, | |
"learning_rate": 2.360780024721515e-07, | |
"loss": 0.4021, | |
"step": 1750 | |
}, | |
{ | |
"epoch": 0.88, | |
"grad_norm": 2.0625, | |
"learning_rate": 2.1791455691739323e-07, | |
"loss": 0.4209, | |
"step": 1760 | |
}, | |
{ | |
"epoch": 0.885, | |
"grad_norm": 2.140625, | |
"learning_rate": 2.0044625669565582e-07, | |
"loss": 0.448, | |
"step": 1770 | |
}, | |
{ | |
"epoch": 0.89, | |
"grad_norm": 1.546875, | |
"learning_rate": 1.8367842282040692e-07, | |
"loss": 0.4229, | |
"step": 1780 | |
}, | |
{ | |
"epoch": 0.895, | |
"grad_norm": 2.640625, | |
"learning_rate": 1.676161629362777e-07, | |
"loss": 0.3854, | |
"step": 1790 | |
}, | |
{ | |
"epoch": 0.9, | |
"grad_norm": 1.65625, | |
"learning_rate": 1.5226436976322728e-07, | |
"loss": 0.3898, | |
"step": 1800 | |
}, | |
{ | |
"epoch": 0.905, | |
"grad_norm": 1.765625, | |
"learning_rate": 1.3762771960617315e-07, | |
"loss": 0.4404, | |
"step": 1810 | |
}, | |
{ | |
"epoch": 0.91, | |
"grad_norm": 2.265625, | |
"learning_rate": 1.237106709305408e-07, | |
"loss": 0.4217, | |
"step": 1820 | |
}, | |
{ | |
"epoch": 0.915, | |
"grad_norm": 1.8984375, | |
"learning_rate": 1.105174630041747e-07, | |
"loss": 0.4096, | |
"step": 1830 | |
}, | |
{ | |
"epoch": 0.92, | |
"grad_norm": 2.3125, | |
"learning_rate": 9.805211460601455e-08, | |
"loss": 0.434, | |
"step": 1840 | |
}, | |
{ | |
"epoch": 0.925, | |
"grad_norm": 1.6640625, | |
"learning_rate": 8.631842280193759e-08, | |
"loss": 0.4104, | |
"step": 1850 | |
}, | |
{ | |
"epoch": 0.93, | |
"grad_norm": 3.359375, | |
"learning_rate": 7.531996178813311e-08, | |
"loss": 0.4564, | |
"step": 1860 | |
}, | |
{ | |
"epoch": 0.935, | |
"grad_norm": 1.734375, | |
"learning_rate": 6.506008180237111e-08, | |
"loss": 0.4031, | |
"step": 1870 | |
}, | |
{ | |
"epoch": 0.94, | |
"grad_norm": 2.03125, | |
"learning_rate": 5.554190810348442e-08, | |
"loss": 0.4025, | |
"step": 1880 | |
}, | |
{ | |
"epoch": 0.945, | |
"grad_norm": 2.265625, | |
"learning_rate": 4.676834001938718e-08, | |
"loss": 0.4193, | |
"step": 1890 | |
}, | |
{ | |
"epoch": 0.95, | |
"grad_norm": 1.7421875, | |
"learning_rate": 3.874205006390852e-08, | |
"loss": 0.4361, | |
"step": 1900 | |
}, | |
{ | |
"epoch": 0.955, | |
"grad_norm": 2.125, | |
"learning_rate": 3.146548312272152e-08, | |
"loss": 0.4219, | |
"step": 1910 | |
}, | |
{ | |
"epoch": 0.96, | |
"grad_norm": 2.609375, | |
"learning_rate": 2.494085570860616e-08, | |
"loss": 0.4297, | |
"step": 1920 | |
}, | |
{ | |
"epoch": 0.965, | |
"grad_norm": 2.28125, | |
"learning_rate": 1.91701552862783e-08, | |
"loss": 0.4148, | |
"step": 1930 | |
}, | |
{ | |
"epoch": 0.97, | |
"grad_norm": 1.734375, | |
"learning_rate": 1.4155139666988393e-08, | |
"loss": 0.4313, | |
"step": 1940 | |
}, | |
{ | |
"epoch": 0.975, | |
"grad_norm": 1.671875, | |
"learning_rate": 9.897336473076168e-09, | |
"loss": 0.4064, | |
"step": 1950 | |
}, | |
{ | |
"epoch": 0.98, | |
"grad_norm": 2.34375, | |
"learning_rate": 6.398042672640104e-09, | |
"loss": 0.4498, | |
"step": 1960 | |
}, | |
{ | |
"epoch": 0.985, | |
"grad_norm": 1.78125, | |
"learning_rate": 3.6583241844706517e-09, | |
"loss": 0.4023, | |
"step": 1970 | |
}, | |
{ | |
"epoch": 0.99, | |
"grad_norm": 2.421875, | |
"learning_rate": 1.6790155533594198e-09, | |
"loss": 0.4588, | |
"step": 1980 | |
}, | |
{ | |
"epoch": 0.995, | |
"grad_norm": 1.7734375, | |
"learning_rate": 4.6071969588945555e-10, | |
"loss": 0.4533, | |
"step": 1990 | |
}, | |
{ | |
"epoch": 1.0, | |
"grad_norm": 1.890625, | |
"learning_rate": 3.807716780768189e-12, | |
"loss": 0.4279, | |
"step": 2000 | |
} | |
], | |
"logging_steps": 10, | |
"max_steps": 2000, | |
"num_input_tokens_seen": 0, | |
"num_train_epochs": 9223372036854775807, | |
"save_steps": 4000, | |
"stateful_callbacks": { | |
"TrainerControl": { | |
"args": { | |
"should_epoch_stop": false, | |
"should_evaluate": false, | |
"should_log": false, | |
"should_save": true, | |
"should_training_stop": true | |
}, | |
"attributes": {} | |
} | |
}, | |
"total_flos": 1.91328310788096e+17, | |
"train_batch_size": 32, | |
"trial_name": null, | |
"trial_params": null | |
} | |