|
{ |
|
"best_metric": 1.0273813009262085, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/dan-Latn/checkpoint-98500", |
|
"epoch": 10.286131996658312, |
|
"eval_steps": 500, |
|
"global_step": 98500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.052213868003341685, |
|
"grad_norm": 5.384862422943115, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.4559, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.052213868003341685, |
|
"eval_accuracy": 0.7296120738961336, |
|
"eval_loss": 1.6221407651901245, |
|
"eval_runtime": 620.8615, |
|
"eval_samples_per_second": 102.185, |
|
"eval_steps_per_second": 3.194, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10442773600668337, |
|
"grad_norm": 4.473601818084717, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.4355, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.10442773600668337, |
|
"eval_accuracy": 0.7314203520980975, |
|
"eval_loss": 1.563644289970398, |
|
"eval_runtime": 631.0902, |
|
"eval_samples_per_second": 100.529, |
|
"eval_steps_per_second": 3.142, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.15664160401002505, |
|
"grad_norm": 4.604930877685547, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.4326, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.15664160401002505, |
|
"eval_accuracy": 0.732858138551035, |
|
"eval_loss": 1.538730502128601, |
|
"eval_runtime": 627.1459, |
|
"eval_samples_per_second": 101.161, |
|
"eval_steps_per_second": 3.162, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.20885547201336674, |
|
"grad_norm": 5.0758867263793945, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.4082, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.20885547201336674, |
|
"eval_accuracy": 0.7337186853777712, |
|
"eval_loss": 1.514768123626709, |
|
"eval_runtime": 623.7693, |
|
"eval_samples_per_second": 101.709, |
|
"eval_steps_per_second": 3.179, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.26106934001670845, |
|
"grad_norm": 4.955592155456543, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.4066, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.26106934001670845, |
|
"eval_accuracy": 0.7373838924395351, |
|
"eval_loss": 1.4860447645187378, |
|
"eval_runtime": 626.6763, |
|
"eval_samples_per_second": 101.237, |
|
"eval_steps_per_second": 3.164, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3132832080200501, |
|
"grad_norm": 4.4695611000061035, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.3965, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3132832080200501, |
|
"eval_accuracy": 0.7386060798818527, |
|
"eval_loss": 1.4720885753631592, |
|
"eval_runtime": 626.8807, |
|
"eval_samples_per_second": 101.204, |
|
"eval_steps_per_second": 3.163, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3654970760233918, |
|
"grad_norm": 4.595192909240723, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.3674, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.3654970760233918, |
|
"eval_accuracy": 0.7389515252696706, |
|
"eval_loss": 1.466160774230957, |
|
"eval_runtime": 626.2854, |
|
"eval_samples_per_second": 101.3, |
|
"eval_steps_per_second": 3.166, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4177109440267335, |
|
"grad_norm": 4.489499092102051, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.3789, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4177109440267335, |
|
"eval_accuracy": 0.7408971720895672, |
|
"eval_loss": 1.4427233934402466, |
|
"eval_runtime": 628.8309, |
|
"eval_samples_per_second": 100.89, |
|
"eval_steps_per_second": 3.153, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4699248120300752, |
|
"grad_norm": 4.887514114379883, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.3677, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.4699248120300752, |
|
"eval_accuracy": 0.7418176415206207, |
|
"eval_loss": NaN, |
|
"eval_runtime": 626.3841, |
|
"eval_samples_per_second": 101.285, |
|
"eval_steps_per_second": 3.166, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5221386800334169, |
|
"grad_norm": 4.255401611328125, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.3591, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5221386800334169, |
|
"eval_accuracy": 0.7420801988701987, |
|
"eval_loss": 1.4378492832183838, |
|
"eval_runtime": 628.3584, |
|
"eval_samples_per_second": 100.966, |
|
"eval_steps_per_second": 3.156, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5743525480367586, |
|
"grad_norm": 3.9552218914031982, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.3528, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.5743525480367586, |
|
"eval_accuracy": 0.7430785963224286, |
|
"eval_loss": 1.4063905477523804, |
|
"eval_runtime": 630.6125, |
|
"eval_samples_per_second": 100.605, |
|
"eval_steps_per_second": 3.145, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6265664160401002, |
|
"grad_norm": 4.043545246124268, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.3467, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6265664160401002, |
|
"eval_accuracy": 0.7439164901309249, |
|
"eval_loss": 1.4179232120513916, |
|
"eval_runtime": 627.0519, |
|
"eval_samples_per_second": 101.177, |
|
"eval_steps_per_second": 3.162, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6787802840434419, |
|
"grad_norm": 65.49678802490234, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.3578, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.6787802840434419, |
|
"eval_accuracy": 0.7436562018580148, |
|
"eval_loss": 1.4101091623306274, |
|
"eval_runtime": 631.9563, |
|
"eval_samples_per_second": 100.391, |
|
"eval_steps_per_second": 3.138, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7309941520467836, |
|
"grad_norm": 3.9676353931427, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 1.3377, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7309941520467836, |
|
"eval_accuracy": 0.7452273902409787, |
|
"eval_loss": 1.3957942724227905, |
|
"eval_runtime": 625.612, |
|
"eval_samples_per_second": 101.41, |
|
"eval_steps_per_second": 3.17, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7832080200501254, |
|
"grad_norm": 3.6173226833343506, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 1.3303, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.7832080200501254, |
|
"eval_accuracy": 0.7475488951341268, |
|
"eval_loss": 1.3813687562942505, |
|
"eval_runtime": 627.0109, |
|
"eval_samples_per_second": 101.183, |
|
"eval_steps_per_second": 3.163, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.835421888053467, |
|
"grad_norm": 3.980341911315918, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 1.3221, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.835421888053467, |
|
"eval_accuracy": 0.7475162804135862, |
|
"eval_loss": 1.3788014650344849, |
|
"eval_runtime": 625.2826, |
|
"eval_samples_per_second": 101.463, |
|
"eval_steps_per_second": 3.171, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8876357560568087, |
|
"grad_norm": 3.367668390274048, |
|
"learning_rate": 9.15e-05, |
|
"loss": 1.3273, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.8876357560568087, |
|
"eval_accuracy": 0.7486955192165756, |
|
"eval_loss": 1.3720810413360596, |
|
"eval_runtime": 626.1965, |
|
"eval_samples_per_second": 101.315, |
|
"eval_steps_per_second": 3.167, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9398496240601504, |
|
"grad_norm": 3.82468843460083, |
|
"learning_rate": 9.1e-05, |
|
"loss": 1.3104, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9398496240601504, |
|
"eval_accuracy": 0.7470319869853385, |
|
"eval_loss": 1.3732112646102905, |
|
"eval_runtime": 630.8001, |
|
"eval_samples_per_second": 100.575, |
|
"eval_steps_per_second": 3.144, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"grad_norm": 3.946758270263672, |
|
"learning_rate": 9.05e-05, |
|
"loss": 1.3158, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.9920634920634921, |
|
"eval_accuracy": 0.7499583551090239, |
|
"eval_loss": 1.3603472709655762, |
|
"eval_runtime": 652.6391, |
|
"eval_samples_per_second": 97.21, |
|
"eval_steps_per_second": 3.038, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0442773600668338, |
|
"grad_norm": 3.916361093521118, |
|
"learning_rate": 9e-05, |
|
"loss": 1.2953, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0442773600668338, |
|
"eval_accuracy": 0.7500646093431366, |
|
"eval_loss": 1.3458328247070312, |
|
"eval_runtime": 626.4497, |
|
"eval_samples_per_second": 101.274, |
|
"eval_steps_per_second": 3.165, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0964912280701755, |
|
"grad_norm": 3.9930801391601562, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 1.2932, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.0964912280701755, |
|
"eval_accuracy": 0.7504904150215282, |
|
"eval_loss": 1.3514128923416138, |
|
"eval_runtime": 630.1314, |
|
"eval_samples_per_second": 100.682, |
|
"eval_steps_per_second": 3.147, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1487050960735172, |
|
"grad_norm": 4.74911642074585, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 1.2944, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.1487050960735172, |
|
"eval_accuracy": 0.7512302865503326, |
|
"eval_loss": 1.3358726501464844, |
|
"eval_runtime": 631.5659, |
|
"eval_samples_per_second": 100.453, |
|
"eval_steps_per_second": 3.14, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.2009189640768587, |
|
"grad_norm": 3.7969741821289062, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 1.2853, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2009189640768587, |
|
"eval_accuracy": 0.7516760587886216, |
|
"eval_loss": 1.3410056829452515, |
|
"eval_runtime": 631.8461, |
|
"eval_samples_per_second": 100.409, |
|
"eval_steps_per_second": 3.138, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2531328320802004, |
|
"grad_norm": 4.402622699737549, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 1.2904, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.2531328320802004, |
|
"eval_accuracy": 0.7529186511157855, |
|
"eval_loss": 1.3296958208084106, |
|
"eval_runtime": 625.8019, |
|
"eval_samples_per_second": 101.379, |
|
"eval_steps_per_second": 3.169, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.3053467000835421, |
|
"grad_norm": 4.738062858581543, |
|
"learning_rate": 8.75e-05, |
|
"loss": 1.2768, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3053467000835421, |
|
"eval_accuracy": 0.7535759135206747, |
|
"eval_loss": 1.323601484298706, |
|
"eval_runtime": 625.8627, |
|
"eval_samples_per_second": 101.369, |
|
"eval_steps_per_second": 3.168, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.3575605680868839, |
|
"grad_norm": 3.4203574657440186, |
|
"learning_rate": 8.7e-05, |
|
"loss": 1.2702, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.3575605680868839, |
|
"eval_accuracy": 0.7543550936371498, |
|
"eval_loss": 1.311880350112915, |
|
"eval_runtime": 632.5695, |
|
"eval_samples_per_second": 100.294, |
|
"eval_steps_per_second": 3.135, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.4097744360902256, |
|
"grad_norm": 4.80487060546875, |
|
"learning_rate": 8.65e-05, |
|
"loss": 1.2686, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4097744360902256, |
|
"eval_accuracy": 0.7551145942334886, |
|
"eval_loss": 1.3116217851638794, |
|
"eval_runtime": 622.4069, |
|
"eval_samples_per_second": 101.932, |
|
"eval_steps_per_second": 3.186, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4619883040935673, |
|
"grad_norm": 3.3763253688812256, |
|
"learning_rate": 8.6e-05, |
|
"loss": 1.256, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.4619883040935673, |
|
"eval_accuracy": 0.7550032122033714, |
|
"eval_loss": NaN, |
|
"eval_runtime": 623.0788, |
|
"eval_samples_per_second": 101.822, |
|
"eval_steps_per_second": 3.183, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.514202172096909, |
|
"grad_norm": 3.4600539207458496, |
|
"learning_rate": 8.55e-05, |
|
"loss": 1.2689, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.514202172096909, |
|
"eval_accuracy": 0.7562542149486124, |
|
"eval_loss": 1.2966333627700806, |
|
"eval_runtime": 630.2089, |
|
"eval_samples_per_second": 100.67, |
|
"eval_steps_per_second": 3.147, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5664160401002505, |
|
"grad_norm": 3.6656503677368164, |
|
"learning_rate": 8.5e-05, |
|
"loss": 1.2536, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.5664160401002505, |
|
"eval_accuracy": 0.7558193144413351, |
|
"eval_loss": 1.3074419498443604, |
|
"eval_runtime": 633.7511, |
|
"eval_samples_per_second": 100.107, |
|
"eval_steps_per_second": 3.129, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.6186299081035922, |
|
"grad_norm": 3.7582554817199707, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 1.262, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6186299081035922, |
|
"eval_accuracy": 0.755967622936909, |
|
"eval_loss": 1.2850651741027832, |
|
"eval_runtime": 626.4542, |
|
"eval_samples_per_second": 101.273, |
|
"eval_steps_per_second": 3.165, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.670843776106934, |
|
"grad_norm": 3.8284225463867188, |
|
"learning_rate": 8.4e-05, |
|
"loss": 1.2427, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.670843776106934, |
|
"eval_accuracy": 0.7575149255490681, |
|
"eval_loss": 1.2897659540176392, |
|
"eval_runtime": 624.5044, |
|
"eval_samples_per_second": 101.589, |
|
"eval_steps_per_second": 3.175, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.7230576441102756, |
|
"grad_norm": 3.8263819217681885, |
|
"learning_rate": 8.35e-05, |
|
"loss": 1.2389, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7230576441102756, |
|
"eval_accuracy": 0.7580776326367493, |
|
"eval_loss": 1.2844797372817993, |
|
"eval_runtime": 627.1556, |
|
"eval_samples_per_second": 101.16, |
|
"eval_steps_per_second": 3.162, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.7752715121136173, |
|
"grad_norm": 3.7820005416870117, |
|
"learning_rate": 8.3e-05, |
|
"loss": 1.2368, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.7752715121136173, |
|
"eval_accuracy": 0.7585746778078989, |
|
"eval_loss": 1.2964411973953247, |
|
"eval_runtime": 635.3019, |
|
"eval_samples_per_second": 99.863, |
|
"eval_steps_per_second": 3.121, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.827485380116959, |
|
"grad_norm": 3.492671012878418, |
|
"learning_rate": 8.25e-05, |
|
"loss": 1.2432, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.827485380116959, |
|
"eval_accuracy": 0.7584533967429662, |
|
"eval_loss": NaN, |
|
"eval_runtime": 623.9705, |
|
"eval_samples_per_second": 101.676, |
|
"eval_steps_per_second": 3.178, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8796992481203008, |
|
"grad_norm": 3.714395761489868, |
|
"learning_rate": 8.2e-05, |
|
"loss": 1.2375, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.8796992481203008, |
|
"eval_accuracy": 0.7593634528855384, |
|
"eval_loss": NaN, |
|
"eval_runtime": 623.2531, |
|
"eval_samples_per_second": 101.793, |
|
"eval_steps_per_second": 3.182, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9319131161236425, |
|
"grad_norm": 3.700199604034424, |
|
"learning_rate": 8.15e-05, |
|
"loss": 1.2308, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9319131161236425, |
|
"eval_accuracy": 0.7597829603112618, |
|
"eval_loss": 1.2819814682006836, |
|
"eval_runtime": 636.6684, |
|
"eval_samples_per_second": 99.648, |
|
"eval_steps_per_second": 3.115, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"grad_norm": 3.390286922454834, |
|
"learning_rate": 8.1e-05, |
|
"loss": 1.2374, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.9841269841269842, |
|
"eval_accuracy": 0.7603004154522052, |
|
"eval_loss": 1.2687220573425293, |
|
"eval_runtime": 630.1502, |
|
"eval_samples_per_second": 100.679, |
|
"eval_steps_per_second": 3.147, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.036340852130326, |
|
"grad_norm": 4.251028060913086, |
|
"learning_rate": 8.05e-05, |
|
"loss": 1.218, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.036340852130326, |
|
"eval_accuracy": 0.760354434691265, |
|
"eval_loss": 1.2784004211425781, |
|
"eval_runtime": 629.6482, |
|
"eval_samples_per_second": 100.759, |
|
"eval_steps_per_second": 3.149, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0885547201336676, |
|
"grad_norm": 3.714747905731201, |
|
"learning_rate": 8e-05, |
|
"loss": 1.2052, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.0885547201336676, |
|
"eval_accuracy": 0.7609532576337851, |
|
"eval_loss": 1.2718734741210938, |
|
"eval_runtime": 644.9629, |
|
"eval_samples_per_second": 98.367, |
|
"eval_steps_per_second": 3.075, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.1407685881370093, |
|
"grad_norm": 3.934018611907959, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 1.2133, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.1407685881370093, |
|
"eval_accuracy": 0.7613764467527375, |
|
"eval_loss": 1.2647244930267334, |
|
"eval_runtime": 631.2806, |
|
"eval_samples_per_second": 100.499, |
|
"eval_steps_per_second": 3.141, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.192982456140351, |
|
"grad_norm": 3.6540703773498535, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 1.2072, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.192982456140351, |
|
"eval_accuracy": 0.7618574927985354, |
|
"eval_loss": 1.2702383995056152, |
|
"eval_runtime": 628.7442, |
|
"eval_samples_per_second": 100.904, |
|
"eval_steps_per_second": 3.154, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.2451963241436927, |
|
"grad_norm": 3.0740270614624023, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 1.2051, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2451963241436927, |
|
"eval_accuracy": 0.7627010896236089, |
|
"eval_loss": 1.272460699081421, |
|
"eval_runtime": 628.6858, |
|
"eval_samples_per_second": 100.914, |
|
"eval_steps_per_second": 3.154, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2974101921470345, |
|
"grad_norm": 4.153682708740234, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 1.1978, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.2974101921470345, |
|
"eval_accuracy": 0.7618345683871698, |
|
"eval_loss": 1.2531063556671143, |
|
"eval_runtime": 646.652, |
|
"eval_samples_per_second": 98.11, |
|
"eval_steps_per_second": 3.067, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.3496240601503757, |
|
"grad_norm": 3.2469732761383057, |
|
"learning_rate": 7.75e-05, |
|
"loss": 1.2017, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.3496240601503757, |
|
"eval_accuracy": 0.7624854045335798, |
|
"eval_loss": NaN, |
|
"eval_runtime": 639.4743, |
|
"eval_samples_per_second": 99.211, |
|
"eval_steps_per_second": 3.101, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.4018379281537174, |
|
"grad_norm": 3.4860455989837646, |
|
"learning_rate": 7.7e-05, |
|
"loss": 1.2036, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.4018379281537174, |
|
"eval_accuracy": 0.7637592883681051, |
|
"eval_loss": 1.2505515813827515, |
|
"eval_runtime": 626.8199, |
|
"eval_samples_per_second": 101.214, |
|
"eval_steps_per_second": 3.164, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.454051796157059, |
|
"grad_norm": 3.0207018852233887, |
|
"learning_rate": 7.65e-05, |
|
"loss": 1.2039, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.454051796157059, |
|
"eval_accuracy": 0.7636972843433327, |
|
"eval_loss": 1.2614232301712036, |
|
"eval_runtime": 626.1594, |
|
"eval_samples_per_second": 101.321, |
|
"eval_steps_per_second": 3.167, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.506265664160401, |
|
"grad_norm": 2.820202589035034, |
|
"learning_rate": 7.6e-05, |
|
"loss": 1.1931, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.506265664160401, |
|
"eval_accuracy": 0.7648100527367832, |
|
"eval_loss": 1.2478718757629395, |
|
"eval_runtime": 654.189, |
|
"eval_samples_per_second": 96.98, |
|
"eval_steps_per_second": 3.031, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.5584795321637426, |
|
"grad_norm": 4.114190101623535, |
|
"learning_rate": 7.55e-05, |
|
"loss": 1.1814, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.5584795321637426, |
|
"eval_accuracy": 0.7652067630736408, |
|
"eval_loss": 1.2430387735366821, |
|
"eval_runtime": 630.6204, |
|
"eval_samples_per_second": 100.604, |
|
"eval_steps_per_second": 3.145, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.6106934001670843, |
|
"grad_norm": 3.205641746520996, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 1.1934, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.6106934001670843, |
|
"eval_accuracy": 0.7653472241088651, |
|
"eval_loss": 1.2560120820999146, |
|
"eval_runtime": 633.6789, |
|
"eval_samples_per_second": 100.119, |
|
"eval_steps_per_second": 3.129, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.662907268170426, |
|
"grad_norm": 3.286635637283325, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 1.1913, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.662907268170426, |
|
"eval_accuracy": 0.7653709204327143, |
|
"eval_loss": 1.2623989582061768, |
|
"eval_runtime": 636.0478, |
|
"eval_samples_per_second": 99.746, |
|
"eval_steps_per_second": 3.118, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.7151211361737677, |
|
"grad_norm": 3.4579050540924072, |
|
"learning_rate": 7.4e-05, |
|
"loss": 1.1835, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.7151211361737677, |
|
"eval_accuracy": 0.766565753562764, |
|
"eval_loss": 1.2262078523635864, |
|
"eval_runtime": 628.0716, |
|
"eval_samples_per_second": 101.012, |
|
"eval_steps_per_second": 3.157, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.7673350041771094, |
|
"grad_norm": 3.3125433921813965, |
|
"learning_rate": 7.35e-05, |
|
"loss": 1.1821, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.7673350041771094, |
|
"eval_accuracy": 0.7665199419874659, |
|
"eval_loss": 1.249574065208435, |
|
"eval_runtime": 626.5772, |
|
"eval_samples_per_second": 101.253, |
|
"eval_steps_per_second": 3.165, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.819548872180451, |
|
"grad_norm": 3.5980172157287598, |
|
"learning_rate": 7.3e-05, |
|
"loss": 1.1879, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.819548872180451, |
|
"eval_accuracy": 0.7665586654611716, |
|
"eval_loss": 1.2500699758529663, |
|
"eval_runtime": 626.2963, |
|
"eval_samples_per_second": 101.299, |
|
"eval_steps_per_second": 3.166, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.871762740183793, |
|
"grad_norm": 3.683032512664795, |
|
"learning_rate": 7.25e-05, |
|
"loss": 1.1865, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.871762740183793, |
|
"eval_accuracy": 0.767375596206993, |
|
"eval_loss": 1.2403781414031982, |
|
"eval_runtime": 647.9501, |
|
"eval_samples_per_second": 97.913, |
|
"eval_steps_per_second": 3.06, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.9239766081871346, |
|
"grad_norm": 3.2120792865753174, |
|
"learning_rate": 7.2e-05, |
|
"loss": 1.1811, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.9239766081871346, |
|
"eval_accuracy": 0.7678975328660842, |
|
"eval_loss": 1.2281544208526611, |
|
"eval_runtime": 641.5519, |
|
"eval_samples_per_second": 98.89, |
|
"eval_steps_per_second": 3.091, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"grad_norm": 3.5708224773406982, |
|
"learning_rate": 7.15e-05, |
|
"loss": 1.1771, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 2.9761904761904763, |
|
"eval_accuracy": 0.7670722352455037, |
|
"eval_loss": 1.2305808067321777, |
|
"eval_runtime": 629.6736, |
|
"eval_samples_per_second": 100.755, |
|
"eval_steps_per_second": 3.149, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.028404344193818, |
|
"grad_norm": 3.624859094619751, |
|
"learning_rate": 7.1e-05, |
|
"loss": 1.1719, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.028404344193818, |
|
"eval_accuracy": 0.7678675260574238, |
|
"eval_loss": NaN, |
|
"eval_runtime": 653.9665, |
|
"eval_samples_per_second": 97.013, |
|
"eval_steps_per_second": 3.032, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.0806182121971597, |
|
"grad_norm": 3.066574811935425, |
|
"learning_rate": 7.05e-05, |
|
"loss": 1.1625, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.0806182121971597, |
|
"eval_accuracy": 0.7687440934734672, |
|
"eval_loss": 1.2233607769012451, |
|
"eval_runtime": 641.0412, |
|
"eval_samples_per_second": 98.969, |
|
"eval_steps_per_second": 3.093, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.1328320802005014, |
|
"grad_norm": 2.951267719268799, |
|
"learning_rate": 7e-05, |
|
"loss": 1.1667, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.1328320802005014, |
|
"eval_accuracy": 0.7692466135515155, |
|
"eval_loss": 1.2097790241241455, |
|
"eval_runtime": 634.5544, |
|
"eval_samples_per_second": 99.98, |
|
"eval_steps_per_second": 3.125, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.185045948203843, |
|
"grad_norm": 3.134652614593506, |
|
"learning_rate": 6.95e-05, |
|
"loss": 1.161, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.185045948203843, |
|
"eval_accuracy": 0.7697799908068673, |
|
"eval_loss": 1.2196942567825317, |
|
"eval_runtime": 639.2853, |
|
"eval_samples_per_second": 99.241, |
|
"eval_steps_per_second": 3.102, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.2372598162071844, |
|
"grad_norm": 3.199575662612915, |
|
"learning_rate": 6.9e-05, |
|
"loss": 1.1591, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2372598162071844, |
|
"eval_accuracy": 0.7695065031514984, |
|
"eval_loss": 1.2124276161193848, |
|
"eval_runtime": 682.4327, |
|
"eval_samples_per_second": 92.966, |
|
"eval_steps_per_second": 2.906, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"grad_norm": 2.954310178756714, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 1.1587, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.2894736842105265, |
|
"eval_accuracy": 0.7700210273355362, |
|
"eval_loss": 1.2054550647735596, |
|
"eval_runtime": 639.9415, |
|
"eval_samples_per_second": 99.139, |
|
"eval_steps_per_second": 3.099, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.341687552213868, |
|
"grad_norm": 2.8550124168395996, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 1.1575, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.341687552213868, |
|
"eval_accuracy": 0.7699819172054071, |
|
"eval_loss": 1.214800238609314, |
|
"eval_runtime": 638.9546, |
|
"eval_samples_per_second": 99.292, |
|
"eval_steps_per_second": 3.104, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.3939014202172095, |
|
"grad_norm": 2.92110013961792, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 1.1523, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.3939014202172095, |
|
"eval_accuracy": 0.769908237207266, |
|
"eval_loss": 1.2140088081359863, |
|
"eval_runtime": 644.5498, |
|
"eval_samples_per_second": 98.43, |
|
"eval_steps_per_second": 3.077, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.4461152882205512, |
|
"grad_norm": 3.346374988555908, |
|
"learning_rate": 6.7e-05, |
|
"loss": 1.1572, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.4461152882205512, |
|
"eval_accuracy": 0.7708846584546473, |
|
"eval_loss": 1.2036925554275513, |
|
"eval_runtime": 640.768, |
|
"eval_samples_per_second": 99.011, |
|
"eval_steps_per_second": 3.095, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.498329156223893, |
|
"grad_norm": 3.251553773880005, |
|
"learning_rate": 6.65e-05, |
|
"loss": 1.1435, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.498329156223893, |
|
"eval_accuracy": 0.7710189865489193, |
|
"eval_loss": 1.2105178833007812, |
|
"eval_runtime": 649.4058, |
|
"eval_samples_per_second": 97.694, |
|
"eval_steps_per_second": 3.054, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.5505430242272347, |
|
"grad_norm": 3.583970785140991, |
|
"learning_rate": 6.6e-05, |
|
"loss": 1.1377, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.5505430242272347, |
|
"eval_accuracy": 0.7721632980503305, |
|
"eval_loss": 1.2023558616638184, |
|
"eval_runtime": 638.8527, |
|
"eval_samples_per_second": 99.308, |
|
"eval_steps_per_second": 3.104, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.6027568922305764, |
|
"grad_norm": 3.588223934173584, |
|
"learning_rate": 6.55e-05, |
|
"loss": 1.1369, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.6027568922305764, |
|
"eval_accuracy": 0.7718440341835643, |
|
"eval_loss": 1.196567416191101, |
|
"eval_runtime": 659.4538, |
|
"eval_samples_per_second": 96.205, |
|
"eval_steps_per_second": 3.007, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.654970760233918, |
|
"grad_norm": 3.4205751419067383, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 1.1417, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.654970760233918, |
|
"eval_accuracy": 0.7722229829571152, |
|
"eval_loss": 1.1913775205612183, |
|
"eval_runtime": 632.4781, |
|
"eval_samples_per_second": 100.309, |
|
"eval_steps_per_second": 3.135, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.70718462823726, |
|
"grad_norm": 3.300046920776367, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 1.1464, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.70718462823726, |
|
"eval_accuracy": 0.7726826510494151, |
|
"eval_loss": 1.1917306184768677, |
|
"eval_runtime": 632.4563, |
|
"eval_samples_per_second": 100.312, |
|
"eval_steps_per_second": 3.135, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"grad_norm": 3.469228982925415, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 1.1413, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.7593984962406015, |
|
"eval_accuracy": 0.7728226251433048, |
|
"eval_loss": 1.1853009462356567, |
|
"eval_runtime": 674.7454, |
|
"eval_samples_per_second": 94.025, |
|
"eval_steps_per_second": 2.939, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.8116123642439432, |
|
"grad_norm": 3.0387423038482666, |
|
"learning_rate": 6.35e-05, |
|
"loss": 1.1437, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.8116123642439432, |
|
"eval_accuracy": 0.7730040846904885, |
|
"eval_loss": 1.197380542755127, |
|
"eval_runtime": 642.657, |
|
"eval_samples_per_second": 98.72, |
|
"eval_steps_per_second": 3.086, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.863826232247285, |
|
"grad_norm": 3.7750790119171143, |
|
"learning_rate": 6.3e-05, |
|
"loss": 1.1381, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.863826232247285, |
|
"eval_accuracy": 0.7732420751700277, |
|
"eval_loss": 1.1929783821105957, |
|
"eval_runtime": 635.6748, |
|
"eval_samples_per_second": 99.804, |
|
"eval_steps_per_second": 3.12, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.9160401002506267, |
|
"grad_norm": 2.7454237937927246, |
|
"learning_rate": 6.25e-05, |
|
"loss": 1.1319, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9160401002506267, |
|
"eval_accuracy": 0.7740359547829445, |
|
"eval_loss": 1.1876792907714844, |
|
"eval_runtime": 646.5493, |
|
"eval_samples_per_second": 98.126, |
|
"eval_steps_per_second": 3.067, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"grad_norm": 3.1448304653167725, |
|
"learning_rate": 6.2e-05, |
|
"loss": 1.1217, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 3.9682539682539684, |
|
"eval_accuracy": 0.7746280010171029, |
|
"eval_loss": 1.1808606386184692, |
|
"eval_runtime": 651.1984, |
|
"eval_samples_per_second": 97.425, |
|
"eval_steps_per_second": 3.045, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.02046783625731, |
|
"grad_norm": 3.3443033695220947, |
|
"learning_rate": 6.15e-05, |
|
"loss": 1.1297, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.02046783625731, |
|
"eval_accuracy": 0.7744960875837769, |
|
"eval_loss": 1.1759783029556274, |
|
"eval_runtime": 633.4471, |
|
"eval_samples_per_second": 100.155, |
|
"eval_steps_per_second": 3.13, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.072681704260652, |
|
"grad_norm": 3.656834602355957, |
|
"learning_rate": 6.1e-05, |
|
"loss": 1.116, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.072681704260652, |
|
"eval_accuracy": 0.7753487577160052, |
|
"eval_loss": 1.171563744544983, |
|
"eval_runtime": 636.4311, |
|
"eval_samples_per_second": 99.686, |
|
"eval_steps_per_second": 3.116, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.124895572263993, |
|
"grad_norm": 3.2146315574645996, |
|
"learning_rate": 6.05e-05, |
|
"loss": 1.1262, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.124895572263993, |
|
"eval_accuracy": 0.7748815741788464, |
|
"eval_loss": 1.1831566095352173, |
|
"eval_runtime": 626.1408, |
|
"eval_samples_per_second": 101.324, |
|
"eval_steps_per_second": 3.167, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.177109440267335, |
|
"grad_norm": 3.5292935371398926, |
|
"learning_rate": 6e-05, |
|
"loss": 1.1038, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.177109440267335, |
|
"eval_accuracy": 0.775054087211447, |
|
"eval_loss": 1.1849807500839233, |
|
"eval_runtime": 625.7328, |
|
"eval_samples_per_second": 101.39, |
|
"eval_steps_per_second": 3.169, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.2293233082706765, |
|
"grad_norm": 3.17421817779541, |
|
"learning_rate": 5.95e-05, |
|
"loss": 1.1144, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.2293233082706765, |
|
"eval_accuracy": 0.7753948446129992, |
|
"eval_loss": 1.176754117012024, |
|
"eval_runtime": 642.2309, |
|
"eval_samples_per_second": 98.785, |
|
"eval_steps_per_second": 3.088, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.281537176274019, |
|
"grad_norm": 3.185692071914673, |
|
"learning_rate": 5.9e-05, |
|
"loss": 1.1068, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.281537176274019, |
|
"eval_accuracy": 0.7759707359303026, |
|
"eval_loss": 1.1789684295654297, |
|
"eval_runtime": 627.5525, |
|
"eval_samples_per_second": 101.096, |
|
"eval_steps_per_second": 3.16, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.33375104427736, |
|
"grad_norm": 3.033123731613159, |
|
"learning_rate": 5.85e-05, |
|
"loss": 1.1088, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.33375104427736, |
|
"eval_accuracy": 0.7761587182255788, |
|
"eval_loss": 1.1775932312011719, |
|
"eval_runtime": 629.8113, |
|
"eval_samples_per_second": 100.733, |
|
"eval_steps_per_second": 3.149, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.385964912280702, |
|
"grad_norm": 3.112304210662842, |
|
"learning_rate": 5.8e-05, |
|
"loss": 1.1243, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.385964912280702, |
|
"eval_accuracy": 0.7764218763015075, |
|
"eval_loss": 1.173019528388977, |
|
"eval_runtime": 655.1515, |
|
"eval_samples_per_second": 96.837, |
|
"eval_steps_per_second": 3.027, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.438178780284043, |
|
"grad_norm": 3.216050624847412, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 1.1196, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.438178780284043, |
|
"eval_accuracy": 0.7775044227327209, |
|
"eval_loss": 1.1682883501052856, |
|
"eval_runtime": 627.0953, |
|
"eval_samples_per_second": 101.17, |
|
"eval_steps_per_second": 3.162, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.4903926482873855, |
|
"grad_norm": 3.4211995601654053, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 1.0985, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.4903926482873855, |
|
"eval_accuracy": 0.7770348312966414, |
|
"eval_loss": 1.1709975004196167, |
|
"eval_runtime": 626.8729, |
|
"eval_samples_per_second": 101.206, |
|
"eval_steps_per_second": 3.163, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.542606516290727, |
|
"grad_norm": 3.0796425342559814, |
|
"learning_rate": 5.65e-05, |
|
"loss": 1.0994, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.542606516290727, |
|
"eval_accuracy": 0.7772564904147641, |
|
"eval_loss": 1.1671358346939087, |
|
"eval_runtime": 626.7232, |
|
"eval_samples_per_second": 101.23, |
|
"eval_steps_per_second": 3.164, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.594820384294069, |
|
"grad_norm": 3.7232813835144043, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 1.1017, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.594820384294069, |
|
"eval_accuracy": 0.7776795841513819, |
|
"eval_loss": 1.1589510440826416, |
|
"eval_runtime": 639.6154, |
|
"eval_samples_per_second": 99.189, |
|
"eval_steps_per_second": 3.1, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.64703425229741, |
|
"grad_norm": 3.0733375549316406, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 1.1071, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.64703425229741, |
|
"eval_accuracy": 0.7780140462141538, |
|
"eval_loss": 1.1811403036117554, |
|
"eval_runtime": 637.3252, |
|
"eval_samples_per_second": 99.546, |
|
"eval_steps_per_second": 3.111, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.6992481203007515, |
|
"grad_norm": 3.2747066020965576, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 1.0993, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.6992481203007515, |
|
"eval_accuracy": 0.7782215810670943, |
|
"eval_loss": 1.153876781463623, |
|
"eval_runtime": 625.6185, |
|
"eval_samples_per_second": 101.408, |
|
"eval_steps_per_second": 3.17, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.751461988304094, |
|
"grad_norm": 3.6057417392730713, |
|
"learning_rate": 5.45e-05, |
|
"loss": 1.0998, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.751461988304094, |
|
"eval_accuracy": 0.7785066434739322, |
|
"eval_loss": 1.151453971862793, |
|
"eval_runtime": 626.7653, |
|
"eval_samples_per_second": 101.223, |
|
"eval_steps_per_second": 3.164, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.803675856307435, |
|
"grad_norm": 2.923815965652466, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 1.0966, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.803675856307435, |
|
"eval_accuracy": 0.7786641899675826, |
|
"eval_loss": 1.1550520658493042, |
|
"eval_runtime": 626.8592, |
|
"eval_samples_per_second": 101.208, |
|
"eval_steps_per_second": 3.163, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.855889724310777, |
|
"grad_norm": 2.843076229095459, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 1.0977, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.855889724310777, |
|
"eval_accuracy": 0.7794697444838651, |
|
"eval_loss": 1.151402473449707, |
|
"eval_runtime": 625.4097, |
|
"eval_samples_per_second": 101.442, |
|
"eval_steps_per_second": 3.171, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.908103592314118, |
|
"grad_norm": 3.2491214275360107, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 1.0962, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.908103592314118, |
|
"eval_accuracy": 0.7797745180661694, |
|
"eval_loss": 1.1584906578063965, |
|
"eval_runtime": 624.1222, |
|
"eval_samples_per_second": 101.652, |
|
"eval_steps_per_second": 3.177, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.9603174603174605, |
|
"grad_norm": 2.8548121452331543, |
|
"learning_rate": 5.25e-05, |
|
"loss": 1.0933, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.9603174603174605, |
|
"eval_accuracy": 0.7794554886066851, |
|
"eval_loss": 1.1526641845703125, |
|
"eval_runtime": 644.9796, |
|
"eval_samples_per_second": 98.364, |
|
"eval_steps_per_second": 3.075, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 5.012531328320802, |
|
"grad_norm": 3.3320724964141846, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 1.0897, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.012531328320802, |
|
"eval_accuracy": 0.7798054689379954, |
|
"eval_loss": 1.153532862663269, |
|
"eval_runtime": 631.6644, |
|
"eval_samples_per_second": 100.438, |
|
"eval_steps_per_second": 3.139, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.064745196324144, |
|
"grad_norm": 3.1418938636779785, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 1.0727, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.064745196324144, |
|
"eval_accuracy": 0.7807417083407959, |
|
"eval_loss": 1.151798963546753, |
|
"eval_runtime": 628.4763, |
|
"eval_samples_per_second": 100.947, |
|
"eval_steps_per_second": 3.155, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.116959064327485, |
|
"grad_norm": 3.242654800415039, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 1.0846, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.116959064327485, |
|
"eval_accuracy": 0.7803700732682441, |
|
"eval_loss": 1.143381953239441, |
|
"eval_runtime": 626.0556, |
|
"eval_samples_per_second": 101.338, |
|
"eval_steps_per_second": 3.167, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.169172932330827, |
|
"grad_norm": 3.165130615234375, |
|
"learning_rate": 5.05e-05, |
|
"loss": 1.0817, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.169172932330827, |
|
"eval_accuracy": 0.7805262215474418, |
|
"eval_loss": 1.1557620763778687, |
|
"eval_runtime": 638.2573, |
|
"eval_samples_per_second": 99.4, |
|
"eval_steps_per_second": 3.107, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.221386800334169, |
|
"grad_norm": 3.074112892150879, |
|
"learning_rate": 5e-05, |
|
"loss": 1.0725, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.221386800334169, |
|
"eval_accuracy": 0.7803664543536556, |
|
"eval_loss": 1.1497862339019775, |
|
"eval_runtime": 635.6514, |
|
"eval_samples_per_second": 99.808, |
|
"eval_steps_per_second": 3.12, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.273600668337511, |
|
"grad_norm": 3.5100748538970947, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 1.076, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.273600668337511, |
|
"eval_accuracy": 0.7819452223531255, |
|
"eval_loss": 1.1426007747650146, |
|
"eval_runtime": 627.3013, |
|
"eval_samples_per_second": 101.136, |
|
"eval_steps_per_second": 3.161, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.325814536340852, |
|
"grad_norm": 2.9352290630340576, |
|
"learning_rate": 4.9e-05, |
|
"loss": 1.0786, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.325814536340852, |
|
"eval_accuracy": 0.7809871489797384, |
|
"eval_loss": 1.1469109058380127, |
|
"eval_runtime": 651.0651, |
|
"eval_samples_per_second": 97.445, |
|
"eval_steps_per_second": 3.046, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.378028404344194, |
|
"grad_norm": 3.215942859649658, |
|
"learning_rate": 4.85e-05, |
|
"loss": 1.0756, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.378028404344194, |
|
"eval_accuracy": 0.7814986150426574, |
|
"eval_loss": 1.1447529792785645, |
|
"eval_runtime": 637.8608, |
|
"eval_samples_per_second": 99.462, |
|
"eval_steps_per_second": 3.109, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.430242272347535, |
|
"grad_norm": 3.225491523742676, |
|
"learning_rate": 4.8e-05, |
|
"loss": 1.0782, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.430242272347535, |
|
"eval_accuracy": 0.781914062234545, |
|
"eval_loss": 1.1327252388000488, |
|
"eval_runtime": 636.3098, |
|
"eval_samples_per_second": 99.705, |
|
"eval_steps_per_second": 3.116, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.482456140350878, |
|
"grad_norm": 3.2323813438415527, |
|
"learning_rate": 4.75e-05, |
|
"loss": 1.0718, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.482456140350878, |
|
"eval_accuracy": 0.7824720975904875, |
|
"eval_loss": 1.1324148178100586, |
|
"eval_runtime": 634.3817, |
|
"eval_samples_per_second": 100.008, |
|
"eval_steps_per_second": 3.126, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.534670008354219, |
|
"grad_norm": 2.8218557834625244, |
|
"learning_rate": 4.7e-05, |
|
"loss": 1.0646, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.534670008354219, |
|
"eval_accuracy": 0.7827635872108434, |
|
"eval_loss": 1.139945387840271, |
|
"eval_runtime": 685.8615, |
|
"eval_samples_per_second": 92.501, |
|
"eval_steps_per_second": 2.891, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.586883876357561, |
|
"grad_norm": 3.848472833633423, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 1.0692, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.586883876357561, |
|
"eval_accuracy": 0.7828145845856118, |
|
"eval_loss": 1.1383306980133057, |
|
"eval_runtime": 640.36, |
|
"eval_samples_per_second": 99.074, |
|
"eval_steps_per_second": 3.097, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.639097744360902, |
|
"grad_norm": 3.0220718383789062, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 1.0605, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.639097744360902, |
|
"eval_accuracy": 0.7828910126440776, |
|
"eval_loss": 1.1487065553665161, |
|
"eval_runtime": 639.7494, |
|
"eval_samples_per_second": 99.169, |
|
"eval_steps_per_second": 3.1, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.6913116123642435, |
|
"grad_norm": 3.226999044418335, |
|
"learning_rate": 4.55e-05, |
|
"loss": 1.0678, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.6913116123642435, |
|
"eval_accuracy": 0.783646746192109, |
|
"eval_loss": 1.1216100454330444, |
|
"eval_runtime": 637.7267, |
|
"eval_samples_per_second": 99.483, |
|
"eval_steps_per_second": 3.109, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.743525480367586, |
|
"grad_norm": 3.0716323852539062, |
|
"learning_rate": 4.5e-05, |
|
"loss": 1.068, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.743525480367586, |
|
"eval_accuracy": 0.7840726797935829, |
|
"eval_loss": 1.1208263635635376, |
|
"eval_runtime": 661.1104, |
|
"eval_samples_per_second": 95.964, |
|
"eval_steps_per_second": 2.999, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.795739348370927, |
|
"grad_norm": 3.032036542892456, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 1.0588, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.795739348370927, |
|
"eval_accuracy": 0.7842573639982784, |
|
"eval_loss": 1.1288572549819946, |
|
"eval_runtime": 644.8754, |
|
"eval_samples_per_second": 98.38, |
|
"eval_steps_per_second": 3.075, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.847953216374269, |
|
"grad_norm": 2.896811008453369, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 1.0723, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.847953216374269, |
|
"eval_accuracy": 0.7839231768143949, |
|
"eval_loss": 1.11968195438385, |
|
"eval_runtime": 643.1751, |
|
"eval_samples_per_second": 98.64, |
|
"eval_steps_per_second": 3.083, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.90016708437761, |
|
"grad_norm": 3.429722785949707, |
|
"learning_rate": 4.35e-05, |
|
"loss": 1.057, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.90016708437761, |
|
"eval_accuracy": 0.7850116259552448, |
|
"eval_loss": 1.1247141361236572, |
|
"eval_runtime": 639.1011, |
|
"eval_samples_per_second": 99.269, |
|
"eval_steps_per_second": 3.103, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.9523809523809526, |
|
"grad_norm": 2.920243740081787, |
|
"learning_rate": 4.3e-05, |
|
"loss": 1.0628, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.9523809523809526, |
|
"eval_accuracy": 0.7846882337268002, |
|
"eval_loss": NaN, |
|
"eval_runtime": 629.1614, |
|
"eval_samples_per_second": 100.837, |
|
"eval_steps_per_second": 3.152, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 6.004594820384294, |
|
"grad_norm": 2.937222957611084, |
|
"learning_rate": 4.25e-05, |
|
"loss": 1.0619, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.004594820384294, |
|
"eval_accuracy": 0.7845797476831528, |
|
"eval_loss": NaN, |
|
"eval_runtime": 628.6009, |
|
"eval_samples_per_second": 100.927, |
|
"eval_steps_per_second": 3.155, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.056808688387636, |
|
"grad_norm": 2.854961395263672, |
|
"learning_rate": 4.2e-05, |
|
"loss": 1.0386, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.056808688387636, |
|
"eval_accuracy": 0.7856027081868581, |
|
"eval_loss": 1.1315497159957886, |
|
"eval_runtime": 626.9314, |
|
"eval_samples_per_second": 101.196, |
|
"eval_steps_per_second": 3.163, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.109022556390977, |
|
"grad_norm": 3.4655332565307617, |
|
"learning_rate": 4.15e-05, |
|
"loss": 1.0446, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.109022556390977, |
|
"eval_accuracy": 0.785224256670098, |
|
"eval_loss": NaN, |
|
"eval_runtime": 674.346, |
|
"eval_samples_per_second": 94.081, |
|
"eval_steps_per_second": 2.941, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.161236424394319, |
|
"grad_norm": 2.93487811088562, |
|
"learning_rate": 4.1e-05, |
|
"loss": 1.0532, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.161236424394319, |
|
"eval_accuracy": 0.7855367902419893, |
|
"eval_loss": 1.1223821640014648, |
|
"eval_runtime": 630.6681, |
|
"eval_samples_per_second": 100.596, |
|
"eval_steps_per_second": 3.144, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.213450292397661, |
|
"grad_norm": 3.50022554397583, |
|
"learning_rate": 4.05e-05, |
|
"loss": 1.0465, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.213450292397661, |
|
"eval_accuracy": 0.7857459334681388, |
|
"eval_loss": 1.1025224924087524, |
|
"eval_runtime": 625.4125, |
|
"eval_samples_per_second": 101.442, |
|
"eval_steps_per_second": 3.171, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.265664160401003, |
|
"grad_norm": 3.2295687198638916, |
|
"learning_rate": 4e-05, |
|
"loss": 1.0448, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.265664160401003, |
|
"eval_accuracy": 0.7862344900229137, |
|
"eval_loss": 1.104607105255127, |
|
"eval_runtime": 668.9552, |
|
"eval_samples_per_second": 94.839, |
|
"eval_steps_per_second": 2.964, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.317878028404344, |
|
"grad_norm": 3.1386146545410156, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 1.0304, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.317878028404344, |
|
"eval_accuracy": 0.7869643708248876, |
|
"eval_loss": 1.1088374853134155, |
|
"eval_runtime": 633.3446, |
|
"eval_samples_per_second": 100.171, |
|
"eval_steps_per_second": 3.131, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.370091896407686, |
|
"grad_norm": 3.1814932823181152, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 1.0354, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.370091896407686, |
|
"eval_accuracy": 0.7865247818397344, |
|
"eval_loss": 1.108305811882019, |
|
"eval_runtime": 636.3927, |
|
"eval_samples_per_second": 99.692, |
|
"eval_steps_per_second": 3.116, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.4223057644110275, |
|
"grad_norm": 3.155350685119629, |
|
"learning_rate": 3.85e-05, |
|
"loss": 1.0437, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.4223057644110275, |
|
"eval_accuracy": 0.7868538533540672, |
|
"eval_loss": NaN, |
|
"eval_runtime": 630.5912, |
|
"eval_samples_per_second": 100.609, |
|
"eval_steps_per_second": 3.145, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.474519632414369, |
|
"grad_norm": 2.9944610595703125, |
|
"learning_rate": 3.8e-05, |
|
"loss": 1.0347, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.474519632414369, |
|
"eval_accuracy": 0.7876532991281543, |
|
"eval_loss": 1.0992841720581055, |
|
"eval_runtime": 649.6208, |
|
"eval_samples_per_second": 97.662, |
|
"eval_steps_per_second": 3.053, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.526733500417711, |
|
"grad_norm": 2.89630126953125, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 1.041, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.526733500417711, |
|
"eval_accuracy": 0.7875077762136071, |
|
"eval_loss": 1.1099497079849243, |
|
"eval_runtime": 631.7247, |
|
"eval_samples_per_second": 100.428, |
|
"eval_steps_per_second": 3.139, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"grad_norm": 3.2351603507995605, |
|
"learning_rate": 3.7e-05, |
|
"loss": 1.0388, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.578947368421053, |
|
"eval_accuracy": 0.78800361330496, |
|
"eval_loss": 1.11760413646698, |
|
"eval_runtime": 629.3158, |
|
"eval_samples_per_second": 100.813, |
|
"eval_steps_per_second": 3.151, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.631161236424394, |
|
"grad_norm": 3.2929039001464844, |
|
"learning_rate": 3.65e-05, |
|
"loss": 1.0383, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.631161236424394, |
|
"eval_accuracy": 0.7879278944035151, |
|
"eval_loss": 1.1053088903427124, |
|
"eval_runtime": 627.1228, |
|
"eval_samples_per_second": 101.165, |
|
"eval_steps_per_second": 3.162, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.683375104427736, |
|
"grad_norm": 2.9945790767669678, |
|
"learning_rate": 3.6e-05, |
|
"loss": 1.0373, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.683375104427736, |
|
"eval_accuracy": 0.7881556360589377, |
|
"eval_loss": 1.1067023277282715, |
|
"eval_runtime": 664.2963, |
|
"eval_samples_per_second": 95.504, |
|
"eval_steps_per_second": 2.985, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.735588972431078, |
|
"grad_norm": 3.4380545616149902, |
|
"learning_rate": 3.55e-05, |
|
"loss": 1.0382, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.735588972431078, |
|
"eval_accuracy": 0.7882482565200516, |
|
"eval_loss": 1.116519570350647, |
|
"eval_runtime": 629.5022, |
|
"eval_samples_per_second": 100.783, |
|
"eval_steps_per_second": 3.15, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.787802840434419, |
|
"grad_norm": 3.368530750274658, |
|
"learning_rate": 3.5e-05, |
|
"loss": 1.0277, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.787802840434419, |
|
"eval_accuracy": 0.7891739950636399, |
|
"eval_loss": 1.0915908813476562, |
|
"eval_runtime": 632.7041, |
|
"eval_samples_per_second": 100.273, |
|
"eval_steps_per_second": 3.134, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.840016708437761, |
|
"grad_norm": 3.3649165630340576, |
|
"learning_rate": 3.45e-05, |
|
"loss": 1.0337, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.840016708437761, |
|
"eval_accuracy": 0.7883815172332914, |
|
"eval_loss": 1.1139589548110962, |
|
"eval_runtime": 640.7807, |
|
"eval_samples_per_second": 99.009, |
|
"eval_steps_per_second": 3.095, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.8922305764411025, |
|
"grad_norm": 3.242825746536255, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 1.033, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.8922305764411025, |
|
"eval_accuracy": 0.789335987558676, |
|
"eval_loss": 1.0902156829833984, |
|
"eval_runtime": 634.5325, |
|
"eval_samples_per_second": 99.984, |
|
"eval_steps_per_second": 3.125, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"grad_norm": 2.7917425632476807, |
|
"learning_rate": 3.35e-05, |
|
"loss": 1.0298, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.944444444444445, |
|
"eval_accuracy": 0.7897472088960492, |
|
"eval_loss": 1.0988086462020874, |
|
"eval_runtime": 636.5285, |
|
"eval_samples_per_second": 99.67, |
|
"eval_steps_per_second": 3.115, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.996658312447786, |
|
"grad_norm": 2.9491727352142334, |
|
"learning_rate": 3.3e-05, |
|
"loss": 1.0325, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 6.996658312447786, |
|
"eval_accuracy": 0.7894358436331047, |
|
"eval_loss": 1.097759485244751, |
|
"eval_runtime": 631.2152, |
|
"eval_samples_per_second": 100.509, |
|
"eval_steps_per_second": 3.142, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.048872180451128, |
|
"grad_norm": 3.81508469581604, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 1.013, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.048872180451128, |
|
"eval_accuracy": 0.7897459597854384, |
|
"eval_loss": 1.1025677919387817, |
|
"eval_runtime": 667.978, |
|
"eval_samples_per_second": 94.978, |
|
"eval_steps_per_second": 2.969, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.101086048454469, |
|
"grad_norm": 2.6366662979125977, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 1.0199, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.101086048454469, |
|
"eval_accuracy": 0.7899999141985645, |
|
"eval_loss": 1.1025761365890503, |
|
"eval_runtime": 630.4778, |
|
"eval_samples_per_second": 100.627, |
|
"eval_steps_per_second": 3.145, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.1532999164578115, |
|
"grad_norm": 3.155102014541626, |
|
"learning_rate": 3.15e-05, |
|
"loss": 1.0157, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.1532999164578115, |
|
"eval_accuracy": 0.7898970170010094, |
|
"eval_loss": 1.1064562797546387, |
|
"eval_runtime": 625.4866, |
|
"eval_samples_per_second": 101.43, |
|
"eval_steps_per_second": 3.17, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.205513784461153, |
|
"grad_norm": 2.9615111351013184, |
|
"learning_rate": 3.1e-05, |
|
"loss": 1.0137, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.205513784461153, |
|
"eval_accuracy": 0.7904352669747394, |
|
"eval_loss": 1.0876559019088745, |
|
"eval_runtime": 646.3487, |
|
"eval_samples_per_second": 98.156, |
|
"eval_steps_per_second": 3.068, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.257727652464495, |
|
"grad_norm": 3.723841905593872, |
|
"learning_rate": 3.05e-05, |
|
"loss": 1.0137, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.257727652464495, |
|
"eval_accuracy": 0.7899743837210673, |
|
"eval_loss": 1.0912542343139648, |
|
"eval_runtime": 652.0884, |
|
"eval_samples_per_second": 97.292, |
|
"eval_steps_per_second": 3.041, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.309941520467836, |
|
"grad_norm": 3.6348705291748047, |
|
"learning_rate": 3e-05, |
|
"loss": 1.0167, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.309941520467836, |
|
"eval_accuracy": 0.7903721199263885, |
|
"eval_loss": NaN, |
|
"eval_runtime": 630.9965, |
|
"eval_samples_per_second": 100.544, |
|
"eval_steps_per_second": 3.143, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.362155388471178, |
|
"grad_norm": 3.2423222064971924, |
|
"learning_rate": 2.95e-05, |
|
"loss": 1.0089, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.362155388471178, |
|
"eval_accuracy": 0.7906133078105837, |
|
"eval_loss": 1.0956330299377441, |
|
"eval_runtime": 633.5776, |
|
"eval_samples_per_second": 100.135, |
|
"eval_steps_per_second": 3.13, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.41436925647452, |
|
"grad_norm": 3.15704083442688, |
|
"learning_rate": 2.9e-05, |
|
"loss": 1.0055, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.41436925647452, |
|
"eval_accuracy": 0.7912404700602101, |
|
"eval_loss": NaN, |
|
"eval_runtime": 649.7229, |
|
"eval_samples_per_second": 97.646, |
|
"eval_steps_per_second": 3.052, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.466583124477861, |
|
"grad_norm": 3.387059211730957, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 1.0135, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.466583124477861, |
|
"eval_accuracy": 0.7914904478088925, |
|
"eval_loss": 1.0922181606292725, |
|
"eval_runtime": 633.8807, |
|
"eval_samples_per_second": 100.087, |
|
"eval_steps_per_second": 3.128, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.518796992481203, |
|
"grad_norm": 3.3285696506500244, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 1.0041, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.518796992481203, |
|
"eval_accuracy": 0.7919370824790591, |
|
"eval_loss": 1.0766669511795044, |
|
"eval_runtime": 640.2628, |
|
"eval_samples_per_second": 99.089, |
|
"eval_steps_per_second": 3.097, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.571010860484545, |
|
"grad_norm": 3.0725443363189697, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.9982, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.571010860484545, |
|
"eval_accuracy": 0.7914295287916221, |
|
"eval_loss": 1.0892815589904785, |
|
"eval_runtime": 636.4517, |
|
"eval_samples_per_second": 99.682, |
|
"eval_steps_per_second": 3.116, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.6232247284878865, |
|
"grad_norm": 3.2176389694213867, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 1.0066, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.6232247284878865, |
|
"eval_accuracy": 0.7917736483647045, |
|
"eval_loss": 1.092371940612793, |
|
"eval_runtime": 621.243, |
|
"eval_samples_per_second": 102.123, |
|
"eval_steps_per_second": 3.192, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.675438596491228, |
|
"grad_norm": 2.7997543811798096, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 1.0073, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.675438596491228, |
|
"eval_accuracy": 0.7922002806270909, |
|
"eval_loss": 1.0904649496078491, |
|
"eval_runtime": 620.7253, |
|
"eval_samples_per_second": 102.208, |
|
"eval_steps_per_second": 3.195, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.72765246449457, |
|
"grad_norm": 3.626774549484253, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 1.0015, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.72765246449457, |
|
"eval_accuracy": 0.7917596219688585, |
|
"eval_loss": 1.0867533683776855, |
|
"eval_runtime": 618.3607, |
|
"eval_samples_per_second": 102.599, |
|
"eval_steps_per_second": 3.207, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.779866332497911, |
|
"grad_norm": 2.8419744968414307, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 1.0049, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.779866332497911, |
|
"eval_accuracy": 0.7928055223543873, |
|
"eval_loss": 1.0817667245864868, |
|
"eval_runtime": 617.1968, |
|
"eval_samples_per_second": 102.792, |
|
"eval_steps_per_second": 3.213, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.832080200501253, |
|
"grad_norm": 2.822751760482788, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.9957, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.832080200501253, |
|
"eval_accuracy": 0.7932603950321594, |
|
"eval_loss": 1.0652581453323364, |
|
"eval_runtime": 616.9484, |
|
"eval_samples_per_second": 102.834, |
|
"eval_steps_per_second": 3.214, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.884294068504595, |
|
"grad_norm": 2.9094507694244385, |
|
"learning_rate": 2.45e-05, |
|
"loss": 1.0042, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.884294068504595, |
|
"eval_accuracy": 0.7934020652494004, |
|
"eval_loss": 1.0694156885147095, |
|
"eval_runtime": 617.9112, |
|
"eval_samples_per_second": 102.673, |
|
"eval_steps_per_second": 3.209, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.936507936507937, |
|
"grad_norm": 2.5249078273773193, |
|
"learning_rate": 2.4e-05, |
|
"loss": 1.0031, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.936507936507937, |
|
"eval_accuracy": 0.7935896894661485, |
|
"eval_loss": 1.0808907747268677, |
|
"eval_runtime": 618.1232, |
|
"eval_samples_per_second": 102.638, |
|
"eval_steps_per_second": 3.208, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.988721804511278, |
|
"grad_norm": 3.253228187561035, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.9979, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 7.988721804511278, |
|
"eval_accuracy": 0.7933455565489351, |
|
"eval_loss": 1.0679088830947876, |
|
"eval_runtime": 618.0488, |
|
"eval_samples_per_second": 102.65, |
|
"eval_steps_per_second": 3.208, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 8.04093567251462, |
|
"grad_norm": 2.6395580768585205, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.9825, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.04093567251462, |
|
"eval_accuracy": 0.7935253800154058, |
|
"eval_loss": 1.07496976852417, |
|
"eval_runtime": 617.9799, |
|
"eval_samples_per_second": 102.662, |
|
"eval_steps_per_second": 3.209, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.093149540517961, |
|
"grad_norm": 3.0883655548095703, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.9926, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.093149540517961, |
|
"eval_accuracy": 0.7942360815200339, |
|
"eval_loss": 1.0814706087112427, |
|
"eval_runtime": 618.6019, |
|
"eval_samples_per_second": 102.559, |
|
"eval_steps_per_second": 3.206, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.145363408521304, |
|
"grad_norm": 3.356086015701294, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.9897, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.145363408521304, |
|
"eval_accuracy": 0.7933161062503574, |
|
"eval_loss": NaN, |
|
"eval_runtime": 617.0688, |
|
"eval_samples_per_second": 102.813, |
|
"eval_steps_per_second": 3.214, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.197577276524646, |
|
"grad_norm": 3.31772780418396, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.9871, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.197577276524646, |
|
"eval_accuracy": 0.7943132034313805, |
|
"eval_loss": 1.0676552057266235, |
|
"eval_runtime": 623.639, |
|
"eval_samples_per_second": 101.73, |
|
"eval_steps_per_second": 3.18, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.249791144527986, |
|
"grad_norm": 3.543698310852051, |
|
"learning_rate": 2.1e-05, |
|
"loss": 1.0001, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.249791144527986, |
|
"eval_accuracy": 0.7938355754922207, |
|
"eval_loss": 1.0734678506851196, |
|
"eval_runtime": 617.8987, |
|
"eval_samples_per_second": 102.675, |
|
"eval_steps_per_second": 3.209, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.302005012531328, |
|
"grad_norm": 2.846734046936035, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.987, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.302005012531328, |
|
"eval_accuracy": 0.7944878324075739, |
|
"eval_loss": 1.0765976905822754, |
|
"eval_runtime": 618.672, |
|
"eval_samples_per_second": 102.547, |
|
"eval_steps_per_second": 3.205, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.35421888053467, |
|
"grad_norm": 2.871553897857666, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9806, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.35421888053467, |
|
"eval_accuracy": 0.794915350004184, |
|
"eval_loss": 1.0597549676895142, |
|
"eval_runtime": 618.1763, |
|
"eval_samples_per_second": 102.629, |
|
"eval_steps_per_second": 3.208, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.406432748538013, |
|
"grad_norm": 2.959845542907715, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.984, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.406432748538013, |
|
"eval_accuracy": 0.7946179832675856, |
|
"eval_loss": 1.0595872402191162, |
|
"eval_runtime": 617.7039, |
|
"eval_samples_per_second": 102.708, |
|
"eval_steps_per_second": 3.21, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.458646616541353, |
|
"grad_norm": 2.7344448566436768, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.9822, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.458646616541353, |
|
"eval_accuracy": 0.7951521505243792, |
|
"eval_loss": 1.0736610889434814, |
|
"eval_runtime": 618.3207, |
|
"eval_samples_per_second": 102.605, |
|
"eval_steps_per_second": 3.207, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.510860484544695, |
|
"grad_norm": 3.7596890926361084, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.9791, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.510860484544695, |
|
"eval_accuracy": 0.795253175296364, |
|
"eval_loss": 1.0666213035583496, |
|
"eval_runtime": 620.245, |
|
"eval_samples_per_second": 102.287, |
|
"eval_steps_per_second": 3.197, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.563074352548037, |
|
"grad_norm": 2.727389335632324, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.9804, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.563074352548037, |
|
"eval_accuracy": 0.7953146030482228, |
|
"eval_loss": 1.0600041151046753, |
|
"eval_runtime": 618.8007, |
|
"eval_samples_per_second": 102.526, |
|
"eval_steps_per_second": 3.205, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.615288220551378, |
|
"grad_norm": 2.9671452045440674, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.9703, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.615288220551378, |
|
"eval_accuracy": 0.7963827692254926, |
|
"eval_loss": 1.0601171255111694, |
|
"eval_runtime": 616.1581, |
|
"eval_samples_per_second": 102.965, |
|
"eval_steps_per_second": 3.218, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.66750208855472, |
|
"grad_norm": 2.9628329277038574, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.9768, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.66750208855472, |
|
"eval_accuracy": 0.7952524450397661, |
|
"eval_loss": 1.0768334865570068, |
|
"eval_runtime": 616.7531, |
|
"eval_samples_per_second": 102.866, |
|
"eval_steps_per_second": 3.215, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.719715956558062, |
|
"grad_norm": 3.6838812828063965, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.9722, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.719715956558062, |
|
"eval_accuracy": 0.796164760278738, |
|
"eval_loss": NaN, |
|
"eval_runtime": 616.0678, |
|
"eval_samples_per_second": 102.981, |
|
"eval_steps_per_second": 3.219, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.771929824561404, |
|
"grad_norm": 3.7852931022644043, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.9813, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.771929824561404, |
|
"eval_accuracy": 0.7964795153494032, |
|
"eval_loss": NaN, |
|
"eval_runtime": 616.9446, |
|
"eval_samples_per_second": 102.834, |
|
"eval_steps_per_second": 3.214, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.824143692564745, |
|
"grad_norm": 2.85229229927063, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.9732, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.824143692564745, |
|
"eval_accuracy": 0.7960116027401912, |
|
"eval_loss": 1.061353325843811, |
|
"eval_runtime": 624.2524, |
|
"eval_samples_per_second": 101.63, |
|
"eval_steps_per_second": 3.177, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.876357560568087, |
|
"grad_norm": 3.463848114013672, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.9764, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.876357560568087, |
|
"eval_accuracy": 0.7963621203122637, |
|
"eval_loss": 1.049773097038269, |
|
"eval_runtime": 618.458, |
|
"eval_samples_per_second": 102.583, |
|
"eval_steps_per_second": 3.206, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"grad_norm": 2.7122750282287598, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.9829, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.928571428571429, |
|
"eval_accuracy": 0.7965627753528177, |
|
"eval_loss": 1.069191813468933, |
|
"eval_runtime": 617.5416, |
|
"eval_samples_per_second": 102.735, |
|
"eval_steps_per_second": 3.211, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.980785296574771, |
|
"grad_norm": 2.976637840270996, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.9741, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.980785296574771, |
|
"eval_accuracy": 0.7967220469069121, |
|
"eval_loss": 1.055444598197937, |
|
"eval_runtime": 614.0894, |
|
"eval_samples_per_second": 103.312, |
|
"eval_steps_per_second": 3.229, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 9.032999164578111, |
|
"grad_norm": 2.89384126663208, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.9648, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.032999164578111, |
|
"eval_accuracy": 0.797160834758222, |
|
"eval_loss": 1.058738350868225, |
|
"eval_runtime": 615.784, |
|
"eval_samples_per_second": 103.028, |
|
"eval_steps_per_second": 3.22, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.085213032581454, |
|
"grad_norm": 3.009979248046875, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.9606, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.085213032581454, |
|
"eval_accuracy": 0.7971404883516657, |
|
"eval_loss": 1.0621484518051147, |
|
"eval_runtime": 618.535, |
|
"eval_samples_per_second": 102.57, |
|
"eval_steps_per_second": 3.206, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.137426900584796, |
|
"grad_norm": 3.301671266555786, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.9682, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.137426900584796, |
|
"eval_accuracy": 0.7973926994154387, |
|
"eval_loss": 1.0569199323654175, |
|
"eval_runtime": 616.4943, |
|
"eval_samples_per_second": 102.909, |
|
"eval_steps_per_second": 3.217, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.189640768588138, |
|
"grad_norm": 2.932385206222534, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9644, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.189640768588138, |
|
"eval_accuracy": 0.7974666942085554, |
|
"eval_loss": 1.0533905029296875, |
|
"eval_runtime": 620.5893, |
|
"eval_samples_per_second": 102.23, |
|
"eval_steps_per_second": 3.195, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.241854636591478, |
|
"grad_norm": 3.1355013847351074, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.9658, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.241854636591478, |
|
"eval_accuracy": 0.7977740520228777, |
|
"eval_loss": 1.0476280450820923, |
|
"eval_runtime": 618.7726, |
|
"eval_samples_per_second": 102.53, |
|
"eval_steps_per_second": 3.205, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.29406850459482, |
|
"grad_norm": 3.353086233139038, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.9641, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.29406850459482, |
|
"eval_accuracy": 0.7973864228195986, |
|
"eval_loss": 1.041751742362976, |
|
"eval_runtime": 614.3896, |
|
"eval_samples_per_second": 103.262, |
|
"eval_steps_per_second": 3.228, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.346282372598163, |
|
"grad_norm": 2.8681247234344482, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.963, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.346282372598163, |
|
"eval_accuracy": 0.798292731668655, |
|
"eval_loss": 1.0465270280838013, |
|
"eval_runtime": 615.1098, |
|
"eval_samples_per_second": 103.141, |
|
"eval_steps_per_second": 3.224, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.398496240601503, |
|
"grad_norm": 3.2927052974700928, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9562, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.398496240601503, |
|
"eval_accuracy": 0.798140924991145, |
|
"eval_loss": 1.0476934909820557, |
|
"eval_runtime": 616.723, |
|
"eval_samples_per_second": 102.871, |
|
"eval_steps_per_second": 3.215, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.450710108604845, |
|
"grad_norm": 3.1475863456726074, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.9605, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.450710108604845, |
|
"eval_accuracy": 0.7977010212122122, |
|
"eval_loss": 1.0535070896148682, |
|
"eval_runtime": 615.5187, |
|
"eval_samples_per_second": 103.072, |
|
"eval_steps_per_second": 3.222, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.502923976608187, |
|
"grad_norm": 3.662548780441284, |
|
"learning_rate": 9e-06, |
|
"loss": 0.9692, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.502923976608187, |
|
"eval_accuracy": 0.7981552799718717, |
|
"eval_loss": 1.0515964031219482, |
|
"eval_runtime": 615.1273, |
|
"eval_samples_per_second": 103.138, |
|
"eval_steps_per_second": 3.224, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.55513784461153, |
|
"grad_norm": 3.5501346588134766, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.966, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.55513784461153, |
|
"eval_accuracy": 0.798176074856451, |
|
"eval_loss": 1.0500941276550293, |
|
"eval_runtime": 616.9498, |
|
"eval_samples_per_second": 102.833, |
|
"eval_steps_per_second": 3.214, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.60735171261487, |
|
"grad_norm": 3.102790117263794, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.9614, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.60735171261487, |
|
"eval_accuracy": 0.7986090909415797, |
|
"eval_loss": 1.0444718599319458, |
|
"eval_runtime": 614.407, |
|
"eval_samples_per_second": 103.259, |
|
"eval_steps_per_second": 3.228, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.659565580618212, |
|
"grad_norm": 3.504926919937134, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.9606, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.659565580618212, |
|
"eval_accuracy": 0.7988812850380725, |
|
"eval_loss": 1.0497493743896484, |
|
"eval_runtime": 615.4873, |
|
"eval_samples_per_second": 103.078, |
|
"eval_steps_per_second": 3.222, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.711779448621554, |
|
"grad_norm": 3.648078203201294, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.9507, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.711779448621554, |
|
"eval_accuracy": 0.798754673907639, |
|
"eval_loss": 1.0451751947402954, |
|
"eval_runtime": 615.9945, |
|
"eval_samples_per_second": 102.993, |
|
"eval_steps_per_second": 3.219, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.763993316624896, |
|
"grad_norm": 3.316922903060913, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.9584, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.763993316624896, |
|
"eval_accuracy": 0.7987269314893306, |
|
"eval_loss": 1.0408653020858765, |
|
"eval_runtime": 618.0487, |
|
"eval_samples_per_second": 102.65, |
|
"eval_steps_per_second": 3.208, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.816207184628237, |
|
"grad_norm": 2.943169116973877, |
|
"learning_rate": 6e-06, |
|
"loss": 0.9572, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.816207184628237, |
|
"eval_accuracy": 0.7989658711747047, |
|
"eval_loss": 1.0431112051010132, |
|
"eval_runtime": 619.8767, |
|
"eval_samples_per_second": 102.348, |
|
"eval_steps_per_second": 3.199, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"grad_norm": 2.713733196258545, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.9552, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.868421052631579, |
|
"eval_accuracy": 0.7991049545929755, |
|
"eval_loss": 1.0465214252471924, |
|
"eval_runtime": 615.6457, |
|
"eval_samples_per_second": 103.051, |
|
"eval_steps_per_second": 3.221, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.920634920634921, |
|
"grad_norm": 3.5131709575653076, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9542, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.920634920634921, |
|
"eval_accuracy": 0.7993720778709359, |
|
"eval_loss": 1.043885350227356, |
|
"eval_runtime": 616.5178, |
|
"eval_samples_per_second": 102.905, |
|
"eval_steps_per_second": 3.216, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.972848788638263, |
|
"grad_norm": 3.0255749225616455, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.9505, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 9.972848788638263, |
|
"eval_accuracy": 0.7994996190810016, |
|
"eval_loss": 1.0470497608184814, |
|
"eval_runtime": 616.7354, |
|
"eval_samples_per_second": 102.869, |
|
"eval_steps_per_second": 3.215, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 10.025062656641603, |
|
"grad_norm": 4.134832382202148, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.9472, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.025062656641603, |
|
"eval_accuracy": 0.7994244357414679, |
|
"eval_loss": 1.0445414781570435, |
|
"eval_runtime": 617.5631, |
|
"eval_samples_per_second": 102.731, |
|
"eval_steps_per_second": 3.211, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.077276524644946, |
|
"grad_norm": 3.3011510372161865, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.9467, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.077276524644946, |
|
"eval_accuracy": 0.7997884398890616, |
|
"eval_loss": 1.0422955751419067, |
|
"eval_runtime": 617.5269, |
|
"eval_samples_per_second": 102.737, |
|
"eval_steps_per_second": 3.211, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.129490392648288, |
|
"grad_norm": 3.65120792388916, |
|
"learning_rate": 3e-06, |
|
"loss": 0.9497, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.129490392648288, |
|
"eval_accuracy": 0.7994027266856298, |
|
"eval_loss": 1.0433976650238037, |
|
"eval_runtime": 615.8756, |
|
"eval_samples_per_second": 103.013, |
|
"eval_steps_per_second": 3.22, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.18170426065163, |
|
"grad_norm": 3.1329991817474365, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.9575, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.18170426065163, |
|
"eval_accuracy": 0.8000353846923677, |
|
"eval_loss": 1.0379133224487305, |
|
"eval_runtime": 615.8872, |
|
"eval_samples_per_second": 103.011, |
|
"eval_steps_per_second": 3.22, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.23391812865497, |
|
"grad_norm": 3.3150088787078857, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.9478, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.23391812865497, |
|
"eval_accuracy": 0.7997029578657322, |
|
"eval_loss": 1.039129614830017, |
|
"eval_runtime": 615.7144, |
|
"eval_samples_per_second": 103.04, |
|
"eval_steps_per_second": 3.221, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.286131996658312, |
|
"grad_norm": 2.9896209239959717, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.9428, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 10.286131996658312, |
|
"eval_accuracy": 0.799764992788252, |
|
"eval_loss": 1.0273813009262085, |
|
"eval_runtime": 615.1763, |
|
"eval_samples_per_second": 103.13, |
|
"eval_steps_per_second": 3.223, |
|
"step": 98500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.317352524927468e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|