xlm-r_dan-latn / trainer_state.json
DGurgurov's picture
Uploading checkpoint-98500 for xlm-r - dan-latn
b92ec8f verified
{
"best_metric": 1.0273813009262085,
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/dan-Latn/checkpoint-98500",
"epoch": 10.286131996658312,
"eval_steps": 500,
"global_step": 98500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.052213868003341685,
"grad_norm": 5.384862422943115,
"learning_rate": 9.95e-05,
"loss": 1.4559,
"step": 500
},
{
"epoch": 0.052213868003341685,
"eval_accuracy": 0.7296120738961336,
"eval_loss": 1.6221407651901245,
"eval_runtime": 620.8615,
"eval_samples_per_second": 102.185,
"eval_steps_per_second": 3.194,
"step": 500
},
{
"epoch": 0.10442773600668337,
"grad_norm": 4.473601818084717,
"learning_rate": 9.900000000000001e-05,
"loss": 1.4355,
"step": 1000
},
{
"epoch": 0.10442773600668337,
"eval_accuracy": 0.7314203520980975,
"eval_loss": 1.563644289970398,
"eval_runtime": 631.0902,
"eval_samples_per_second": 100.529,
"eval_steps_per_second": 3.142,
"step": 1000
},
{
"epoch": 0.15664160401002505,
"grad_norm": 4.604930877685547,
"learning_rate": 9.850000000000001e-05,
"loss": 1.4326,
"step": 1500
},
{
"epoch": 0.15664160401002505,
"eval_accuracy": 0.732858138551035,
"eval_loss": 1.538730502128601,
"eval_runtime": 627.1459,
"eval_samples_per_second": 101.161,
"eval_steps_per_second": 3.162,
"step": 1500
},
{
"epoch": 0.20885547201336674,
"grad_norm": 5.0758867263793945,
"learning_rate": 9.8e-05,
"loss": 1.4082,
"step": 2000
},
{
"epoch": 0.20885547201336674,
"eval_accuracy": 0.7337186853777712,
"eval_loss": 1.514768123626709,
"eval_runtime": 623.7693,
"eval_samples_per_second": 101.709,
"eval_steps_per_second": 3.179,
"step": 2000
},
{
"epoch": 0.26106934001670845,
"grad_norm": 4.955592155456543,
"learning_rate": 9.75e-05,
"loss": 1.4066,
"step": 2500
},
{
"epoch": 0.26106934001670845,
"eval_accuracy": 0.7373838924395351,
"eval_loss": 1.4860447645187378,
"eval_runtime": 626.6763,
"eval_samples_per_second": 101.237,
"eval_steps_per_second": 3.164,
"step": 2500
},
{
"epoch": 0.3132832080200501,
"grad_norm": 4.4695611000061035,
"learning_rate": 9.7e-05,
"loss": 1.3965,
"step": 3000
},
{
"epoch": 0.3132832080200501,
"eval_accuracy": 0.7386060798818527,
"eval_loss": 1.4720885753631592,
"eval_runtime": 626.8807,
"eval_samples_per_second": 101.204,
"eval_steps_per_second": 3.163,
"step": 3000
},
{
"epoch": 0.3654970760233918,
"grad_norm": 4.595192909240723,
"learning_rate": 9.65e-05,
"loss": 1.3674,
"step": 3500
},
{
"epoch": 0.3654970760233918,
"eval_accuracy": 0.7389515252696706,
"eval_loss": 1.466160774230957,
"eval_runtime": 626.2854,
"eval_samples_per_second": 101.3,
"eval_steps_per_second": 3.166,
"step": 3500
},
{
"epoch": 0.4177109440267335,
"grad_norm": 4.489499092102051,
"learning_rate": 9.6e-05,
"loss": 1.3789,
"step": 4000
},
{
"epoch": 0.4177109440267335,
"eval_accuracy": 0.7408971720895672,
"eval_loss": 1.4427233934402466,
"eval_runtime": 628.8309,
"eval_samples_per_second": 100.89,
"eval_steps_per_second": 3.153,
"step": 4000
},
{
"epoch": 0.4699248120300752,
"grad_norm": 4.887514114379883,
"learning_rate": 9.55e-05,
"loss": 1.3677,
"step": 4500
},
{
"epoch": 0.4699248120300752,
"eval_accuracy": 0.7418176415206207,
"eval_loss": NaN,
"eval_runtime": 626.3841,
"eval_samples_per_second": 101.285,
"eval_steps_per_second": 3.166,
"step": 4500
},
{
"epoch": 0.5221386800334169,
"grad_norm": 4.255401611328125,
"learning_rate": 9.5e-05,
"loss": 1.3591,
"step": 5000
},
{
"epoch": 0.5221386800334169,
"eval_accuracy": 0.7420801988701987,
"eval_loss": 1.4378492832183838,
"eval_runtime": 628.3584,
"eval_samples_per_second": 100.966,
"eval_steps_per_second": 3.156,
"step": 5000
},
{
"epoch": 0.5743525480367586,
"grad_norm": 3.9552218914031982,
"learning_rate": 9.449999999999999e-05,
"loss": 1.3528,
"step": 5500
},
{
"epoch": 0.5743525480367586,
"eval_accuracy": 0.7430785963224286,
"eval_loss": 1.4063905477523804,
"eval_runtime": 630.6125,
"eval_samples_per_second": 100.605,
"eval_steps_per_second": 3.145,
"step": 5500
},
{
"epoch": 0.6265664160401002,
"grad_norm": 4.043545246124268,
"learning_rate": 9.4e-05,
"loss": 1.3467,
"step": 6000
},
{
"epoch": 0.6265664160401002,
"eval_accuracy": 0.7439164901309249,
"eval_loss": 1.4179232120513916,
"eval_runtime": 627.0519,
"eval_samples_per_second": 101.177,
"eval_steps_per_second": 3.162,
"step": 6000
},
{
"epoch": 0.6787802840434419,
"grad_norm": 65.49678802490234,
"learning_rate": 9.350000000000001e-05,
"loss": 1.3578,
"step": 6500
},
{
"epoch": 0.6787802840434419,
"eval_accuracy": 0.7436562018580148,
"eval_loss": 1.4101091623306274,
"eval_runtime": 631.9563,
"eval_samples_per_second": 100.391,
"eval_steps_per_second": 3.138,
"step": 6500
},
{
"epoch": 0.7309941520467836,
"grad_norm": 3.9676353931427,
"learning_rate": 9.300000000000001e-05,
"loss": 1.3377,
"step": 7000
},
{
"epoch": 0.7309941520467836,
"eval_accuracy": 0.7452273902409787,
"eval_loss": 1.3957942724227905,
"eval_runtime": 625.612,
"eval_samples_per_second": 101.41,
"eval_steps_per_second": 3.17,
"step": 7000
},
{
"epoch": 0.7832080200501254,
"grad_norm": 3.6173226833343506,
"learning_rate": 9.250000000000001e-05,
"loss": 1.3303,
"step": 7500
},
{
"epoch": 0.7832080200501254,
"eval_accuracy": 0.7475488951341268,
"eval_loss": 1.3813687562942505,
"eval_runtime": 627.0109,
"eval_samples_per_second": 101.183,
"eval_steps_per_second": 3.163,
"step": 7500
},
{
"epoch": 0.835421888053467,
"grad_norm": 3.980341911315918,
"learning_rate": 9.200000000000001e-05,
"loss": 1.3221,
"step": 8000
},
{
"epoch": 0.835421888053467,
"eval_accuracy": 0.7475162804135862,
"eval_loss": 1.3788014650344849,
"eval_runtime": 625.2826,
"eval_samples_per_second": 101.463,
"eval_steps_per_second": 3.171,
"step": 8000
},
{
"epoch": 0.8876357560568087,
"grad_norm": 3.367668390274048,
"learning_rate": 9.15e-05,
"loss": 1.3273,
"step": 8500
},
{
"epoch": 0.8876357560568087,
"eval_accuracy": 0.7486955192165756,
"eval_loss": 1.3720810413360596,
"eval_runtime": 626.1965,
"eval_samples_per_second": 101.315,
"eval_steps_per_second": 3.167,
"step": 8500
},
{
"epoch": 0.9398496240601504,
"grad_norm": 3.82468843460083,
"learning_rate": 9.1e-05,
"loss": 1.3104,
"step": 9000
},
{
"epoch": 0.9398496240601504,
"eval_accuracy": 0.7470319869853385,
"eval_loss": 1.3732112646102905,
"eval_runtime": 630.8001,
"eval_samples_per_second": 100.575,
"eval_steps_per_second": 3.144,
"step": 9000
},
{
"epoch": 0.9920634920634921,
"grad_norm": 3.946758270263672,
"learning_rate": 9.05e-05,
"loss": 1.3158,
"step": 9500
},
{
"epoch": 0.9920634920634921,
"eval_accuracy": 0.7499583551090239,
"eval_loss": 1.3603472709655762,
"eval_runtime": 652.6391,
"eval_samples_per_second": 97.21,
"eval_steps_per_second": 3.038,
"step": 9500
},
{
"epoch": 1.0442773600668338,
"grad_norm": 3.916361093521118,
"learning_rate": 9e-05,
"loss": 1.2953,
"step": 10000
},
{
"epoch": 1.0442773600668338,
"eval_accuracy": 0.7500646093431366,
"eval_loss": 1.3458328247070312,
"eval_runtime": 626.4497,
"eval_samples_per_second": 101.274,
"eval_steps_per_second": 3.165,
"step": 10000
},
{
"epoch": 1.0964912280701755,
"grad_norm": 3.9930801391601562,
"learning_rate": 8.950000000000001e-05,
"loss": 1.2932,
"step": 10500
},
{
"epoch": 1.0964912280701755,
"eval_accuracy": 0.7504904150215282,
"eval_loss": 1.3514128923416138,
"eval_runtime": 630.1314,
"eval_samples_per_second": 100.682,
"eval_steps_per_second": 3.147,
"step": 10500
},
{
"epoch": 1.1487050960735172,
"grad_norm": 4.74911642074585,
"learning_rate": 8.900000000000001e-05,
"loss": 1.2944,
"step": 11000
},
{
"epoch": 1.1487050960735172,
"eval_accuracy": 0.7512302865503326,
"eval_loss": 1.3358726501464844,
"eval_runtime": 631.5659,
"eval_samples_per_second": 100.453,
"eval_steps_per_second": 3.14,
"step": 11000
},
{
"epoch": 1.2009189640768587,
"grad_norm": 3.7969741821289062,
"learning_rate": 8.850000000000001e-05,
"loss": 1.2853,
"step": 11500
},
{
"epoch": 1.2009189640768587,
"eval_accuracy": 0.7516760587886216,
"eval_loss": 1.3410056829452515,
"eval_runtime": 631.8461,
"eval_samples_per_second": 100.409,
"eval_steps_per_second": 3.138,
"step": 11500
},
{
"epoch": 1.2531328320802004,
"grad_norm": 4.402622699737549,
"learning_rate": 8.800000000000001e-05,
"loss": 1.2904,
"step": 12000
},
{
"epoch": 1.2531328320802004,
"eval_accuracy": 0.7529186511157855,
"eval_loss": 1.3296958208084106,
"eval_runtime": 625.8019,
"eval_samples_per_second": 101.379,
"eval_steps_per_second": 3.169,
"step": 12000
},
{
"epoch": 1.3053467000835421,
"grad_norm": 4.738062858581543,
"learning_rate": 8.75e-05,
"loss": 1.2768,
"step": 12500
},
{
"epoch": 1.3053467000835421,
"eval_accuracy": 0.7535759135206747,
"eval_loss": 1.323601484298706,
"eval_runtime": 625.8627,
"eval_samples_per_second": 101.369,
"eval_steps_per_second": 3.168,
"step": 12500
},
{
"epoch": 1.3575605680868839,
"grad_norm": 3.4203574657440186,
"learning_rate": 8.7e-05,
"loss": 1.2702,
"step": 13000
},
{
"epoch": 1.3575605680868839,
"eval_accuracy": 0.7543550936371498,
"eval_loss": 1.311880350112915,
"eval_runtime": 632.5695,
"eval_samples_per_second": 100.294,
"eval_steps_per_second": 3.135,
"step": 13000
},
{
"epoch": 1.4097744360902256,
"grad_norm": 4.80487060546875,
"learning_rate": 8.65e-05,
"loss": 1.2686,
"step": 13500
},
{
"epoch": 1.4097744360902256,
"eval_accuracy": 0.7551145942334886,
"eval_loss": 1.3116217851638794,
"eval_runtime": 622.4069,
"eval_samples_per_second": 101.932,
"eval_steps_per_second": 3.186,
"step": 13500
},
{
"epoch": 1.4619883040935673,
"grad_norm": 3.3763253688812256,
"learning_rate": 8.6e-05,
"loss": 1.256,
"step": 14000
},
{
"epoch": 1.4619883040935673,
"eval_accuracy": 0.7550032122033714,
"eval_loss": NaN,
"eval_runtime": 623.0788,
"eval_samples_per_second": 101.822,
"eval_steps_per_second": 3.183,
"step": 14000
},
{
"epoch": 1.514202172096909,
"grad_norm": 3.4600539207458496,
"learning_rate": 8.55e-05,
"loss": 1.2689,
"step": 14500
},
{
"epoch": 1.514202172096909,
"eval_accuracy": 0.7562542149486124,
"eval_loss": 1.2966333627700806,
"eval_runtime": 630.2089,
"eval_samples_per_second": 100.67,
"eval_steps_per_second": 3.147,
"step": 14500
},
{
"epoch": 1.5664160401002505,
"grad_norm": 3.6656503677368164,
"learning_rate": 8.5e-05,
"loss": 1.2536,
"step": 15000
},
{
"epoch": 1.5664160401002505,
"eval_accuracy": 0.7558193144413351,
"eval_loss": 1.3074419498443604,
"eval_runtime": 633.7511,
"eval_samples_per_second": 100.107,
"eval_steps_per_second": 3.129,
"step": 15000
},
{
"epoch": 1.6186299081035922,
"grad_norm": 3.7582554817199707,
"learning_rate": 8.450000000000001e-05,
"loss": 1.262,
"step": 15500
},
{
"epoch": 1.6186299081035922,
"eval_accuracy": 0.755967622936909,
"eval_loss": 1.2850651741027832,
"eval_runtime": 626.4542,
"eval_samples_per_second": 101.273,
"eval_steps_per_second": 3.165,
"step": 15500
},
{
"epoch": 1.670843776106934,
"grad_norm": 3.8284225463867188,
"learning_rate": 8.4e-05,
"loss": 1.2427,
"step": 16000
},
{
"epoch": 1.670843776106934,
"eval_accuracy": 0.7575149255490681,
"eval_loss": 1.2897659540176392,
"eval_runtime": 624.5044,
"eval_samples_per_second": 101.589,
"eval_steps_per_second": 3.175,
"step": 16000
},
{
"epoch": 1.7230576441102756,
"grad_norm": 3.8263819217681885,
"learning_rate": 8.35e-05,
"loss": 1.2389,
"step": 16500
},
{
"epoch": 1.7230576441102756,
"eval_accuracy": 0.7580776326367493,
"eval_loss": 1.2844797372817993,
"eval_runtime": 627.1556,
"eval_samples_per_second": 101.16,
"eval_steps_per_second": 3.162,
"step": 16500
},
{
"epoch": 1.7752715121136173,
"grad_norm": 3.7820005416870117,
"learning_rate": 8.3e-05,
"loss": 1.2368,
"step": 17000
},
{
"epoch": 1.7752715121136173,
"eval_accuracy": 0.7585746778078989,
"eval_loss": 1.2964411973953247,
"eval_runtime": 635.3019,
"eval_samples_per_second": 99.863,
"eval_steps_per_second": 3.121,
"step": 17000
},
{
"epoch": 1.827485380116959,
"grad_norm": 3.492671012878418,
"learning_rate": 8.25e-05,
"loss": 1.2432,
"step": 17500
},
{
"epoch": 1.827485380116959,
"eval_accuracy": 0.7584533967429662,
"eval_loss": NaN,
"eval_runtime": 623.9705,
"eval_samples_per_second": 101.676,
"eval_steps_per_second": 3.178,
"step": 17500
},
{
"epoch": 1.8796992481203008,
"grad_norm": 3.714395761489868,
"learning_rate": 8.2e-05,
"loss": 1.2375,
"step": 18000
},
{
"epoch": 1.8796992481203008,
"eval_accuracy": 0.7593634528855384,
"eval_loss": NaN,
"eval_runtime": 623.2531,
"eval_samples_per_second": 101.793,
"eval_steps_per_second": 3.182,
"step": 18000
},
{
"epoch": 1.9319131161236425,
"grad_norm": 3.700199604034424,
"learning_rate": 8.15e-05,
"loss": 1.2308,
"step": 18500
},
{
"epoch": 1.9319131161236425,
"eval_accuracy": 0.7597829603112618,
"eval_loss": 1.2819814682006836,
"eval_runtime": 636.6684,
"eval_samples_per_second": 99.648,
"eval_steps_per_second": 3.115,
"step": 18500
},
{
"epoch": 1.9841269841269842,
"grad_norm": 3.390286922454834,
"learning_rate": 8.1e-05,
"loss": 1.2374,
"step": 19000
},
{
"epoch": 1.9841269841269842,
"eval_accuracy": 0.7603004154522052,
"eval_loss": 1.2687220573425293,
"eval_runtime": 630.1502,
"eval_samples_per_second": 100.679,
"eval_steps_per_second": 3.147,
"step": 19000
},
{
"epoch": 2.036340852130326,
"grad_norm": 4.251028060913086,
"learning_rate": 8.05e-05,
"loss": 1.218,
"step": 19500
},
{
"epoch": 2.036340852130326,
"eval_accuracy": 0.760354434691265,
"eval_loss": 1.2784004211425781,
"eval_runtime": 629.6482,
"eval_samples_per_second": 100.759,
"eval_steps_per_second": 3.149,
"step": 19500
},
{
"epoch": 2.0885547201336676,
"grad_norm": 3.714747905731201,
"learning_rate": 8e-05,
"loss": 1.2052,
"step": 20000
},
{
"epoch": 2.0885547201336676,
"eval_accuracy": 0.7609532576337851,
"eval_loss": 1.2718734741210938,
"eval_runtime": 644.9629,
"eval_samples_per_second": 98.367,
"eval_steps_per_second": 3.075,
"step": 20000
},
{
"epoch": 2.1407685881370093,
"grad_norm": 3.934018611907959,
"learning_rate": 7.950000000000001e-05,
"loss": 1.2133,
"step": 20500
},
{
"epoch": 2.1407685881370093,
"eval_accuracy": 0.7613764467527375,
"eval_loss": 1.2647244930267334,
"eval_runtime": 631.2806,
"eval_samples_per_second": 100.499,
"eval_steps_per_second": 3.141,
"step": 20500
},
{
"epoch": 2.192982456140351,
"grad_norm": 3.6540703773498535,
"learning_rate": 7.900000000000001e-05,
"loss": 1.2072,
"step": 21000
},
{
"epoch": 2.192982456140351,
"eval_accuracy": 0.7618574927985354,
"eval_loss": 1.2702383995056152,
"eval_runtime": 628.7442,
"eval_samples_per_second": 100.904,
"eval_steps_per_second": 3.154,
"step": 21000
},
{
"epoch": 2.2451963241436927,
"grad_norm": 3.0740270614624023,
"learning_rate": 7.850000000000001e-05,
"loss": 1.2051,
"step": 21500
},
{
"epoch": 2.2451963241436927,
"eval_accuracy": 0.7627010896236089,
"eval_loss": 1.272460699081421,
"eval_runtime": 628.6858,
"eval_samples_per_second": 100.914,
"eval_steps_per_second": 3.154,
"step": 21500
},
{
"epoch": 2.2974101921470345,
"grad_norm": 4.153682708740234,
"learning_rate": 7.800000000000001e-05,
"loss": 1.1978,
"step": 22000
},
{
"epoch": 2.2974101921470345,
"eval_accuracy": 0.7618345683871698,
"eval_loss": 1.2531063556671143,
"eval_runtime": 646.652,
"eval_samples_per_second": 98.11,
"eval_steps_per_second": 3.067,
"step": 22000
},
{
"epoch": 2.3496240601503757,
"grad_norm": 3.2469732761383057,
"learning_rate": 7.75e-05,
"loss": 1.2017,
"step": 22500
},
{
"epoch": 2.3496240601503757,
"eval_accuracy": 0.7624854045335798,
"eval_loss": NaN,
"eval_runtime": 639.4743,
"eval_samples_per_second": 99.211,
"eval_steps_per_second": 3.101,
"step": 22500
},
{
"epoch": 2.4018379281537174,
"grad_norm": 3.4860455989837646,
"learning_rate": 7.7e-05,
"loss": 1.2036,
"step": 23000
},
{
"epoch": 2.4018379281537174,
"eval_accuracy": 0.7637592883681051,
"eval_loss": 1.2505515813827515,
"eval_runtime": 626.8199,
"eval_samples_per_second": 101.214,
"eval_steps_per_second": 3.164,
"step": 23000
},
{
"epoch": 2.454051796157059,
"grad_norm": 3.0207018852233887,
"learning_rate": 7.65e-05,
"loss": 1.2039,
"step": 23500
},
{
"epoch": 2.454051796157059,
"eval_accuracy": 0.7636972843433327,
"eval_loss": 1.2614232301712036,
"eval_runtime": 626.1594,
"eval_samples_per_second": 101.321,
"eval_steps_per_second": 3.167,
"step": 23500
},
{
"epoch": 2.506265664160401,
"grad_norm": 2.820202589035034,
"learning_rate": 7.6e-05,
"loss": 1.1931,
"step": 24000
},
{
"epoch": 2.506265664160401,
"eval_accuracy": 0.7648100527367832,
"eval_loss": 1.2478718757629395,
"eval_runtime": 654.189,
"eval_samples_per_second": 96.98,
"eval_steps_per_second": 3.031,
"step": 24000
},
{
"epoch": 2.5584795321637426,
"grad_norm": 4.114190101623535,
"learning_rate": 7.55e-05,
"loss": 1.1814,
"step": 24500
},
{
"epoch": 2.5584795321637426,
"eval_accuracy": 0.7652067630736408,
"eval_loss": 1.2430387735366821,
"eval_runtime": 630.6204,
"eval_samples_per_second": 100.604,
"eval_steps_per_second": 3.145,
"step": 24500
},
{
"epoch": 2.6106934001670843,
"grad_norm": 3.205641746520996,
"learning_rate": 7.500000000000001e-05,
"loss": 1.1934,
"step": 25000
},
{
"epoch": 2.6106934001670843,
"eval_accuracy": 0.7653472241088651,
"eval_loss": 1.2560120820999146,
"eval_runtime": 633.6789,
"eval_samples_per_second": 100.119,
"eval_steps_per_second": 3.129,
"step": 25000
},
{
"epoch": 2.662907268170426,
"grad_norm": 3.286635637283325,
"learning_rate": 7.450000000000001e-05,
"loss": 1.1913,
"step": 25500
},
{
"epoch": 2.662907268170426,
"eval_accuracy": 0.7653709204327143,
"eval_loss": 1.2623989582061768,
"eval_runtime": 636.0478,
"eval_samples_per_second": 99.746,
"eval_steps_per_second": 3.118,
"step": 25500
},
{
"epoch": 2.7151211361737677,
"grad_norm": 3.4579050540924072,
"learning_rate": 7.4e-05,
"loss": 1.1835,
"step": 26000
},
{
"epoch": 2.7151211361737677,
"eval_accuracy": 0.766565753562764,
"eval_loss": 1.2262078523635864,
"eval_runtime": 628.0716,
"eval_samples_per_second": 101.012,
"eval_steps_per_second": 3.157,
"step": 26000
},
{
"epoch": 2.7673350041771094,
"grad_norm": 3.3125433921813965,
"learning_rate": 7.35e-05,
"loss": 1.1821,
"step": 26500
},
{
"epoch": 2.7673350041771094,
"eval_accuracy": 0.7665199419874659,
"eval_loss": 1.249574065208435,
"eval_runtime": 626.5772,
"eval_samples_per_second": 101.253,
"eval_steps_per_second": 3.165,
"step": 26500
},
{
"epoch": 2.819548872180451,
"grad_norm": 3.5980172157287598,
"learning_rate": 7.3e-05,
"loss": 1.1879,
"step": 27000
},
{
"epoch": 2.819548872180451,
"eval_accuracy": 0.7665586654611716,
"eval_loss": 1.2500699758529663,
"eval_runtime": 626.2963,
"eval_samples_per_second": 101.299,
"eval_steps_per_second": 3.166,
"step": 27000
},
{
"epoch": 2.871762740183793,
"grad_norm": 3.683032512664795,
"learning_rate": 7.25e-05,
"loss": 1.1865,
"step": 27500
},
{
"epoch": 2.871762740183793,
"eval_accuracy": 0.767375596206993,
"eval_loss": 1.2403781414031982,
"eval_runtime": 647.9501,
"eval_samples_per_second": 97.913,
"eval_steps_per_second": 3.06,
"step": 27500
},
{
"epoch": 2.9239766081871346,
"grad_norm": 3.2120792865753174,
"learning_rate": 7.2e-05,
"loss": 1.1811,
"step": 28000
},
{
"epoch": 2.9239766081871346,
"eval_accuracy": 0.7678975328660842,
"eval_loss": 1.2281544208526611,
"eval_runtime": 641.5519,
"eval_samples_per_second": 98.89,
"eval_steps_per_second": 3.091,
"step": 28000
},
{
"epoch": 2.9761904761904763,
"grad_norm": 3.5708224773406982,
"learning_rate": 7.15e-05,
"loss": 1.1771,
"step": 28500
},
{
"epoch": 2.9761904761904763,
"eval_accuracy": 0.7670722352455037,
"eval_loss": 1.2305808067321777,
"eval_runtime": 629.6736,
"eval_samples_per_second": 100.755,
"eval_steps_per_second": 3.149,
"step": 28500
},
{
"epoch": 3.028404344193818,
"grad_norm": 3.624859094619751,
"learning_rate": 7.1e-05,
"loss": 1.1719,
"step": 29000
},
{
"epoch": 3.028404344193818,
"eval_accuracy": 0.7678675260574238,
"eval_loss": NaN,
"eval_runtime": 653.9665,
"eval_samples_per_second": 97.013,
"eval_steps_per_second": 3.032,
"step": 29000
},
{
"epoch": 3.0806182121971597,
"grad_norm": 3.066574811935425,
"learning_rate": 7.05e-05,
"loss": 1.1625,
"step": 29500
},
{
"epoch": 3.0806182121971597,
"eval_accuracy": 0.7687440934734672,
"eval_loss": 1.2233607769012451,
"eval_runtime": 641.0412,
"eval_samples_per_second": 98.969,
"eval_steps_per_second": 3.093,
"step": 29500
},
{
"epoch": 3.1328320802005014,
"grad_norm": 2.951267719268799,
"learning_rate": 7e-05,
"loss": 1.1667,
"step": 30000
},
{
"epoch": 3.1328320802005014,
"eval_accuracy": 0.7692466135515155,
"eval_loss": 1.2097790241241455,
"eval_runtime": 634.5544,
"eval_samples_per_second": 99.98,
"eval_steps_per_second": 3.125,
"step": 30000
},
{
"epoch": 3.185045948203843,
"grad_norm": 3.134652614593506,
"learning_rate": 6.95e-05,
"loss": 1.161,
"step": 30500
},
{
"epoch": 3.185045948203843,
"eval_accuracy": 0.7697799908068673,
"eval_loss": 1.2196942567825317,
"eval_runtime": 639.2853,
"eval_samples_per_second": 99.241,
"eval_steps_per_second": 3.102,
"step": 30500
},
{
"epoch": 3.2372598162071844,
"grad_norm": 3.199575662612915,
"learning_rate": 6.9e-05,
"loss": 1.1591,
"step": 31000
},
{
"epoch": 3.2372598162071844,
"eval_accuracy": 0.7695065031514984,
"eval_loss": 1.2124276161193848,
"eval_runtime": 682.4327,
"eval_samples_per_second": 92.966,
"eval_steps_per_second": 2.906,
"step": 31000
},
{
"epoch": 3.2894736842105265,
"grad_norm": 2.954310178756714,
"learning_rate": 6.850000000000001e-05,
"loss": 1.1587,
"step": 31500
},
{
"epoch": 3.2894736842105265,
"eval_accuracy": 0.7700210273355362,
"eval_loss": 1.2054550647735596,
"eval_runtime": 639.9415,
"eval_samples_per_second": 99.139,
"eval_steps_per_second": 3.099,
"step": 31500
},
{
"epoch": 3.341687552213868,
"grad_norm": 2.8550124168395996,
"learning_rate": 6.800000000000001e-05,
"loss": 1.1575,
"step": 32000
},
{
"epoch": 3.341687552213868,
"eval_accuracy": 0.7699819172054071,
"eval_loss": 1.214800238609314,
"eval_runtime": 638.9546,
"eval_samples_per_second": 99.292,
"eval_steps_per_second": 3.104,
"step": 32000
},
{
"epoch": 3.3939014202172095,
"grad_norm": 2.92110013961792,
"learning_rate": 6.750000000000001e-05,
"loss": 1.1523,
"step": 32500
},
{
"epoch": 3.3939014202172095,
"eval_accuracy": 0.769908237207266,
"eval_loss": 1.2140088081359863,
"eval_runtime": 644.5498,
"eval_samples_per_second": 98.43,
"eval_steps_per_second": 3.077,
"step": 32500
},
{
"epoch": 3.4461152882205512,
"grad_norm": 3.346374988555908,
"learning_rate": 6.7e-05,
"loss": 1.1572,
"step": 33000
},
{
"epoch": 3.4461152882205512,
"eval_accuracy": 0.7708846584546473,
"eval_loss": 1.2036925554275513,
"eval_runtime": 640.768,
"eval_samples_per_second": 99.011,
"eval_steps_per_second": 3.095,
"step": 33000
},
{
"epoch": 3.498329156223893,
"grad_norm": 3.251553773880005,
"learning_rate": 6.65e-05,
"loss": 1.1435,
"step": 33500
},
{
"epoch": 3.498329156223893,
"eval_accuracy": 0.7710189865489193,
"eval_loss": 1.2105178833007812,
"eval_runtime": 649.4058,
"eval_samples_per_second": 97.694,
"eval_steps_per_second": 3.054,
"step": 33500
},
{
"epoch": 3.5505430242272347,
"grad_norm": 3.583970785140991,
"learning_rate": 6.6e-05,
"loss": 1.1377,
"step": 34000
},
{
"epoch": 3.5505430242272347,
"eval_accuracy": 0.7721632980503305,
"eval_loss": 1.2023558616638184,
"eval_runtime": 638.8527,
"eval_samples_per_second": 99.308,
"eval_steps_per_second": 3.104,
"step": 34000
},
{
"epoch": 3.6027568922305764,
"grad_norm": 3.588223934173584,
"learning_rate": 6.55e-05,
"loss": 1.1369,
"step": 34500
},
{
"epoch": 3.6027568922305764,
"eval_accuracy": 0.7718440341835643,
"eval_loss": 1.196567416191101,
"eval_runtime": 659.4538,
"eval_samples_per_second": 96.205,
"eval_steps_per_second": 3.007,
"step": 34500
},
{
"epoch": 3.654970760233918,
"grad_norm": 3.4205751419067383,
"learning_rate": 6.500000000000001e-05,
"loss": 1.1417,
"step": 35000
},
{
"epoch": 3.654970760233918,
"eval_accuracy": 0.7722229829571152,
"eval_loss": 1.1913775205612183,
"eval_runtime": 632.4781,
"eval_samples_per_second": 100.309,
"eval_steps_per_second": 3.135,
"step": 35000
},
{
"epoch": 3.70718462823726,
"grad_norm": 3.300046920776367,
"learning_rate": 6.450000000000001e-05,
"loss": 1.1464,
"step": 35500
},
{
"epoch": 3.70718462823726,
"eval_accuracy": 0.7726826510494151,
"eval_loss": 1.1917306184768677,
"eval_runtime": 632.4563,
"eval_samples_per_second": 100.312,
"eval_steps_per_second": 3.135,
"step": 35500
},
{
"epoch": 3.7593984962406015,
"grad_norm": 3.469228982925415,
"learning_rate": 6.400000000000001e-05,
"loss": 1.1413,
"step": 36000
},
{
"epoch": 3.7593984962406015,
"eval_accuracy": 0.7728226251433048,
"eval_loss": 1.1853009462356567,
"eval_runtime": 674.7454,
"eval_samples_per_second": 94.025,
"eval_steps_per_second": 2.939,
"step": 36000
},
{
"epoch": 3.8116123642439432,
"grad_norm": 3.0387423038482666,
"learning_rate": 6.35e-05,
"loss": 1.1437,
"step": 36500
},
{
"epoch": 3.8116123642439432,
"eval_accuracy": 0.7730040846904885,
"eval_loss": 1.197380542755127,
"eval_runtime": 642.657,
"eval_samples_per_second": 98.72,
"eval_steps_per_second": 3.086,
"step": 36500
},
{
"epoch": 3.863826232247285,
"grad_norm": 3.7750790119171143,
"learning_rate": 6.3e-05,
"loss": 1.1381,
"step": 37000
},
{
"epoch": 3.863826232247285,
"eval_accuracy": 0.7732420751700277,
"eval_loss": 1.1929783821105957,
"eval_runtime": 635.6748,
"eval_samples_per_second": 99.804,
"eval_steps_per_second": 3.12,
"step": 37000
},
{
"epoch": 3.9160401002506267,
"grad_norm": 2.7454237937927246,
"learning_rate": 6.25e-05,
"loss": 1.1319,
"step": 37500
},
{
"epoch": 3.9160401002506267,
"eval_accuracy": 0.7740359547829445,
"eval_loss": 1.1876792907714844,
"eval_runtime": 646.5493,
"eval_samples_per_second": 98.126,
"eval_steps_per_second": 3.067,
"step": 37500
},
{
"epoch": 3.9682539682539684,
"grad_norm": 3.1448304653167725,
"learning_rate": 6.2e-05,
"loss": 1.1217,
"step": 38000
},
{
"epoch": 3.9682539682539684,
"eval_accuracy": 0.7746280010171029,
"eval_loss": 1.1808606386184692,
"eval_runtime": 651.1984,
"eval_samples_per_second": 97.425,
"eval_steps_per_second": 3.045,
"step": 38000
},
{
"epoch": 4.02046783625731,
"grad_norm": 3.3443033695220947,
"learning_rate": 6.15e-05,
"loss": 1.1297,
"step": 38500
},
{
"epoch": 4.02046783625731,
"eval_accuracy": 0.7744960875837769,
"eval_loss": 1.1759783029556274,
"eval_runtime": 633.4471,
"eval_samples_per_second": 100.155,
"eval_steps_per_second": 3.13,
"step": 38500
},
{
"epoch": 4.072681704260652,
"grad_norm": 3.656834602355957,
"learning_rate": 6.1e-05,
"loss": 1.116,
"step": 39000
},
{
"epoch": 4.072681704260652,
"eval_accuracy": 0.7753487577160052,
"eval_loss": 1.171563744544983,
"eval_runtime": 636.4311,
"eval_samples_per_second": 99.686,
"eval_steps_per_second": 3.116,
"step": 39000
},
{
"epoch": 4.124895572263993,
"grad_norm": 3.2146315574645996,
"learning_rate": 6.05e-05,
"loss": 1.1262,
"step": 39500
},
{
"epoch": 4.124895572263993,
"eval_accuracy": 0.7748815741788464,
"eval_loss": 1.1831566095352173,
"eval_runtime": 626.1408,
"eval_samples_per_second": 101.324,
"eval_steps_per_second": 3.167,
"step": 39500
},
{
"epoch": 4.177109440267335,
"grad_norm": 3.5292935371398926,
"learning_rate": 6e-05,
"loss": 1.1038,
"step": 40000
},
{
"epoch": 4.177109440267335,
"eval_accuracy": 0.775054087211447,
"eval_loss": 1.1849807500839233,
"eval_runtime": 625.7328,
"eval_samples_per_second": 101.39,
"eval_steps_per_second": 3.169,
"step": 40000
},
{
"epoch": 4.2293233082706765,
"grad_norm": 3.17421817779541,
"learning_rate": 5.95e-05,
"loss": 1.1144,
"step": 40500
},
{
"epoch": 4.2293233082706765,
"eval_accuracy": 0.7753948446129992,
"eval_loss": 1.176754117012024,
"eval_runtime": 642.2309,
"eval_samples_per_second": 98.785,
"eval_steps_per_second": 3.088,
"step": 40500
},
{
"epoch": 4.281537176274019,
"grad_norm": 3.185692071914673,
"learning_rate": 5.9e-05,
"loss": 1.1068,
"step": 41000
},
{
"epoch": 4.281537176274019,
"eval_accuracy": 0.7759707359303026,
"eval_loss": 1.1789684295654297,
"eval_runtime": 627.5525,
"eval_samples_per_second": 101.096,
"eval_steps_per_second": 3.16,
"step": 41000
},
{
"epoch": 4.33375104427736,
"grad_norm": 3.033123731613159,
"learning_rate": 5.85e-05,
"loss": 1.1088,
"step": 41500
},
{
"epoch": 4.33375104427736,
"eval_accuracy": 0.7761587182255788,
"eval_loss": 1.1775932312011719,
"eval_runtime": 629.8113,
"eval_samples_per_second": 100.733,
"eval_steps_per_second": 3.149,
"step": 41500
},
{
"epoch": 4.385964912280702,
"grad_norm": 3.112304210662842,
"learning_rate": 5.8e-05,
"loss": 1.1243,
"step": 42000
},
{
"epoch": 4.385964912280702,
"eval_accuracy": 0.7764218763015075,
"eval_loss": 1.173019528388977,
"eval_runtime": 655.1515,
"eval_samples_per_second": 96.837,
"eval_steps_per_second": 3.027,
"step": 42000
},
{
"epoch": 4.438178780284043,
"grad_norm": 3.216050624847412,
"learning_rate": 5.7499999999999995e-05,
"loss": 1.1196,
"step": 42500
},
{
"epoch": 4.438178780284043,
"eval_accuracy": 0.7775044227327209,
"eval_loss": 1.1682883501052856,
"eval_runtime": 627.0953,
"eval_samples_per_second": 101.17,
"eval_steps_per_second": 3.162,
"step": 42500
},
{
"epoch": 4.4903926482873855,
"grad_norm": 3.4211995601654053,
"learning_rate": 5.6999999999999996e-05,
"loss": 1.0985,
"step": 43000
},
{
"epoch": 4.4903926482873855,
"eval_accuracy": 0.7770348312966414,
"eval_loss": 1.1709975004196167,
"eval_runtime": 626.8729,
"eval_samples_per_second": 101.206,
"eval_steps_per_second": 3.163,
"step": 43000
},
{
"epoch": 4.542606516290727,
"grad_norm": 3.0796425342559814,
"learning_rate": 5.65e-05,
"loss": 1.0994,
"step": 43500
},
{
"epoch": 4.542606516290727,
"eval_accuracy": 0.7772564904147641,
"eval_loss": 1.1671358346939087,
"eval_runtime": 626.7232,
"eval_samples_per_second": 101.23,
"eval_steps_per_second": 3.164,
"step": 43500
},
{
"epoch": 4.594820384294069,
"grad_norm": 3.7232813835144043,
"learning_rate": 5.6000000000000006e-05,
"loss": 1.1017,
"step": 44000
},
{
"epoch": 4.594820384294069,
"eval_accuracy": 0.7776795841513819,
"eval_loss": 1.1589510440826416,
"eval_runtime": 639.6154,
"eval_samples_per_second": 99.189,
"eval_steps_per_second": 3.1,
"step": 44000
},
{
"epoch": 4.64703425229741,
"grad_norm": 3.0733375549316406,
"learning_rate": 5.550000000000001e-05,
"loss": 1.1071,
"step": 44500
},
{
"epoch": 4.64703425229741,
"eval_accuracy": 0.7780140462141538,
"eval_loss": 1.1811403036117554,
"eval_runtime": 637.3252,
"eval_samples_per_second": 99.546,
"eval_steps_per_second": 3.111,
"step": 44500
},
{
"epoch": 4.6992481203007515,
"grad_norm": 3.2747066020965576,
"learning_rate": 5.500000000000001e-05,
"loss": 1.0993,
"step": 45000
},
{
"epoch": 4.6992481203007515,
"eval_accuracy": 0.7782215810670943,
"eval_loss": 1.153876781463623,
"eval_runtime": 625.6185,
"eval_samples_per_second": 101.408,
"eval_steps_per_second": 3.17,
"step": 45000
},
{
"epoch": 4.751461988304094,
"grad_norm": 3.6057417392730713,
"learning_rate": 5.45e-05,
"loss": 1.0998,
"step": 45500
},
{
"epoch": 4.751461988304094,
"eval_accuracy": 0.7785066434739322,
"eval_loss": 1.151453971862793,
"eval_runtime": 626.7653,
"eval_samples_per_second": 101.223,
"eval_steps_per_second": 3.164,
"step": 45500
},
{
"epoch": 4.803675856307435,
"grad_norm": 2.923815965652466,
"learning_rate": 5.4000000000000005e-05,
"loss": 1.0966,
"step": 46000
},
{
"epoch": 4.803675856307435,
"eval_accuracy": 0.7786641899675826,
"eval_loss": 1.1550520658493042,
"eval_runtime": 626.8592,
"eval_samples_per_second": 101.208,
"eval_steps_per_second": 3.163,
"step": 46000
},
{
"epoch": 4.855889724310777,
"grad_norm": 2.843076229095459,
"learning_rate": 5.3500000000000006e-05,
"loss": 1.0977,
"step": 46500
},
{
"epoch": 4.855889724310777,
"eval_accuracy": 0.7794697444838651,
"eval_loss": 1.151402473449707,
"eval_runtime": 625.4097,
"eval_samples_per_second": 101.442,
"eval_steps_per_second": 3.171,
"step": 46500
},
{
"epoch": 4.908103592314118,
"grad_norm": 3.2491214275360107,
"learning_rate": 5.300000000000001e-05,
"loss": 1.0962,
"step": 47000
},
{
"epoch": 4.908103592314118,
"eval_accuracy": 0.7797745180661694,
"eval_loss": 1.1584906578063965,
"eval_runtime": 624.1222,
"eval_samples_per_second": 101.652,
"eval_steps_per_second": 3.177,
"step": 47000
},
{
"epoch": 4.9603174603174605,
"grad_norm": 2.8548121452331543,
"learning_rate": 5.25e-05,
"loss": 1.0933,
"step": 47500
},
{
"epoch": 4.9603174603174605,
"eval_accuracy": 0.7794554886066851,
"eval_loss": 1.1526641845703125,
"eval_runtime": 644.9796,
"eval_samples_per_second": 98.364,
"eval_steps_per_second": 3.075,
"step": 47500
},
{
"epoch": 5.012531328320802,
"grad_norm": 3.3320724964141846,
"learning_rate": 5.2000000000000004e-05,
"loss": 1.0897,
"step": 48000
},
{
"epoch": 5.012531328320802,
"eval_accuracy": 0.7798054689379954,
"eval_loss": 1.153532862663269,
"eval_runtime": 631.6644,
"eval_samples_per_second": 100.438,
"eval_steps_per_second": 3.139,
"step": 48000
},
{
"epoch": 5.064745196324144,
"grad_norm": 3.1418938636779785,
"learning_rate": 5.1500000000000005e-05,
"loss": 1.0727,
"step": 48500
},
{
"epoch": 5.064745196324144,
"eval_accuracy": 0.7807417083407959,
"eval_loss": 1.151798963546753,
"eval_runtime": 628.4763,
"eval_samples_per_second": 100.947,
"eval_steps_per_second": 3.155,
"step": 48500
},
{
"epoch": 5.116959064327485,
"grad_norm": 3.242654800415039,
"learning_rate": 5.1000000000000006e-05,
"loss": 1.0846,
"step": 49000
},
{
"epoch": 5.116959064327485,
"eval_accuracy": 0.7803700732682441,
"eval_loss": 1.143381953239441,
"eval_runtime": 626.0556,
"eval_samples_per_second": 101.338,
"eval_steps_per_second": 3.167,
"step": 49000
},
{
"epoch": 5.169172932330827,
"grad_norm": 3.165130615234375,
"learning_rate": 5.05e-05,
"loss": 1.0817,
"step": 49500
},
{
"epoch": 5.169172932330827,
"eval_accuracy": 0.7805262215474418,
"eval_loss": 1.1557620763778687,
"eval_runtime": 638.2573,
"eval_samples_per_second": 99.4,
"eval_steps_per_second": 3.107,
"step": 49500
},
{
"epoch": 5.221386800334169,
"grad_norm": 3.074112892150879,
"learning_rate": 5e-05,
"loss": 1.0725,
"step": 50000
},
{
"epoch": 5.221386800334169,
"eval_accuracy": 0.7803664543536556,
"eval_loss": 1.1497862339019775,
"eval_runtime": 635.6514,
"eval_samples_per_second": 99.808,
"eval_steps_per_second": 3.12,
"step": 50000
},
{
"epoch": 5.273600668337511,
"grad_norm": 3.5100748538970947,
"learning_rate": 4.9500000000000004e-05,
"loss": 1.076,
"step": 50500
},
{
"epoch": 5.273600668337511,
"eval_accuracy": 0.7819452223531255,
"eval_loss": 1.1426007747650146,
"eval_runtime": 627.3013,
"eval_samples_per_second": 101.136,
"eval_steps_per_second": 3.161,
"step": 50500
},
{
"epoch": 5.325814536340852,
"grad_norm": 2.9352290630340576,
"learning_rate": 4.9e-05,
"loss": 1.0786,
"step": 51000
},
{
"epoch": 5.325814536340852,
"eval_accuracy": 0.7809871489797384,
"eval_loss": 1.1469109058380127,
"eval_runtime": 651.0651,
"eval_samples_per_second": 97.445,
"eval_steps_per_second": 3.046,
"step": 51000
},
{
"epoch": 5.378028404344194,
"grad_norm": 3.215942859649658,
"learning_rate": 4.85e-05,
"loss": 1.0756,
"step": 51500
},
{
"epoch": 5.378028404344194,
"eval_accuracy": 0.7814986150426574,
"eval_loss": 1.1447529792785645,
"eval_runtime": 637.8608,
"eval_samples_per_second": 99.462,
"eval_steps_per_second": 3.109,
"step": 51500
},
{
"epoch": 5.430242272347535,
"grad_norm": 3.225491523742676,
"learning_rate": 4.8e-05,
"loss": 1.0782,
"step": 52000
},
{
"epoch": 5.430242272347535,
"eval_accuracy": 0.781914062234545,
"eval_loss": 1.1327252388000488,
"eval_runtime": 636.3098,
"eval_samples_per_second": 99.705,
"eval_steps_per_second": 3.116,
"step": 52000
},
{
"epoch": 5.482456140350878,
"grad_norm": 3.2323813438415527,
"learning_rate": 4.75e-05,
"loss": 1.0718,
"step": 52500
},
{
"epoch": 5.482456140350878,
"eval_accuracy": 0.7824720975904875,
"eval_loss": 1.1324148178100586,
"eval_runtime": 634.3817,
"eval_samples_per_second": 100.008,
"eval_steps_per_second": 3.126,
"step": 52500
},
{
"epoch": 5.534670008354219,
"grad_norm": 2.8218557834625244,
"learning_rate": 4.7e-05,
"loss": 1.0646,
"step": 53000
},
{
"epoch": 5.534670008354219,
"eval_accuracy": 0.7827635872108434,
"eval_loss": 1.139945387840271,
"eval_runtime": 685.8615,
"eval_samples_per_second": 92.501,
"eval_steps_per_second": 2.891,
"step": 53000
},
{
"epoch": 5.586883876357561,
"grad_norm": 3.848472833633423,
"learning_rate": 4.6500000000000005e-05,
"loss": 1.0692,
"step": 53500
},
{
"epoch": 5.586883876357561,
"eval_accuracy": 0.7828145845856118,
"eval_loss": 1.1383306980133057,
"eval_runtime": 640.36,
"eval_samples_per_second": 99.074,
"eval_steps_per_second": 3.097,
"step": 53500
},
{
"epoch": 5.639097744360902,
"grad_norm": 3.0220718383789062,
"learning_rate": 4.600000000000001e-05,
"loss": 1.0605,
"step": 54000
},
{
"epoch": 5.639097744360902,
"eval_accuracy": 0.7828910126440776,
"eval_loss": 1.1487065553665161,
"eval_runtime": 639.7494,
"eval_samples_per_second": 99.169,
"eval_steps_per_second": 3.1,
"step": 54000
},
{
"epoch": 5.6913116123642435,
"grad_norm": 3.226999044418335,
"learning_rate": 4.55e-05,
"loss": 1.0678,
"step": 54500
},
{
"epoch": 5.6913116123642435,
"eval_accuracy": 0.783646746192109,
"eval_loss": 1.1216100454330444,
"eval_runtime": 637.7267,
"eval_samples_per_second": 99.483,
"eval_steps_per_second": 3.109,
"step": 54500
},
{
"epoch": 5.743525480367586,
"grad_norm": 3.0716323852539062,
"learning_rate": 4.5e-05,
"loss": 1.068,
"step": 55000
},
{
"epoch": 5.743525480367586,
"eval_accuracy": 0.7840726797935829,
"eval_loss": 1.1208263635635376,
"eval_runtime": 661.1104,
"eval_samples_per_second": 95.964,
"eval_steps_per_second": 2.999,
"step": 55000
},
{
"epoch": 5.795739348370927,
"grad_norm": 3.032036542892456,
"learning_rate": 4.4500000000000004e-05,
"loss": 1.0588,
"step": 55500
},
{
"epoch": 5.795739348370927,
"eval_accuracy": 0.7842573639982784,
"eval_loss": 1.1288572549819946,
"eval_runtime": 644.8754,
"eval_samples_per_second": 98.38,
"eval_steps_per_second": 3.075,
"step": 55500
},
{
"epoch": 5.847953216374269,
"grad_norm": 2.896811008453369,
"learning_rate": 4.4000000000000006e-05,
"loss": 1.0723,
"step": 56000
},
{
"epoch": 5.847953216374269,
"eval_accuracy": 0.7839231768143949,
"eval_loss": 1.11968195438385,
"eval_runtime": 643.1751,
"eval_samples_per_second": 98.64,
"eval_steps_per_second": 3.083,
"step": 56000
},
{
"epoch": 5.90016708437761,
"grad_norm": 3.429722785949707,
"learning_rate": 4.35e-05,
"loss": 1.057,
"step": 56500
},
{
"epoch": 5.90016708437761,
"eval_accuracy": 0.7850116259552448,
"eval_loss": 1.1247141361236572,
"eval_runtime": 639.1011,
"eval_samples_per_second": 99.269,
"eval_steps_per_second": 3.103,
"step": 56500
},
{
"epoch": 5.9523809523809526,
"grad_norm": 2.920243740081787,
"learning_rate": 4.3e-05,
"loss": 1.0628,
"step": 57000
},
{
"epoch": 5.9523809523809526,
"eval_accuracy": 0.7846882337268002,
"eval_loss": NaN,
"eval_runtime": 629.1614,
"eval_samples_per_second": 100.837,
"eval_steps_per_second": 3.152,
"step": 57000
},
{
"epoch": 6.004594820384294,
"grad_norm": 2.937222957611084,
"learning_rate": 4.25e-05,
"loss": 1.0619,
"step": 57500
},
{
"epoch": 6.004594820384294,
"eval_accuracy": 0.7845797476831528,
"eval_loss": NaN,
"eval_runtime": 628.6009,
"eval_samples_per_second": 100.927,
"eval_steps_per_second": 3.155,
"step": 57500
},
{
"epoch": 6.056808688387636,
"grad_norm": 2.854961395263672,
"learning_rate": 4.2e-05,
"loss": 1.0386,
"step": 58000
},
{
"epoch": 6.056808688387636,
"eval_accuracy": 0.7856027081868581,
"eval_loss": 1.1315497159957886,
"eval_runtime": 626.9314,
"eval_samples_per_second": 101.196,
"eval_steps_per_second": 3.163,
"step": 58000
},
{
"epoch": 6.109022556390977,
"grad_norm": 3.4655332565307617,
"learning_rate": 4.15e-05,
"loss": 1.0446,
"step": 58500
},
{
"epoch": 6.109022556390977,
"eval_accuracy": 0.785224256670098,
"eval_loss": NaN,
"eval_runtime": 674.346,
"eval_samples_per_second": 94.081,
"eval_steps_per_second": 2.941,
"step": 58500
},
{
"epoch": 6.161236424394319,
"grad_norm": 2.93487811088562,
"learning_rate": 4.1e-05,
"loss": 1.0532,
"step": 59000
},
{
"epoch": 6.161236424394319,
"eval_accuracy": 0.7855367902419893,
"eval_loss": 1.1223821640014648,
"eval_runtime": 630.6681,
"eval_samples_per_second": 100.596,
"eval_steps_per_second": 3.144,
"step": 59000
},
{
"epoch": 6.213450292397661,
"grad_norm": 3.50022554397583,
"learning_rate": 4.05e-05,
"loss": 1.0465,
"step": 59500
},
{
"epoch": 6.213450292397661,
"eval_accuracy": 0.7857459334681388,
"eval_loss": 1.1025224924087524,
"eval_runtime": 625.4125,
"eval_samples_per_second": 101.442,
"eval_steps_per_second": 3.171,
"step": 59500
},
{
"epoch": 6.265664160401003,
"grad_norm": 3.2295687198638916,
"learning_rate": 4e-05,
"loss": 1.0448,
"step": 60000
},
{
"epoch": 6.265664160401003,
"eval_accuracy": 0.7862344900229137,
"eval_loss": 1.104607105255127,
"eval_runtime": 668.9552,
"eval_samples_per_second": 94.839,
"eval_steps_per_second": 2.964,
"step": 60000
},
{
"epoch": 6.317878028404344,
"grad_norm": 3.1386146545410156,
"learning_rate": 3.9500000000000005e-05,
"loss": 1.0304,
"step": 60500
},
{
"epoch": 6.317878028404344,
"eval_accuracy": 0.7869643708248876,
"eval_loss": 1.1088374853134155,
"eval_runtime": 633.3446,
"eval_samples_per_second": 100.171,
"eval_steps_per_second": 3.131,
"step": 60500
},
{
"epoch": 6.370091896407686,
"grad_norm": 3.1814932823181152,
"learning_rate": 3.9000000000000006e-05,
"loss": 1.0354,
"step": 61000
},
{
"epoch": 6.370091896407686,
"eval_accuracy": 0.7865247818397344,
"eval_loss": 1.108305811882019,
"eval_runtime": 636.3927,
"eval_samples_per_second": 99.692,
"eval_steps_per_second": 3.116,
"step": 61000
},
{
"epoch": 6.4223057644110275,
"grad_norm": 3.155350685119629,
"learning_rate": 3.85e-05,
"loss": 1.0437,
"step": 61500
},
{
"epoch": 6.4223057644110275,
"eval_accuracy": 0.7868538533540672,
"eval_loss": NaN,
"eval_runtime": 630.5912,
"eval_samples_per_second": 100.609,
"eval_steps_per_second": 3.145,
"step": 61500
},
{
"epoch": 6.474519632414369,
"grad_norm": 2.9944610595703125,
"learning_rate": 3.8e-05,
"loss": 1.0347,
"step": 62000
},
{
"epoch": 6.474519632414369,
"eval_accuracy": 0.7876532991281543,
"eval_loss": 1.0992841720581055,
"eval_runtime": 649.6208,
"eval_samples_per_second": 97.662,
"eval_steps_per_second": 3.053,
"step": 62000
},
{
"epoch": 6.526733500417711,
"grad_norm": 2.89630126953125,
"learning_rate": 3.7500000000000003e-05,
"loss": 1.041,
"step": 62500
},
{
"epoch": 6.526733500417711,
"eval_accuracy": 0.7875077762136071,
"eval_loss": 1.1099497079849243,
"eval_runtime": 631.7247,
"eval_samples_per_second": 100.428,
"eval_steps_per_second": 3.139,
"step": 62500
},
{
"epoch": 6.578947368421053,
"grad_norm": 3.2351603507995605,
"learning_rate": 3.7e-05,
"loss": 1.0388,
"step": 63000
},
{
"epoch": 6.578947368421053,
"eval_accuracy": 0.78800361330496,
"eval_loss": 1.11760413646698,
"eval_runtime": 629.3158,
"eval_samples_per_second": 100.813,
"eval_steps_per_second": 3.151,
"step": 63000
},
{
"epoch": 6.631161236424394,
"grad_norm": 3.2929039001464844,
"learning_rate": 3.65e-05,
"loss": 1.0383,
"step": 63500
},
{
"epoch": 6.631161236424394,
"eval_accuracy": 0.7879278944035151,
"eval_loss": 1.1053088903427124,
"eval_runtime": 627.1228,
"eval_samples_per_second": 101.165,
"eval_steps_per_second": 3.162,
"step": 63500
},
{
"epoch": 6.683375104427736,
"grad_norm": 2.9945790767669678,
"learning_rate": 3.6e-05,
"loss": 1.0373,
"step": 64000
},
{
"epoch": 6.683375104427736,
"eval_accuracy": 0.7881556360589377,
"eval_loss": 1.1067023277282715,
"eval_runtime": 664.2963,
"eval_samples_per_second": 95.504,
"eval_steps_per_second": 2.985,
"step": 64000
},
{
"epoch": 6.735588972431078,
"grad_norm": 3.4380545616149902,
"learning_rate": 3.55e-05,
"loss": 1.0382,
"step": 64500
},
{
"epoch": 6.735588972431078,
"eval_accuracy": 0.7882482565200516,
"eval_loss": 1.116519570350647,
"eval_runtime": 629.5022,
"eval_samples_per_second": 100.783,
"eval_steps_per_second": 3.15,
"step": 64500
},
{
"epoch": 6.787802840434419,
"grad_norm": 3.368530750274658,
"learning_rate": 3.5e-05,
"loss": 1.0277,
"step": 65000
},
{
"epoch": 6.787802840434419,
"eval_accuracy": 0.7891739950636399,
"eval_loss": 1.0915908813476562,
"eval_runtime": 632.7041,
"eval_samples_per_second": 100.273,
"eval_steps_per_second": 3.134,
"step": 65000
},
{
"epoch": 6.840016708437761,
"grad_norm": 3.3649165630340576,
"learning_rate": 3.45e-05,
"loss": 1.0337,
"step": 65500
},
{
"epoch": 6.840016708437761,
"eval_accuracy": 0.7883815172332914,
"eval_loss": 1.1139589548110962,
"eval_runtime": 640.7807,
"eval_samples_per_second": 99.009,
"eval_steps_per_second": 3.095,
"step": 65500
},
{
"epoch": 6.8922305764411025,
"grad_norm": 3.242825746536255,
"learning_rate": 3.4000000000000007e-05,
"loss": 1.033,
"step": 66000
},
{
"epoch": 6.8922305764411025,
"eval_accuracy": 0.789335987558676,
"eval_loss": 1.0902156829833984,
"eval_runtime": 634.5325,
"eval_samples_per_second": 99.984,
"eval_steps_per_second": 3.125,
"step": 66000
},
{
"epoch": 6.944444444444445,
"grad_norm": 2.7917425632476807,
"learning_rate": 3.35e-05,
"loss": 1.0298,
"step": 66500
},
{
"epoch": 6.944444444444445,
"eval_accuracy": 0.7897472088960492,
"eval_loss": 1.0988086462020874,
"eval_runtime": 636.5285,
"eval_samples_per_second": 99.67,
"eval_steps_per_second": 3.115,
"step": 66500
},
{
"epoch": 6.996658312447786,
"grad_norm": 2.9491727352142334,
"learning_rate": 3.3e-05,
"loss": 1.0325,
"step": 67000
},
{
"epoch": 6.996658312447786,
"eval_accuracy": 0.7894358436331047,
"eval_loss": 1.097759485244751,
"eval_runtime": 631.2152,
"eval_samples_per_second": 100.509,
"eval_steps_per_second": 3.142,
"step": 67000
},
{
"epoch": 7.048872180451128,
"grad_norm": 3.81508469581604,
"learning_rate": 3.2500000000000004e-05,
"loss": 1.013,
"step": 67500
},
{
"epoch": 7.048872180451128,
"eval_accuracy": 0.7897459597854384,
"eval_loss": 1.1025677919387817,
"eval_runtime": 667.978,
"eval_samples_per_second": 94.978,
"eval_steps_per_second": 2.969,
"step": 67500
},
{
"epoch": 7.101086048454469,
"grad_norm": 2.6366662979125977,
"learning_rate": 3.2000000000000005e-05,
"loss": 1.0199,
"step": 68000
},
{
"epoch": 7.101086048454469,
"eval_accuracy": 0.7899999141985645,
"eval_loss": 1.1025761365890503,
"eval_runtime": 630.4778,
"eval_samples_per_second": 100.627,
"eval_steps_per_second": 3.145,
"step": 68000
},
{
"epoch": 7.1532999164578115,
"grad_norm": 3.155102014541626,
"learning_rate": 3.15e-05,
"loss": 1.0157,
"step": 68500
},
{
"epoch": 7.1532999164578115,
"eval_accuracy": 0.7898970170010094,
"eval_loss": 1.1064562797546387,
"eval_runtime": 625.4866,
"eval_samples_per_second": 101.43,
"eval_steps_per_second": 3.17,
"step": 68500
},
{
"epoch": 7.205513784461153,
"grad_norm": 2.9615111351013184,
"learning_rate": 3.1e-05,
"loss": 1.0137,
"step": 69000
},
{
"epoch": 7.205513784461153,
"eval_accuracy": 0.7904352669747394,
"eval_loss": 1.0876559019088745,
"eval_runtime": 646.3487,
"eval_samples_per_second": 98.156,
"eval_steps_per_second": 3.068,
"step": 69000
},
{
"epoch": 7.257727652464495,
"grad_norm": 3.723841905593872,
"learning_rate": 3.05e-05,
"loss": 1.0137,
"step": 69500
},
{
"epoch": 7.257727652464495,
"eval_accuracy": 0.7899743837210673,
"eval_loss": 1.0912542343139648,
"eval_runtime": 652.0884,
"eval_samples_per_second": 97.292,
"eval_steps_per_second": 3.041,
"step": 69500
},
{
"epoch": 7.309941520467836,
"grad_norm": 3.6348705291748047,
"learning_rate": 3e-05,
"loss": 1.0167,
"step": 70000
},
{
"epoch": 7.309941520467836,
"eval_accuracy": 0.7903721199263885,
"eval_loss": NaN,
"eval_runtime": 630.9965,
"eval_samples_per_second": 100.544,
"eval_steps_per_second": 3.143,
"step": 70000
},
{
"epoch": 7.362155388471178,
"grad_norm": 3.2423222064971924,
"learning_rate": 2.95e-05,
"loss": 1.0089,
"step": 70500
},
{
"epoch": 7.362155388471178,
"eval_accuracy": 0.7906133078105837,
"eval_loss": 1.0956330299377441,
"eval_runtime": 633.5776,
"eval_samples_per_second": 100.135,
"eval_steps_per_second": 3.13,
"step": 70500
},
{
"epoch": 7.41436925647452,
"grad_norm": 3.15704083442688,
"learning_rate": 2.9e-05,
"loss": 1.0055,
"step": 71000
},
{
"epoch": 7.41436925647452,
"eval_accuracy": 0.7912404700602101,
"eval_loss": NaN,
"eval_runtime": 649.7229,
"eval_samples_per_second": 97.646,
"eval_steps_per_second": 3.052,
"step": 71000
},
{
"epoch": 7.466583124477861,
"grad_norm": 3.387059211730957,
"learning_rate": 2.8499999999999998e-05,
"loss": 1.0135,
"step": 71500
},
{
"epoch": 7.466583124477861,
"eval_accuracy": 0.7914904478088925,
"eval_loss": 1.0922181606292725,
"eval_runtime": 633.8807,
"eval_samples_per_second": 100.087,
"eval_steps_per_second": 3.128,
"step": 71500
},
{
"epoch": 7.518796992481203,
"grad_norm": 3.3285696506500244,
"learning_rate": 2.8000000000000003e-05,
"loss": 1.0041,
"step": 72000
},
{
"epoch": 7.518796992481203,
"eval_accuracy": 0.7919370824790591,
"eval_loss": 1.0766669511795044,
"eval_runtime": 640.2628,
"eval_samples_per_second": 99.089,
"eval_steps_per_second": 3.097,
"step": 72000
},
{
"epoch": 7.571010860484545,
"grad_norm": 3.0725443363189697,
"learning_rate": 2.7500000000000004e-05,
"loss": 0.9982,
"step": 72500
},
{
"epoch": 7.571010860484545,
"eval_accuracy": 0.7914295287916221,
"eval_loss": 1.0892815589904785,
"eval_runtime": 636.4517,
"eval_samples_per_second": 99.682,
"eval_steps_per_second": 3.116,
"step": 72500
},
{
"epoch": 7.6232247284878865,
"grad_norm": 3.2176389694213867,
"learning_rate": 2.7000000000000002e-05,
"loss": 1.0066,
"step": 73000
},
{
"epoch": 7.6232247284878865,
"eval_accuracy": 0.7917736483647045,
"eval_loss": 1.092371940612793,
"eval_runtime": 621.243,
"eval_samples_per_second": 102.123,
"eval_steps_per_second": 3.192,
"step": 73000
},
{
"epoch": 7.675438596491228,
"grad_norm": 2.7997543811798096,
"learning_rate": 2.6500000000000004e-05,
"loss": 1.0073,
"step": 73500
},
{
"epoch": 7.675438596491228,
"eval_accuracy": 0.7922002806270909,
"eval_loss": 1.0904649496078491,
"eval_runtime": 620.7253,
"eval_samples_per_second": 102.208,
"eval_steps_per_second": 3.195,
"step": 73500
},
{
"epoch": 7.72765246449457,
"grad_norm": 3.626774549484253,
"learning_rate": 2.6000000000000002e-05,
"loss": 1.0015,
"step": 74000
},
{
"epoch": 7.72765246449457,
"eval_accuracy": 0.7917596219688585,
"eval_loss": 1.0867533683776855,
"eval_runtime": 618.3607,
"eval_samples_per_second": 102.599,
"eval_steps_per_second": 3.207,
"step": 74000
},
{
"epoch": 7.779866332497911,
"grad_norm": 2.8419744968414307,
"learning_rate": 2.5500000000000003e-05,
"loss": 1.0049,
"step": 74500
},
{
"epoch": 7.779866332497911,
"eval_accuracy": 0.7928055223543873,
"eval_loss": 1.0817667245864868,
"eval_runtime": 617.1968,
"eval_samples_per_second": 102.792,
"eval_steps_per_second": 3.213,
"step": 74500
},
{
"epoch": 7.832080200501253,
"grad_norm": 2.822751760482788,
"learning_rate": 2.5e-05,
"loss": 0.9957,
"step": 75000
},
{
"epoch": 7.832080200501253,
"eval_accuracy": 0.7932603950321594,
"eval_loss": 1.0652581453323364,
"eval_runtime": 616.9484,
"eval_samples_per_second": 102.834,
"eval_steps_per_second": 3.214,
"step": 75000
},
{
"epoch": 7.884294068504595,
"grad_norm": 2.9094507694244385,
"learning_rate": 2.45e-05,
"loss": 1.0042,
"step": 75500
},
{
"epoch": 7.884294068504595,
"eval_accuracy": 0.7934020652494004,
"eval_loss": 1.0694156885147095,
"eval_runtime": 617.9112,
"eval_samples_per_second": 102.673,
"eval_steps_per_second": 3.209,
"step": 75500
},
{
"epoch": 7.936507936507937,
"grad_norm": 2.5249078273773193,
"learning_rate": 2.4e-05,
"loss": 1.0031,
"step": 76000
},
{
"epoch": 7.936507936507937,
"eval_accuracy": 0.7935896894661485,
"eval_loss": 1.0808907747268677,
"eval_runtime": 618.1232,
"eval_samples_per_second": 102.638,
"eval_steps_per_second": 3.208,
"step": 76000
},
{
"epoch": 7.988721804511278,
"grad_norm": 3.253228187561035,
"learning_rate": 2.35e-05,
"loss": 0.9979,
"step": 76500
},
{
"epoch": 7.988721804511278,
"eval_accuracy": 0.7933455565489351,
"eval_loss": 1.0679088830947876,
"eval_runtime": 618.0488,
"eval_samples_per_second": 102.65,
"eval_steps_per_second": 3.208,
"step": 76500
},
{
"epoch": 8.04093567251462,
"grad_norm": 2.6395580768585205,
"learning_rate": 2.3000000000000003e-05,
"loss": 0.9825,
"step": 77000
},
{
"epoch": 8.04093567251462,
"eval_accuracy": 0.7935253800154058,
"eval_loss": 1.07496976852417,
"eval_runtime": 617.9799,
"eval_samples_per_second": 102.662,
"eval_steps_per_second": 3.209,
"step": 77000
},
{
"epoch": 8.093149540517961,
"grad_norm": 3.0883655548095703,
"learning_rate": 2.25e-05,
"loss": 0.9926,
"step": 77500
},
{
"epoch": 8.093149540517961,
"eval_accuracy": 0.7942360815200339,
"eval_loss": 1.0814706087112427,
"eval_runtime": 618.6019,
"eval_samples_per_second": 102.559,
"eval_steps_per_second": 3.206,
"step": 77500
},
{
"epoch": 8.145363408521304,
"grad_norm": 3.356086015701294,
"learning_rate": 2.2000000000000003e-05,
"loss": 0.9897,
"step": 78000
},
{
"epoch": 8.145363408521304,
"eval_accuracy": 0.7933161062503574,
"eval_loss": NaN,
"eval_runtime": 617.0688,
"eval_samples_per_second": 102.813,
"eval_steps_per_second": 3.214,
"step": 78000
},
{
"epoch": 8.197577276524646,
"grad_norm": 3.31772780418396,
"learning_rate": 2.15e-05,
"loss": 0.9871,
"step": 78500
},
{
"epoch": 8.197577276524646,
"eval_accuracy": 0.7943132034313805,
"eval_loss": 1.0676552057266235,
"eval_runtime": 623.639,
"eval_samples_per_second": 101.73,
"eval_steps_per_second": 3.18,
"step": 78500
},
{
"epoch": 8.249791144527986,
"grad_norm": 3.543698310852051,
"learning_rate": 2.1e-05,
"loss": 1.0001,
"step": 79000
},
{
"epoch": 8.249791144527986,
"eval_accuracy": 0.7938355754922207,
"eval_loss": 1.0734678506851196,
"eval_runtime": 617.8987,
"eval_samples_per_second": 102.675,
"eval_steps_per_second": 3.209,
"step": 79000
},
{
"epoch": 8.302005012531328,
"grad_norm": 2.846734046936035,
"learning_rate": 2.05e-05,
"loss": 0.987,
"step": 79500
},
{
"epoch": 8.302005012531328,
"eval_accuracy": 0.7944878324075739,
"eval_loss": 1.0765976905822754,
"eval_runtime": 618.672,
"eval_samples_per_second": 102.547,
"eval_steps_per_second": 3.205,
"step": 79500
},
{
"epoch": 8.35421888053467,
"grad_norm": 2.871553897857666,
"learning_rate": 2e-05,
"loss": 0.9806,
"step": 80000
},
{
"epoch": 8.35421888053467,
"eval_accuracy": 0.794915350004184,
"eval_loss": 1.0597549676895142,
"eval_runtime": 618.1763,
"eval_samples_per_second": 102.629,
"eval_steps_per_second": 3.208,
"step": 80000
},
{
"epoch": 8.406432748538013,
"grad_norm": 2.959845542907715,
"learning_rate": 1.9500000000000003e-05,
"loss": 0.984,
"step": 80500
},
{
"epoch": 8.406432748538013,
"eval_accuracy": 0.7946179832675856,
"eval_loss": 1.0595872402191162,
"eval_runtime": 617.7039,
"eval_samples_per_second": 102.708,
"eval_steps_per_second": 3.21,
"step": 80500
},
{
"epoch": 8.458646616541353,
"grad_norm": 2.7344448566436768,
"learning_rate": 1.9e-05,
"loss": 0.9822,
"step": 81000
},
{
"epoch": 8.458646616541353,
"eval_accuracy": 0.7951521505243792,
"eval_loss": 1.0736610889434814,
"eval_runtime": 618.3207,
"eval_samples_per_second": 102.605,
"eval_steps_per_second": 3.207,
"step": 81000
},
{
"epoch": 8.510860484544695,
"grad_norm": 3.7596890926361084,
"learning_rate": 1.85e-05,
"loss": 0.9791,
"step": 81500
},
{
"epoch": 8.510860484544695,
"eval_accuracy": 0.795253175296364,
"eval_loss": 1.0666213035583496,
"eval_runtime": 620.245,
"eval_samples_per_second": 102.287,
"eval_steps_per_second": 3.197,
"step": 81500
},
{
"epoch": 8.563074352548037,
"grad_norm": 2.727389335632324,
"learning_rate": 1.8e-05,
"loss": 0.9804,
"step": 82000
},
{
"epoch": 8.563074352548037,
"eval_accuracy": 0.7953146030482228,
"eval_loss": 1.0600041151046753,
"eval_runtime": 618.8007,
"eval_samples_per_second": 102.526,
"eval_steps_per_second": 3.205,
"step": 82000
},
{
"epoch": 8.615288220551378,
"grad_norm": 2.9671452045440674,
"learning_rate": 1.75e-05,
"loss": 0.9703,
"step": 82500
},
{
"epoch": 8.615288220551378,
"eval_accuracy": 0.7963827692254926,
"eval_loss": 1.0601171255111694,
"eval_runtime": 616.1581,
"eval_samples_per_second": 102.965,
"eval_steps_per_second": 3.218,
"step": 82500
},
{
"epoch": 8.66750208855472,
"grad_norm": 2.9628329277038574,
"learning_rate": 1.7000000000000003e-05,
"loss": 0.9768,
"step": 83000
},
{
"epoch": 8.66750208855472,
"eval_accuracy": 0.7952524450397661,
"eval_loss": 1.0768334865570068,
"eval_runtime": 616.7531,
"eval_samples_per_second": 102.866,
"eval_steps_per_second": 3.215,
"step": 83000
},
{
"epoch": 8.719715956558062,
"grad_norm": 3.6838812828063965,
"learning_rate": 1.65e-05,
"loss": 0.9722,
"step": 83500
},
{
"epoch": 8.719715956558062,
"eval_accuracy": 0.796164760278738,
"eval_loss": NaN,
"eval_runtime": 616.0678,
"eval_samples_per_second": 102.981,
"eval_steps_per_second": 3.219,
"step": 83500
},
{
"epoch": 8.771929824561404,
"grad_norm": 3.7852931022644043,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9813,
"step": 84000
},
{
"epoch": 8.771929824561404,
"eval_accuracy": 0.7964795153494032,
"eval_loss": NaN,
"eval_runtime": 616.9446,
"eval_samples_per_second": 102.834,
"eval_steps_per_second": 3.214,
"step": 84000
},
{
"epoch": 8.824143692564745,
"grad_norm": 2.85229229927063,
"learning_rate": 1.55e-05,
"loss": 0.9732,
"step": 84500
},
{
"epoch": 8.824143692564745,
"eval_accuracy": 0.7960116027401912,
"eval_loss": 1.061353325843811,
"eval_runtime": 624.2524,
"eval_samples_per_second": 101.63,
"eval_steps_per_second": 3.177,
"step": 84500
},
{
"epoch": 8.876357560568087,
"grad_norm": 3.463848114013672,
"learning_rate": 1.5e-05,
"loss": 0.9764,
"step": 85000
},
{
"epoch": 8.876357560568087,
"eval_accuracy": 0.7963621203122637,
"eval_loss": 1.049773097038269,
"eval_runtime": 618.458,
"eval_samples_per_second": 102.583,
"eval_steps_per_second": 3.206,
"step": 85000
},
{
"epoch": 8.928571428571429,
"grad_norm": 2.7122750282287598,
"learning_rate": 1.45e-05,
"loss": 0.9829,
"step": 85500
},
{
"epoch": 8.928571428571429,
"eval_accuracy": 0.7965627753528177,
"eval_loss": 1.069191813468933,
"eval_runtime": 617.5416,
"eval_samples_per_second": 102.735,
"eval_steps_per_second": 3.211,
"step": 85500
},
{
"epoch": 8.980785296574771,
"grad_norm": 2.976637840270996,
"learning_rate": 1.4000000000000001e-05,
"loss": 0.9741,
"step": 86000
},
{
"epoch": 8.980785296574771,
"eval_accuracy": 0.7967220469069121,
"eval_loss": 1.055444598197937,
"eval_runtime": 614.0894,
"eval_samples_per_second": 103.312,
"eval_steps_per_second": 3.229,
"step": 86000
},
{
"epoch": 9.032999164578111,
"grad_norm": 2.89384126663208,
"learning_rate": 1.3500000000000001e-05,
"loss": 0.9648,
"step": 86500
},
{
"epoch": 9.032999164578111,
"eval_accuracy": 0.797160834758222,
"eval_loss": 1.058738350868225,
"eval_runtime": 615.784,
"eval_samples_per_second": 103.028,
"eval_steps_per_second": 3.22,
"step": 86500
},
{
"epoch": 9.085213032581454,
"grad_norm": 3.009979248046875,
"learning_rate": 1.3000000000000001e-05,
"loss": 0.9606,
"step": 87000
},
{
"epoch": 9.085213032581454,
"eval_accuracy": 0.7971404883516657,
"eval_loss": 1.0621484518051147,
"eval_runtime": 618.535,
"eval_samples_per_second": 102.57,
"eval_steps_per_second": 3.206,
"step": 87000
},
{
"epoch": 9.137426900584796,
"grad_norm": 3.301671266555786,
"learning_rate": 1.25e-05,
"loss": 0.9682,
"step": 87500
},
{
"epoch": 9.137426900584796,
"eval_accuracy": 0.7973926994154387,
"eval_loss": 1.0569199323654175,
"eval_runtime": 616.4943,
"eval_samples_per_second": 102.909,
"eval_steps_per_second": 3.217,
"step": 87500
},
{
"epoch": 9.189640768588138,
"grad_norm": 2.932385206222534,
"learning_rate": 1.2e-05,
"loss": 0.9644,
"step": 88000
},
{
"epoch": 9.189640768588138,
"eval_accuracy": 0.7974666942085554,
"eval_loss": 1.0533905029296875,
"eval_runtime": 620.5893,
"eval_samples_per_second": 102.23,
"eval_steps_per_second": 3.195,
"step": 88000
},
{
"epoch": 9.241854636591478,
"grad_norm": 3.1355013847351074,
"learning_rate": 1.1500000000000002e-05,
"loss": 0.9658,
"step": 88500
},
{
"epoch": 9.241854636591478,
"eval_accuracy": 0.7977740520228777,
"eval_loss": 1.0476280450820923,
"eval_runtime": 618.7726,
"eval_samples_per_second": 102.53,
"eval_steps_per_second": 3.205,
"step": 88500
},
{
"epoch": 9.29406850459482,
"grad_norm": 3.353086233139038,
"learning_rate": 1.1000000000000001e-05,
"loss": 0.9641,
"step": 89000
},
{
"epoch": 9.29406850459482,
"eval_accuracy": 0.7973864228195986,
"eval_loss": 1.041751742362976,
"eval_runtime": 614.3896,
"eval_samples_per_second": 103.262,
"eval_steps_per_second": 3.228,
"step": 89000
},
{
"epoch": 9.346282372598163,
"grad_norm": 2.8681247234344482,
"learning_rate": 1.05e-05,
"loss": 0.963,
"step": 89500
},
{
"epoch": 9.346282372598163,
"eval_accuracy": 0.798292731668655,
"eval_loss": 1.0465270280838013,
"eval_runtime": 615.1098,
"eval_samples_per_second": 103.141,
"eval_steps_per_second": 3.224,
"step": 89500
},
{
"epoch": 9.398496240601503,
"grad_norm": 3.2927052974700928,
"learning_rate": 1e-05,
"loss": 0.9562,
"step": 90000
},
{
"epoch": 9.398496240601503,
"eval_accuracy": 0.798140924991145,
"eval_loss": 1.0476934909820557,
"eval_runtime": 616.723,
"eval_samples_per_second": 102.871,
"eval_steps_per_second": 3.215,
"step": 90000
},
{
"epoch": 9.450710108604845,
"grad_norm": 3.1475863456726074,
"learning_rate": 9.5e-06,
"loss": 0.9605,
"step": 90500
},
{
"epoch": 9.450710108604845,
"eval_accuracy": 0.7977010212122122,
"eval_loss": 1.0535070896148682,
"eval_runtime": 615.5187,
"eval_samples_per_second": 103.072,
"eval_steps_per_second": 3.222,
"step": 90500
},
{
"epoch": 9.502923976608187,
"grad_norm": 3.662548780441284,
"learning_rate": 9e-06,
"loss": 0.9692,
"step": 91000
},
{
"epoch": 9.502923976608187,
"eval_accuracy": 0.7981552799718717,
"eval_loss": 1.0515964031219482,
"eval_runtime": 615.1273,
"eval_samples_per_second": 103.138,
"eval_steps_per_second": 3.224,
"step": 91000
},
{
"epoch": 9.55513784461153,
"grad_norm": 3.5501346588134766,
"learning_rate": 8.500000000000002e-06,
"loss": 0.966,
"step": 91500
},
{
"epoch": 9.55513784461153,
"eval_accuracy": 0.798176074856451,
"eval_loss": 1.0500941276550293,
"eval_runtime": 616.9498,
"eval_samples_per_second": 102.833,
"eval_steps_per_second": 3.214,
"step": 91500
},
{
"epoch": 9.60735171261487,
"grad_norm": 3.102790117263794,
"learning_rate": 8.000000000000001e-06,
"loss": 0.9614,
"step": 92000
},
{
"epoch": 9.60735171261487,
"eval_accuracy": 0.7986090909415797,
"eval_loss": 1.0444718599319458,
"eval_runtime": 614.407,
"eval_samples_per_second": 103.259,
"eval_steps_per_second": 3.228,
"step": 92000
},
{
"epoch": 9.659565580618212,
"grad_norm": 3.504926919937134,
"learning_rate": 7.5e-06,
"loss": 0.9606,
"step": 92500
},
{
"epoch": 9.659565580618212,
"eval_accuracy": 0.7988812850380725,
"eval_loss": 1.0497493743896484,
"eval_runtime": 615.4873,
"eval_samples_per_second": 103.078,
"eval_steps_per_second": 3.222,
"step": 92500
},
{
"epoch": 9.711779448621554,
"grad_norm": 3.648078203201294,
"learning_rate": 7.000000000000001e-06,
"loss": 0.9507,
"step": 93000
},
{
"epoch": 9.711779448621554,
"eval_accuracy": 0.798754673907639,
"eval_loss": 1.0451751947402954,
"eval_runtime": 615.9945,
"eval_samples_per_second": 102.993,
"eval_steps_per_second": 3.219,
"step": 93000
},
{
"epoch": 9.763993316624896,
"grad_norm": 3.316922903060913,
"learning_rate": 6.5000000000000004e-06,
"loss": 0.9584,
"step": 93500
},
{
"epoch": 9.763993316624896,
"eval_accuracy": 0.7987269314893306,
"eval_loss": 1.0408653020858765,
"eval_runtime": 618.0487,
"eval_samples_per_second": 102.65,
"eval_steps_per_second": 3.208,
"step": 93500
},
{
"epoch": 9.816207184628237,
"grad_norm": 2.943169116973877,
"learning_rate": 6e-06,
"loss": 0.9572,
"step": 94000
},
{
"epoch": 9.816207184628237,
"eval_accuracy": 0.7989658711747047,
"eval_loss": 1.0431112051010132,
"eval_runtime": 619.8767,
"eval_samples_per_second": 102.348,
"eval_steps_per_second": 3.199,
"step": 94000
},
{
"epoch": 9.868421052631579,
"grad_norm": 2.713733196258545,
"learning_rate": 5.500000000000001e-06,
"loss": 0.9552,
"step": 94500
},
{
"epoch": 9.868421052631579,
"eval_accuracy": 0.7991049545929755,
"eval_loss": 1.0465214252471924,
"eval_runtime": 615.6457,
"eval_samples_per_second": 103.051,
"eval_steps_per_second": 3.221,
"step": 94500
},
{
"epoch": 9.920634920634921,
"grad_norm": 3.5131709575653076,
"learning_rate": 5e-06,
"loss": 0.9542,
"step": 95000
},
{
"epoch": 9.920634920634921,
"eval_accuracy": 0.7993720778709359,
"eval_loss": 1.043885350227356,
"eval_runtime": 616.5178,
"eval_samples_per_second": 102.905,
"eval_steps_per_second": 3.216,
"step": 95000
},
{
"epoch": 9.972848788638263,
"grad_norm": 3.0255749225616455,
"learning_rate": 4.5e-06,
"loss": 0.9505,
"step": 95500
},
{
"epoch": 9.972848788638263,
"eval_accuracy": 0.7994996190810016,
"eval_loss": 1.0470497608184814,
"eval_runtime": 616.7354,
"eval_samples_per_second": 102.869,
"eval_steps_per_second": 3.215,
"step": 95500
},
{
"epoch": 10.025062656641603,
"grad_norm": 4.134832382202148,
"learning_rate": 4.000000000000001e-06,
"loss": 0.9472,
"step": 96000
},
{
"epoch": 10.025062656641603,
"eval_accuracy": 0.7994244357414679,
"eval_loss": 1.0445414781570435,
"eval_runtime": 617.5631,
"eval_samples_per_second": 102.731,
"eval_steps_per_second": 3.211,
"step": 96000
},
{
"epoch": 10.077276524644946,
"grad_norm": 3.3011510372161865,
"learning_rate": 3.5000000000000004e-06,
"loss": 0.9467,
"step": 96500
},
{
"epoch": 10.077276524644946,
"eval_accuracy": 0.7997884398890616,
"eval_loss": 1.0422955751419067,
"eval_runtime": 617.5269,
"eval_samples_per_second": 102.737,
"eval_steps_per_second": 3.211,
"step": 96500
},
{
"epoch": 10.129490392648288,
"grad_norm": 3.65120792388916,
"learning_rate": 3e-06,
"loss": 0.9497,
"step": 97000
},
{
"epoch": 10.129490392648288,
"eval_accuracy": 0.7994027266856298,
"eval_loss": 1.0433976650238037,
"eval_runtime": 615.8756,
"eval_samples_per_second": 103.013,
"eval_steps_per_second": 3.22,
"step": 97000
},
{
"epoch": 10.18170426065163,
"grad_norm": 3.1329991817474365,
"learning_rate": 2.5e-06,
"loss": 0.9575,
"step": 97500
},
{
"epoch": 10.18170426065163,
"eval_accuracy": 0.8000353846923677,
"eval_loss": 1.0379133224487305,
"eval_runtime": 615.8872,
"eval_samples_per_second": 103.011,
"eval_steps_per_second": 3.22,
"step": 97500
},
{
"epoch": 10.23391812865497,
"grad_norm": 3.3150088787078857,
"learning_rate": 2.0000000000000003e-06,
"loss": 0.9478,
"step": 98000
},
{
"epoch": 10.23391812865497,
"eval_accuracy": 0.7997029578657322,
"eval_loss": 1.039129614830017,
"eval_runtime": 615.7144,
"eval_samples_per_second": 103.04,
"eval_steps_per_second": 3.221,
"step": 98000
},
{
"epoch": 10.286131996658312,
"grad_norm": 2.9896209239959717,
"learning_rate": 1.5e-06,
"loss": 0.9428,
"step": 98500
},
{
"epoch": 10.286131996658312,
"eval_accuracy": 0.799764992788252,
"eval_loss": 1.0273813009262085,
"eval_runtime": 615.1763,
"eval_samples_per_second": 103.13,
"eval_steps_per_second": 3.223,
"step": 98500
}
],
"logging_steps": 500,
"max_steps": 100000,
"num_input_tokens_seen": 0,
"num_train_epochs": 11,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.317352524927468e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}