{ "best_metric": 1.0273813009262085, "best_model_checkpoint": "./model_fine-tune/glot/xlm-r/dan-Latn/checkpoint-98500", "epoch": 10.286131996658312, "eval_steps": 500, "global_step": 98500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.052213868003341685, "grad_norm": 5.384862422943115, "learning_rate": 9.95e-05, "loss": 1.4559, "step": 500 }, { "epoch": 0.052213868003341685, "eval_accuracy": 0.7296120738961336, "eval_loss": 1.6221407651901245, "eval_runtime": 620.8615, "eval_samples_per_second": 102.185, "eval_steps_per_second": 3.194, "step": 500 }, { "epoch": 0.10442773600668337, "grad_norm": 4.473601818084717, "learning_rate": 9.900000000000001e-05, "loss": 1.4355, "step": 1000 }, { "epoch": 0.10442773600668337, "eval_accuracy": 0.7314203520980975, "eval_loss": 1.563644289970398, "eval_runtime": 631.0902, "eval_samples_per_second": 100.529, "eval_steps_per_second": 3.142, "step": 1000 }, { "epoch": 0.15664160401002505, "grad_norm": 4.604930877685547, "learning_rate": 9.850000000000001e-05, "loss": 1.4326, "step": 1500 }, { "epoch": 0.15664160401002505, "eval_accuracy": 0.732858138551035, "eval_loss": 1.538730502128601, "eval_runtime": 627.1459, "eval_samples_per_second": 101.161, "eval_steps_per_second": 3.162, "step": 1500 }, { "epoch": 0.20885547201336674, "grad_norm": 5.0758867263793945, "learning_rate": 9.8e-05, "loss": 1.4082, "step": 2000 }, { "epoch": 0.20885547201336674, "eval_accuracy": 0.7337186853777712, "eval_loss": 1.514768123626709, "eval_runtime": 623.7693, "eval_samples_per_second": 101.709, "eval_steps_per_second": 3.179, "step": 2000 }, { "epoch": 0.26106934001670845, "grad_norm": 4.955592155456543, "learning_rate": 9.75e-05, "loss": 1.4066, "step": 2500 }, { "epoch": 0.26106934001670845, "eval_accuracy": 0.7373838924395351, "eval_loss": 1.4860447645187378, "eval_runtime": 626.6763, "eval_samples_per_second": 101.237, "eval_steps_per_second": 3.164, "step": 2500 }, { "epoch": 0.3132832080200501, "grad_norm": 4.4695611000061035, "learning_rate": 9.7e-05, "loss": 1.3965, "step": 3000 }, { "epoch": 0.3132832080200501, "eval_accuracy": 0.7386060798818527, "eval_loss": 1.4720885753631592, "eval_runtime": 626.8807, "eval_samples_per_second": 101.204, "eval_steps_per_second": 3.163, "step": 3000 }, { "epoch": 0.3654970760233918, "grad_norm": 4.595192909240723, "learning_rate": 9.65e-05, "loss": 1.3674, "step": 3500 }, { "epoch": 0.3654970760233918, "eval_accuracy": 0.7389515252696706, "eval_loss": 1.466160774230957, "eval_runtime": 626.2854, "eval_samples_per_second": 101.3, "eval_steps_per_second": 3.166, "step": 3500 }, { "epoch": 0.4177109440267335, "grad_norm": 4.489499092102051, "learning_rate": 9.6e-05, "loss": 1.3789, "step": 4000 }, { "epoch": 0.4177109440267335, "eval_accuracy": 0.7408971720895672, "eval_loss": 1.4427233934402466, "eval_runtime": 628.8309, "eval_samples_per_second": 100.89, "eval_steps_per_second": 3.153, "step": 4000 }, { "epoch": 0.4699248120300752, "grad_norm": 4.887514114379883, "learning_rate": 9.55e-05, "loss": 1.3677, "step": 4500 }, { "epoch": 0.4699248120300752, "eval_accuracy": 0.7418176415206207, "eval_loss": NaN, "eval_runtime": 626.3841, "eval_samples_per_second": 101.285, "eval_steps_per_second": 3.166, "step": 4500 }, { "epoch": 0.5221386800334169, "grad_norm": 4.255401611328125, "learning_rate": 9.5e-05, "loss": 1.3591, "step": 5000 }, { "epoch": 0.5221386800334169, "eval_accuracy": 0.7420801988701987, "eval_loss": 1.4378492832183838, "eval_runtime": 628.3584, "eval_samples_per_second": 100.966, "eval_steps_per_second": 3.156, "step": 5000 }, { "epoch": 0.5743525480367586, "grad_norm": 3.9552218914031982, "learning_rate": 9.449999999999999e-05, "loss": 1.3528, "step": 5500 }, { "epoch": 0.5743525480367586, "eval_accuracy": 0.7430785963224286, "eval_loss": 1.4063905477523804, "eval_runtime": 630.6125, "eval_samples_per_second": 100.605, "eval_steps_per_second": 3.145, "step": 5500 }, { "epoch": 0.6265664160401002, "grad_norm": 4.043545246124268, "learning_rate": 9.4e-05, "loss": 1.3467, "step": 6000 }, { "epoch": 0.6265664160401002, "eval_accuracy": 0.7439164901309249, "eval_loss": 1.4179232120513916, "eval_runtime": 627.0519, "eval_samples_per_second": 101.177, "eval_steps_per_second": 3.162, "step": 6000 }, { "epoch": 0.6787802840434419, "grad_norm": 65.49678802490234, "learning_rate": 9.350000000000001e-05, "loss": 1.3578, "step": 6500 }, { "epoch": 0.6787802840434419, "eval_accuracy": 0.7436562018580148, "eval_loss": 1.4101091623306274, "eval_runtime": 631.9563, "eval_samples_per_second": 100.391, "eval_steps_per_second": 3.138, "step": 6500 }, { "epoch": 0.7309941520467836, "grad_norm": 3.9676353931427, "learning_rate": 9.300000000000001e-05, "loss": 1.3377, "step": 7000 }, { "epoch": 0.7309941520467836, "eval_accuracy": 0.7452273902409787, "eval_loss": 1.3957942724227905, "eval_runtime": 625.612, "eval_samples_per_second": 101.41, "eval_steps_per_second": 3.17, "step": 7000 }, { "epoch": 0.7832080200501254, "grad_norm": 3.6173226833343506, "learning_rate": 9.250000000000001e-05, "loss": 1.3303, "step": 7500 }, { "epoch": 0.7832080200501254, "eval_accuracy": 0.7475488951341268, "eval_loss": 1.3813687562942505, "eval_runtime": 627.0109, "eval_samples_per_second": 101.183, "eval_steps_per_second": 3.163, "step": 7500 }, { "epoch": 0.835421888053467, "grad_norm": 3.980341911315918, "learning_rate": 9.200000000000001e-05, "loss": 1.3221, "step": 8000 }, { "epoch": 0.835421888053467, "eval_accuracy": 0.7475162804135862, "eval_loss": 1.3788014650344849, "eval_runtime": 625.2826, "eval_samples_per_second": 101.463, "eval_steps_per_second": 3.171, "step": 8000 }, { "epoch": 0.8876357560568087, "grad_norm": 3.367668390274048, "learning_rate": 9.15e-05, "loss": 1.3273, "step": 8500 }, { "epoch": 0.8876357560568087, "eval_accuracy": 0.7486955192165756, "eval_loss": 1.3720810413360596, "eval_runtime": 626.1965, "eval_samples_per_second": 101.315, "eval_steps_per_second": 3.167, "step": 8500 }, { "epoch": 0.9398496240601504, "grad_norm": 3.82468843460083, "learning_rate": 9.1e-05, "loss": 1.3104, "step": 9000 }, { "epoch": 0.9398496240601504, "eval_accuracy": 0.7470319869853385, "eval_loss": 1.3732112646102905, "eval_runtime": 630.8001, "eval_samples_per_second": 100.575, "eval_steps_per_second": 3.144, "step": 9000 }, { "epoch": 0.9920634920634921, "grad_norm": 3.946758270263672, "learning_rate": 9.05e-05, "loss": 1.3158, "step": 9500 }, { "epoch": 0.9920634920634921, "eval_accuracy": 0.7499583551090239, "eval_loss": 1.3603472709655762, "eval_runtime": 652.6391, "eval_samples_per_second": 97.21, "eval_steps_per_second": 3.038, "step": 9500 }, { "epoch": 1.0442773600668338, "grad_norm": 3.916361093521118, "learning_rate": 9e-05, "loss": 1.2953, "step": 10000 }, { "epoch": 1.0442773600668338, "eval_accuracy": 0.7500646093431366, "eval_loss": 1.3458328247070312, "eval_runtime": 626.4497, "eval_samples_per_second": 101.274, "eval_steps_per_second": 3.165, "step": 10000 }, { "epoch": 1.0964912280701755, "grad_norm": 3.9930801391601562, "learning_rate": 8.950000000000001e-05, "loss": 1.2932, "step": 10500 }, { "epoch": 1.0964912280701755, "eval_accuracy": 0.7504904150215282, "eval_loss": 1.3514128923416138, "eval_runtime": 630.1314, "eval_samples_per_second": 100.682, "eval_steps_per_second": 3.147, "step": 10500 }, { "epoch": 1.1487050960735172, "grad_norm": 4.74911642074585, "learning_rate": 8.900000000000001e-05, "loss": 1.2944, "step": 11000 }, { "epoch": 1.1487050960735172, "eval_accuracy": 0.7512302865503326, "eval_loss": 1.3358726501464844, "eval_runtime": 631.5659, "eval_samples_per_second": 100.453, "eval_steps_per_second": 3.14, "step": 11000 }, { "epoch": 1.2009189640768587, "grad_norm": 3.7969741821289062, "learning_rate": 8.850000000000001e-05, "loss": 1.2853, "step": 11500 }, { "epoch": 1.2009189640768587, "eval_accuracy": 0.7516760587886216, "eval_loss": 1.3410056829452515, "eval_runtime": 631.8461, "eval_samples_per_second": 100.409, "eval_steps_per_second": 3.138, "step": 11500 }, { "epoch": 1.2531328320802004, "grad_norm": 4.402622699737549, "learning_rate": 8.800000000000001e-05, "loss": 1.2904, "step": 12000 }, { "epoch": 1.2531328320802004, "eval_accuracy": 0.7529186511157855, "eval_loss": 1.3296958208084106, "eval_runtime": 625.8019, "eval_samples_per_second": 101.379, "eval_steps_per_second": 3.169, "step": 12000 }, { "epoch": 1.3053467000835421, "grad_norm": 4.738062858581543, "learning_rate": 8.75e-05, "loss": 1.2768, "step": 12500 }, { "epoch": 1.3053467000835421, "eval_accuracy": 0.7535759135206747, "eval_loss": 1.323601484298706, "eval_runtime": 625.8627, "eval_samples_per_second": 101.369, "eval_steps_per_second": 3.168, "step": 12500 }, { "epoch": 1.3575605680868839, "grad_norm": 3.4203574657440186, "learning_rate": 8.7e-05, "loss": 1.2702, "step": 13000 }, { "epoch": 1.3575605680868839, "eval_accuracy": 0.7543550936371498, "eval_loss": 1.311880350112915, "eval_runtime": 632.5695, "eval_samples_per_second": 100.294, "eval_steps_per_second": 3.135, "step": 13000 }, { "epoch": 1.4097744360902256, "grad_norm": 4.80487060546875, "learning_rate": 8.65e-05, "loss": 1.2686, "step": 13500 }, { "epoch": 1.4097744360902256, "eval_accuracy": 0.7551145942334886, "eval_loss": 1.3116217851638794, "eval_runtime": 622.4069, "eval_samples_per_second": 101.932, "eval_steps_per_second": 3.186, "step": 13500 }, { "epoch": 1.4619883040935673, "grad_norm": 3.3763253688812256, "learning_rate": 8.6e-05, "loss": 1.256, "step": 14000 }, { "epoch": 1.4619883040935673, "eval_accuracy": 0.7550032122033714, "eval_loss": NaN, "eval_runtime": 623.0788, "eval_samples_per_second": 101.822, "eval_steps_per_second": 3.183, "step": 14000 }, { "epoch": 1.514202172096909, "grad_norm": 3.4600539207458496, "learning_rate": 8.55e-05, "loss": 1.2689, "step": 14500 }, { "epoch": 1.514202172096909, "eval_accuracy": 0.7562542149486124, "eval_loss": 1.2966333627700806, "eval_runtime": 630.2089, "eval_samples_per_second": 100.67, "eval_steps_per_second": 3.147, "step": 14500 }, { "epoch": 1.5664160401002505, "grad_norm": 3.6656503677368164, "learning_rate": 8.5e-05, "loss": 1.2536, "step": 15000 }, { "epoch": 1.5664160401002505, "eval_accuracy": 0.7558193144413351, "eval_loss": 1.3074419498443604, "eval_runtime": 633.7511, "eval_samples_per_second": 100.107, "eval_steps_per_second": 3.129, "step": 15000 }, { "epoch": 1.6186299081035922, "grad_norm": 3.7582554817199707, "learning_rate": 8.450000000000001e-05, "loss": 1.262, "step": 15500 }, { "epoch": 1.6186299081035922, "eval_accuracy": 0.755967622936909, "eval_loss": 1.2850651741027832, "eval_runtime": 626.4542, "eval_samples_per_second": 101.273, "eval_steps_per_second": 3.165, "step": 15500 }, { "epoch": 1.670843776106934, "grad_norm": 3.8284225463867188, "learning_rate": 8.4e-05, "loss": 1.2427, "step": 16000 }, { "epoch": 1.670843776106934, "eval_accuracy": 0.7575149255490681, "eval_loss": 1.2897659540176392, "eval_runtime": 624.5044, "eval_samples_per_second": 101.589, "eval_steps_per_second": 3.175, "step": 16000 }, { "epoch": 1.7230576441102756, "grad_norm": 3.8263819217681885, "learning_rate": 8.35e-05, "loss": 1.2389, "step": 16500 }, { "epoch": 1.7230576441102756, "eval_accuracy": 0.7580776326367493, "eval_loss": 1.2844797372817993, "eval_runtime": 627.1556, "eval_samples_per_second": 101.16, "eval_steps_per_second": 3.162, "step": 16500 }, { "epoch": 1.7752715121136173, "grad_norm": 3.7820005416870117, "learning_rate": 8.3e-05, "loss": 1.2368, "step": 17000 }, { "epoch": 1.7752715121136173, "eval_accuracy": 0.7585746778078989, "eval_loss": 1.2964411973953247, "eval_runtime": 635.3019, "eval_samples_per_second": 99.863, "eval_steps_per_second": 3.121, "step": 17000 }, { "epoch": 1.827485380116959, "grad_norm": 3.492671012878418, "learning_rate": 8.25e-05, "loss": 1.2432, "step": 17500 }, { "epoch": 1.827485380116959, "eval_accuracy": 0.7584533967429662, "eval_loss": NaN, "eval_runtime": 623.9705, "eval_samples_per_second": 101.676, "eval_steps_per_second": 3.178, "step": 17500 }, { "epoch": 1.8796992481203008, "grad_norm": 3.714395761489868, "learning_rate": 8.2e-05, "loss": 1.2375, "step": 18000 }, { "epoch": 1.8796992481203008, "eval_accuracy": 0.7593634528855384, "eval_loss": NaN, "eval_runtime": 623.2531, "eval_samples_per_second": 101.793, "eval_steps_per_second": 3.182, "step": 18000 }, { "epoch": 1.9319131161236425, "grad_norm": 3.700199604034424, "learning_rate": 8.15e-05, "loss": 1.2308, "step": 18500 }, { "epoch": 1.9319131161236425, "eval_accuracy": 0.7597829603112618, "eval_loss": 1.2819814682006836, "eval_runtime": 636.6684, "eval_samples_per_second": 99.648, "eval_steps_per_second": 3.115, "step": 18500 }, { "epoch": 1.9841269841269842, "grad_norm": 3.390286922454834, "learning_rate": 8.1e-05, "loss": 1.2374, "step": 19000 }, { "epoch": 1.9841269841269842, "eval_accuracy": 0.7603004154522052, "eval_loss": 1.2687220573425293, "eval_runtime": 630.1502, "eval_samples_per_second": 100.679, "eval_steps_per_second": 3.147, "step": 19000 }, { "epoch": 2.036340852130326, "grad_norm": 4.251028060913086, "learning_rate": 8.05e-05, "loss": 1.218, "step": 19500 }, { "epoch": 2.036340852130326, "eval_accuracy": 0.760354434691265, "eval_loss": 1.2784004211425781, "eval_runtime": 629.6482, "eval_samples_per_second": 100.759, "eval_steps_per_second": 3.149, "step": 19500 }, { "epoch": 2.0885547201336676, "grad_norm": 3.714747905731201, "learning_rate": 8e-05, "loss": 1.2052, "step": 20000 }, { "epoch": 2.0885547201336676, "eval_accuracy": 0.7609532576337851, "eval_loss": 1.2718734741210938, "eval_runtime": 644.9629, "eval_samples_per_second": 98.367, "eval_steps_per_second": 3.075, "step": 20000 }, { "epoch": 2.1407685881370093, "grad_norm": 3.934018611907959, "learning_rate": 7.950000000000001e-05, "loss": 1.2133, "step": 20500 }, { "epoch": 2.1407685881370093, "eval_accuracy": 0.7613764467527375, "eval_loss": 1.2647244930267334, "eval_runtime": 631.2806, "eval_samples_per_second": 100.499, "eval_steps_per_second": 3.141, "step": 20500 }, { "epoch": 2.192982456140351, "grad_norm": 3.6540703773498535, "learning_rate": 7.900000000000001e-05, "loss": 1.2072, "step": 21000 }, { "epoch": 2.192982456140351, "eval_accuracy": 0.7618574927985354, "eval_loss": 1.2702383995056152, "eval_runtime": 628.7442, "eval_samples_per_second": 100.904, "eval_steps_per_second": 3.154, "step": 21000 }, { "epoch": 2.2451963241436927, "grad_norm": 3.0740270614624023, "learning_rate": 7.850000000000001e-05, "loss": 1.2051, "step": 21500 }, { "epoch": 2.2451963241436927, "eval_accuracy": 0.7627010896236089, "eval_loss": 1.272460699081421, "eval_runtime": 628.6858, "eval_samples_per_second": 100.914, "eval_steps_per_second": 3.154, "step": 21500 }, { "epoch": 2.2974101921470345, "grad_norm": 4.153682708740234, "learning_rate": 7.800000000000001e-05, "loss": 1.1978, "step": 22000 }, { "epoch": 2.2974101921470345, "eval_accuracy": 0.7618345683871698, "eval_loss": 1.2531063556671143, "eval_runtime": 646.652, "eval_samples_per_second": 98.11, "eval_steps_per_second": 3.067, "step": 22000 }, { "epoch": 2.3496240601503757, "grad_norm": 3.2469732761383057, "learning_rate": 7.75e-05, "loss": 1.2017, "step": 22500 }, { "epoch": 2.3496240601503757, "eval_accuracy": 0.7624854045335798, "eval_loss": NaN, "eval_runtime": 639.4743, "eval_samples_per_second": 99.211, "eval_steps_per_second": 3.101, "step": 22500 }, { "epoch": 2.4018379281537174, "grad_norm": 3.4860455989837646, "learning_rate": 7.7e-05, "loss": 1.2036, "step": 23000 }, { "epoch": 2.4018379281537174, "eval_accuracy": 0.7637592883681051, "eval_loss": 1.2505515813827515, "eval_runtime": 626.8199, "eval_samples_per_second": 101.214, "eval_steps_per_second": 3.164, "step": 23000 }, { "epoch": 2.454051796157059, "grad_norm": 3.0207018852233887, "learning_rate": 7.65e-05, "loss": 1.2039, "step": 23500 }, { "epoch": 2.454051796157059, "eval_accuracy": 0.7636972843433327, "eval_loss": 1.2614232301712036, "eval_runtime": 626.1594, "eval_samples_per_second": 101.321, "eval_steps_per_second": 3.167, "step": 23500 }, { "epoch": 2.506265664160401, "grad_norm": 2.820202589035034, "learning_rate": 7.6e-05, "loss": 1.1931, "step": 24000 }, { "epoch": 2.506265664160401, "eval_accuracy": 0.7648100527367832, "eval_loss": 1.2478718757629395, "eval_runtime": 654.189, "eval_samples_per_second": 96.98, "eval_steps_per_second": 3.031, "step": 24000 }, { "epoch": 2.5584795321637426, "grad_norm": 4.114190101623535, "learning_rate": 7.55e-05, "loss": 1.1814, "step": 24500 }, { "epoch": 2.5584795321637426, "eval_accuracy": 0.7652067630736408, "eval_loss": 1.2430387735366821, "eval_runtime": 630.6204, "eval_samples_per_second": 100.604, "eval_steps_per_second": 3.145, "step": 24500 }, { "epoch": 2.6106934001670843, "grad_norm": 3.205641746520996, "learning_rate": 7.500000000000001e-05, "loss": 1.1934, "step": 25000 }, { "epoch": 2.6106934001670843, "eval_accuracy": 0.7653472241088651, "eval_loss": 1.2560120820999146, "eval_runtime": 633.6789, "eval_samples_per_second": 100.119, "eval_steps_per_second": 3.129, "step": 25000 }, { "epoch": 2.662907268170426, "grad_norm": 3.286635637283325, "learning_rate": 7.450000000000001e-05, "loss": 1.1913, "step": 25500 }, { "epoch": 2.662907268170426, "eval_accuracy": 0.7653709204327143, "eval_loss": 1.2623989582061768, "eval_runtime": 636.0478, "eval_samples_per_second": 99.746, "eval_steps_per_second": 3.118, "step": 25500 }, { "epoch": 2.7151211361737677, "grad_norm": 3.4579050540924072, "learning_rate": 7.4e-05, "loss": 1.1835, "step": 26000 }, { "epoch": 2.7151211361737677, "eval_accuracy": 0.766565753562764, "eval_loss": 1.2262078523635864, "eval_runtime": 628.0716, "eval_samples_per_second": 101.012, "eval_steps_per_second": 3.157, "step": 26000 }, { "epoch": 2.7673350041771094, "grad_norm": 3.3125433921813965, "learning_rate": 7.35e-05, "loss": 1.1821, "step": 26500 }, { "epoch": 2.7673350041771094, "eval_accuracy": 0.7665199419874659, "eval_loss": 1.249574065208435, "eval_runtime": 626.5772, "eval_samples_per_second": 101.253, "eval_steps_per_second": 3.165, "step": 26500 }, { "epoch": 2.819548872180451, "grad_norm": 3.5980172157287598, "learning_rate": 7.3e-05, "loss": 1.1879, "step": 27000 }, { "epoch": 2.819548872180451, "eval_accuracy": 0.7665586654611716, "eval_loss": 1.2500699758529663, "eval_runtime": 626.2963, "eval_samples_per_second": 101.299, "eval_steps_per_second": 3.166, "step": 27000 }, { "epoch": 2.871762740183793, "grad_norm": 3.683032512664795, "learning_rate": 7.25e-05, "loss": 1.1865, "step": 27500 }, { "epoch": 2.871762740183793, "eval_accuracy": 0.767375596206993, "eval_loss": 1.2403781414031982, "eval_runtime": 647.9501, "eval_samples_per_second": 97.913, "eval_steps_per_second": 3.06, "step": 27500 }, { "epoch": 2.9239766081871346, "grad_norm": 3.2120792865753174, "learning_rate": 7.2e-05, "loss": 1.1811, "step": 28000 }, { "epoch": 2.9239766081871346, "eval_accuracy": 0.7678975328660842, "eval_loss": 1.2281544208526611, "eval_runtime": 641.5519, "eval_samples_per_second": 98.89, "eval_steps_per_second": 3.091, "step": 28000 }, { "epoch": 2.9761904761904763, "grad_norm": 3.5708224773406982, "learning_rate": 7.15e-05, "loss": 1.1771, "step": 28500 }, { "epoch": 2.9761904761904763, "eval_accuracy": 0.7670722352455037, "eval_loss": 1.2305808067321777, "eval_runtime": 629.6736, "eval_samples_per_second": 100.755, "eval_steps_per_second": 3.149, "step": 28500 }, { "epoch": 3.028404344193818, "grad_norm": 3.624859094619751, "learning_rate": 7.1e-05, "loss": 1.1719, "step": 29000 }, { "epoch": 3.028404344193818, "eval_accuracy": 0.7678675260574238, "eval_loss": NaN, "eval_runtime": 653.9665, "eval_samples_per_second": 97.013, "eval_steps_per_second": 3.032, "step": 29000 }, { "epoch": 3.0806182121971597, "grad_norm": 3.066574811935425, "learning_rate": 7.05e-05, "loss": 1.1625, "step": 29500 }, { "epoch": 3.0806182121971597, "eval_accuracy": 0.7687440934734672, "eval_loss": 1.2233607769012451, "eval_runtime": 641.0412, "eval_samples_per_second": 98.969, "eval_steps_per_second": 3.093, "step": 29500 }, { "epoch": 3.1328320802005014, "grad_norm": 2.951267719268799, "learning_rate": 7e-05, "loss": 1.1667, "step": 30000 }, { "epoch": 3.1328320802005014, "eval_accuracy": 0.7692466135515155, "eval_loss": 1.2097790241241455, "eval_runtime": 634.5544, "eval_samples_per_second": 99.98, "eval_steps_per_second": 3.125, "step": 30000 }, { "epoch": 3.185045948203843, "grad_norm": 3.134652614593506, "learning_rate": 6.95e-05, "loss": 1.161, "step": 30500 }, { "epoch": 3.185045948203843, "eval_accuracy": 0.7697799908068673, "eval_loss": 1.2196942567825317, "eval_runtime": 639.2853, "eval_samples_per_second": 99.241, "eval_steps_per_second": 3.102, "step": 30500 }, { "epoch": 3.2372598162071844, "grad_norm": 3.199575662612915, "learning_rate": 6.9e-05, "loss": 1.1591, "step": 31000 }, { "epoch": 3.2372598162071844, "eval_accuracy": 0.7695065031514984, "eval_loss": 1.2124276161193848, "eval_runtime": 682.4327, "eval_samples_per_second": 92.966, "eval_steps_per_second": 2.906, "step": 31000 }, { "epoch": 3.2894736842105265, "grad_norm": 2.954310178756714, "learning_rate": 6.850000000000001e-05, "loss": 1.1587, "step": 31500 }, { "epoch": 3.2894736842105265, "eval_accuracy": 0.7700210273355362, "eval_loss": 1.2054550647735596, "eval_runtime": 639.9415, "eval_samples_per_second": 99.139, "eval_steps_per_second": 3.099, "step": 31500 }, { "epoch": 3.341687552213868, "grad_norm": 2.8550124168395996, "learning_rate": 6.800000000000001e-05, "loss": 1.1575, "step": 32000 }, { "epoch": 3.341687552213868, "eval_accuracy": 0.7699819172054071, "eval_loss": 1.214800238609314, "eval_runtime": 638.9546, "eval_samples_per_second": 99.292, "eval_steps_per_second": 3.104, "step": 32000 }, { "epoch": 3.3939014202172095, "grad_norm": 2.92110013961792, "learning_rate": 6.750000000000001e-05, "loss": 1.1523, "step": 32500 }, { "epoch": 3.3939014202172095, "eval_accuracy": 0.769908237207266, "eval_loss": 1.2140088081359863, "eval_runtime": 644.5498, "eval_samples_per_second": 98.43, "eval_steps_per_second": 3.077, "step": 32500 }, { "epoch": 3.4461152882205512, "grad_norm": 3.346374988555908, "learning_rate": 6.7e-05, "loss": 1.1572, "step": 33000 }, { "epoch": 3.4461152882205512, "eval_accuracy": 0.7708846584546473, "eval_loss": 1.2036925554275513, "eval_runtime": 640.768, "eval_samples_per_second": 99.011, "eval_steps_per_second": 3.095, "step": 33000 }, { "epoch": 3.498329156223893, "grad_norm": 3.251553773880005, "learning_rate": 6.65e-05, "loss": 1.1435, "step": 33500 }, { "epoch": 3.498329156223893, "eval_accuracy": 0.7710189865489193, "eval_loss": 1.2105178833007812, "eval_runtime": 649.4058, "eval_samples_per_second": 97.694, "eval_steps_per_second": 3.054, "step": 33500 }, { "epoch": 3.5505430242272347, "grad_norm": 3.583970785140991, "learning_rate": 6.6e-05, "loss": 1.1377, "step": 34000 }, { "epoch": 3.5505430242272347, "eval_accuracy": 0.7721632980503305, "eval_loss": 1.2023558616638184, "eval_runtime": 638.8527, "eval_samples_per_second": 99.308, "eval_steps_per_second": 3.104, "step": 34000 }, { "epoch": 3.6027568922305764, "grad_norm": 3.588223934173584, "learning_rate": 6.55e-05, "loss": 1.1369, "step": 34500 }, { "epoch": 3.6027568922305764, "eval_accuracy": 0.7718440341835643, "eval_loss": 1.196567416191101, "eval_runtime": 659.4538, "eval_samples_per_second": 96.205, "eval_steps_per_second": 3.007, "step": 34500 }, { "epoch": 3.654970760233918, "grad_norm": 3.4205751419067383, "learning_rate": 6.500000000000001e-05, "loss": 1.1417, "step": 35000 }, { "epoch": 3.654970760233918, "eval_accuracy": 0.7722229829571152, "eval_loss": 1.1913775205612183, "eval_runtime": 632.4781, "eval_samples_per_second": 100.309, "eval_steps_per_second": 3.135, "step": 35000 }, { "epoch": 3.70718462823726, "grad_norm": 3.300046920776367, "learning_rate": 6.450000000000001e-05, "loss": 1.1464, "step": 35500 }, { "epoch": 3.70718462823726, "eval_accuracy": 0.7726826510494151, "eval_loss": 1.1917306184768677, "eval_runtime": 632.4563, "eval_samples_per_second": 100.312, "eval_steps_per_second": 3.135, "step": 35500 }, { "epoch": 3.7593984962406015, "grad_norm": 3.469228982925415, "learning_rate": 6.400000000000001e-05, "loss": 1.1413, "step": 36000 }, { "epoch": 3.7593984962406015, "eval_accuracy": 0.7728226251433048, "eval_loss": 1.1853009462356567, "eval_runtime": 674.7454, "eval_samples_per_second": 94.025, "eval_steps_per_second": 2.939, "step": 36000 }, { "epoch": 3.8116123642439432, "grad_norm": 3.0387423038482666, "learning_rate": 6.35e-05, "loss": 1.1437, "step": 36500 }, { "epoch": 3.8116123642439432, "eval_accuracy": 0.7730040846904885, "eval_loss": 1.197380542755127, "eval_runtime": 642.657, "eval_samples_per_second": 98.72, "eval_steps_per_second": 3.086, "step": 36500 }, { "epoch": 3.863826232247285, "grad_norm": 3.7750790119171143, "learning_rate": 6.3e-05, "loss": 1.1381, "step": 37000 }, { "epoch": 3.863826232247285, "eval_accuracy": 0.7732420751700277, "eval_loss": 1.1929783821105957, "eval_runtime": 635.6748, "eval_samples_per_second": 99.804, "eval_steps_per_second": 3.12, "step": 37000 }, { "epoch": 3.9160401002506267, "grad_norm": 2.7454237937927246, "learning_rate": 6.25e-05, "loss": 1.1319, "step": 37500 }, { "epoch": 3.9160401002506267, "eval_accuracy": 0.7740359547829445, "eval_loss": 1.1876792907714844, "eval_runtime": 646.5493, "eval_samples_per_second": 98.126, "eval_steps_per_second": 3.067, "step": 37500 }, { "epoch": 3.9682539682539684, "grad_norm": 3.1448304653167725, "learning_rate": 6.2e-05, "loss": 1.1217, "step": 38000 }, { "epoch": 3.9682539682539684, "eval_accuracy": 0.7746280010171029, "eval_loss": 1.1808606386184692, "eval_runtime": 651.1984, "eval_samples_per_second": 97.425, "eval_steps_per_second": 3.045, "step": 38000 }, { "epoch": 4.02046783625731, "grad_norm": 3.3443033695220947, "learning_rate": 6.15e-05, "loss": 1.1297, "step": 38500 }, { "epoch": 4.02046783625731, "eval_accuracy": 0.7744960875837769, "eval_loss": 1.1759783029556274, "eval_runtime": 633.4471, "eval_samples_per_second": 100.155, "eval_steps_per_second": 3.13, "step": 38500 }, { "epoch": 4.072681704260652, "grad_norm": 3.656834602355957, "learning_rate": 6.1e-05, "loss": 1.116, "step": 39000 }, { "epoch": 4.072681704260652, "eval_accuracy": 0.7753487577160052, "eval_loss": 1.171563744544983, "eval_runtime": 636.4311, "eval_samples_per_second": 99.686, "eval_steps_per_second": 3.116, "step": 39000 }, { "epoch": 4.124895572263993, "grad_norm": 3.2146315574645996, "learning_rate": 6.05e-05, "loss": 1.1262, "step": 39500 }, { "epoch": 4.124895572263993, "eval_accuracy": 0.7748815741788464, "eval_loss": 1.1831566095352173, "eval_runtime": 626.1408, "eval_samples_per_second": 101.324, "eval_steps_per_second": 3.167, "step": 39500 }, { "epoch": 4.177109440267335, "grad_norm": 3.5292935371398926, "learning_rate": 6e-05, "loss": 1.1038, "step": 40000 }, { "epoch": 4.177109440267335, "eval_accuracy": 0.775054087211447, "eval_loss": 1.1849807500839233, "eval_runtime": 625.7328, "eval_samples_per_second": 101.39, "eval_steps_per_second": 3.169, "step": 40000 }, { "epoch": 4.2293233082706765, "grad_norm": 3.17421817779541, "learning_rate": 5.95e-05, "loss": 1.1144, "step": 40500 }, { "epoch": 4.2293233082706765, "eval_accuracy": 0.7753948446129992, "eval_loss": 1.176754117012024, "eval_runtime": 642.2309, "eval_samples_per_second": 98.785, "eval_steps_per_second": 3.088, "step": 40500 }, { "epoch": 4.281537176274019, "grad_norm": 3.185692071914673, "learning_rate": 5.9e-05, "loss": 1.1068, "step": 41000 }, { "epoch": 4.281537176274019, "eval_accuracy": 0.7759707359303026, "eval_loss": 1.1789684295654297, "eval_runtime": 627.5525, "eval_samples_per_second": 101.096, "eval_steps_per_second": 3.16, "step": 41000 }, { "epoch": 4.33375104427736, "grad_norm": 3.033123731613159, "learning_rate": 5.85e-05, "loss": 1.1088, "step": 41500 }, { "epoch": 4.33375104427736, "eval_accuracy": 0.7761587182255788, "eval_loss": 1.1775932312011719, "eval_runtime": 629.8113, "eval_samples_per_second": 100.733, "eval_steps_per_second": 3.149, "step": 41500 }, { "epoch": 4.385964912280702, "grad_norm": 3.112304210662842, "learning_rate": 5.8e-05, "loss": 1.1243, "step": 42000 }, { "epoch": 4.385964912280702, "eval_accuracy": 0.7764218763015075, "eval_loss": 1.173019528388977, "eval_runtime": 655.1515, "eval_samples_per_second": 96.837, "eval_steps_per_second": 3.027, "step": 42000 }, { "epoch": 4.438178780284043, "grad_norm": 3.216050624847412, "learning_rate": 5.7499999999999995e-05, "loss": 1.1196, "step": 42500 }, { "epoch": 4.438178780284043, "eval_accuracy": 0.7775044227327209, "eval_loss": 1.1682883501052856, "eval_runtime": 627.0953, "eval_samples_per_second": 101.17, "eval_steps_per_second": 3.162, "step": 42500 }, { "epoch": 4.4903926482873855, "grad_norm": 3.4211995601654053, "learning_rate": 5.6999999999999996e-05, "loss": 1.0985, "step": 43000 }, { "epoch": 4.4903926482873855, "eval_accuracy": 0.7770348312966414, "eval_loss": 1.1709975004196167, "eval_runtime": 626.8729, "eval_samples_per_second": 101.206, "eval_steps_per_second": 3.163, "step": 43000 }, { "epoch": 4.542606516290727, "grad_norm": 3.0796425342559814, "learning_rate": 5.65e-05, "loss": 1.0994, "step": 43500 }, { "epoch": 4.542606516290727, "eval_accuracy": 0.7772564904147641, "eval_loss": 1.1671358346939087, "eval_runtime": 626.7232, "eval_samples_per_second": 101.23, "eval_steps_per_second": 3.164, "step": 43500 }, { "epoch": 4.594820384294069, "grad_norm": 3.7232813835144043, "learning_rate": 5.6000000000000006e-05, "loss": 1.1017, "step": 44000 }, { "epoch": 4.594820384294069, "eval_accuracy": 0.7776795841513819, "eval_loss": 1.1589510440826416, "eval_runtime": 639.6154, "eval_samples_per_second": 99.189, "eval_steps_per_second": 3.1, "step": 44000 }, { "epoch": 4.64703425229741, "grad_norm": 3.0733375549316406, "learning_rate": 5.550000000000001e-05, "loss": 1.1071, "step": 44500 }, { "epoch": 4.64703425229741, "eval_accuracy": 0.7780140462141538, "eval_loss": 1.1811403036117554, "eval_runtime": 637.3252, "eval_samples_per_second": 99.546, "eval_steps_per_second": 3.111, "step": 44500 }, { "epoch": 4.6992481203007515, "grad_norm": 3.2747066020965576, "learning_rate": 5.500000000000001e-05, "loss": 1.0993, "step": 45000 }, { "epoch": 4.6992481203007515, "eval_accuracy": 0.7782215810670943, "eval_loss": 1.153876781463623, "eval_runtime": 625.6185, "eval_samples_per_second": 101.408, "eval_steps_per_second": 3.17, "step": 45000 }, { "epoch": 4.751461988304094, "grad_norm": 3.6057417392730713, "learning_rate": 5.45e-05, "loss": 1.0998, "step": 45500 }, { "epoch": 4.751461988304094, "eval_accuracy": 0.7785066434739322, "eval_loss": 1.151453971862793, "eval_runtime": 626.7653, "eval_samples_per_second": 101.223, "eval_steps_per_second": 3.164, "step": 45500 }, { "epoch": 4.803675856307435, "grad_norm": 2.923815965652466, "learning_rate": 5.4000000000000005e-05, "loss": 1.0966, "step": 46000 }, { "epoch": 4.803675856307435, "eval_accuracy": 0.7786641899675826, "eval_loss": 1.1550520658493042, "eval_runtime": 626.8592, "eval_samples_per_second": 101.208, "eval_steps_per_second": 3.163, "step": 46000 }, { "epoch": 4.855889724310777, "grad_norm": 2.843076229095459, "learning_rate": 5.3500000000000006e-05, "loss": 1.0977, "step": 46500 }, { "epoch": 4.855889724310777, "eval_accuracy": 0.7794697444838651, "eval_loss": 1.151402473449707, "eval_runtime": 625.4097, "eval_samples_per_second": 101.442, "eval_steps_per_second": 3.171, "step": 46500 }, { "epoch": 4.908103592314118, "grad_norm": 3.2491214275360107, "learning_rate": 5.300000000000001e-05, "loss": 1.0962, "step": 47000 }, { "epoch": 4.908103592314118, "eval_accuracy": 0.7797745180661694, "eval_loss": 1.1584906578063965, "eval_runtime": 624.1222, "eval_samples_per_second": 101.652, "eval_steps_per_second": 3.177, "step": 47000 }, { "epoch": 4.9603174603174605, "grad_norm": 2.8548121452331543, "learning_rate": 5.25e-05, "loss": 1.0933, "step": 47500 }, { "epoch": 4.9603174603174605, "eval_accuracy": 0.7794554886066851, "eval_loss": 1.1526641845703125, "eval_runtime": 644.9796, "eval_samples_per_second": 98.364, "eval_steps_per_second": 3.075, "step": 47500 }, { "epoch": 5.012531328320802, "grad_norm": 3.3320724964141846, "learning_rate": 5.2000000000000004e-05, "loss": 1.0897, "step": 48000 }, { "epoch": 5.012531328320802, "eval_accuracy": 0.7798054689379954, "eval_loss": 1.153532862663269, "eval_runtime": 631.6644, "eval_samples_per_second": 100.438, "eval_steps_per_second": 3.139, "step": 48000 }, { "epoch": 5.064745196324144, "grad_norm": 3.1418938636779785, "learning_rate": 5.1500000000000005e-05, "loss": 1.0727, "step": 48500 }, { "epoch": 5.064745196324144, "eval_accuracy": 0.7807417083407959, "eval_loss": 1.151798963546753, "eval_runtime": 628.4763, "eval_samples_per_second": 100.947, "eval_steps_per_second": 3.155, "step": 48500 }, { "epoch": 5.116959064327485, "grad_norm": 3.242654800415039, "learning_rate": 5.1000000000000006e-05, "loss": 1.0846, "step": 49000 }, { "epoch": 5.116959064327485, "eval_accuracy": 0.7803700732682441, "eval_loss": 1.143381953239441, "eval_runtime": 626.0556, "eval_samples_per_second": 101.338, "eval_steps_per_second": 3.167, "step": 49000 }, { "epoch": 5.169172932330827, "grad_norm": 3.165130615234375, "learning_rate": 5.05e-05, "loss": 1.0817, "step": 49500 }, { "epoch": 5.169172932330827, "eval_accuracy": 0.7805262215474418, "eval_loss": 1.1557620763778687, "eval_runtime": 638.2573, "eval_samples_per_second": 99.4, "eval_steps_per_second": 3.107, "step": 49500 }, { "epoch": 5.221386800334169, "grad_norm": 3.074112892150879, "learning_rate": 5e-05, "loss": 1.0725, "step": 50000 }, { "epoch": 5.221386800334169, "eval_accuracy": 0.7803664543536556, "eval_loss": 1.1497862339019775, "eval_runtime": 635.6514, "eval_samples_per_second": 99.808, "eval_steps_per_second": 3.12, "step": 50000 }, { "epoch": 5.273600668337511, "grad_norm": 3.5100748538970947, "learning_rate": 4.9500000000000004e-05, "loss": 1.076, "step": 50500 }, { "epoch": 5.273600668337511, "eval_accuracy": 0.7819452223531255, "eval_loss": 1.1426007747650146, "eval_runtime": 627.3013, "eval_samples_per_second": 101.136, "eval_steps_per_second": 3.161, "step": 50500 }, { "epoch": 5.325814536340852, "grad_norm": 2.9352290630340576, "learning_rate": 4.9e-05, "loss": 1.0786, "step": 51000 }, { "epoch": 5.325814536340852, "eval_accuracy": 0.7809871489797384, "eval_loss": 1.1469109058380127, "eval_runtime": 651.0651, "eval_samples_per_second": 97.445, "eval_steps_per_second": 3.046, "step": 51000 }, { "epoch": 5.378028404344194, "grad_norm": 3.215942859649658, "learning_rate": 4.85e-05, "loss": 1.0756, "step": 51500 }, { "epoch": 5.378028404344194, "eval_accuracy": 0.7814986150426574, "eval_loss": 1.1447529792785645, "eval_runtime": 637.8608, "eval_samples_per_second": 99.462, "eval_steps_per_second": 3.109, "step": 51500 }, { "epoch": 5.430242272347535, "grad_norm": 3.225491523742676, "learning_rate": 4.8e-05, "loss": 1.0782, "step": 52000 }, { "epoch": 5.430242272347535, "eval_accuracy": 0.781914062234545, "eval_loss": 1.1327252388000488, "eval_runtime": 636.3098, "eval_samples_per_second": 99.705, "eval_steps_per_second": 3.116, "step": 52000 }, { "epoch": 5.482456140350878, "grad_norm": 3.2323813438415527, "learning_rate": 4.75e-05, "loss": 1.0718, "step": 52500 }, { "epoch": 5.482456140350878, "eval_accuracy": 0.7824720975904875, "eval_loss": 1.1324148178100586, "eval_runtime": 634.3817, "eval_samples_per_second": 100.008, "eval_steps_per_second": 3.126, "step": 52500 }, { "epoch": 5.534670008354219, "grad_norm": 2.8218557834625244, "learning_rate": 4.7e-05, "loss": 1.0646, "step": 53000 }, { "epoch": 5.534670008354219, "eval_accuracy": 0.7827635872108434, "eval_loss": 1.139945387840271, "eval_runtime": 685.8615, "eval_samples_per_second": 92.501, "eval_steps_per_second": 2.891, "step": 53000 }, { "epoch": 5.586883876357561, "grad_norm": 3.848472833633423, "learning_rate": 4.6500000000000005e-05, "loss": 1.0692, "step": 53500 }, { "epoch": 5.586883876357561, "eval_accuracy": 0.7828145845856118, "eval_loss": 1.1383306980133057, "eval_runtime": 640.36, "eval_samples_per_second": 99.074, "eval_steps_per_second": 3.097, "step": 53500 }, { "epoch": 5.639097744360902, "grad_norm": 3.0220718383789062, "learning_rate": 4.600000000000001e-05, "loss": 1.0605, "step": 54000 }, { "epoch": 5.639097744360902, "eval_accuracy": 0.7828910126440776, "eval_loss": 1.1487065553665161, "eval_runtime": 639.7494, "eval_samples_per_second": 99.169, "eval_steps_per_second": 3.1, "step": 54000 }, { "epoch": 5.6913116123642435, "grad_norm": 3.226999044418335, "learning_rate": 4.55e-05, "loss": 1.0678, "step": 54500 }, { "epoch": 5.6913116123642435, "eval_accuracy": 0.783646746192109, "eval_loss": 1.1216100454330444, "eval_runtime": 637.7267, "eval_samples_per_second": 99.483, "eval_steps_per_second": 3.109, "step": 54500 }, { "epoch": 5.743525480367586, "grad_norm": 3.0716323852539062, "learning_rate": 4.5e-05, "loss": 1.068, "step": 55000 }, { "epoch": 5.743525480367586, "eval_accuracy": 0.7840726797935829, "eval_loss": 1.1208263635635376, "eval_runtime": 661.1104, "eval_samples_per_second": 95.964, "eval_steps_per_second": 2.999, "step": 55000 }, { "epoch": 5.795739348370927, "grad_norm": 3.032036542892456, "learning_rate": 4.4500000000000004e-05, "loss": 1.0588, "step": 55500 }, { "epoch": 5.795739348370927, "eval_accuracy": 0.7842573639982784, "eval_loss": 1.1288572549819946, "eval_runtime": 644.8754, "eval_samples_per_second": 98.38, "eval_steps_per_second": 3.075, "step": 55500 }, { "epoch": 5.847953216374269, "grad_norm": 2.896811008453369, "learning_rate": 4.4000000000000006e-05, "loss": 1.0723, "step": 56000 }, { "epoch": 5.847953216374269, "eval_accuracy": 0.7839231768143949, "eval_loss": 1.11968195438385, "eval_runtime": 643.1751, "eval_samples_per_second": 98.64, "eval_steps_per_second": 3.083, "step": 56000 }, { "epoch": 5.90016708437761, "grad_norm": 3.429722785949707, "learning_rate": 4.35e-05, "loss": 1.057, "step": 56500 }, { "epoch": 5.90016708437761, "eval_accuracy": 0.7850116259552448, "eval_loss": 1.1247141361236572, "eval_runtime": 639.1011, "eval_samples_per_second": 99.269, "eval_steps_per_second": 3.103, "step": 56500 }, { "epoch": 5.9523809523809526, "grad_norm": 2.920243740081787, "learning_rate": 4.3e-05, "loss": 1.0628, "step": 57000 }, { "epoch": 5.9523809523809526, "eval_accuracy": 0.7846882337268002, "eval_loss": NaN, "eval_runtime": 629.1614, "eval_samples_per_second": 100.837, "eval_steps_per_second": 3.152, "step": 57000 }, { "epoch": 6.004594820384294, "grad_norm": 2.937222957611084, "learning_rate": 4.25e-05, "loss": 1.0619, "step": 57500 }, { "epoch": 6.004594820384294, "eval_accuracy": 0.7845797476831528, "eval_loss": NaN, "eval_runtime": 628.6009, "eval_samples_per_second": 100.927, "eval_steps_per_second": 3.155, "step": 57500 }, { "epoch": 6.056808688387636, "grad_norm": 2.854961395263672, "learning_rate": 4.2e-05, "loss": 1.0386, "step": 58000 }, { "epoch": 6.056808688387636, "eval_accuracy": 0.7856027081868581, "eval_loss": 1.1315497159957886, "eval_runtime": 626.9314, "eval_samples_per_second": 101.196, "eval_steps_per_second": 3.163, "step": 58000 }, { "epoch": 6.109022556390977, "grad_norm": 3.4655332565307617, "learning_rate": 4.15e-05, "loss": 1.0446, "step": 58500 }, { "epoch": 6.109022556390977, "eval_accuracy": 0.785224256670098, "eval_loss": NaN, "eval_runtime": 674.346, "eval_samples_per_second": 94.081, "eval_steps_per_second": 2.941, "step": 58500 }, { "epoch": 6.161236424394319, "grad_norm": 2.93487811088562, "learning_rate": 4.1e-05, "loss": 1.0532, "step": 59000 }, { "epoch": 6.161236424394319, "eval_accuracy": 0.7855367902419893, "eval_loss": 1.1223821640014648, "eval_runtime": 630.6681, "eval_samples_per_second": 100.596, "eval_steps_per_second": 3.144, "step": 59000 }, { "epoch": 6.213450292397661, "grad_norm": 3.50022554397583, "learning_rate": 4.05e-05, "loss": 1.0465, "step": 59500 }, { "epoch": 6.213450292397661, "eval_accuracy": 0.7857459334681388, "eval_loss": 1.1025224924087524, "eval_runtime": 625.4125, "eval_samples_per_second": 101.442, "eval_steps_per_second": 3.171, "step": 59500 }, { "epoch": 6.265664160401003, "grad_norm": 3.2295687198638916, "learning_rate": 4e-05, "loss": 1.0448, "step": 60000 }, { "epoch": 6.265664160401003, "eval_accuracy": 0.7862344900229137, "eval_loss": 1.104607105255127, "eval_runtime": 668.9552, "eval_samples_per_second": 94.839, "eval_steps_per_second": 2.964, "step": 60000 }, { "epoch": 6.317878028404344, "grad_norm": 3.1386146545410156, "learning_rate": 3.9500000000000005e-05, "loss": 1.0304, "step": 60500 }, { "epoch": 6.317878028404344, "eval_accuracy": 0.7869643708248876, "eval_loss": 1.1088374853134155, "eval_runtime": 633.3446, "eval_samples_per_second": 100.171, "eval_steps_per_second": 3.131, "step": 60500 }, { "epoch": 6.370091896407686, "grad_norm": 3.1814932823181152, "learning_rate": 3.9000000000000006e-05, "loss": 1.0354, "step": 61000 }, { "epoch": 6.370091896407686, "eval_accuracy": 0.7865247818397344, "eval_loss": 1.108305811882019, "eval_runtime": 636.3927, "eval_samples_per_second": 99.692, "eval_steps_per_second": 3.116, "step": 61000 }, { "epoch": 6.4223057644110275, "grad_norm": 3.155350685119629, "learning_rate": 3.85e-05, "loss": 1.0437, "step": 61500 }, { "epoch": 6.4223057644110275, "eval_accuracy": 0.7868538533540672, "eval_loss": NaN, "eval_runtime": 630.5912, "eval_samples_per_second": 100.609, "eval_steps_per_second": 3.145, "step": 61500 }, { "epoch": 6.474519632414369, "grad_norm": 2.9944610595703125, "learning_rate": 3.8e-05, "loss": 1.0347, "step": 62000 }, { "epoch": 6.474519632414369, "eval_accuracy": 0.7876532991281543, "eval_loss": 1.0992841720581055, "eval_runtime": 649.6208, "eval_samples_per_second": 97.662, "eval_steps_per_second": 3.053, "step": 62000 }, { "epoch": 6.526733500417711, "grad_norm": 2.89630126953125, "learning_rate": 3.7500000000000003e-05, "loss": 1.041, "step": 62500 }, { "epoch": 6.526733500417711, "eval_accuracy": 0.7875077762136071, "eval_loss": 1.1099497079849243, "eval_runtime": 631.7247, "eval_samples_per_second": 100.428, "eval_steps_per_second": 3.139, "step": 62500 }, { "epoch": 6.578947368421053, "grad_norm": 3.2351603507995605, "learning_rate": 3.7e-05, "loss": 1.0388, "step": 63000 }, { "epoch": 6.578947368421053, "eval_accuracy": 0.78800361330496, "eval_loss": 1.11760413646698, "eval_runtime": 629.3158, "eval_samples_per_second": 100.813, "eval_steps_per_second": 3.151, "step": 63000 }, { "epoch": 6.631161236424394, "grad_norm": 3.2929039001464844, "learning_rate": 3.65e-05, "loss": 1.0383, "step": 63500 }, { "epoch": 6.631161236424394, "eval_accuracy": 0.7879278944035151, "eval_loss": 1.1053088903427124, "eval_runtime": 627.1228, "eval_samples_per_second": 101.165, "eval_steps_per_second": 3.162, "step": 63500 }, { "epoch": 6.683375104427736, "grad_norm": 2.9945790767669678, "learning_rate": 3.6e-05, "loss": 1.0373, "step": 64000 }, { "epoch": 6.683375104427736, "eval_accuracy": 0.7881556360589377, "eval_loss": 1.1067023277282715, "eval_runtime": 664.2963, "eval_samples_per_second": 95.504, "eval_steps_per_second": 2.985, "step": 64000 }, { "epoch": 6.735588972431078, "grad_norm": 3.4380545616149902, "learning_rate": 3.55e-05, "loss": 1.0382, "step": 64500 }, { "epoch": 6.735588972431078, "eval_accuracy": 0.7882482565200516, "eval_loss": 1.116519570350647, "eval_runtime": 629.5022, "eval_samples_per_second": 100.783, "eval_steps_per_second": 3.15, "step": 64500 }, { "epoch": 6.787802840434419, "grad_norm": 3.368530750274658, "learning_rate": 3.5e-05, "loss": 1.0277, "step": 65000 }, { "epoch": 6.787802840434419, "eval_accuracy": 0.7891739950636399, "eval_loss": 1.0915908813476562, "eval_runtime": 632.7041, "eval_samples_per_second": 100.273, "eval_steps_per_second": 3.134, "step": 65000 }, { "epoch": 6.840016708437761, "grad_norm": 3.3649165630340576, "learning_rate": 3.45e-05, "loss": 1.0337, "step": 65500 }, { "epoch": 6.840016708437761, "eval_accuracy": 0.7883815172332914, "eval_loss": 1.1139589548110962, "eval_runtime": 640.7807, "eval_samples_per_second": 99.009, "eval_steps_per_second": 3.095, "step": 65500 }, { "epoch": 6.8922305764411025, "grad_norm": 3.242825746536255, "learning_rate": 3.4000000000000007e-05, "loss": 1.033, "step": 66000 }, { "epoch": 6.8922305764411025, "eval_accuracy": 0.789335987558676, "eval_loss": 1.0902156829833984, "eval_runtime": 634.5325, "eval_samples_per_second": 99.984, "eval_steps_per_second": 3.125, "step": 66000 }, { "epoch": 6.944444444444445, "grad_norm": 2.7917425632476807, "learning_rate": 3.35e-05, "loss": 1.0298, "step": 66500 }, { "epoch": 6.944444444444445, "eval_accuracy": 0.7897472088960492, "eval_loss": 1.0988086462020874, "eval_runtime": 636.5285, "eval_samples_per_second": 99.67, "eval_steps_per_second": 3.115, "step": 66500 }, { "epoch": 6.996658312447786, "grad_norm": 2.9491727352142334, "learning_rate": 3.3e-05, "loss": 1.0325, "step": 67000 }, { "epoch": 6.996658312447786, "eval_accuracy": 0.7894358436331047, "eval_loss": 1.097759485244751, "eval_runtime": 631.2152, "eval_samples_per_second": 100.509, "eval_steps_per_second": 3.142, "step": 67000 }, { "epoch": 7.048872180451128, "grad_norm": 3.81508469581604, "learning_rate": 3.2500000000000004e-05, "loss": 1.013, "step": 67500 }, { "epoch": 7.048872180451128, "eval_accuracy": 0.7897459597854384, "eval_loss": 1.1025677919387817, "eval_runtime": 667.978, "eval_samples_per_second": 94.978, "eval_steps_per_second": 2.969, "step": 67500 }, { "epoch": 7.101086048454469, "grad_norm": 2.6366662979125977, "learning_rate": 3.2000000000000005e-05, "loss": 1.0199, "step": 68000 }, { "epoch": 7.101086048454469, "eval_accuracy": 0.7899999141985645, "eval_loss": 1.1025761365890503, "eval_runtime": 630.4778, "eval_samples_per_second": 100.627, "eval_steps_per_second": 3.145, "step": 68000 }, { "epoch": 7.1532999164578115, "grad_norm": 3.155102014541626, "learning_rate": 3.15e-05, "loss": 1.0157, "step": 68500 }, { "epoch": 7.1532999164578115, "eval_accuracy": 0.7898970170010094, "eval_loss": 1.1064562797546387, "eval_runtime": 625.4866, "eval_samples_per_second": 101.43, "eval_steps_per_second": 3.17, "step": 68500 }, { "epoch": 7.205513784461153, "grad_norm": 2.9615111351013184, "learning_rate": 3.1e-05, "loss": 1.0137, "step": 69000 }, { "epoch": 7.205513784461153, "eval_accuracy": 0.7904352669747394, "eval_loss": 1.0876559019088745, "eval_runtime": 646.3487, "eval_samples_per_second": 98.156, "eval_steps_per_second": 3.068, "step": 69000 }, { "epoch": 7.257727652464495, "grad_norm": 3.723841905593872, "learning_rate": 3.05e-05, "loss": 1.0137, "step": 69500 }, { "epoch": 7.257727652464495, "eval_accuracy": 0.7899743837210673, "eval_loss": 1.0912542343139648, "eval_runtime": 652.0884, "eval_samples_per_second": 97.292, "eval_steps_per_second": 3.041, "step": 69500 }, { "epoch": 7.309941520467836, "grad_norm": 3.6348705291748047, "learning_rate": 3e-05, "loss": 1.0167, "step": 70000 }, { "epoch": 7.309941520467836, "eval_accuracy": 0.7903721199263885, "eval_loss": NaN, "eval_runtime": 630.9965, "eval_samples_per_second": 100.544, "eval_steps_per_second": 3.143, "step": 70000 }, { "epoch": 7.362155388471178, "grad_norm": 3.2423222064971924, "learning_rate": 2.95e-05, "loss": 1.0089, "step": 70500 }, { "epoch": 7.362155388471178, "eval_accuracy": 0.7906133078105837, "eval_loss": 1.0956330299377441, "eval_runtime": 633.5776, "eval_samples_per_second": 100.135, "eval_steps_per_second": 3.13, "step": 70500 }, { "epoch": 7.41436925647452, "grad_norm": 3.15704083442688, "learning_rate": 2.9e-05, "loss": 1.0055, "step": 71000 }, { "epoch": 7.41436925647452, "eval_accuracy": 0.7912404700602101, "eval_loss": NaN, "eval_runtime": 649.7229, "eval_samples_per_second": 97.646, "eval_steps_per_second": 3.052, "step": 71000 }, { "epoch": 7.466583124477861, "grad_norm": 3.387059211730957, "learning_rate": 2.8499999999999998e-05, "loss": 1.0135, "step": 71500 }, { "epoch": 7.466583124477861, "eval_accuracy": 0.7914904478088925, "eval_loss": 1.0922181606292725, "eval_runtime": 633.8807, "eval_samples_per_second": 100.087, "eval_steps_per_second": 3.128, "step": 71500 }, { "epoch": 7.518796992481203, "grad_norm": 3.3285696506500244, "learning_rate": 2.8000000000000003e-05, "loss": 1.0041, "step": 72000 }, { "epoch": 7.518796992481203, "eval_accuracy": 0.7919370824790591, "eval_loss": 1.0766669511795044, "eval_runtime": 640.2628, "eval_samples_per_second": 99.089, "eval_steps_per_second": 3.097, "step": 72000 }, { "epoch": 7.571010860484545, "grad_norm": 3.0725443363189697, "learning_rate": 2.7500000000000004e-05, "loss": 0.9982, "step": 72500 }, { "epoch": 7.571010860484545, "eval_accuracy": 0.7914295287916221, "eval_loss": 1.0892815589904785, "eval_runtime": 636.4517, "eval_samples_per_second": 99.682, "eval_steps_per_second": 3.116, "step": 72500 }, { "epoch": 7.6232247284878865, "grad_norm": 3.2176389694213867, "learning_rate": 2.7000000000000002e-05, "loss": 1.0066, "step": 73000 }, { "epoch": 7.6232247284878865, "eval_accuracy": 0.7917736483647045, "eval_loss": 1.092371940612793, "eval_runtime": 621.243, "eval_samples_per_second": 102.123, "eval_steps_per_second": 3.192, "step": 73000 }, { "epoch": 7.675438596491228, "grad_norm": 2.7997543811798096, "learning_rate": 2.6500000000000004e-05, "loss": 1.0073, "step": 73500 }, { "epoch": 7.675438596491228, "eval_accuracy": 0.7922002806270909, "eval_loss": 1.0904649496078491, "eval_runtime": 620.7253, "eval_samples_per_second": 102.208, "eval_steps_per_second": 3.195, "step": 73500 }, { "epoch": 7.72765246449457, "grad_norm": 3.626774549484253, "learning_rate": 2.6000000000000002e-05, "loss": 1.0015, "step": 74000 }, { "epoch": 7.72765246449457, "eval_accuracy": 0.7917596219688585, "eval_loss": 1.0867533683776855, "eval_runtime": 618.3607, "eval_samples_per_second": 102.599, "eval_steps_per_second": 3.207, "step": 74000 }, { "epoch": 7.779866332497911, "grad_norm": 2.8419744968414307, "learning_rate": 2.5500000000000003e-05, "loss": 1.0049, "step": 74500 }, { "epoch": 7.779866332497911, "eval_accuracy": 0.7928055223543873, "eval_loss": 1.0817667245864868, "eval_runtime": 617.1968, "eval_samples_per_second": 102.792, "eval_steps_per_second": 3.213, "step": 74500 }, { "epoch": 7.832080200501253, "grad_norm": 2.822751760482788, "learning_rate": 2.5e-05, "loss": 0.9957, "step": 75000 }, { "epoch": 7.832080200501253, "eval_accuracy": 0.7932603950321594, "eval_loss": 1.0652581453323364, "eval_runtime": 616.9484, "eval_samples_per_second": 102.834, "eval_steps_per_second": 3.214, "step": 75000 }, { "epoch": 7.884294068504595, "grad_norm": 2.9094507694244385, "learning_rate": 2.45e-05, "loss": 1.0042, "step": 75500 }, { "epoch": 7.884294068504595, "eval_accuracy": 0.7934020652494004, "eval_loss": 1.0694156885147095, "eval_runtime": 617.9112, "eval_samples_per_second": 102.673, "eval_steps_per_second": 3.209, "step": 75500 }, { "epoch": 7.936507936507937, "grad_norm": 2.5249078273773193, "learning_rate": 2.4e-05, "loss": 1.0031, "step": 76000 }, { "epoch": 7.936507936507937, "eval_accuracy": 0.7935896894661485, "eval_loss": 1.0808907747268677, "eval_runtime": 618.1232, "eval_samples_per_second": 102.638, "eval_steps_per_second": 3.208, "step": 76000 }, { "epoch": 7.988721804511278, "grad_norm": 3.253228187561035, "learning_rate": 2.35e-05, "loss": 0.9979, "step": 76500 }, { "epoch": 7.988721804511278, "eval_accuracy": 0.7933455565489351, "eval_loss": 1.0679088830947876, "eval_runtime": 618.0488, "eval_samples_per_second": 102.65, "eval_steps_per_second": 3.208, "step": 76500 }, { "epoch": 8.04093567251462, "grad_norm": 2.6395580768585205, "learning_rate": 2.3000000000000003e-05, "loss": 0.9825, "step": 77000 }, { "epoch": 8.04093567251462, "eval_accuracy": 0.7935253800154058, "eval_loss": 1.07496976852417, "eval_runtime": 617.9799, "eval_samples_per_second": 102.662, "eval_steps_per_second": 3.209, "step": 77000 }, { "epoch": 8.093149540517961, "grad_norm": 3.0883655548095703, "learning_rate": 2.25e-05, "loss": 0.9926, "step": 77500 }, { "epoch": 8.093149540517961, "eval_accuracy": 0.7942360815200339, "eval_loss": 1.0814706087112427, "eval_runtime": 618.6019, "eval_samples_per_second": 102.559, "eval_steps_per_second": 3.206, "step": 77500 }, { "epoch": 8.145363408521304, "grad_norm": 3.356086015701294, "learning_rate": 2.2000000000000003e-05, "loss": 0.9897, "step": 78000 }, { "epoch": 8.145363408521304, "eval_accuracy": 0.7933161062503574, "eval_loss": NaN, "eval_runtime": 617.0688, "eval_samples_per_second": 102.813, "eval_steps_per_second": 3.214, "step": 78000 }, { "epoch": 8.197577276524646, "grad_norm": 3.31772780418396, "learning_rate": 2.15e-05, "loss": 0.9871, "step": 78500 }, { "epoch": 8.197577276524646, "eval_accuracy": 0.7943132034313805, "eval_loss": 1.0676552057266235, "eval_runtime": 623.639, "eval_samples_per_second": 101.73, "eval_steps_per_second": 3.18, "step": 78500 }, { "epoch": 8.249791144527986, "grad_norm": 3.543698310852051, "learning_rate": 2.1e-05, "loss": 1.0001, "step": 79000 }, { "epoch": 8.249791144527986, "eval_accuracy": 0.7938355754922207, "eval_loss": 1.0734678506851196, "eval_runtime": 617.8987, "eval_samples_per_second": 102.675, "eval_steps_per_second": 3.209, "step": 79000 }, { "epoch": 8.302005012531328, "grad_norm": 2.846734046936035, "learning_rate": 2.05e-05, "loss": 0.987, "step": 79500 }, { "epoch": 8.302005012531328, "eval_accuracy": 0.7944878324075739, "eval_loss": 1.0765976905822754, "eval_runtime": 618.672, "eval_samples_per_second": 102.547, "eval_steps_per_second": 3.205, "step": 79500 }, { "epoch": 8.35421888053467, "grad_norm": 2.871553897857666, "learning_rate": 2e-05, "loss": 0.9806, "step": 80000 }, { "epoch": 8.35421888053467, "eval_accuracy": 0.794915350004184, "eval_loss": 1.0597549676895142, "eval_runtime": 618.1763, "eval_samples_per_second": 102.629, "eval_steps_per_second": 3.208, "step": 80000 }, { "epoch": 8.406432748538013, "grad_norm": 2.959845542907715, "learning_rate": 1.9500000000000003e-05, "loss": 0.984, "step": 80500 }, { "epoch": 8.406432748538013, "eval_accuracy": 0.7946179832675856, "eval_loss": 1.0595872402191162, "eval_runtime": 617.7039, "eval_samples_per_second": 102.708, "eval_steps_per_second": 3.21, "step": 80500 }, { "epoch": 8.458646616541353, "grad_norm": 2.7344448566436768, "learning_rate": 1.9e-05, "loss": 0.9822, "step": 81000 }, { "epoch": 8.458646616541353, "eval_accuracy": 0.7951521505243792, "eval_loss": 1.0736610889434814, "eval_runtime": 618.3207, "eval_samples_per_second": 102.605, "eval_steps_per_second": 3.207, "step": 81000 }, { "epoch": 8.510860484544695, "grad_norm": 3.7596890926361084, "learning_rate": 1.85e-05, "loss": 0.9791, "step": 81500 }, { "epoch": 8.510860484544695, "eval_accuracy": 0.795253175296364, "eval_loss": 1.0666213035583496, "eval_runtime": 620.245, "eval_samples_per_second": 102.287, "eval_steps_per_second": 3.197, "step": 81500 }, { "epoch": 8.563074352548037, "grad_norm": 2.727389335632324, "learning_rate": 1.8e-05, "loss": 0.9804, "step": 82000 }, { "epoch": 8.563074352548037, "eval_accuracy": 0.7953146030482228, "eval_loss": 1.0600041151046753, "eval_runtime": 618.8007, "eval_samples_per_second": 102.526, "eval_steps_per_second": 3.205, "step": 82000 }, { "epoch": 8.615288220551378, "grad_norm": 2.9671452045440674, "learning_rate": 1.75e-05, "loss": 0.9703, "step": 82500 }, { "epoch": 8.615288220551378, "eval_accuracy": 0.7963827692254926, "eval_loss": 1.0601171255111694, "eval_runtime": 616.1581, "eval_samples_per_second": 102.965, "eval_steps_per_second": 3.218, "step": 82500 }, { "epoch": 8.66750208855472, "grad_norm": 2.9628329277038574, "learning_rate": 1.7000000000000003e-05, "loss": 0.9768, "step": 83000 }, { "epoch": 8.66750208855472, "eval_accuracy": 0.7952524450397661, "eval_loss": 1.0768334865570068, "eval_runtime": 616.7531, "eval_samples_per_second": 102.866, "eval_steps_per_second": 3.215, "step": 83000 }, { "epoch": 8.719715956558062, "grad_norm": 3.6838812828063965, "learning_rate": 1.65e-05, "loss": 0.9722, "step": 83500 }, { "epoch": 8.719715956558062, "eval_accuracy": 0.796164760278738, "eval_loss": NaN, "eval_runtime": 616.0678, "eval_samples_per_second": 102.981, "eval_steps_per_second": 3.219, "step": 83500 }, { "epoch": 8.771929824561404, "grad_norm": 3.7852931022644043, "learning_rate": 1.6000000000000003e-05, "loss": 0.9813, "step": 84000 }, { "epoch": 8.771929824561404, "eval_accuracy": 0.7964795153494032, "eval_loss": NaN, "eval_runtime": 616.9446, "eval_samples_per_second": 102.834, "eval_steps_per_second": 3.214, "step": 84000 }, { "epoch": 8.824143692564745, "grad_norm": 2.85229229927063, "learning_rate": 1.55e-05, "loss": 0.9732, "step": 84500 }, { "epoch": 8.824143692564745, "eval_accuracy": 0.7960116027401912, "eval_loss": 1.061353325843811, "eval_runtime": 624.2524, "eval_samples_per_second": 101.63, "eval_steps_per_second": 3.177, "step": 84500 }, { "epoch": 8.876357560568087, "grad_norm": 3.463848114013672, "learning_rate": 1.5e-05, "loss": 0.9764, "step": 85000 }, { "epoch": 8.876357560568087, "eval_accuracy": 0.7963621203122637, "eval_loss": 1.049773097038269, "eval_runtime": 618.458, "eval_samples_per_second": 102.583, "eval_steps_per_second": 3.206, "step": 85000 }, { "epoch": 8.928571428571429, "grad_norm": 2.7122750282287598, "learning_rate": 1.45e-05, "loss": 0.9829, "step": 85500 }, { "epoch": 8.928571428571429, "eval_accuracy": 0.7965627753528177, "eval_loss": 1.069191813468933, "eval_runtime": 617.5416, "eval_samples_per_second": 102.735, "eval_steps_per_second": 3.211, "step": 85500 }, { "epoch": 8.980785296574771, "grad_norm": 2.976637840270996, "learning_rate": 1.4000000000000001e-05, "loss": 0.9741, "step": 86000 }, { "epoch": 8.980785296574771, "eval_accuracy": 0.7967220469069121, "eval_loss": 1.055444598197937, "eval_runtime": 614.0894, "eval_samples_per_second": 103.312, "eval_steps_per_second": 3.229, "step": 86000 }, { "epoch": 9.032999164578111, "grad_norm": 2.89384126663208, "learning_rate": 1.3500000000000001e-05, "loss": 0.9648, "step": 86500 }, { "epoch": 9.032999164578111, "eval_accuracy": 0.797160834758222, "eval_loss": 1.058738350868225, "eval_runtime": 615.784, "eval_samples_per_second": 103.028, "eval_steps_per_second": 3.22, "step": 86500 }, { "epoch": 9.085213032581454, "grad_norm": 3.009979248046875, "learning_rate": 1.3000000000000001e-05, "loss": 0.9606, "step": 87000 }, { "epoch": 9.085213032581454, "eval_accuracy": 0.7971404883516657, "eval_loss": 1.0621484518051147, "eval_runtime": 618.535, "eval_samples_per_second": 102.57, "eval_steps_per_second": 3.206, "step": 87000 }, { "epoch": 9.137426900584796, "grad_norm": 3.301671266555786, "learning_rate": 1.25e-05, "loss": 0.9682, "step": 87500 }, { "epoch": 9.137426900584796, "eval_accuracy": 0.7973926994154387, "eval_loss": 1.0569199323654175, "eval_runtime": 616.4943, "eval_samples_per_second": 102.909, "eval_steps_per_second": 3.217, "step": 87500 }, { "epoch": 9.189640768588138, "grad_norm": 2.932385206222534, "learning_rate": 1.2e-05, "loss": 0.9644, "step": 88000 }, { "epoch": 9.189640768588138, "eval_accuracy": 0.7974666942085554, "eval_loss": 1.0533905029296875, "eval_runtime": 620.5893, "eval_samples_per_second": 102.23, "eval_steps_per_second": 3.195, "step": 88000 }, { "epoch": 9.241854636591478, "grad_norm": 3.1355013847351074, "learning_rate": 1.1500000000000002e-05, "loss": 0.9658, "step": 88500 }, { "epoch": 9.241854636591478, "eval_accuracy": 0.7977740520228777, "eval_loss": 1.0476280450820923, "eval_runtime": 618.7726, "eval_samples_per_second": 102.53, "eval_steps_per_second": 3.205, "step": 88500 }, { "epoch": 9.29406850459482, "grad_norm": 3.353086233139038, "learning_rate": 1.1000000000000001e-05, "loss": 0.9641, "step": 89000 }, { "epoch": 9.29406850459482, "eval_accuracy": 0.7973864228195986, "eval_loss": 1.041751742362976, "eval_runtime": 614.3896, "eval_samples_per_second": 103.262, "eval_steps_per_second": 3.228, "step": 89000 }, { "epoch": 9.346282372598163, "grad_norm": 2.8681247234344482, "learning_rate": 1.05e-05, "loss": 0.963, "step": 89500 }, { "epoch": 9.346282372598163, "eval_accuracy": 0.798292731668655, "eval_loss": 1.0465270280838013, "eval_runtime": 615.1098, "eval_samples_per_second": 103.141, "eval_steps_per_second": 3.224, "step": 89500 }, { "epoch": 9.398496240601503, "grad_norm": 3.2927052974700928, "learning_rate": 1e-05, "loss": 0.9562, "step": 90000 }, { "epoch": 9.398496240601503, "eval_accuracy": 0.798140924991145, "eval_loss": 1.0476934909820557, "eval_runtime": 616.723, "eval_samples_per_second": 102.871, "eval_steps_per_second": 3.215, "step": 90000 }, { "epoch": 9.450710108604845, "grad_norm": 3.1475863456726074, "learning_rate": 9.5e-06, "loss": 0.9605, "step": 90500 }, { "epoch": 9.450710108604845, "eval_accuracy": 0.7977010212122122, "eval_loss": 1.0535070896148682, "eval_runtime": 615.5187, "eval_samples_per_second": 103.072, "eval_steps_per_second": 3.222, "step": 90500 }, { "epoch": 9.502923976608187, "grad_norm": 3.662548780441284, "learning_rate": 9e-06, "loss": 0.9692, "step": 91000 }, { "epoch": 9.502923976608187, "eval_accuracy": 0.7981552799718717, "eval_loss": 1.0515964031219482, "eval_runtime": 615.1273, "eval_samples_per_second": 103.138, "eval_steps_per_second": 3.224, "step": 91000 }, { "epoch": 9.55513784461153, "grad_norm": 3.5501346588134766, "learning_rate": 8.500000000000002e-06, "loss": 0.966, "step": 91500 }, { "epoch": 9.55513784461153, "eval_accuracy": 0.798176074856451, "eval_loss": 1.0500941276550293, "eval_runtime": 616.9498, "eval_samples_per_second": 102.833, "eval_steps_per_second": 3.214, "step": 91500 }, { "epoch": 9.60735171261487, "grad_norm": 3.102790117263794, "learning_rate": 8.000000000000001e-06, "loss": 0.9614, "step": 92000 }, { "epoch": 9.60735171261487, "eval_accuracy": 0.7986090909415797, "eval_loss": 1.0444718599319458, "eval_runtime": 614.407, "eval_samples_per_second": 103.259, "eval_steps_per_second": 3.228, "step": 92000 }, { "epoch": 9.659565580618212, "grad_norm": 3.504926919937134, "learning_rate": 7.5e-06, "loss": 0.9606, "step": 92500 }, { "epoch": 9.659565580618212, "eval_accuracy": 0.7988812850380725, "eval_loss": 1.0497493743896484, "eval_runtime": 615.4873, "eval_samples_per_second": 103.078, "eval_steps_per_second": 3.222, "step": 92500 }, { "epoch": 9.711779448621554, "grad_norm": 3.648078203201294, "learning_rate": 7.000000000000001e-06, "loss": 0.9507, "step": 93000 }, { "epoch": 9.711779448621554, "eval_accuracy": 0.798754673907639, "eval_loss": 1.0451751947402954, "eval_runtime": 615.9945, "eval_samples_per_second": 102.993, "eval_steps_per_second": 3.219, "step": 93000 }, { "epoch": 9.763993316624896, "grad_norm": 3.316922903060913, "learning_rate": 6.5000000000000004e-06, "loss": 0.9584, "step": 93500 }, { "epoch": 9.763993316624896, "eval_accuracy": 0.7987269314893306, "eval_loss": 1.0408653020858765, "eval_runtime": 618.0487, "eval_samples_per_second": 102.65, "eval_steps_per_second": 3.208, "step": 93500 }, { "epoch": 9.816207184628237, "grad_norm": 2.943169116973877, "learning_rate": 6e-06, "loss": 0.9572, "step": 94000 }, { "epoch": 9.816207184628237, "eval_accuracy": 0.7989658711747047, "eval_loss": 1.0431112051010132, "eval_runtime": 619.8767, "eval_samples_per_second": 102.348, "eval_steps_per_second": 3.199, "step": 94000 }, { "epoch": 9.868421052631579, "grad_norm": 2.713733196258545, "learning_rate": 5.500000000000001e-06, "loss": 0.9552, "step": 94500 }, { "epoch": 9.868421052631579, "eval_accuracy": 0.7991049545929755, "eval_loss": 1.0465214252471924, "eval_runtime": 615.6457, "eval_samples_per_second": 103.051, "eval_steps_per_second": 3.221, "step": 94500 }, { "epoch": 9.920634920634921, "grad_norm": 3.5131709575653076, "learning_rate": 5e-06, "loss": 0.9542, "step": 95000 }, { "epoch": 9.920634920634921, "eval_accuracy": 0.7993720778709359, "eval_loss": 1.043885350227356, "eval_runtime": 616.5178, "eval_samples_per_second": 102.905, "eval_steps_per_second": 3.216, "step": 95000 }, { "epoch": 9.972848788638263, "grad_norm": 3.0255749225616455, "learning_rate": 4.5e-06, "loss": 0.9505, "step": 95500 }, { "epoch": 9.972848788638263, "eval_accuracy": 0.7994996190810016, "eval_loss": 1.0470497608184814, "eval_runtime": 616.7354, "eval_samples_per_second": 102.869, "eval_steps_per_second": 3.215, "step": 95500 }, { "epoch": 10.025062656641603, "grad_norm": 4.134832382202148, "learning_rate": 4.000000000000001e-06, "loss": 0.9472, "step": 96000 }, { "epoch": 10.025062656641603, "eval_accuracy": 0.7994244357414679, "eval_loss": 1.0445414781570435, "eval_runtime": 617.5631, "eval_samples_per_second": 102.731, "eval_steps_per_second": 3.211, "step": 96000 }, { "epoch": 10.077276524644946, "grad_norm": 3.3011510372161865, "learning_rate": 3.5000000000000004e-06, "loss": 0.9467, "step": 96500 }, { "epoch": 10.077276524644946, "eval_accuracy": 0.7997884398890616, "eval_loss": 1.0422955751419067, "eval_runtime": 617.5269, "eval_samples_per_second": 102.737, "eval_steps_per_second": 3.211, "step": 96500 }, { "epoch": 10.129490392648288, "grad_norm": 3.65120792388916, "learning_rate": 3e-06, "loss": 0.9497, "step": 97000 }, { "epoch": 10.129490392648288, "eval_accuracy": 0.7994027266856298, "eval_loss": 1.0433976650238037, "eval_runtime": 615.8756, "eval_samples_per_second": 103.013, "eval_steps_per_second": 3.22, "step": 97000 }, { "epoch": 10.18170426065163, "grad_norm": 3.1329991817474365, "learning_rate": 2.5e-06, "loss": 0.9575, "step": 97500 }, { "epoch": 10.18170426065163, "eval_accuracy": 0.8000353846923677, "eval_loss": 1.0379133224487305, "eval_runtime": 615.8872, "eval_samples_per_second": 103.011, "eval_steps_per_second": 3.22, "step": 97500 }, { "epoch": 10.23391812865497, "grad_norm": 3.3150088787078857, "learning_rate": 2.0000000000000003e-06, "loss": 0.9478, "step": 98000 }, { "epoch": 10.23391812865497, "eval_accuracy": 0.7997029578657322, "eval_loss": 1.039129614830017, "eval_runtime": 615.7144, "eval_samples_per_second": 103.04, "eval_steps_per_second": 3.221, "step": 98000 }, { "epoch": 10.286131996658312, "grad_norm": 2.9896209239959717, "learning_rate": 1.5e-06, "loss": 0.9428, "step": 98500 }, { "epoch": 10.286131996658312, "eval_accuracy": 0.799764992788252, "eval_loss": 1.0273813009262085, "eval_runtime": 615.1763, "eval_samples_per_second": 103.13, "eval_steps_per_second": 3.223, "step": 98500 } ], "logging_steps": 500, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 11, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.317352524927468e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }