{ "best_metric": 0.839046835899353, "best_model_checkpoint": "./bert-large-jpn-grammar\\checkpoint-5980", "epoch": 10.0, "eval_steps": 500, "global_step": 5980, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016722408026755853, "grad_norm": 4.53647518157959, "learning_rate": 2.5083612040133778e-08, "loss": 7.8555, "step": 1 }, { "epoch": 0.08361204013377926, "grad_norm": 4.600866317749023, "learning_rate": 1.254180602006689e-06, "loss": 7.8831, "step": 50 }, { "epoch": 0.16722408026755853, "grad_norm": 4.168644428253174, "learning_rate": 2.508361204013378e-06, "loss": 7.8681, "step": 100 }, { "epoch": 0.2508361204013378, "grad_norm": 13.43441104888916, "learning_rate": 3.762541806020067e-06, "loss": 7.8594, "step": 150 }, { "epoch": 0.33444816053511706, "grad_norm": 3.9283835887908936, "learning_rate": 5.016722408026756e-06, "loss": 7.8337, "step": 200 }, { "epoch": 0.4180602006688963, "grad_norm": 3.899462938308716, "learning_rate": 6.270903010033445e-06, "loss": 7.8177, "step": 250 }, { "epoch": 0.5016722408026756, "grad_norm": 4.085901737213135, "learning_rate": 7.525083612040134e-06, "loss": 7.7797, "step": 300 }, { "epoch": 0.5852842809364549, "grad_norm": 4.916100025177002, "learning_rate": 8.779264214046824e-06, "loss": 7.7515, "step": 350 }, { "epoch": 0.6688963210702341, "grad_norm": 5.722199440002441, "learning_rate": 1.0033444816053512e-05, "loss": 7.7095, "step": 400 }, { "epoch": 0.7525083612040134, "grad_norm": 6.974526405334473, "learning_rate": 1.1287625418060201e-05, "loss": 7.6088, "step": 450 }, { "epoch": 0.8361204013377926, "grad_norm": 7.1739888191223145, "learning_rate": 1.254180602006689e-05, "loss": 7.4935, "step": 500 }, { "epoch": 0.919732441471572, "grad_norm": 7.732913494110107, "learning_rate": 1.379598662207358e-05, "loss": 7.3773, "step": 550 }, { "epoch": 1.0, "eval_accuracy": 0.1417224109172821, "eval_f1": 0.1417224109172821, "eval_loss": 7.010268688201904, "eval_runtime": 59.6669, "eval_samples_per_second": 80.178, "eval_steps_per_second": 1.257, "step": 598 }, { "epoch": 1.0033444816053512, "grad_norm": 7.402344703674316, "learning_rate": 1.5050167224080267e-05, "loss": 7.2391, "step": 600 }, { "epoch": 1.0869565217391304, "grad_norm": 7.742281913757324, "learning_rate": 1.6304347826086955e-05, "loss": 6.9797, "step": 650 }, { "epoch": 1.1705685618729098, "grad_norm": 7.522943019866943, "learning_rate": 1.7558528428093648e-05, "loss": 6.8021, "step": 700 }, { "epoch": 1.254180602006689, "grad_norm": 7.755581378936768, "learning_rate": 1.8812709030100337e-05, "loss": 6.6368, "step": 750 }, { "epoch": 1.3377926421404682, "grad_norm": 7.403460502624512, "learning_rate": 2.0066889632107023e-05, "loss": 6.4424, "step": 800 }, { "epoch": 1.4214046822742474, "grad_norm": 7.275426864624023, "learning_rate": 2.1321070234113713e-05, "loss": 6.2322, "step": 850 }, { "epoch": 1.5050167224080266, "grad_norm": 7.691336631774902, "learning_rate": 2.2575250836120402e-05, "loss": 6.0312, "step": 900 }, { "epoch": 1.588628762541806, "grad_norm": 7.337137699127197, "learning_rate": 2.382943143812709e-05, "loss": 5.8301, "step": 950 }, { "epoch": 1.6722408026755853, "grad_norm": 7.574013710021973, "learning_rate": 2.508361204013378e-05, "loss": 5.6195, "step": 1000 }, { "epoch": 1.7558528428093645, "grad_norm": 7.1535773277282715, "learning_rate": 2.6337792642140467e-05, "loss": 5.3459, "step": 1050 }, { "epoch": 1.839464882943144, "grad_norm": 7.483964443206787, "learning_rate": 2.759197324414716e-05, "loss": 5.1481, "step": 1100 }, { "epoch": 1.9230769230769231, "grad_norm": 7.383283615112305, "learning_rate": 2.884615384615385e-05, "loss": 4.9299, "step": 1150 }, { "epoch": 2.0, "eval_accuracy": 0.6444398164749146, "eval_f1": 0.6444398164749146, "eval_loss": 4.192617893218994, "eval_runtime": 55.4042, "eval_samples_per_second": 86.347, "eval_steps_per_second": 1.354, "step": 1196 }, { "epoch": 2.0066889632107023, "grad_norm": 7.1753668785095215, "learning_rate": 2.9974916387959864e-05, "loss": 4.6789, "step": 1200 }, { "epoch": 2.0903010033444818, "grad_norm": 7.401775360107422, "learning_rate": 2.9661371237458197e-05, "loss": 4.2448, "step": 1250 }, { "epoch": 2.1739130434782608, "grad_norm": 7.254606246948242, "learning_rate": 2.9347826086956523e-05, "loss": 4.0163, "step": 1300 }, { "epoch": 2.25752508361204, "grad_norm": 7.533504962921143, "learning_rate": 2.903428093645485e-05, "loss": 3.8083, "step": 1350 }, { "epoch": 2.3411371237458196, "grad_norm": 6.658289432525635, "learning_rate": 2.8720735785953178e-05, "loss": 3.5772, "step": 1400 }, { "epoch": 2.4247491638795986, "grad_norm": 9.382271766662598, "learning_rate": 2.8407190635451504e-05, "loss": 3.4262, "step": 1450 }, { "epoch": 2.508361204013378, "grad_norm": 6.964470863342285, "learning_rate": 2.8093645484949833e-05, "loss": 3.2302, "step": 1500 }, { "epoch": 2.591973244147157, "grad_norm": 6.510575294494629, "learning_rate": 2.7780100334448163e-05, "loss": 3.0355, "step": 1550 }, { "epoch": 2.6755852842809364, "grad_norm": 6.38152551651001, "learning_rate": 2.746655518394649e-05, "loss": 2.8789, "step": 1600 }, { "epoch": 2.759197324414716, "grad_norm": 6.31973934173584, "learning_rate": 2.7153010033444814e-05, "loss": 2.6744, "step": 1650 }, { "epoch": 2.842809364548495, "grad_norm": 8.131747245788574, "learning_rate": 2.6839464882943144e-05, "loss": 2.5437, "step": 1700 }, { "epoch": 2.9264214046822743, "grad_norm": 6.337752342224121, "learning_rate": 2.6525919732441473e-05, "loss": 2.3661, "step": 1750 }, { "epoch": 3.0, "eval_accuracy": 0.735785961151123, "eval_f1": 0.735785961151123, "eval_loss": 1.8872631788253784, "eval_runtime": 58.5991, "eval_samples_per_second": 81.64, "eval_steps_per_second": 1.28, "step": 1794 }, { "epoch": 3.0100334448160537, "grad_norm": 6.063199520111084, "learning_rate": 2.6212374581939803e-05, "loss": 2.2655, "step": 1800 }, { "epoch": 3.0936454849498327, "grad_norm": 5.415772914886475, "learning_rate": 2.589882943143813e-05, "loss": 1.9437, "step": 1850 }, { "epoch": 3.177257525083612, "grad_norm": 5.352977752685547, "learning_rate": 2.5585284280936454e-05, "loss": 1.8229, "step": 1900 }, { "epoch": 3.260869565217391, "grad_norm": 6.909513473510742, "learning_rate": 2.5271739130434784e-05, "loss": 1.7459, "step": 1950 }, { "epoch": 3.3444816053511706, "grad_norm": 5.768405914306641, "learning_rate": 2.495819397993311e-05, "loss": 1.6214, "step": 2000 }, { "epoch": 3.42809364548495, "grad_norm": 4.940030574798584, "learning_rate": 2.464464882943144e-05, "loss": 1.5412, "step": 2050 }, { "epoch": 3.511705685618729, "grad_norm": 5.3233323097229, "learning_rate": 2.433110367892977e-05, "loss": 1.4467, "step": 2100 }, { "epoch": 3.5953177257525084, "grad_norm": 5.229661464691162, "learning_rate": 2.4017558528428094e-05, "loss": 1.3918, "step": 2150 }, { "epoch": 3.678929765886288, "grad_norm": 4.888563632965088, "learning_rate": 2.370401337792642e-05, "loss": 1.3623, "step": 2200 }, { "epoch": 3.762541806020067, "grad_norm": 4.426875591278076, "learning_rate": 2.339046822742475e-05, "loss": 1.2494, "step": 2250 }, { "epoch": 3.8461538461538463, "grad_norm": 5.58463191986084, "learning_rate": 2.307692307692308e-05, "loss": 1.2034, "step": 2300 }, { "epoch": 3.9297658862876252, "grad_norm": 4.331861972808838, "learning_rate": 2.2763377926421405e-05, "loss": 1.1543, "step": 2350 }, { "epoch": 4.0, "eval_accuracy": 0.7759197354316711, "eval_f1": 0.7759197354316711, "eval_loss": 0.9816091060638428, "eval_runtime": 56.0644, "eval_samples_per_second": 85.33, "eval_steps_per_second": 1.338, "step": 2392 }, { "epoch": 4.013377926421405, "grad_norm": 4.542367935180664, "learning_rate": 2.2449832775919734e-05, "loss": 1.0647, "step": 2400 }, { "epoch": 4.096989966555184, "grad_norm": 4.851340293884277, "learning_rate": 2.213628762541806e-05, "loss": 0.9377, "step": 2450 }, { "epoch": 4.1806020066889635, "grad_norm": 4.438965797424316, "learning_rate": 2.1822742474916386e-05, "loss": 0.9529, "step": 2500 }, { "epoch": 4.264214046822742, "grad_norm": 4.728175163269043, "learning_rate": 2.150919732441472e-05, "loss": 0.8806, "step": 2550 }, { "epoch": 4.3478260869565215, "grad_norm": 4.2397284507751465, "learning_rate": 2.1195652173913045e-05, "loss": 0.8575, "step": 2600 }, { "epoch": 4.431438127090301, "grad_norm": 3.8949732780456543, "learning_rate": 2.088210702341137e-05, "loss": 0.8447, "step": 2650 }, { "epoch": 4.51505016722408, "grad_norm": 4.523480415344238, "learning_rate": 2.05685618729097e-05, "loss": 0.8167, "step": 2700 }, { "epoch": 4.59866220735786, "grad_norm": 5.011022090911865, "learning_rate": 2.0255016722408026e-05, "loss": 0.7993, "step": 2750 }, { "epoch": 4.682274247491639, "grad_norm": 4.6291399002075195, "learning_rate": 1.9941471571906352e-05, "loss": 0.7959, "step": 2800 }, { "epoch": 4.765886287625418, "grad_norm": 4.647440433502197, "learning_rate": 1.9627926421404685e-05, "loss": 0.7582, "step": 2850 }, { "epoch": 4.849498327759197, "grad_norm": 3.1524765491485596, "learning_rate": 1.931438127090301e-05, "loss": 0.7298, "step": 2900 }, { "epoch": 4.933110367892977, "grad_norm": 5.388834476470947, "learning_rate": 1.9000836120401337e-05, "loss": 0.6885, "step": 2950 }, { "epoch": 5.0, "eval_accuracy": 0.8018394708633423, "eval_f1": 0.8018394708633423, "eval_loss": 0.6958563327789307, "eval_runtime": 57.0066, "eval_samples_per_second": 83.92, "eval_steps_per_second": 1.316, "step": 2990 }, { "epoch": 5.016722408026756, "grad_norm": 2.930847644805908, "learning_rate": 1.8687290969899666e-05, "loss": 0.6759, "step": 3000 }, { "epoch": 5.1003344481605355, "grad_norm": 4.471439361572266, "learning_rate": 1.8373745819397992e-05, "loss": 0.6077, "step": 3050 }, { "epoch": 5.183946488294314, "grad_norm": 3.0460543632507324, "learning_rate": 1.806020066889632e-05, "loss": 0.6214, "step": 3100 }, { "epoch": 5.2675585284280935, "grad_norm": 3.273836851119995, "learning_rate": 1.774665551839465e-05, "loss": 0.6055, "step": 3150 }, { "epoch": 5.351170568561873, "grad_norm": 3.214381456375122, "learning_rate": 1.7433110367892977e-05, "loss": 0.5718, "step": 3200 }, { "epoch": 5.434782608695652, "grad_norm": 2.87959361076355, "learning_rate": 1.7119565217391303e-05, "loss": 0.565, "step": 3250 }, { "epoch": 5.518394648829432, "grad_norm": 4.1827521324157715, "learning_rate": 1.6806020066889632e-05, "loss": 0.5923, "step": 3300 }, { "epoch": 5.602006688963211, "grad_norm": 2.584045886993408, "learning_rate": 1.649247491638796e-05, "loss": 0.5553, "step": 3350 }, { "epoch": 5.68561872909699, "grad_norm": 2.441519260406494, "learning_rate": 1.6178929765886288e-05, "loss": 0.557, "step": 3400 }, { "epoch": 5.769230769230769, "grad_norm": 3.489903688430786, "learning_rate": 1.5865384615384617e-05, "loss": 0.5212, "step": 3450 }, { "epoch": 5.852842809364549, "grad_norm": 3.7490415573120117, "learning_rate": 1.5551839464882943e-05, "loss": 0.5246, "step": 3500 }, { "epoch": 5.936454849498328, "grad_norm": 3.0013935565948486, "learning_rate": 1.523829431438127e-05, "loss": 0.5341, "step": 3550 }, { "epoch": 6.0, "eval_accuracy": 0.8160535097122192, "eval_f1": 0.8160535097122192, "eval_loss": 0.610401451587677, "eval_runtime": 57.2156, "eval_samples_per_second": 83.614, "eval_steps_per_second": 1.311, "step": 3588 }, { "epoch": 6.0200668896321075, "grad_norm": 3.0697216987609863, "learning_rate": 1.49247491638796e-05, "loss": 0.5124, "step": 3600 }, { "epoch": 6.103678929765886, "grad_norm": 3.071810722351074, "learning_rate": 1.4611204013377926e-05, "loss": 0.4665, "step": 3650 }, { "epoch": 6.187290969899665, "grad_norm": 2.360440969467163, "learning_rate": 1.4297658862876255e-05, "loss": 0.4523, "step": 3700 }, { "epoch": 6.270903010033445, "grad_norm": 3.7450270652770996, "learning_rate": 1.3984113712374583e-05, "loss": 0.4557, "step": 3750 }, { "epoch": 6.354515050167224, "grad_norm": 2.299044370651245, "learning_rate": 1.3670568561872909e-05, "loss": 0.4486, "step": 3800 }, { "epoch": 6.438127090301004, "grad_norm": 3.2113373279571533, "learning_rate": 1.3357023411371238e-05, "loss": 0.4441, "step": 3850 }, { "epoch": 6.521739130434782, "grad_norm": 3.265704393386841, "learning_rate": 1.3043478260869566e-05, "loss": 0.4476, "step": 3900 }, { "epoch": 6.605351170568562, "grad_norm": 2.365684747695923, "learning_rate": 1.2729933110367893e-05, "loss": 0.4326, "step": 3950 }, { "epoch": 6.688963210702341, "grad_norm": 2.623091697692871, "learning_rate": 1.2416387959866221e-05, "loss": 0.4493, "step": 4000 }, { "epoch": 6.7725752508361206, "grad_norm": 4.2816362380981445, "learning_rate": 1.2102842809364549e-05, "loss": 0.4374, "step": 4050 }, { "epoch": 6.8561872909699, "grad_norm": 2.976745367050171, "learning_rate": 1.1789297658862876e-05, "loss": 0.4246, "step": 4100 }, { "epoch": 6.9397993311036785, "grad_norm": 2.723569631576538, "learning_rate": 1.1475752508361204e-05, "loss": 0.4104, "step": 4150 }, { "epoch": 7.0, "eval_accuracy": 0.8235785961151123, "eval_f1": 0.8235785961151123, "eval_loss": 0.5483660697937012, "eval_runtime": 56.6001, "eval_samples_per_second": 84.523, "eval_steps_per_second": 1.325, "step": 4186 }, { "epoch": 7.023411371237458, "grad_norm": 2.4986026287078857, "learning_rate": 1.1162207357859532e-05, "loss": 0.413, "step": 4200 }, { "epoch": 7.107023411371237, "grad_norm": 3.6669883728027344, "learning_rate": 1.084866220735786e-05, "loss": 0.3636, "step": 4250 }, { "epoch": 7.190635451505017, "grad_norm": 2.7657742500305176, "learning_rate": 1.0535117056856187e-05, "loss": 0.3762, "step": 4300 }, { "epoch": 7.274247491638796, "grad_norm": 2.6917202472686768, "learning_rate": 1.0221571906354516e-05, "loss": 0.3688, "step": 4350 }, { "epoch": 7.357859531772576, "grad_norm": 2.6124022006988525, "learning_rate": 9.908026755852842e-06, "loss": 0.3534, "step": 4400 }, { "epoch": 7.441471571906354, "grad_norm": 2.3492910861968994, "learning_rate": 9.59448160535117e-06, "loss": 0.3648, "step": 4450 }, { "epoch": 7.525083612040134, "grad_norm": 1.842259168624878, "learning_rate": 9.2809364548495e-06, "loss": 0.3457, "step": 4500 }, { "epoch": 7.608695652173913, "grad_norm": 2.880143404006958, "learning_rate": 8.967391304347827e-06, "loss": 0.3747, "step": 4550 }, { "epoch": 7.6923076923076925, "grad_norm": 2.546765089035034, "learning_rate": 8.653846153846153e-06, "loss": 0.349, "step": 4600 }, { "epoch": 7.775919732441472, "grad_norm": 2.572425127029419, "learning_rate": 8.340301003344482e-06, "loss": 0.3479, "step": 4650 }, { "epoch": 7.8595317725752505, "grad_norm": 3.774538278579712, "learning_rate": 8.02675585284281e-06, "loss": 0.3458, "step": 4700 }, { "epoch": 7.94314381270903, "grad_norm": 2.3983170986175537, "learning_rate": 7.713210702341138e-06, "loss": 0.3549, "step": 4750 }, { "epoch": 8.0, "eval_accuracy": 0.8317307829856873, "eval_f1": 0.8317307829856873, "eval_loss": 0.5173951983451843, "eval_runtime": 57.872, "eval_samples_per_second": 82.665, "eval_steps_per_second": 1.296, "step": 4784 }, { "epoch": 8.02675585284281, "grad_norm": 2.7807955741882324, "learning_rate": 7.399665551839465e-06, "loss": 0.3324, "step": 4800 }, { "epoch": 8.110367892976589, "grad_norm": 3.530358076095581, "learning_rate": 7.086120401337793e-06, "loss": 0.3212, "step": 4850 }, { "epoch": 8.193979933110368, "grad_norm": 2.1531357765197754, "learning_rate": 6.772575250836121e-06, "loss": 0.3141, "step": 4900 }, { "epoch": 8.277591973244148, "grad_norm": 1.996618628501892, "learning_rate": 6.459030100334448e-06, "loss": 0.288, "step": 4950 }, { "epoch": 8.361204013377927, "grad_norm": 2.6897034645080566, "learning_rate": 6.145484949832776e-06, "loss": 0.2943, "step": 5000 }, { "epoch": 8.444816053511706, "grad_norm": 2.446686267852783, "learning_rate": 5.831939799331104e-06, "loss": 0.308, "step": 5050 }, { "epoch": 8.528428093645484, "grad_norm": 2.7985997200012207, "learning_rate": 5.518394648829431e-06, "loss": 0.3014, "step": 5100 }, { "epoch": 8.612040133779264, "grad_norm": 2.4453961849212646, "learning_rate": 5.204849498327759e-06, "loss": 0.2996, "step": 5150 }, { "epoch": 8.695652173913043, "grad_norm": 3.4949986934661865, "learning_rate": 4.891304347826087e-06, "loss": 0.3024, "step": 5200 }, { "epoch": 8.779264214046822, "grad_norm": 3.0933334827423096, "learning_rate": 4.577759197324415e-06, "loss": 0.318, "step": 5250 }, { "epoch": 8.862876254180602, "grad_norm": 2.353951930999756, "learning_rate": 4.264214046822742e-06, "loss": 0.3042, "step": 5300 }, { "epoch": 8.946488294314381, "grad_norm": 3.0331270694732666, "learning_rate": 3.95066889632107e-06, "loss": 0.299, "step": 5350 }, { "epoch": 9.0, "eval_accuracy": 0.8377926349639893, "eval_f1": 0.8377926349639893, "eval_loss": 0.4919103980064392, "eval_runtime": 57.5334, "eval_samples_per_second": 83.152, "eval_steps_per_second": 1.304, "step": 5382 }, { "epoch": 9.03010033444816, "grad_norm": 1.635090947151184, "learning_rate": 3.637123745819398e-06, "loss": 0.27, "step": 5400 }, { "epoch": 9.11371237458194, "grad_norm": 2.002584934234619, "learning_rate": 3.3235785953177257e-06, "loss": 0.2604, "step": 5450 }, { "epoch": 9.19732441471572, "grad_norm": 2.2020089626312256, "learning_rate": 3.0100334448160537e-06, "loss": 0.2643, "step": 5500 }, { "epoch": 9.280936454849499, "grad_norm": 2.3574795722961426, "learning_rate": 2.6964882943143814e-06, "loss": 0.2754, "step": 5550 }, { "epoch": 9.364548494983278, "grad_norm": 2.37418794631958, "learning_rate": 2.382943143812709e-06, "loss": 0.2833, "step": 5600 }, { "epoch": 9.448160535117056, "grad_norm": 2.4970650672912598, "learning_rate": 2.0693979933110367e-06, "loss": 0.2598, "step": 5650 }, { "epoch": 9.531772575250836, "grad_norm": 2.6536877155303955, "learning_rate": 1.7558528428093646e-06, "loss": 0.2685, "step": 5700 }, { "epoch": 9.615384615384615, "grad_norm": 2.722958564758301, "learning_rate": 1.4423076923076924e-06, "loss": 0.2588, "step": 5750 }, { "epoch": 9.698996655518394, "grad_norm": 2.4485061168670654, "learning_rate": 1.12876254180602e-06, "loss": 0.2728, "step": 5800 }, { "epoch": 9.782608695652174, "grad_norm": 2.3278067111968994, "learning_rate": 8.152173913043479e-07, "loss": 0.2752, "step": 5850 }, { "epoch": 9.866220735785953, "grad_norm": 1.9324153661727905, "learning_rate": 5.016722408026756e-07, "loss": 0.2604, "step": 5900 }, { "epoch": 9.949832775919733, "grad_norm": 1.9075779914855957, "learning_rate": 1.8812709030100336e-07, "loss": 0.2599, "step": 5950 }, { "epoch": 10.0, "eval_accuracy": 0.839046835899353, "eval_f1": 0.839046835899353, "eval_loss": 0.49083635210990906, "eval_runtime": 59.1954, "eval_samples_per_second": 80.817, "eval_steps_per_second": 1.267, "step": 5980 } ], "logging_steps": 50, "max_steps": 5980, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6022352226511616e+16, "train_batch_size": 64, "trial_name": null, "trial_params": null }