{ "best_metric": 1.6703945398330688, "best_model_checkpoint": "models/models/mbert-3-5e-5/checkpoint-116000", "epoch": 49.998677383253636, "eval_steps": 2000, "global_step": 122850, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.8139179977617255, "grad_norm": 9.055466651916504, "learning_rate": 4.796499796499796e-05, "loss": 5.8944, "step": 2000 }, { "epoch": 0.8139179977617255, "eval_loss": 5.142313480377197, "eval_runtime": 73.0566, "eval_samples_per_second": 136.88, "eval_steps_per_second": 2.149, "step": 2000 }, { "epoch": 1.6275307762742903, "grad_norm": 14.413078308105469, "learning_rate": 4.592999592999594e-05, "loss": 4.8554, "step": 4000 }, { "epoch": 1.6275307762742903, "eval_loss": 4.169687747955322, "eval_runtime": 73.0805, "eval_samples_per_second": 136.835, "eval_steps_per_second": 2.148, "step": 4000 }, { "epoch": 2.4411435547868554, "grad_norm": 10.98070240020752, "learning_rate": 4.38949938949939e-05, "loss": 3.8936, "step": 6000 }, { "epoch": 2.4411435547868554, "eval_loss": 3.520249605178833, "eval_runtime": 73.0468, "eval_samples_per_second": 136.898, "eval_steps_per_second": 2.149, "step": 6000 }, { "epoch": 3.25475633329942, "grad_norm": 12.734247207641602, "learning_rate": 4.185999185999186e-05, "loss": 3.3719, "step": 8000 }, { "epoch": 3.25475633329942, "eval_loss": 3.012561082839966, "eval_runtime": 73.0351, "eval_samples_per_second": 136.92, "eval_steps_per_second": 2.15, "step": 8000 }, { "epoch": 4.068369111811985, "grad_norm": 11.761553764343262, "learning_rate": 3.9824989824989825e-05, "loss": 2.9884, "step": 10000 }, { "epoch": 4.068369111811985, "eval_loss": 2.7652907371520996, "eval_runtime": 73.0559, "eval_samples_per_second": 136.882, "eval_steps_per_second": 2.149, "step": 10000 }, { "epoch": 4.882287109573711, "grad_norm": 12.824328422546387, "learning_rate": 3.778998778998779e-05, "loss": 2.7846, "step": 12000 }, { "epoch": 4.882287109573711, "eval_loss": 2.616189956665039, "eval_runtime": 73.0493, "eval_samples_per_second": 136.894, "eval_steps_per_second": 2.149, "step": 12000 }, { "epoch": 5.6958998880862755, "grad_norm": 12.507063865661621, "learning_rate": 3.575498575498576e-05, "loss": 2.6443, "step": 14000 }, { "epoch": 5.6958998880862755, "eval_loss": 2.491734504699707, "eval_runtime": 73.0626, "eval_samples_per_second": 136.869, "eval_steps_per_second": 2.149, "step": 14000 }, { "epoch": 6.50951266659884, "grad_norm": 12.569066047668457, "learning_rate": 3.371998371998372e-05, "loss": 2.5361, "step": 16000 }, { "epoch": 6.50951266659884, "eval_loss": 2.402878522872925, "eval_runtime": 73.0516, "eval_samples_per_second": 136.89, "eval_steps_per_second": 2.149, "step": 16000 }, { "epoch": 7.323125445111405, "grad_norm": 12.419079780578613, "learning_rate": 3.168498168498169e-05, "loss": 2.4558, "step": 18000 }, { "epoch": 7.323125445111405, "eval_loss": 2.3287837505340576, "eval_runtime": 73.0774, "eval_samples_per_second": 136.841, "eval_steps_per_second": 2.148, "step": 18000 }, { "epoch": 8.13673822362397, "grad_norm": 12.206134796142578, "learning_rate": 2.9649979649979655e-05, "loss": 2.3847, "step": 20000 }, { "epoch": 8.13673822362397, "eval_loss": 2.2792084217071533, "eval_runtime": 73.059, "eval_samples_per_second": 136.876, "eval_steps_per_second": 2.149, "step": 20000 }, { "epoch": 8.950656221385696, "grad_norm": 12.273974418640137, "learning_rate": 2.7614977614977615e-05, "loss": 2.3293, "step": 22000 }, { "epoch": 8.950656221385696, "eval_loss": 2.2214159965515137, "eval_runtime": 73.07, "eval_samples_per_second": 136.855, "eval_steps_per_second": 2.149, "step": 22000 }, { "epoch": 9.76426899989826, "grad_norm": 12.672439575195312, "learning_rate": 2.557997557997558e-05, "loss": 2.2763, "step": 24000 }, { "epoch": 9.76426899989826, "eval_loss": 2.1826987266540527, "eval_runtime": 73.0461, "eval_samples_per_second": 136.9, "eval_steps_per_second": 2.149, "step": 24000 }, { "epoch": 10.577881778410825, "grad_norm": 12.514191627502441, "learning_rate": 2.3544973544973546e-05, "loss": 2.2361, "step": 26000 }, { "epoch": 10.577881778410825, "eval_loss": 2.1435160636901855, "eval_runtime": 73.0345, "eval_samples_per_second": 136.922, "eval_steps_per_second": 2.15, "step": 26000 }, { "epoch": 11.39149455692339, "grad_norm": 12.286685943603516, "learning_rate": 2.150997150997151e-05, "loss": 2.1973, "step": 28000 }, { "epoch": 11.39149455692339, "eval_loss": 2.1050431728363037, "eval_runtime": 73.0521, "eval_samples_per_second": 136.889, "eval_steps_per_second": 2.149, "step": 28000 }, { "epoch": 12.209990843422526, "grad_norm": 12.680660247802734, "learning_rate": 2.9649979649979655e-05, "loss": 2.1805, "step": 30000 }, { "epoch": 12.209990843422526, "eval_loss": 2.0837137699127197, "eval_runtime": 73.1666, "eval_samples_per_second": 136.674, "eval_steps_per_second": 2.146, "step": 30000 }, { "epoch": 13.02360362193509, "grad_norm": 12.36361026763916, "learning_rate": 2.8293311626644965e-05, "loss": 2.1418, "step": 32000 }, { "epoch": 13.02360362193509, "eval_loss": 2.052241802215576, "eval_runtime": 73.1973, "eval_samples_per_second": 136.617, "eval_steps_per_second": 2.145, "step": 32000 }, { "epoch": 13.837521619696815, "grad_norm": 12.344305992126465, "learning_rate": 2.693664360331027e-05, "loss": 2.1107, "step": 34000 }, { "epoch": 13.837521619696815, "eval_loss": 2.0308263301849365, "eval_runtime": 73.2003, "eval_samples_per_second": 136.611, "eval_steps_per_second": 2.145, "step": 34000 }, { "epoch": 14.651134398209381, "grad_norm": 12.52969741821289, "learning_rate": 2.557997557997558e-05, "loss": 2.0818, "step": 36000 }, { "epoch": 14.651134398209381, "eval_loss": 2.0151000022888184, "eval_runtime": 73.21, "eval_samples_per_second": 136.593, "eval_steps_per_second": 2.145, "step": 36000 }, { "epoch": 15.464747176721945, "grad_norm": 12.168683052062988, "learning_rate": 2.4223307556640893e-05, "loss": 2.0559, "step": 38000 }, { "epoch": 15.464747176721945, "eval_loss": 1.9706333875656128, "eval_runtime": 73.1868, "eval_samples_per_second": 136.637, "eval_steps_per_second": 2.145, "step": 38000 }, { "epoch": 16.27835995523451, "grad_norm": 12.408997535705566, "learning_rate": 2.28666395333062e-05, "loss": 2.0272, "step": 40000 }, { "epoch": 16.27835995523451, "eval_loss": 1.9658160209655762, "eval_runtime": 73.2254, "eval_samples_per_second": 136.565, "eval_steps_per_second": 2.144, "step": 40000 }, { "epoch": 17.091972733747074, "grad_norm": 13.222212791442871, "learning_rate": 2.150997150997151e-05, "loss": 2.0069, "step": 42000 }, { "epoch": 17.091972733747074, "eval_loss": 1.9462146759033203, "eval_runtime": 73.2046, "eval_samples_per_second": 136.603, "eval_steps_per_second": 2.145, "step": 42000 }, { "epoch": 17.9058907315088, "grad_norm": 12.432639122009277, "learning_rate": 2.015330348663682e-05, "loss": 1.9873, "step": 44000 }, { "epoch": 17.9058907315088, "eval_loss": 1.9287099838256836, "eval_runtime": 73.196, "eval_samples_per_second": 136.619, "eval_steps_per_second": 2.145, "step": 44000 }, { "epoch": 18.719503510021365, "grad_norm": 12.828729629516602, "learning_rate": 1.879663546330213e-05, "loss": 1.9696, "step": 46000 }, { "epoch": 18.719503510021365, "eval_loss": 1.9102226495742798, "eval_runtime": 73.1921, "eval_samples_per_second": 136.627, "eval_steps_per_second": 2.145, "step": 46000 }, { "epoch": 19.53311628853393, "grad_norm": 12.529792785644531, "learning_rate": 1.743996743996744e-05, "loss": 1.9506, "step": 48000 }, { "epoch": 19.53311628853393, "eval_loss": 1.8978015184402466, "eval_runtime": 73.2056, "eval_samples_per_second": 136.602, "eval_steps_per_second": 2.145, "step": 48000 }, { "epoch": 20.346729067046496, "grad_norm": 12.476061820983887, "learning_rate": 1.6083299416632748e-05, "loss": 1.937, "step": 50000 }, { "epoch": 20.346729067046496, "eval_loss": 1.8884605169296265, "eval_runtime": 73.211, "eval_samples_per_second": 136.592, "eval_steps_per_second": 2.144, "step": 50000 }, { "epoch": 21.16034184555906, "grad_norm": 12.546550750732422, "learning_rate": 1.472663139329806e-05, "loss": 1.9239, "step": 52000 }, { "epoch": 21.16034184555906, "eval_loss": 1.8613293170928955, "eval_runtime": 73.2024, "eval_samples_per_second": 136.608, "eval_steps_per_second": 2.145, "step": 52000 }, { "epoch": 21.974259843320784, "grad_norm": 12.588421821594238, "learning_rate": 1.336996336996337e-05, "loss": 1.9106, "step": 54000 }, { "epoch": 21.974259843320784, "eval_loss": 1.8496237993240356, "eval_runtime": 73.1844, "eval_samples_per_second": 136.641, "eval_steps_per_second": 2.145, "step": 54000 }, { "epoch": 22.78787262183335, "grad_norm": 12.508979797363281, "learning_rate": 1.2013295346628681e-05, "loss": 1.9004, "step": 56000 }, { "epoch": 22.78787262183335, "eval_loss": 1.8513011932373047, "eval_runtime": 73.1881, "eval_samples_per_second": 136.634, "eval_steps_per_second": 2.145, "step": 56000 }, { "epoch": 23.605961949333604, "grad_norm": 12.510702133178711, "learning_rate": 2.0492470492470494e-05, "loss": 1.8995, "step": 58000 }, { "epoch": 23.605961949333604, "eval_loss": 1.8457722663879395, "eval_runtime": 73.2143, "eval_samples_per_second": 136.585, "eval_steps_per_second": 2.144, "step": 58000 }, { "epoch": 24.419574727846168, "grad_norm": 12.43933391571045, "learning_rate": 1.9474969474969477e-05, "loss": 1.8903, "step": 60000 }, { "epoch": 24.419574727846168, "eval_loss": 1.8323436975479126, "eval_runtime": 73.2612, "eval_samples_per_second": 136.498, "eval_steps_per_second": 2.143, "step": 60000 }, { "epoch": 25.233187506358735, "grad_norm": 12.256065368652344, "learning_rate": 1.8457468457468458e-05, "loss": 1.8754, "step": 62000 }, { "epoch": 25.233187506358735, "eval_loss": 1.8291531801223755, "eval_runtime": 73.2439, "eval_samples_per_second": 136.53, "eval_steps_per_second": 2.144, "step": 62000 }, { "epoch": 26.0468002848713, "grad_norm": 12.613372802734375, "learning_rate": 1.743996743996744e-05, "loss": 1.8644, "step": 64000 }, { "epoch": 26.0468002848713, "eval_loss": 1.823506236076355, "eval_runtime": 73.2487, "eval_samples_per_second": 136.521, "eval_steps_per_second": 2.143, "step": 64000 }, { "epoch": 26.860718282633023, "grad_norm": 12.547616958618164, "learning_rate": 1.642246642246642e-05, "loss": 1.8537, "step": 66000 }, { "epoch": 26.860718282633023, "eval_loss": 1.7943871021270752, "eval_runtime": 73.2466, "eval_samples_per_second": 136.525, "eval_steps_per_second": 2.143, "step": 66000 }, { "epoch": 27.67433106114559, "grad_norm": 12.599259376525879, "learning_rate": 1.5404965404965405e-05, "loss": 1.8426, "step": 68000 }, { "epoch": 27.67433106114559, "eval_loss": 1.787021517753601, "eval_runtime": 73.2491, "eval_samples_per_second": 136.52, "eval_steps_per_second": 2.143, "step": 68000 }, { "epoch": 28.487943839658154, "grad_norm": 12.874439239501953, "learning_rate": 1.4387464387464389e-05, "loss": 1.8307, "step": 70000 }, { "epoch": 28.487943839658154, "eval_loss": 1.7922056913375854, "eval_runtime": 73.2621, "eval_samples_per_second": 136.496, "eval_steps_per_second": 2.143, "step": 70000 }, { "epoch": 29.301556618170718, "grad_norm": 12.278645515441895, "learning_rate": 1.336996336996337e-05, "loss": 1.8235, "step": 72000 }, { "epoch": 29.301556618170718, "eval_loss": 1.7722290754318237, "eval_runtime": 73.2392, "eval_samples_per_second": 136.539, "eval_steps_per_second": 2.144, "step": 72000 }, { "epoch": 30.115169396683285, "grad_norm": 12.372228622436523, "learning_rate": 1.2352462352462353e-05, "loss": 1.8142, "step": 74000 }, { "epoch": 30.115169396683285, "eval_loss": 1.7639870643615723, "eval_runtime": 73.2474, "eval_samples_per_second": 136.524, "eval_steps_per_second": 2.143, "step": 74000 }, { "epoch": 30.92908739444501, "grad_norm": 13.026344299316406, "learning_rate": 1.1334961334961336e-05, "loss": 1.8052, "step": 76000 }, { "epoch": 30.92908739444501, "eval_loss": 1.7618211507797241, "eval_runtime": 73.2535, "eval_samples_per_second": 136.512, "eval_steps_per_second": 2.143, "step": 76000 }, { "epoch": 31.742700172957573, "grad_norm": 12.346455574035645, "learning_rate": 1.0317460317460318e-05, "loss": 1.7968, "step": 78000 }, { "epoch": 31.742700172957573, "eval_loss": 1.7621996402740479, "eval_runtime": 73.2566, "eval_samples_per_second": 136.506, "eval_steps_per_second": 2.143, "step": 78000 }, { "epoch": 32.55631295147014, "grad_norm": 12.404403686523438, "learning_rate": 9.2999592999593e-06, "loss": 1.7906, "step": 80000 }, { "epoch": 32.55631295147014, "eval_loss": 1.7546992301940918, "eval_runtime": 73.2436, "eval_samples_per_second": 136.531, "eval_steps_per_second": 2.144, "step": 80000 }, { "epoch": 33.369925729982704, "grad_norm": 13.407048225402832, "learning_rate": 8.282458282458284e-06, "loss": 1.7831, "step": 82000 }, { "epoch": 33.369925729982704, "eval_loss": 1.7637808322906494, "eval_runtime": 73.252, "eval_samples_per_second": 136.515, "eval_steps_per_second": 2.143, "step": 82000 }, { "epoch": 34.18353850849527, "grad_norm": 12.677322387695312, "learning_rate": 7.264957264957266e-06, "loss": 1.7781, "step": 84000 }, { "epoch": 34.18353850849527, "eval_loss": 1.7423206567764282, "eval_runtime": 73.2424, "eval_samples_per_second": 136.533, "eval_steps_per_second": 2.144, "step": 84000 }, { "epoch": 35.0020347949944, "grad_norm": 12.919666290283203, "learning_rate": 1.4997964997965e-05, "loss": 1.7839, "step": 86000 }, { "epoch": 35.0020347949944, "eval_loss": 1.7404271364212036, "eval_runtime": 73.1021, "eval_samples_per_second": 136.795, "eval_steps_per_second": 2.148, "step": 86000 }, { "epoch": 35.81595279275613, "grad_norm": 13.499661445617676, "learning_rate": 1.4183964183964185e-05, "loss": 1.7779, "step": 88000 }, { "epoch": 35.81595279275613, "eval_loss": 1.7452839612960815, "eval_runtime": 73.1263, "eval_samples_per_second": 136.75, "eval_steps_per_second": 2.147, "step": 88000 }, { "epoch": 36.62956557126869, "grad_norm": 12.777277946472168, "learning_rate": 1.336996336996337e-05, "loss": 1.7694, "step": 90000 }, { "epoch": 36.62956557126869, "eval_loss": 1.7361302375793457, "eval_runtime": 73.1394, "eval_samples_per_second": 136.725, "eval_steps_per_second": 2.147, "step": 90000 }, { "epoch": 37.443178349781256, "grad_norm": 12.409318923950195, "learning_rate": 1.2555962555962556e-05, "loss": 1.7634, "step": 92000 }, { "epoch": 37.443178349781256, "eval_loss": 1.7353118658065796, "eval_runtime": 73.1692, "eval_samples_per_second": 136.669, "eval_steps_per_second": 2.146, "step": 92000 }, { "epoch": 38.25679112829383, "grad_norm": 13.48429012298584, "learning_rate": 1.1741961741961743e-05, "loss": 1.7593, "step": 94000 }, { "epoch": 38.25679112829383, "eval_loss": 1.7299132347106934, "eval_runtime": 73.1218, "eval_samples_per_second": 136.758, "eval_steps_per_second": 2.147, "step": 94000 }, { "epoch": 39.07040390680639, "grad_norm": 12.476696014404297, "learning_rate": 1.0927960927960928e-05, "loss": 1.7498, "step": 96000 }, { "epoch": 39.07040390680639, "eval_loss": 1.7091785669326782, "eval_runtime": 73.0912, "eval_samples_per_second": 136.815, "eval_steps_per_second": 2.148, "step": 96000 }, { "epoch": 39.88432190456812, "grad_norm": 13.220356941223145, "learning_rate": 1.0113960113960115e-05, "loss": 1.7446, "step": 98000 }, { "epoch": 39.88432190456812, "eval_loss": 1.7064162492752075, "eval_runtime": 73.1495, "eval_samples_per_second": 136.706, "eval_steps_per_second": 2.146, "step": 98000 }, { "epoch": 40.69793468308068, "grad_norm": 13.10384750366211, "learning_rate": 9.2999592999593e-06, "loss": 1.7376, "step": 100000 }, { "epoch": 40.69793468308068, "eval_loss": 1.7068718671798706, "eval_runtime": 73.1342, "eval_samples_per_second": 136.735, "eval_steps_per_second": 2.147, "step": 100000 }, { "epoch": 41.511547461593246, "grad_norm": 13.384297370910645, "learning_rate": 8.485958485958487e-06, "loss": 1.732, "step": 102000 }, { "epoch": 41.511547461593246, "eval_loss": 1.7083535194396973, "eval_runtime": 73.1173, "eval_samples_per_second": 136.767, "eval_steps_per_second": 2.147, "step": 102000 }, { "epoch": 42.32516024010581, "grad_norm": 12.881793022155762, "learning_rate": 7.671957671957672e-06, "loss": 1.7282, "step": 104000 }, { "epoch": 42.32516024010581, "eval_loss": 1.7061749696731567, "eval_runtime": 73.1059, "eval_samples_per_second": 136.788, "eval_steps_per_second": 2.148, "step": 104000 }, { "epoch": 43.13877301861837, "grad_norm": 13.3131685256958, "learning_rate": 6.857956857956858e-06, "loss": 1.7218, "step": 106000 }, { "epoch": 43.13877301861837, "eval_loss": 1.6910033226013184, "eval_runtime": 73.0946, "eval_samples_per_second": 136.809, "eval_steps_per_second": 2.148, "step": 106000 }, { "epoch": 43.9526910163801, "grad_norm": 12.788925170898438, "learning_rate": 6.043956043956044e-06, "loss": 1.7205, "step": 108000 }, { "epoch": 43.9526910163801, "eval_loss": 1.6898643970489502, "eval_runtime": 73.1074, "eval_samples_per_second": 136.785, "eval_steps_per_second": 2.148, "step": 108000 }, { "epoch": 44.766303794892664, "grad_norm": 12.509169578552246, "learning_rate": 5.22995522995523e-06, "loss": 1.7151, "step": 110000 }, { "epoch": 44.766303794892664, "eval_loss": 1.6873972415924072, "eval_runtime": 73.1077, "eval_samples_per_second": 136.784, "eval_steps_per_second": 2.148, "step": 110000 }, { "epoch": 45.57991657340523, "grad_norm": 12.945860862731934, "learning_rate": 4.415954415954416e-06, "loss": 1.7109, "step": 112000 }, { "epoch": 45.57991657340523, "eval_loss": 1.677456021308899, "eval_runtime": 73.1203, "eval_samples_per_second": 136.761, "eval_steps_per_second": 2.147, "step": 112000 }, { "epoch": 46.39800590090548, "grad_norm": 12.642820358276367, "learning_rate": 3.6019536019536017e-06, "loss": 1.7058, "step": 114000 }, { "epoch": 46.39800590090548, "eval_loss": 1.6775144338607788, "eval_runtime": 73.1275, "eval_samples_per_second": 136.747, "eval_steps_per_second": 2.147, "step": 114000 }, { "epoch": 47.21161867941805, "grad_norm": 13.197073936462402, "learning_rate": 2.787952787952788e-06, "loss": 1.7048, "step": 116000 }, { "epoch": 47.21161867941805, "eval_loss": 1.6703945398330688, "eval_runtime": 73.1574, "eval_samples_per_second": 136.691, "eval_steps_per_second": 2.146, "step": 116000 }, { "epoch": 48.025231457930616, "grad_norm": 12.470918655395508, "learning_rate": 1.9739519739519742e-06, "loss": 1.7037, "step": 118000 }, { "epoch": 48.025231457930616, "eval_loss": 1.6824976205825806, "eval_runtime": 73.1867, "eval_samples_per_second": 136.637, "eval_steps_per_second": 2.145, "step": 118000 }, { "epoch": 48.839149455692336, "grad_norm": 12.579526901245117, "learning_rate": 1.15995115995116e-06, "loss": 1.6999, "step": 120000 }, { "epoch": 48.839149455692336, "eval_loss": 1.678907871246338, "eval_runtime": 73.1871, "eval_samples_per_second": 136.636, "eval_steps_per_second": 2.145, "step": 120000 }, { "epoch": 49.65276223420491, "grad_norm": 13.10560131072998, "learning_rate": 3.4595034595034596e-07, "loss": 1.6969, "step": 122000 }, { "epoch": 49.65276223420491, "eval_loss": 1.6780418157577515, "eval_runtime": 73.1913, "eval_samples_per_second": 136.628, "eval_steps_per_second": 2.145, "step": 122000 } ], "logging_steps": 2000, "max_steps": 122850, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.275195636696166e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }