|
{ |
|
"best_metric": 0.7950243949890137, |
|
"best_model_checkpoint": "./model_fine-tune/glot/xlm-r/kat-Geor/checkpoint-99500", |
|
"epoch": 32.34720416124838, |
|
"eval_steps": 500, |
|
"global_step": 99500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1625487646293888, |
|
"grad_norm": 4.931751251220703, |
|
"learning_rate": 9.95e-05, |
|
"loss": 1.3006, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.1625487646293888, |
|
"eval_accuracy": 0.7623556146386671, |
|
"eval_loss": 1.3419054746627808, |
|
"eval_runtime": 224.5592, |
|
"eval_samples_per_second": 90.996, |
|
"eval_steps_per_second": 2.846, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.3250975292587776, |
|
"grad_norm": 4.145026683807373, |
|
"learning_rate": 9.900000000000001e-05, |
|
"loss": 1.2154, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.3250975292587776, |
|
"eval_accuracy": 0.7721005611086652, |
|
"eval_loss": 1.2983847856521606, |
|
"eval_runtime": 225.9654, |
|
"eval_samples_per_second": 90.43, |
|
"eval_steps_per_second": 2.828, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.48764629388816644, |
|
"grad_norm": 4.589369297027588, |
|
"learning_rate": 9.850000000000001e-05, |
|
"loss": 1.1779, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.48764629388816644, |
|
"eval_accuracy": 0.7790585887983502, |
|
"eval_loss": 1.2703238725662231, |
|
"eval_runtime": 226.7078, |
|
"eval_samples_per_second": 90.134, |
|
"eval_steps_per_second": 2.819, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6501950585175552, |
|
"grad_norm": 4.552799701690674, |
|
"learning_rate": 9.8e-05, |
|
"loss": 1.1546, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.6501950585175552, |
|
"eval_accuracy": 0.7819611016961427, |
|
"eval_loss": 1.2609333992004395, |
|
"eval_runtime": 226.665, |
|
"eval_samples_per_second": 90.151, |
|
"eval_steps_per_second": 2.819, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.812743823146944, |
|
"grad_norm": 3.709968328475952, |
|
"learning_rate": 9.75e-05, |
|
"loss": 1.1259, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.812743823146944, |
|
"eval_accuracy": 0.7877356734428504, |
|
"eval_loss": 1.2131903171539307, |
|
"eval_runtime": 226.8657, |
|
"eval_samples_per_second": 90.071, |
|
"eval_steps_per_second": 2.817, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.9752925877763329, |
|
"grad_norm": 3.6233718395233154, |
|
"learning_rate": 9.7e-05, |
|
"loss": 1.0938, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.9752925877763329, |
|
"eval_accuracy": 0.789409001949858, |
|
"eval_loss": 1.2194353342056274, |
|
"eval_runtime": 226.5843, |
|
"eval_samples_per_second": 90.183, |
|
"eval_steps_per_second": 2.82, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.1378413524057218, |
|
"grad_norm": 3.7706315517425537, |
|
"learning_rate": 9.65e-05, |
|
"loss": 1.0728, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.1378413524057218, |
|
"eval_accuracy": 0.7930909730559011, |
|
"eval_loss": 1.2004605531692505, |
|
"eval_runtime": 225.3386, |
|
"eval_samples_per_second": 90.681, |
|
"eval_steps_per_second": 2.836, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.3003901170351106, |
|
"grad_norm": 3.654762029647827, |
|
"learning_rate": 9.6e-05, |
|
"loss": 1.067, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.3003901170351106, |
|
"eval_accuracy": 0.7911667822625772, |
|
"eval_loss": 1.206479549407959, |
|
"eval_runtime": 229.9442, |
|
"eval_samples_per_second": 88.865, |
|
"eval_steps_per_second": 2.779, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.4629388816644995, |
|
"grad_norm": 3.6047844886779785, |
|
"learning_rate": 9.55e-05, |
|
"loss": 1.0623, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.4629388816644995, |
|
"eval_accuracy": 0.7950484617863337, |
|
"eval_loss": 1.186224341392517, |
|
"eval_runtime": 229.5346, |
|
"eval_samples_per_second": 89.024, |
|
"eval_steps_per_second": 2.784, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.6254876462938883, |
|
"grad_norm": 3.174414873123169, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.0452, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.6254876462938883, |
|
"eval_accuracy": 0.7986192006290228, |
|
"eval_loss": 1.1624418497085571, |
|
"eval_runtime": 227.8114, |
|
"eval_samples_per_second": 89.697, |
|
"eval_steps_per_second": 2.805, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.7880364109232771, |
|
"grad_norm": 3.455960988998413, |
|
"learning_rate": 9.449999999999999e-05, |
|
"loss": 1.0203, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.7880364109232771, |
|
"eval_accuracy": 0.7996825782405662, |
|
"eval_loss": 1.160246729850769, |
|
"eval_runtime": 227.8469, |
|
"eval_samples_per_second": 89.683, |
|
"eval_steps_per_second": 2.805, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.9505851755526658, |
|
"grad_norm": 3.4478368759155273, |
|
"learning_rate": 9.4e-05, |
|
"loss": 1.0102, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.9505851755526658, |
|
"eval_accuracy": 0.8028851426121806, |
|
"eval_loss": 1.1382070779800415, |
|
"eval_runtime": 225.8399, |
|
"eval_samples_per_second": 90.48, |
|
"eval_steps_per_second": 2.829, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.113133940182055, |
|
"grad_norm": 3.5076828002929688, |
|
"learning_rate": 9.350000000000001e-05, |
|
"loss": 1.0007, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.113133940182055, |
|
"eval_accuracy": 0.8036015201468353, |
|
"eval_loss": 1.1308486461639404, |
|
"eval_runtime": 227.3542, |
|
"eval_samples_per_second": 89.877, |
|
"eval_steps_per_second": 2.811, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.2756827048114436, |
|
"grad_norm": 2.954378366470337, |
|
"learning_rate": 9.300000000000001e-05, |
|
"loss": 0.9828, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.2756827048114436, |
|
"eval_accuracy": 0.80609597689472, |
|
"eval_loss": 1.1300016641616821, |
|
"eval_runtime": 226.3258, |
|
"eval_samples_per_second": 90.286, |
|
"eval_steps_per_second": 2.823, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.438231469440832, |
|
"grad_norm": 3.6220898628234863, |
|
"learning_rate": 9.250000000000001e-05, |
|
"loss": 0.9738, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.438231469440832, |
|
"eval_accuracy": 0.807549206838832, |
|
"eval_loss": 1.1173433065414429, |
|
"eval_runtime": 230.0766, |
|
"eval_samples_per_second": 88.814, |
|
"eval_steps_per_second": 2.777, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.6007802340702213, |
|
"grad_norm": 3.0354886054992676, |
|
"learning_rate": 9.200000000000001e-05, |
|
"loss": 0.9726, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.6007802340702213, |
|
"eval_accuracy": 0.8092077277721563, |
|
"eval_loss": 1.1192331314086914, |
|
"eval_runtime": 228.747, |
|
"eval_samples_per_second": 89.33, |
|
"eval_steps_per_second": 2.793, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 2.7633289986996097, |
|
"grad_norm": 2.905066967010498, |
|
"learning_rate": 9.15e-05, |
|
"loss": 0.9639, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.7633289986996097, |
|
"eval_accuracy": 0.8105359765383991, |
|
"eval_loss": 1.0964252948760986, |
|
"eval_runtime": 239.0977, |
|
"eval_samples_per_second": 85.463, |
|
"eval_steps_per_second": 2.673, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 2.925877763328999, |
|
"grad_norm": 2.5866756439208984, |
|
"learning_rate": 9.1e-05, |
|
"loss": 0.9562, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.925877763328999, |
|
"eval_accuracy": 0.8124049786000702, |
|
"eval_loss": 1.0979543924331665, |
|
"eval_runtime": 228.0298, |
|
"eval_samples_per_second": 89.611, |
|
"eval_steps_per_second": 2.802, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.0884265279583873, |
|
"grad_norm": 3.0912091732025146, |
|
"learning_rate": 9.05e-05, |
|
"loss": 0.9412, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.0884265279583873, |
|
"eval_accuracy": 0.8128664613154557, |
|
"eval_loss": 1.098233699798584, |
|
"eval_runtime": 226.3594, |
|
"eval_samples_per_second": 90.272, |
|
"eval_steps_per_second": 2.823, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.250975292587776, |
|
"grad_norm": 3.310755968093872, |
|
"learning_rate": 9e-05, |
|
"loss": 0.9404, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.250975292587776, |
|
"eval_accuracy": 0.8141734154155298, |
|
"eval_loss": 1.0834821462631226, |
|
"eval_runtime": 229.2889, |
|
"eval_samples_per_second": 89.119, |
|
"eval_steps_per_second": 2.787, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.413524057217165, |
|
"grad_norm": 3.2035183906555176, |
|
"learning_rate": 8.950000000000001e-05, |
|
"loss": 0.9345, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.413524057217165, |
|
"eval_accuracy": 0.8148430925920552, |
|
"eval_loss": 1.071053385734558, |
|
"eval_runtime": 228.6307, |
|
"eval_samples_per_second": 89.376, |
|
"eval_steps_per_second": 2.795, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 3.576072821846554, |
|
"grad_norm": 3.3350119590759277, |
|
"learning_rate": 8.900000000000001e-05, |
|
"loss": 0.9287, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.576072821846554, |
|
"eval_accuracy": 0.8148937367897325, |
|
"eval_loss": 1.0869466066360474, |
|
"eval_runtime": 229.0653, |
|
"eval_samples_per_second": 89.206, |
|
"eval_steps_per_second": 2.79, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 3.7386215864759427, |
|
"grad_norm": 3.0126075744628906, |
|
"learning_rate": 8.850000000000001e-05, |
|
"loss": 0.916, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.7386215864759427, |
|
"eval_accuracy": 0.8164226040931482, |
|
"eval_loss": 1.0795150995254517, |
|
"eval_runtime": 227.3062, |
|
"eval_samples_per_second": 89.896, |
|
"eval_steps_per_second": 2.811, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 3.9011703511053315, |
|
"grad_norm": 2.9025819301605225, |
|
"learning_rate": 8.800000000000001e-05, |
|
"loss": 0.9114, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 3.9011703511053315, |
|
"eval_accuracy": 0.8172531893844662, |
|
"eval_loss": 1.0739566087722778, |
|
"eval_runtime": 227.5695, |
|
"eval_samples_per_second": 89.792, |
|
"eval_steps_per_second": 2.808, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 4.06371911573472, |
|
"grad_norm": 3.1317036151885986, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.9104, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.06371911573472, |
|
"eval_accuracy": 0.8181712807844302, |
|
"eval_loss": 1.057742953300476, |
|
"eval_runtime": 226.8271, |
|
"eval_samples_per_second": 90.086, |
|
"eval_steps_per_second": 2.817, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 4.22626788036411, |
|
"grad_norm": 2.9879837036132812, |
|
"learning_rate": 8.7e-05, |
|
"loss": 0.8932, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.22626788036411, |
|
"eval_accuracy": 0.819528608544276, |
|
"eval_loss": 1.0524249076843262, |
|
"eval_runtime": 227.5673, |
|
"eval_samples_per_second": 89.793, |
|
"eval_steps_per_second": 2.808, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 4.388816644993498, |
|
"grad_norm": 3.1617486476898193, |
|
"learning_rate": 8.65e-05, |
|
"loss": 0.8884, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.388816644993498, |
|
"eval_accuracy": 0.8202382159332025, |
|
"eval_loss": 1.0618681907653809, |
|
"eval_runtime": 228.7279, |
|
"eval_samples_per_second": 89.338, |
|
"eval_steps_per_second": 2.794, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 4.551365409622887, |
|
"grad_norm": 3.287351131439209, |
|
"learning_rate": 8.6e-05, |
|
"loss": 0.8932, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.551365409622887, |
|
"eval_accuracy": 0.821493227612126, |
|
"eval_loss": 1.0429145097732544, |
|
"eval_runtime": 229.2177, |
|
"eval_samples_per_second": 89.147, |
|
"eval_steps_per_second": 2.788, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 4.713914174252276, |
|
"grad_norm": 3.1148979663848877, |
|
"learning_rate": 8.55e-05, |
|
"loss": 0.8871, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.713914174252276, |
|
"eval_accuracy": 0.821055764186474, |
|
"eval_loss": 1.0499247312545776, |
|
"eval_runtime": 227.9461, |
|
"eval_samples_per_second": 89.644, |
|
"eval_steps_per_second": 2.803, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 4.876462938881664, |
|
"grad_norm": 3.0099728107452393, |
|
"learning_rate": 8.5e-05, |
|
"loss": 0.8848, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 4.876462938881664, |
|
"eval_accuracy": 0.8221091492282949, |
|
"eval_loss": 1.0302263498306274, |
|
"eval_runtime": 229.0246, |
|
"eval_samples_per_second": 89.222, |
|
"eval_steps_per_second": 2.79, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 5.039011703511053, |
|
"grad_norm": 2.8249874114990234, |
|
"learning_rate": 8.450000000000001e-05, |
|
"loss": 0.8772, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.039011703511053, |
|
"eval_accuracy": 0.8225020975532324, |
|
"eval_loss": 1.0407395362854004, |
|
"eval_runtime": 230.4007, |
|
"eval_samples_per_second": 88.689, |
|
"eval_steps_per_second": 2.773, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 5.201560468140442, |
|
"grad_norm": 2.273552179336548, |
|
"learning_rate": 8.4e-05, |
|
"loss": 0.8715, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.201560468140442, |
|
"eval_accuracy": 0.8238052823384368, |
|
"eval_loss": 1.0325181484222412, |
|
"eval_runtime": 230.5347, |
|
"eval_samples_per_second": 88.637, |
|
"eval_steps_per_second": 2.772, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 5.364109232769831, |
|
"grad_norm": 2.884051561355591, |
|
"learning_rate": 8.35e-05, |
|
"loss": 0.8687, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.364109232769831, |
|
"eval_accuracy": 0.8243404153748926, |
|
"eval_loss": 1.0298871994018555, |
|
"eval_runtime": 231.4091, |
|
"eval_samples_per_second": 88.302, |
|
"eval_steps_per_second": 2.761, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 5.526657997399219, |
|
"grad_norm": 3.12709379196167, |
|
"learning_rate": 8.3e-05, |
|
"loss": 0.8674, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.526657997399219, |
|
"eval_accuracy": 0.8245251637171149, |
|
"eval_loss": 1.0233651399612427, |
|
"eval_runtime": 225.4766, |
|
"eval_samples_per_second": 90.626, |
|
"eval_steps_per_second": 2.834, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 5.689206762028609, |
|
"grad_norm": 2.7715749740600586, |
|
"learning_rate": 8.25e-05, |
|
"loss": 0.8597, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.689206762028609, |
|
"eval_accuracy": 0.8253348922363006, |
|
"eval_loss": 1.0265984535217285, |
|
"eval_runtime": 226.4158, |
|
"eval_samples_per_second": 90.25, |
|
"eval_steps_per_second": 2.822, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 5.851755526657997, |
|
"grad_norm": 2.630910873413086, |
|
"learning_rate": 8.2e-05, |
|
"loss": 0.8621, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 5.851755526657997, |
|
"eval_accuracy": 0.8264286938861036, |
|
"eval_loss": 1.025406002998352, |
|
"eval_runtime": 225.265, |
|
"eval_samples_per_second": 90.711, |
|
"eval_steps_per_second": 2.837, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 6.014304291287386, |
|
"grad_norm": 2.960498571395874, |
|
"learning_rate": 8.15e-05, |
|
"loss": 0.8505, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.014304291287386, |
|
"eval_accuracy": 0.8266802737227764, |
|
"eval_loss": 1.0281902551651, |
|
"eval_runtime": 225.5813, |
|
"eval_samples_per_second": 90.584, |
|
"eval_steps_per_second": 2.833, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 6.176853055916775, |
|
"grad_norm": 2.7250943183898926, |
|
"learning_rate": 8.1e-05, |
|
"loss": 0.8434, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.176853055916775, |
|
"eval_accuracy": 0.8273285977403005, |
|
"eval_loss": 1.019544005393982, |
|
"eval_runtime": 225.1972, |
|
"eval_samples_per_second": 90.738, |
|
"eval_steps_per_second": 2.838, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 6.339401820546164, |
|
"grad_norm": 2.45147967338562, |
|
"learning_rate": 8.05e-05, |
|
"loss": 0.8396, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.339401820546164, |
|
"eval_accuracy": 0.8278276606616609, |
|
"eval_loss": 1.016022801399231, |
|
"eval_runtime": 225.1071, |
|
"eval_samples_per_second": 90.775, |
|
"eval_steps_per_second": 2.839, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 6.501950585175552, |
|
"grad_norm": 3.0252089500427246, |
|
"learning_rate": 8e-05, |
|
"loss": 0.849, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.501950585175552, |
|
"eval_accuracy": 0.8286753807254127, |
|
"eval_loss": 0.9984493851661682, |
|
"eval_runtime": 225.6293, |
|
"eval_samples_per_second": 90.564, |
|
"eval_steps_per_second": 2.832, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 6.664499349804942, |
|
"grad_norm": 2.79874849319458, |
|
"learning_rate": 7.950000000000001e-05, |
|
"loss": 0.8344, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.664499349804942, |
|
"eval_accuracy": 0.8295993429023929, |
|
"eval_loss": 1.008083462715149, |
|
"eval_runtime": 226.4394, |
|
"eval_samples_per_second": 90.24, |
|
"eval_steps_per_second": 2.822, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 6.82704811443433, |
|
"grad_norm": 2.726980447769165, |
|
"learning_rate": 7.900000000000001e-05, |
|
"loss": 0.832, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.82704811443433, |
|
"eval_accuracy": 0.8289772346442087, |
|
"eval_loss": 1.0009534358978271, |
|
"eval_runtime": 225.2727, |
|
"eval_samples_per_second": 90.708, |
|
"eval_steps_per_second": 2.837, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 6.989596879063719, |
|
"grad_norm": 2.6177260875701904, |
|
"learning_rate": 7.850000000000001e-05, |
|
"loss": 0.8371, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 6.989596879063719, |
|
"eval_accuracy": 0.828972862081629, |
|
"eval_loss": 1.007731556892395, |
|
"eval_runtime": 225.1985, |
|
"eval_samples_per_second": 90.738, |
|
"eval_steps_per_second": 2.837, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 7.152145643693108, |
|
"grad_norm": 2.511810541152954, |
|
"learning_rate": 7.800000000000001e-05, |
|
"loss": 0.8221, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.152145643693108, |
|
"eval_accuracy": 0.8296968824079836, |
|
"eval_loss": 1.001420259475708, |
|
"eval_runtime": 226.3788, |
|
"eval_samples_per_second": 90.265, |
|
"eval_steps_per_second": 2.823, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 7.314694408322497, |
|
"grad_norm": 2.490196466445923, |
|
"learning_rate": 7.75e-05, |
|
"loss": 0.819, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.314694408322497, |
|
"eval_accuracy": 0.8315370661967273, |
|
"eval_loss": 1.0057367086410522, |
|
"eval_runtime": 226.9835, |
|
"eval_samples_per_second": 90.024, |
|
"eval_steps_per_second": 2.815, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 7.477243172951885, |
|
"grad_norm": 2.7740466594696045, |
|
"learning_rate": 7.7e-05, |
|
"loss": 0.8211, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.477243172951885, |
|
"eval_accuracy": 0.8325044106908798, |
|
"eval_loss": 0.9906172752380371, |
|
"eval_runtime": 226.1902, |
|
"eval_samples_per_second": 90.34, |
|
"eval_steps_per_second": 2.825, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 7.639791937581275, |
|
"grad_norm": 3.0789926052093506, |
|
"learning_rate": 7.65e-05, |
|
"loss": 0.818, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.639791937581275, |
|
"eval_accuracy": 0.832777941957673, |
|
"eval_loss": 0.9794951677322388, |
|
"eval_runtime": 226.5187, |
|
"eval_samples_per_second": 90.209, |
|
"eval_steps_per_second": 2.821, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 7.802340702210663, |
|
"grad_norm": 2.452514171600342, |
|
"learning_rate": 7.6e-05, |
|
"loss": 0.8128, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.802340702210663, |
|
"eval_accuracy": 0.8325327815535476, |
|
"eval_loss": 0.9824967384338379, |
|
"eval_runtime": 225.9154, |
|
"eval_samples_per_second": 90.45, |
|
"eval_steps_per_second": 2.828, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 7.964889466840052, |
|
"grad_norm": 2.486287832260132, |
|
"learning_rate": 7.55e-05, |
|
"loss": 0.8133, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 7.964889466840052, |
|
"eval_accuracy": 0.8329719850198115, |
|
"eval_loss": 0.9840763211250305, |
|
"eval_runtime": 225.7676, |
|
"eval_samples_per_second": 90.509, |
|
"eval_steps_per_second": 2.83, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 8.12743823146944, |
|
"grad_norm": 2.6238162517547607, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.7972, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.12743823146944, |
|
"eval_accuracy": 0.8329068936503207, |
|
"eval_loss": 0.9786662459373474, |
|
"eval_runtime": 227.0649, |
|
"eval_samples_per_second": 89.992, |
|
"eval_steps_per_second": 2.814, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 8.289986996098829, |
|
"grad_norm": 2.622385025024414, |
|
"learning_rate": 7.450000000000001e-05, |
|
"loss": 0.8012, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.289986996098829, |
|
"eval_accuracy": 0.83353766380846, |
|
"eval_loss": 0.9908860325813293, |
|
"eval_runtime": 225.4511, |
|
"eval_samples_per_second": 90.636, |
|
"eval_steps_per_second": 2.834, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 8.45253576072822, |
|
"grad_norm": 2.3659889698028564, |
|
"learning_rate": 7.4e-05, |
|
"loss": 0.8002, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 8.45253576072822, |
|
"eval_accuracy": 0.834269520268432, |
|
"eval_loss": 0.9886682629585266, |
|
"eval_runtime": 226.2835, |
|
"eval_samples_per_second": 90.303, |
|
"eval_steps_per_second": 2.824, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 8.615084525357608, |
|
"grad_norm": 2.6340997219085693, |
|
"learning_rate": 7.35e-05, |
|
"loss": 0.796, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.615084525357608, |
|
"eval_accuracy": 0.83431812146113, |
|
"eval_loss": 1.1027251482009888, |
|
"eval_runtime": 224.7718, |
|
"eval_samples_per_second": 90.91, |
|
"eval_steps_per_second": 2.843, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 8.777633289986996, |
|
"grad_norm": 2.351562976837158, |
|
"learning_rate": 7.3e-05, |
|
"loss": 0.8032, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.777633289986996, |
|
"eval_accuracy": 0.83473517326524, |
|
"eval_loss": 0.9701533317565918, |
|
"eval_runtime": 224.7228, |
|
"eval_samples_per_second": 90.93, |
|
"eval_steps_per_second": 2.844, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 8.940182054616384, |
|
"grad_norm": 3.179107189178467, |
|
"learning_rate": 7.25e-05, |
|
"loss": 0.7956, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 8.940182054616384, |
|
"eval_accuracy": 0.8351845716261748, |
|
"eval_loss": 0.978928804397583, |
|
"eval_runtime": 226.3807, |
|
"eval_samples_per_second": 90.264, |
|
"eval_steps_per_second": 2.823, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 9.102730819245775, |
|
"grad_norm": 2.7947020530700684, |
|
"learning_rate": 7.2e-05, |
|
"loss": 0.7897, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 9.102730819245775, |
|
"eval_accuracy": 0.8353527515006416, |
|
"eval_loss": 0.9703938961029053, |
|
"eval_runtime": 227.4785, |
|
"eval_samples_per_second": 89.828, |
|
"eval_steps_per_second": 2.809, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 9.265279583875163, |
|
"grad_norm": 2.4941184520721436, |
|
"learning_rate": 7.15e-05, |
|
"loss": 0.7822, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 9.265279583875163, |
|
"eval_accuracy": 0.8354081248952335, |
|
"eval_loss": 0.985792875289917, |
|
"eval_runtime": 225.177, |
|
"eval_samples_per_second": 90.746, |
|
"eval_steps_per_second": 2.838, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 9.427828348504551, |
|
"grad_norm": 2.623500108718872, |
|
"learning_rate": 7.1e-05, |
|
"loss": 0.7885, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 9.427828348504551, |
|
"eval_accuracy": 0.8367873538892325, |
|
"eval_loss": 0.9710731506347656, |
|
"eval_runtime": 224.2999, |
|
"eval_samples_per_second": 91.101, |
|
"eval_steps_per_second": 2.849, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 9.59037711313394, |
|
"grad_norm": 2.687804698944092, |
|
"learning_rate": 7.05e-05, |
|
"loss": 0.7853, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 9.59037711313394, |
|
"eval_accuracy": 0.8370918540482961, |
|
"eval_loss": 0.9488577246665955, |
|
"eval_runtime": 225.5849, |
|
"eval_samples_per_second": 90.582, |
|
"eval_steps_per_second": 2.833, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 9.752925877763328, |
|
"grad_norm": 2.415875196456909, |
|
"learning_rate": 7e-05, |
|
"loss": 0.7834, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 9.752925877763328, |
|
"eval_accuracy": 0.8376393865857097, |
|
"eval_loss": 0.9489056468009949, |
|
"eval_runtime": 224.5743, |
|
"eval_samples_per_second": 90.99, |
|
"eval_steps_per_second": 2.845, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 9.915474642392718, |
|
"grad_norm": 2.635620355606079, |
|
"learning_rate": 6.95e-05, |
|
"loss": 0.7795, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 9.915474642392718, |
|
"eval_accuracy": 0.8379052346488646, |
|
"eval_loss": 0.9600683450698853, |
|
"eval_runtime": 227.5569, |
|
"eval_samples_per_second": 89.797, |
|
"eval_steps_per_second": 2.808, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 10.078023407022107, |
|
"grad_norm": 2.2729740142822266, |
|
"learning_rate": 6.9e-05, |
|
"loss": 0.7784, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 10.078023407022107, |
|
"eval_accuracy": 0.8387711597381718, |
|
"eval_loss": 0.9676991701126099, |
|
"eval_runtime": 227.0441, |
|
"eval_samples_per_second": 90.0, |
|
"eval_steps_per_second": 2.814, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 10.240572171651495, |
|
"grad_norm": 2.665194511413574, |
|
"learning_rate": 6.850000000000001e-05, |
|
"loss": 0.7731, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 10.240572171651495, |
|
"eval_accuracy": 0.8384039380286287, |
|
"eval_loss": 0.9660561084747314, |
|
"eval_runtime": 225.7122, |
|
"eval_samples_per_second": 90.531, |
|
"eval_steps_per_second": 2.831, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 10.403120936280883, |
|
"grad_norm": 2.378326892852783, |
|
"learning_rate": 6.800000000000001e-05, |
|
"loss": 0.7732, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 10.403120936280883, |
|
"eval_accuracy": 0.8384703620413777, |
|
"eval_loss": 0.9522095322608948, |
|
"eval_runtime": 226.2237, |
|
"eval_samples_per_second": 90.327, |
|
"eval_steps_per_second": 2.825, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 10.565669700910274, |
|
"grad_norm": 2.3873043060302734, |
|
"learning_rate": 6.750000000000001e-05, |
|
"loss": 0.7675, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 10.565669700910274, |
|
"eval_accuracy": 0.8387303829039866, |
|
"eval_loss": 0.9506338238716125, |
|
"eval_runtime": 225.3705, |
|
"eval_samples_per_second": 90.668, |
|
"eval_steps_per_second": 2.835, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 10.728218465539662, |
|
"grad_norm": 2.908627986907959, |
|
"learning_rate": 6.7e-05, |
|
"loss": 0.7632, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 10.728218465539662, |
|
"eval_accuracy": 0.8393342612931991, |
|
"eval_loss": 0.9406708478927612, |
|
"eval_runtime": 226.5163, |
|
"eval_samples_per_second": 90.21, |
|
"eval_steps_per_second": 2.821, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 10.89076723016905, |
|
"grad_norm": 2.353163242340088, |
|
"learning_rate": 6.65e-05, |
|
"loss": 0.7652, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 10.89076723016905, |
|
"eval_accuracy": 0.8397924279467366, |
|
"eval_loss": 0.9450008869171143, |
|
"eval_runtime": 228.8699, |
|
"eval_samples_per_second": 89.282, |
|
"eval_steps_per_second": 2.792, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 11.053315994798439, |
|
"grad_norm": 2.5943007469177246, |
|
"learning_rate": 6.6e-05, |
|
"loss": 0.7569, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 11.053315994798439, |
|
"eval_accuracy": 0.8403287583785509, |
|
"eval_loss": 0.9399783611297607, |
|
"eval_runtime": 226.1121, |
|
"eval_samples_per_second": 90.371, |
|
"eval_steps_per_second": 2.826, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 11.215864759427829, |
|
"grad_norm": 2.693864107131958, |
|
"learning_rate": 6.55e-05, |
|
"loss": 0.7594, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 11.215864759427829, |
|
"eval_accuracy": 0.8401560936743129, |
|
"eval_loss": 0.9480236172676086, |
|
"eval_runtime": 225.4517, |
|
"eval_samples_per_second": 90.636, |
|
"eval_steps_per_second": 2.834, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 11.378413524057217, |
|
"grad_norm": 2.644212484359741, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 0.7549, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 11.378413524057217, |
|
"eval_accuracy": 0.8397807491674708, |
|
"eval_loss": 0.947754442691803, |
|
"eval_runtime": 224.7666, |
|
"eval_samples_per_second": 90.912, |
|
"eval_steps_per_second": 2.843, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 11.540962288686606, |
|
"grad_norm": 2.548055410385132, |
|
"learning_rate": 6.450000000000001e-05, |
|
"loss": 0.7508, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 11.540962288686606, |
|
"eval_accuracy": 0.8408923535971057, |
|
"eval_loss": 0.9363004565238953, |
|
"eval_runtime": 226.2948, |
|
"eval_samples_per_second": 90.298, |
|
"eval_steps_per_second": 2.824, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 11.703511053315994, |
|
"grad_norm": 2.2983460426330566, |
|
"learning_rate": 6.400000000000001e-05, |
|
"loss": 0.7498, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 11.703511053315994, |
|
"eval_accuracy": 0.8415923695179955, |
|
"eval_loss": 0.9559971690177917, |
|
"eval_runtime": 225.5228, |
|
"eval_samples_per_second": 90.607, |
|
"eval_steps_per_second": 2.833, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 11.866059817945384, |
|
"grad_norm": 2.585132122039795, |
|
"learning_rate": 6.35e-05, |
|
"loss": 0.7529, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 11.866059817945384, |
|
"eval_accuracy": 0.8414500365516611, |
|
"eval_loss": 0.9397149085998535, |
|
"eval_runtime": 225.5184, |
|
"eval_samples_per_second": 90.609, |
|
"eval_steps_per_second": 2.833, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 12.028608582574773, |
|
"grad_norm": 2.7008893489837646, |
|
"learning_rate": 6.3e-05, |
|
"loss": 0.7499, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 12.028608582574773, |
|
"eval_accuracy": 0.8429048722887895, |
|
"eval_loss": 0.9322752952575684, |
|
"eval_runtime": 226.0814, |
|
"eval_samples_per_second": 90.383, |
|
"eval_steps_per_second": 2.826, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 12.191157347204161, |
|
"grad_norm": 2.234344959259033, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.7516, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 12.191157347204161, |
|
"eval_accuracy": 0.8423553938504648, |
|
"eval_loss": 0.9274143576622009, |
|
"eval_runtime": 224.6544, |
|
"eval_samples_per_second": 90.957, |
|
"eval_steps_per_second": 2.844, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 12.35370611183355, |
|
"grad_norm": 2.38130259513855, |
|
"learning_rate": 6.2e-05, |
|
"loss": 0.742, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 12.35370611183355, |
|
"eval_accuracy": 0.8434239606449652, |
|
"eval_loss": 0.9298020005226135, |
|
"eval_runtime": 224.8881, |
|
"eval_samples_per_second": 90.863, |
|
"eval_steps_per_second": 2.841, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 12.51625487646294, |
|
"grad_norm": 2.4753472805023193, |
|
"learning_rate": 6.15e-05, |
|
"loss": 0.7349, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 12.51625487646294, |
|
"eval_accuracy": 0.8435835943839733, |
|
"eval_loss": 0.9265833497047424, |
|
"eval_runtime": 225.4856, |
|
"eval_samples_per_second": 90.622, |
|
"eval_steps_per_second": 2.834, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 12.678803641092328, |
|
"grad_norm": 2.6576738357543945, |
|
"learning_rate": 6.1e-05, |
|
"loss": 0.7408, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 12.678803641092328, |
|
"eval_accuracy": 0.8433502485604528, |
|
"eval_loss": 0.9311428070068359, |
|
"eval_runtime": 226.2881, |
|
"eval_samples_per_second": 90.301, |
|
"eval_steps_per_second": 2.824, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 12.841352405721716, |
|
"grad_norm": 2.318998336791992, |
|
"learning_rate": 6.05e-05, |
|
"loss": 0.742, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 12.841352405721716, |
|
"eval_accuracy": 0.8439347116351804, |
|
"eval_loss": 0.9278241991996765, |
|
"eval_runtime": 226.17, |
|
"eval_samples_per_second": 90.348, |
|
"eval_steps_per_second": 2.825, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 13.003901170351105, |
|
"grad_norm": 2.5888776779174805, |
|
"learning_rate": 6e-05, |
|
"loss": 0.735, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 13.003901170351105, |
|
"eval_accuracy": 0.8435309371055906, |
|
"eval_loss": 0.9364249110221863, |
|
"eval_runtime": 226.5769, |
|
"eval_samples_per_second": 90.186, |
|
"eval_steps_per_second": 2.82, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 13.166449934980495, |
|
"grad_norm": 2.5382964611053467, |
|
"learning_rate": 5.95e-05, |
|
"loss": 0.7317, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 13.166449934980495, |
|
"eval_accuracy": 0.8435685143232774, |
|
"eval_loss": 0.9289753437042236, |
|
"eval_runtime": 226.0444, |
|
"eval_samples_per_second": 90.398, |
|
"eval_steps_per_second": 2.827, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 13.328998699609883, |
|
"grad_norm": 2.3561949729919434, |
|
"learning_rate": 5.9e-05, |
|
"loss": 0.7287, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 13.328998699609883, |
|
"eval_accuracy": 0.8441468378237561, |
|
"eval_loss": 0.9176314473152161, |
|
"eval_runtime": 225.4845, |
|
"eval_samples_per_second": 90.623, |
|
"eval_steps_per_second": 2.834, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 13.491547464239272, |
|
"grad_norm": 2.4950144290924072, |
|
"learning_rate": 5.85e-05, |
|
"loss": 0.7236, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 13.491547464239272, |
|
"eval_accuracy": 0.8440490876509515, |
|
"eval_loss": 0.9126904010772705, |
|
"eval_runtime": 225.1525, |
|
"eval_samples_per_second": 90.756, |
|
"eval_steps_per_second": 2.838, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 13.65409622886866, |
|
"grad_norm": 2.5094358921051025, |
|
"learning_rate": 5.8e-05, |
|
"loss": 0.7271, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 13.65409622886866, |
|
"eval_accuracy": 0.844420508518643, |
|
"eval_loss": 0.9222070574760437, |
|
"eval_runtime": 226.8881, |
|
"eval_samples_per_second": 90.062, |
|
"eval_steps_per_second": 2.816, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 13.81664499349805, |
|
"grad_norm": 2.5358660221099854, |
|
"learning_rate": 5.7499999999999995e-05, |
|
"loss": 0.7247, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 13.81664499349805, |
|
"eval_accuracy": 0.8447406385701589, |
|
"eval_loss": 0.911141037940979, |
|
"eval_runtime": 226.4876, |
|
"eval_samples_per_second": 90.221, |
|
"eval_steps_per_second": 2.821, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 13.979193758127439, |
|
"grad_norm": 2.449117422103882, |
|
"learning_rate": 5.6999999999999996e-05, |
|
"loss": 0.724, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 13.979193758127439, |
|
"eval_accuracy": 0.8455180692523965, |
|
"eval_loss": 0.9233959317207336, |
|
"eval_runtime": 226.6454, |
|
"eval_samples_per_second": 90.158, |
|
"eval_steps_per_second": 2.819, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 14.141742522756827, |
|
"grad_norm": 2.9299075603485107, |
|
"learning_rate": 5.65e-05, |
|
"loss": 0.717, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 14.141742522756827, |
|
"eval_accuracy": 0.8457589583161474, |
|
"eval_loss": 0.9212149381637573, |
|
"eval_runtime": 226.3542, |
|
"eval_samples_per_second": 90.274, |
|
"eval_steps_per_second": 2.823, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 14.304291287386215, |
|
"grad_norm": 2.705927610397339, |
|
"learning_rate": 5.6000000000000006e-05, |
|
"loss": 0.7209, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 14.304291287386215, |
|
"eval_accuracy": 0.8461540554265815, |
|
"eval_loss": 0.9096006155014038, |
|
"eval_runtime": 225.3071, |
|
"eval_samples_per_second": 90.694, |
|
"eval_steps_per_second": 2.836, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 14.466840052015606, |
|
"grad_norm": 2.0237460136413574, |
|
"learning_rate": 5.550000000000001e-05, |
|
"loss": 0.7143, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 14.466840052015606, |
|
"eval_accuracy": 0.8467122032637585, |
|
"eval_loss": 0.9058763980865479, |
|
"eval_runtime": 224.7066, |
|
"eval_samples_per_second": 90.936, |
|
"eval_steps_per_second": 2.844, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 14.629388816644994, |
|
"grad_norm": 2.6412603855133057, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 0.7097, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 14.629388816644994, |
|
"eval_accuracy": 0.8469896352526538, |
|
"eval_loss": 0.9109433889389038, |
|
"eval_runtime": 226.4374, |
|
"eval_samples_per_second": 90.241, |
|
"eval_steps_per_second": 2.822, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 14.791937581274382, |
|
"grad_norm": 2.0750892162323, |
|
"learning_rate": 5.45e-05, |
|
"loss": 0.7167, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 14.791937581274382, |
|
"eval_accuracy": 0.8459233510762237, |
|
"eval_loss": 0.9634975790977478, |
|
"eval_runtime": 224.623, |
|
"eval_samples_per_second": 90.97, |
|
"eval_steps_per_second": 2.845, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 14.95448634590377, |
|
"grad_norm": 3.2299835681915283, |
|
"learning_rate": 5.4000000000000005e-05, |
|
"loss": 0.7175, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 14.95448634590377, |
|
"eval_accuracy": 0.8472973119877086, |
|
"eval_loss": 0.9039083123207092, |
|
"eval_runtime": 226.4661, |
|
"eval_samples_per_second": 90.23, |
|
"eval_steps_per_second": 2.822, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 15.117035110533159, |
|
"grad_norm": 2.562220573425293, |
|
"learning_rate": 5.3500000000000006e-05, |
|
"loss": 0.7032, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 15.117035110533159, |
|
"eval_accuracy": 0.8473520786144324, |
|
"eval_loss": 0.911507785320282, |
|
"eval_runtime": 224.8892, |
|
"eval_samples_per_second": 90.863, |
|
"eval_steps_per_second": 2.841, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 15.27958387516255, |
|
"grad_norm": 2.8594844341278076, |
|
"learning_rate": 5.300000000000001e-05, |
|
"loss": 0.7082, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 15.27958387516255, |
|
"eval_accuracy": 0.8476350142048673, |
|
"eval_loss": 0.909827470779419, |
|
"eval_runtime": 225.7006, |
|
"eval_samples_per_second": 90.536, |
|
"eval_steps_per_second": 2.831, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 15.442132639791938, |
|
"grad_norm": 2.468425989151001, |
|
"learning_rate": 5.25e-05, |
|
"loss": 0.7041, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 15.442132639791938, |
|
"eval_accuracy": 0.8478791054352419, |
|
"eval_loss": 0.9049926996231079, |
|
"eval_runtime": 224.8771, |
|
"eval_samples_per_second": 90.867, |
|
"eval_steps_per_second": 2.842, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 15.604681404421326, |
|
"grad_norm": 2.389726400375366, |
|
"learning_rate": 5.2000000000000004e-05, |
|
"loss": 0.7031, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 15.604681404421326, |
|
"eval_accuracy": 0.8481061886894397, |
|
"eval_loss": 0.890941858291626, |
|
"eval_runtime": 226.989, |
|
"eval_samples_per_second": 90.022, |
|
"eval_steps_per_second": 2.815, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 15.767230169050714, |
|
"grad_norm": 2.2124507427215576, |
|
"learning_rate": 5.1500000000000005e-05, |
|
"loss": 0.6986, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 15.767230169050714, |
|
"eval_accuracy": 0.8491361761433549, |
|
"eval_loss": 0.9069699048995972, |
|
"eval_runtime": 224.9486, |
|
"eval_samples_per_second": 90.839, |
|
"eval_steps_per_second": 2.841, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 15.929778933680105, |
|
"grad_norm": 2.2906134128570557, |
|
"learning_rate": 5.1000000000000006e-05, |
|
"loss": 0.707, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 15.929778933680105, |
|
"eval_accuracy": 0.8479968642310016, |
|
"eval_loss": 0.9036478996276855, |
|
"eval_runtime": 225.9102, |
|
"eval_samples_per_second": 90.452, |
|
"eval_steps_per_second": 2.829, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 16.092327698309493, |
|
"grad_norm": 2.6677193641662598, |
|
"learning_rate": 5.05e-05, |
|
"loss": 0.6948, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 16.092327698309493, |
|
"eval_accuracy": 0.8493272111938716, |
|
"eval_loss": 0.9056188464164734, |
|
"eval_runtime": 225.0003, |
|
"eval_samples_per_second": 90.818, |
|
"eval_steps_per_second": 2.84, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 16.25487646293888, |
|
"grad_norm": 2.4262144565582275, |
|
"learning_rate": 5e-05, |
|
"loss": 0.6967, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 16.25487646293888, |
|
"eval_accuracy": 0.8485872336104957, |
|
"eval_loss": 0.896799623966217, |
|
"eval_runtime": 224.1595, |
|
"eval_samples_per_second": 91.158, |
|
"eval_steps_per_second": 2.851, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 16.41742522756827, |
|
"grad_norm": 2.316122055053711, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 0.6977, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 16.41742522756827, |
|
"eval_accuracy": 0.8495556837644054, |
|
"eval_loss": 0.9065157175064087, |
|
"eval_runtime": 225.8926, |
|
"eval_samples_per_second": 90.459, |
|
"eval_steps_per_second": 2.829, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 16.579973992197658, |
|
"grad_norm": 2.6691653728485107, |
|
"learning_rate": 4.9e-05, |
|
"loss": 0.6961, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 16.579973992197658, |
|
"eval_accuracy": 0.8500652800696321, |
|
"eval_loss": 0.896096408367157, |
|
"eval_runtime": 232.2659, |
|
"eval_samples_per_second": 87.977, |
|
"eval_steps_per_second": 2.751, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 16.742522756827046, |
|
"grad_norm": 2.475905656814575, |
|
"learning_rate": 4.85e-05, |
|
"loss": 0.6916, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 16.742522756827046, |
|
"eval_accuracy": 0.8497042599251818, |
|
"eval_loss": 0.8976150155067444, |
|
"eval_runtime": 226.0766, |
|
"eval_samples_per_second": 90.385, |
|
"eval_steps_per_second": 2.826, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 16.90507152145644, |
|
"grad_norm": 2.786370277404785, |
|
"learning_rate": 4.8e-05, |
|
"loss": 0.6914, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 16.90507152145644, |
|
"eval_accuracy": 0.8496545695743757, |
|
"eval_loss": 0.8938505053520203, |
|
"eval_runtime": 224.6085, |
|
"eval_samples_per_second": 90.976, |
|
"eval_steps_per_second": 2.845, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 17.067620286085827, |
|
"grad_norm": 2.8185195922851562, |
|
"learning_rate": 4.75e-05, |
|
"loss": 0.6843, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 17.067620286085827, |
|
"eval_accuracy": 0.8506024205488569, |
|
"eval_loss": 0.902123749256134, |
|
"eval_runtime": 225.1853, |
|
"eval_samples_per_second": 90.743, |
|
"eval_steps_per_second": 2.838, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 17.230169050715215, |
|
"grad_norm": 2.287614107131958, |
|
"learning_rate": 4.7e-05, |
|
"loss": 0.6886, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 17.230169050715215, |
|
"eval_accuracy": 0.8509516099496137, |
|
"eval_loss": 0.8869491815567017, |
|
"eval_runtime": 225.8223, |
|
"eval_samples_per_second": 90.487, |
|
"eval_steps_per_second": 2.83, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 17.392717815344604, |
|
"grad_norm": 2.545166492462158, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 0.6895, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 17.392717815344604, |
|
"eval_accuracy": 0.8509741315809264, |
|
"eval_loss": 0.88730388879776, |
|
"eval_runtime": 225.2327, |
|
"eval_samples_per_second": 90.724, |
|
"eval_steps_per_second": 2.837, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 17.555266579973992, |
|
"grad_norm": 2.133690357208252, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 0.684, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 17.555266579973992, |
|
"eval_accuracy": 0.8506065339944305, |
|
"eval_loss": 0.8796634674072266, |
|
"eval_runtime": 225.9145, |
|
"eval_samples_per_second": 90.45, |
|
"eval_steps_per_second": 2.829, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 17.71781534460338, |
|
"grad_norm": 2.368135690689087, |
|
"learning_rate": 4.55e-05, |
|
"loss": 0.6814, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 17.71781534460338, |
|
"eval_accuracy": 0.8518810256708096, |
|
"eval_loss": 0.8883038759231567, |
|
"eval_runtime": 226.0516, |
|
"eval_samples_per_second": 90.395, |
|
"eval_steps_per_second": 2.827, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 17.88036410923277, |
|
"grad_norm": 2.93131685256958, |
|
"learning_rate": 4.5e-05, |
|
"loss": 0.6814, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 17.88036410923277, |
|
"eval_accuracy": 0.8515069138621577, |
|
"eval_loss": 0.8804932832717896, |
|
"eval_runtime": 224.3083, |
|
"eval_samples_per_second": 91.098, |
|
"eval_steps_per_second": 2.849, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 18.042912873862157, |
|
"grad_norm": 2.6115188598632812, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 0.6825, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 18.042912873862157, |
|
"eval_accuracy": 0.8524721846400168, |
|
"eval_loss": 0.8792157173156738, |
|
"eval_runtime": 225.8719, |
|
"eval_samples_per_second": 90.467, |
|
"eval_steps_per_second": 2.829, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 18.20546163849155, |
|
"grad_norm": 3.650696039199829, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 0.6754, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 18.20546163849155, |
|
"eval_accuracy": 0.8515890870808295, |
|
"eval_loss": 0.8762602210044861, |
|
"eval_runtime": 225.4458, |
|
"eval_samples_per_second": 90.638, |
|
"eval_steps_per_second": 2.834, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 18.368010403120937, |
|
"grad_norm": 2.48811936378479, |
|
"learning_rate": 4.35e-05, |
|
"loss": 0.6694, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 18.368010403120937, |
|
"eval_accuracy": 0.852083188035357, |
|
"eval_loss": 0.8903321623802185, |
|
"eval_runtime": 231.1517, |
|
"eval_samples_per_second": 88.401, |
|
"eval_steps_per_second": 2.764, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 18.530559167750326, |
|
"grad_norm": 2.664206027984619, |
|
"learning_rate": 4.3e-05, |
|
"loss": 0.6711, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 18.530559167750326, |
|
"eval_accuracy": 0.8524635038150364, |
|
"eval_loss": 0.8883496522903442, |
|
"eval_runtime": 224.7773, |
|
"eval_samples_per_second": 90.908, |
|
"eval_steps_per_second": 2.843, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 18.693107932379714, |
|
"grad_norm": 2.5020461082458496, |
|
"learning_rate": 4.25e-05, |
|
"loss": 0.6701, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 18.693107932379714, |
|
"eval_accuracy": 0.8527774358008777, |
|
"eval_loss": 0.8737440705299377, |
|
"eval_runtime": 224.3031, |
|
"eval_samples_per_second": 91.1, |
|
"eval_steps_per_second": 2.849, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 18.855656697009103, |
|
"grad_norm": 2.341423273086548, |
|
"learning_rate": 4.2e-05, |
|
"loss": 0.6722, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 18.855656697009103, |
|
"eval_accuracy": 0.8529963243635703, |
|
"eval_loss": 0.8735175132751465, |
|
"eval_runtime": 225.4167, |
|
"eval_samples_per_second": 90.65, |
|
"eval_steps_per_second": 2.835, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 19.01820546163849, |
|
"grad_norm": 2.4827890396118164, |
|
"learning_rate": 4.15e-05, |
|
"loss": 0.6723, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 19.01820546163849, |
|
"eval_accuracy": 0.853864474597493, |
|
"eval_loss": 0.8714447021484375, |
|
"eval_runtime": 225.7934, |
|
"eval_samples_per_second": 90.499, |
|
"eval_steps_per_second": 2.83, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 19.18075422626788, |
|
"grad_norm": 2.6257293224334717, |
|
"learning_rate": 4.1e-05, |
|
"loss": 0.6623, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 19.18075422626788, |
|
"eval_accuracy": 0.8535918441752076, |
|
"eval_loss": 0.8707379698753357, |
|
"eval_runtime": 233.0012, |
|
"eval_samples_per_second": 87.699, |
|
"eval_steps_per_second": 2.742, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 19.343302990897268, |
|
"grad_norm": 2.2881245613098145, |
|
"learning_rate": 4.05e-05, |
|
"loss": 0.6629, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 19.343302990897268, |
|
"eval_accuracy": 0.8536284370228246, |
|
"eval_loss": 0.8575323820114136, |
|
"eval_runtime": 225.6251, |
|
"eval_samples_per_second": 90.566, |
|
"eval_steps_per_second": 2.832, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 19.505851755526656, |
|
"grad_norm": 2.485198497772217, |
|
"learning_rate": 4e-05, |
|
"loss": 0.6629, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 19.505851755526656, |
|
"eval_accuracy": 0.8541063694677179, |
|
"eval_loss": 0.8660362958908081, |
|
"eval_runtime": 229.1829, |
|
"eval_samples_per_second": 89.16, |
|
"eval_steps_per_second": 2.788, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 19.668400520156048, |
|
"grad_norm": 2.365445375442505, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 0.6636, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 19.668400520156048, |
|
"eval_accuracy": 0.8544101305781451, |
|
"eval_loss": 0.8751095533370972, |
|
"eval_runtime": 226.4875, |
|
"eval_samples_per_second": 90.221, |
|
"eval_steps_per_second": 2.821, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 19.830949284785437, |
|
"grad_norm": 2.7341248989105225, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 0.6668, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 19.830949284785437, |
|
"eval_accuracy": 0.8538102982406334, |
|
"eval_loss": 0.8749545812606812, |
|
"eval_runtime": 225.3871, |
|
"eval_samples_per_second": 90.662, |
|
"eval_steps_per_second": 2.835, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 19.993498049414825, |
|
"grad_norm": 2.200584888458252, |
|
"learning_rate": 3.85e-05, |
|
"loss": 0.6658, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 19.993498049414825, |
|
"eval_accuracy": 0.8542889960446646, |
|
"eval_loss": 0.8671930432319641, |
|
"eval_runtime": 225.4174, |
|
"eval_samples_per_second": 90.65, |
|
"eval_steps_per_second": 2.835, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 20.156046814044213, |
|
"grad_norm": 2.7894246578216553, |
|
"learning_rate": 3.8e-05, |
|
"loss": 0.6616, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 20.156046814044213, |
|
"eval_accuracy": 0.8548723750905142, |
|
"eval_loss": 0.8662538528442383, |
|
"eval_runtime": 224.781, |
|
"eval_samples_per_second": 90.906, |
|
"eval_steps_per_second": 2.843, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 20.3185955786736, |
|
"grad_norm": 2.3690404891967773, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.6541, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 20.3185955786736, |
|
"eval_accuracy": 0.8550845489091575, |
|
"eval_loss": 0.865816593170166, |
|
"eval_runtime": 228.0119, |
|
"eval_samples_per_second": 89.618, |
|
"eval_steps_per_second": 2.802, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 20.48114434330299, |
|
"grad_norm": 2.6345200538635254, |
|
"learning_rate": 3.7e-05, |
|
"loss": 0.6561, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 20.48114434330299, |
|
"eval_accuracy": 0.8558111308868751, |
|
"eval_loss": 0.8506866097450256, |
|
"eval_runtime": 225.0923, |
|
"eval_samples_per_second": 90.781, |
|
"eval_steps_per_second": 2.839, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 20.64369310793238, |
|
"grad_norm": 2.1237897872924805, |
|
"learning_rate": 3.65e-05, |
|
"loss": 0.6578, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 20.64369310793238, |
|
"eval_accuracy": 0.8554877275609093, |
|
"eval_loss": 0.8685211539268494, |
|
"eval_runtime": 225.3889, |
|
"eval_samples_per_second": 90.661, |
|
"eval_steps_per_second": 2.835, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 20.806241872561767, |
|
"grad_norm": 2.37259578704834, |
|
"learning_rate": 3.6e-05, |
|
"loss": 0.657, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 20.806241872561767, |
|
"eval_accuracy": 0.8553885784782709, |
|
"eval_loss": 0.8665293455123901, |
|
"eval_runtime": 225.5872, |
|
"eval_samples_per_second": 90.581, |
|
"eval_steps_per_second": 2.833, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 20.96879063719116, |
|
"grad_norm": 2.595446825027466, |
|
"learning_rate": 3.55e-05, |
|
"loss": 0.6495, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 20.96879063719116, |
|
"eval_accuracy": 0.8558350742146078, |
|
"eval_loss": 0.8600612878799438, |
|
"eval_runtime": 225.3383, |
|
"eval_samples_per_second": 90.681, |
|
"eval_steps_per_second": 2.836, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 21.131339401820547, |
|
"grad_norm": 2.689633846282959, |
|
"learning_rate": 3.5e-05, |
|
"loss": 0.6484, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 21.131339401820547, |
|
"eval_accuracy": 0.8563225017454439, |
|
"eval_loss": 0.8686124086380005, |
|
"eval_runtime": 225.5265, |
|
"eval_samples_per_second": 90.606, |
|
"eval_steps_per_second": 2.833, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 21.293888166449936, |
|
"grad_norm": 2.4909982681274414, |
|
"learning_rate": 3.45e-05, |
|
"loss": 0.6429, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 21.293888166449936, |
|
"eval_accuracy": 0.8557131214639457, |
|
"eval_loss": 0.86305832862854, |
|
"eval_runtime": 225.2213, |
|
"eval_samples_per_second": 90.729, |
|
"eval_steps_per_second": 2.837, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 21.456436931079324, |
|
"grad_norm": 2.7555160522460938, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 0.6523, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 21.456436931079324, |
|
"eval_accuracy": 0.8565990504258199, |
|
"eval_loss": 0.8672294616699219, |
|
"eval_runtime": 225.3745, |
|
"eval_samples_per_second": 90.667, |
|
"eval_steps_per_second": 2.835, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 21.618985695708712, |
|
"grad_norm": 2.4315414428710938, |
|
"learning_rate": 3.35e-05, |
|
"loss": 0.6384, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 21.618985695708712, |
|
"eval_accuracy": 0.8567521705315332, |
|
"eval_loss": 0.8545786738395691, |
|
"eval_runtime": 226.5184, |
|
"eval_samples_per_second": 90.209, |
|
"eval_steps_per_second": 2.821, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 21.7815344603381, |
|
"grad_norm": 2.373359441757202, |
|
"learning_rate": 3.3e-05, |
|
"loss": 0.6512, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 21.7815344603381, |
|
"eval_accuracy": 0.8568411511443462, |
|
"eval_loss": 0.8615005612373352, |
|
"eval_runtime": 226.7483, |
|
"eval_samples_per_second": 90.118, |
|
"eval_steps_per_second": 2.818, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 21.94408322496749, |
|
"grad_norm": 2.5687010288238525, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 0.6452, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 21.94408322496749, |
|
"eval_accuracy": 0.856976936965993, |
|
"eval_loss": 0.8514290452003479, |
|
"eval_runtime": 224.8277, |
|
"eval_samples_per_second": 90.887, |
|
"eval_steps_per_second": 2.842, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 22.106631989596877, |
|
"grad_norm": 2.3000574111938477, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 0.6388, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 22.106631989596877, |
|
"eval_accuracy": 0.8576112543945463, |
|
"eval_loss": 0.8523911833763123, |
|
"eval_runtime": 225.5646, |
|
"eval_samples_per_second": 90.59, |
|
"eval_steps_per_second": 2.833, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 22.26918075422627, |
|
"grad_norm": 2.5039753913879395, |
|
"learning_rate": 3.15e-05, |
|
"loss": 0.6382, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 22.26918075422627, |
|
"eval_accuracy": 0.8567441321838508, |
|
"eval_loss": 0.8560938835144043, |
|
"eval_runtime": 226.9717, |
|
"eval_samples_per_second": 90.029, |
|
"eval_steps_per_second": 2.815, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 22.431729518855658, |
|
"grad_norm": 2.6623408794403076, |
|
"learning_rate": 3.1e-05, |
|
"loss": 0.6408, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 22.431729518855658, |
|
"eval_accuracy": 0.8576559655770156, |
|
"eval_loss": 0.854763925075531, |
|
"eval_runtime": 226.0318, |
|
"eval_samples_per_second": 90.403, |
|
"eval_steps_per_second": 2.827, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 22.594278283485046, |
|
"grad_norm": 2.3223702907562256, |
|
"learning_rate": 3.05e-05, |
|
"loss": 0.6433, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 22.594278283485046, |
|
"eval_accuracy": 0.8581642158372993, |
|
"eval_loss": 0.8585615754127502, |
|
"eval_runtime": 225.6463, |
|
"eval_samples_per_second": 90.558, |
|
"eval_steps_per_second": 2.832, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 22.756827048114435, |
|
"grad_norm": 2.545360803604126, |
|
"learning_rate": 3e-05, |
|
"loss": 0.6371, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 22.756827048114435, |
|
"eval_accuracy": 0.8583454058165224, |
|
"eval_loss": 0.8482581973075867, |
|
"eval_runtime": 225.5921, |
|
"eval_samples_per_second": 90.579, |
|
"eval_steps_per_second": 2.833, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 22.919375812743823, |
|
"grad_norm": 2.18627667427063, |
|
"learning_rate": 2.95e-05, |
|
"loss": 0.6331, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 22.919375812743823, |
|
"eval_accuracy": 0.857949745807387, |
|
"eval_loss": 0.8507857918739319, |
|
"eval_runtime": 226.8073, |
|
"eval_samples_per_second": 90.094, |
|
"eval_steps_per_second": 2.817, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 23.08192457737321, |
|
"grad_norm": 2.8513870239257812, |
|
"learning_rate": 2.9e-05, |
|
"loss": 0.6393, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 23.08192457737321, |
|
"eval_accuracy": 0.8573411368207433, |
|
"eval_loss": 0.8502649664878845, |
|
"eval_runtime": 227.8606, |
|
"eval_samples_per_second": 89.678, |
|
"eval_steps_per_second": 2.804, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 23.2444733420026, |
|
"grad_norm": 2.961089611053467, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 0.6269, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 23.2444733420026, |
|
"eval_accuracy": 0.8582647966293633, |
|
"eval_loss": 0.8488523364067078, |
|
"eval_runtime": 226.4431, |
|
"eval_samples_per_second": 90.239, |
|
"eval_steps_per_second": 2.822, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 23.407022106631988, |
|
"grad_norm": 2.448005437850952, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 0.6284, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 23.407022106631988, |
|
"eval_accuracy": 0.8588356517259145, |
|
"eval_loss": 0.8428735136985779, |
|
"eval_runtime": 226.3587, |
|
"eval_samples_per_second": 90.273, |
|
"eval_steps_per_second": 2.823, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 23.56957087126138, |
|
"grad_norm": 2.4801974296569824, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 0.6311, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 23.56957087126138, |
|
"eval_accuracy": 0.8585083391102231, |
|
"eval_loss": 0.8391257524490356, |
|
"eval_runtime": 225.3955, |
|
"eval_samples_per_second": 90.658, |
|
"eval_steps_per_second": 2.835, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 23.73211963589077, |
|
"grad_norm": 2.4415576457977295, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 0.6333, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 23.73211963589077, |
|
"eval_accuracy": 0.8584229812472617, |
|
"eval_loss": 0.8474059104919434, |
|
"eval_runtime": 226.7984, |
|
"eval_samples_per_second": 90.098, |
|
"eval_steps_per_second": 2.817, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 23.894668400520157, |
|
"grad_norm": 2.5334153175354004, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 0.6291, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 23.894668400520157, |
|
"eval_accuracy": 0.8590801539747378, |
|
"eval_loss": 0.837401270866394, |
|
"eval_runtime": 225.1842, |
|
"eval_samples_per_second": 90.743, |
|
"eval_steps_per_second": 2.838, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 24.057217165149545, |
|
"grad_norm": 2.569241762161255, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 0.6255, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 24.057217165149545, |
|
"eval_accuracy": 0.8589290048653603, |
|
"eval_loss": 0.8332562446594238, |
|
"eval_runtime": 229.3684, |
|
"eval_samples_per_second": 89.088, |
|
"eval_steps_per_second": 2.786, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 24.219765929778934, |
|
"grad_norm": 2.526848316192627, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 0.6236, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 24.219765929778934, |
|
"eval_accuracy": 0.8597087528316705, |
|
"eval_loss": 0.848160445690155, |
|
"eval_runtime": 226.4635, |
|
"eval_samples_per_second": 90.231, |
|
"eval_steps_per_second": 2.822, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 24.382314694408322, |
|
"grad_norm": 2.655872106552124, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.624, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 24.382314694408322, |
|
"eval_accuracy": 0.8598462984123302, |
|
"eval_loss": 0.8461793065071106, |
|
"eval_runtime": 225.3124, |
|
"eval_samples_per_second": 90.692, |
|
"eval_steps_per_second": 2.836, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 24.54486345903771, |
|
"grad_norm": 2.109790325164795, |
|
"learning_rate": 2.45e-05, |
|
"loss": 0.6178, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 24.54486345903771, |
|
"eval_accuracy": 0.8595766101602678, |
|
"eval_loss": 0.8289109468460083, |
|
"eval_runtime": 226.5489, |
|
"eval_samples_per_second": 90.197, |
|
"eval_steps_per_second": 2.821, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 24.7074122236671, |
|
"grad_norm": 2.4060940742492676, |
|
"learning_rate": 2.4e-05, |
|
"loss": 0.6166, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 24.7074122236671, |
|
"eval_accuracy": 0.8606532868210031, |
|
"eval_loss": 0.8234532475471497, |
|
"eval_runtime": 225.3276, |
|
"eval_samples_per_second": 90.686, |
|
"eval_steps_per_second": 2.836, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 24.86996098829649, |
|
"grad_norm": 2.24013352394104, |
|
"learning_rate": 2.35e-05, |
|
"loss": 0.6247, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 24.86996098829649, |
|
"eval_accuracy": 0.8600803414966184, |
|
"eval_loss": 0.8327089548110962, |
|
"eval_runtime": 227.0917, |
|
"eval_samples_per_second": 89.981, |
|
"eval_steps_per_second": 2.814, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 25.03250975292588, |
|
"grad_norm": 2.370615005493164, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 0.624, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 25.03250975292588, |
|
"eval_accuracy": 0.8600759552685691, |
|
"eval_loss": 0.8379160761833191, |
|
"eval_runtime": 224.9721, |
|
"eval_samples_per_second": 90.829, |
|
"eval_steps_per_second": 2.84, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 25.195058517555267, |
|
"grad_norm": 2.192373514175415, |
|
"learning_rate": 2.25e-05, |
|
"loss": 0.6161, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 25.195058517555267, |
|
"eval_accuracy": 0.8597079760637308, |
|
"eval_loss": 0.8407602906227112, |
|
"eval_runtime": 225.6861, |
|
"eval_samples_per_second": 90.542, |
|
"eval_steps_per_second": 2.831, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 25.357607282184656, |
|
"grad_norm": 2.4751832485198975, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 0.6142, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 25.357607282184656, |
|
"eval_accuracy": 0.8608490074683309, |
|
"eval_loss": 0.8316988348960876, |
|
"eval_runtime": 225.4697, |
|
"eval_samples_per_second": 90.629, |
|
"eval_steps_per_second": 2.834, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 25.520156046814044, |
|
"grad_norm": 2.737602472305298, |
|
"learning_rate": 2.15e-05, |
|
"loss": 0.6193, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 25.520156046814044, |
|
"eval_accuracy": 0.860743624097801, |
|
"eval_loss": 0.8356801867485046, |
|
"eval_runtime": 226.5781, |
|
"eval_samples_per_second": 90.185, |
|
"eval_steps_per_second": 2.82, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 25.682704811443433, |
|
"grad_norm": 2.1464016437530518, |
|
"learning_rate": 2.1e-05, |
|
"loss": 0.6117, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 25.682704811443433, |
|
"eval_accuracy": 0.8607643217464414, |
|
"eval_loss": 0.8443505167961121, |
|
"eval_runtime": 225.3999, |
|
"eval_samples_per_second": 90.657, |
|
"eval_steps_per_second": 2.835, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 25.84525357607282, |
|
"grad_norm": 2.244690418243408, |
|
"learning_rate": 2.05e-05, |
|
"loss": 0.6155, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 25.84525357607282, |
|
"eval_accuracy": 0.8608062892739589, |
|
"eval_loss": 0.8309040665626526, |
|
"eval_runtime": 227.6535, |
|
"eval_samples_per_second": 89.759, |
|
"eval_steps_per_second": 2.807, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 26.00780234070221, |
|
"grad_norm": 2.2778093814849854, |
|
"learning_rate": 2e-05, |
|
"loss": 0.6159, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 26.00780234070221, |
|
"eval_accuracy": 0.860849372733563, |
|
"eval_loss": 0.8468282222747803, |
|
"eval_runtime": 226.4415, |
|
"eval_samples_per_second": 90.24, |
|
"eval_steps_per_second": 2.822, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 26.170351105331598, |
|
"grad_norm": 2.04994535446167, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 0.6095, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 26.170351105331598, |
|
"eval_accuracy": 0.8617138336966451, |
|
"eval_loss": 0.8285297155380249, |
|
"eval_runtime": 224.8524, |
|
"eval_samples_per_second": 90.877, |
|
"eval_steps_per_second": 2.842, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 26.33289986996099, |
|
"grad_norm": 2.437809944152832, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.6105, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 26.33289986996099, |
|
"eval_accuracy": 0.8610518577000663, |
|
"eval_loss": 0.8272110223770142, |
|
"eval_runtime": 225.6868, |
|
"eval_samples_per_second": 90.541, |
|
"eval_steps_per_second": 2.831, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 26.495448634590378, |
|
"grad_norm": 2.2250375747680664, |
|
"learning_rate": 1.85e-05, |
|
"loss": 0.6087, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 26.495448634590378, |
|
"eval_accuracy": 0.8626399458651031, |
|
"eval_loss": 0.812351644039154, |
|
"eval_runtime": 226.8612, |
|
"eval_samples_per_second": 90.073, |
|
"eval_steps_per_second": 2.817, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 26.657997399219767, |
|
"grad_norm": 2.060518980026245, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.6151, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 26.657997399219767, |
|
"eval_accuracy": 0.8616146515518304, |
|
"eval_loss": 0.8262282609939575, |
|
"eval_runtime": 226.3417, |
|
"eval_samples_per_second": 90.279, |
|
"eval_steps_per_second": 2.823, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 26.820546163849155, |
|
"grad_norm": 2.0575308799743652, |
|
"learning_rate": 1.75e-05, |
|
"loss": 0.6095, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 26.820546163849155, |
|
"eval_accuracy": 0.8622129077230878, |
|
"eval_loss": 0.8227624893188477, |
|
"eval_runtime": 228.246, |
|
"eval_samples_per_second": 89.526, |
|
"eval_steps_per_second": 2.8, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 26.983094928478543, |
|
"grad_norm": 2.253310203552246, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 0.6, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 26.983094928478543, |
|
"eval_accuracy": 0.8627496487013281, |
|
"eval_loss": 0.8195393085479736, |
|
"eval_runtime": 226.7315, |
|
"eval_samples_per_second": 90.124, |
|
"eval_steps_per_second": 2.818, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 27.14564369310793, |
|
"grad_norm": 2.459129810333252, |
|
"learning_rate": 1.65e-05, |
|
"loss": 0.6013, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 27.14564369310793, |
|
"eval_accuracy": 0.8624795555933904, |
|
"eval_loss": 0.8182792663574219, |
|
"eval_runtime": 225.0241, |
|
"eval_samples_per_second": 90.808, |
|
"eval_steps_per_second": 2.84, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 27.30819245773732, |
|
"grad_norm": 2.1593105792999268, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.6001, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 27.30819245773732, |
|
"eval_accuracy": 0.8627880443457693, |
|
"eval_loss": 0.8216105103492737, |
|
"eval_runtime": 224.8609, |
|
"eval_samples_per_second": 90.874, |
|
"eval_steps_per_second": 2.842, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 27.47074122236671, |
|
"grad_norm": 2.086055040359497, |
|
"learning_rate": 1.55e-05, |
|
"loss": 0.6013, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 27.47074122236671, |
|
"eval_accuracy": 0.8621898394825607, |
|
"eval_loss": 0.8210575580596924, |
|
"eval_runtime": 226.231, |
|
"eval_samples_per_second": 90.324, |
|
"eval_steps_per_second": 2.825, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 27.6332899869961, |
|
"grad_norm": 2.5327186584472656, |
|
"learning_rate": 1.5e-05, |
|
"loss": 0.6058, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 27.6332899869961, |
|
"eval_accuracy": 0.8627229258499225, |
|
"eval_loss": 0.8185027241706848, |
|
"eval_runtime": 226.1064, |
|
"eval_samples_per_second": 90.373, |
|
"eval_steps_per_second": 2.826, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 27.79583875162549, |
|
"grad_norm": 2.3839502334594727, |
|
"learning_rate": 1.45e-05, |
|
"loss": 0.6042, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 27.79583875162549, |
|
"eval_accuracy": 0.8627582312704439, |
|
"eval_loss": 0.8231362104415894, |
|
"eval_runtime": 228.8409, |
|
"eval_samples_per_second": 89.293, |
|
"eval_steps_per_second": 2.792, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 27.958387516254877, |
|
"grad_norm": 2.195699691772461, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 0.5997, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 27.958387516254877, |
|
"eval_accuracy": 0.8631414086547167, |
|
"eval_loss": 0.8138222694396973, |
|
"eval_runtime": 225.1659, |
|
"eval_samples_per_second": 90.751, |
|
"eval_steps_per_second": 2.838, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 28.120936280884266, |
|
"grad_norm": 2.942133665084839, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 0.5976, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 28.120936280884266, |
|
"eval_accuracy": 0.8629241152120203, |
|
"eval_loss": 0.8277115225791931, |
|
"eval_runtime": 226.384, |
|
"eval_samples_per_second": 90.263, |
|
"eval_steps_per_second": 2.823, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 28.283485045513654, |
|
"grad_norm": 2.2104339599609375, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 0.6005, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 28.283485045513654, |
|
"eval_accuracy": 0.8635328035684335, |
|
"eval_loss": 0.8249954581260681, |
|
"eval_runtime": 225.8607, |
|
"eval_samples_per_second": 90.472, |
|
"eval_steps_per_second": 2.829, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 28.446033810143042, |
|
"grad_norm": 2.4792349338531494, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.5964, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 28.446033810143042, |
|
"eval_accuracy": 0.8631331114250733, |
|
"eval_loss": 0.8169623613357544, |
|
"eval_runtime": 224.4723, |
|
"eval_samples_per_second": 91.031, |
|
"eval_steps_per_second": 2.847, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 28.60858257477243, |
|
"grad_norm": 2.2584621906280518, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.5978, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 28.60858257477243, |
|
"eval_accuracy": 0.8637502162788443, |
|
"eval_loss": 0.8240325450897217, |
|
"eval_runtime": 226.0844, |
|
"eval_samples_per_second": 90.382, |
|
"eval_steps_per_second": 2.826, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 28.77113133940182, |
|
"grad_norm": 2.4756360054016113, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 0.5933, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 28.77113133940182, |
|
"eval_accuracy": 0.863286343484686, |
|
"eval_loss": 0.821010947227478, |
|
"eval_runtime": 233.6708, |
|
"eval_samples_per_second": 87.448, |
|
"eval_steps_per_second": 2.735, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 28.93368010403121, |
|
"grad_norm": 2.4466359615325928, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 0.595, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 28.93368010403121, |
|
"eval_accuracy": 0.8634251119288006, |
|
"eval_loss": 0.821042001247406, |
|
"eval_runtime": 225.7222, |
|
"eval_samples_per_second": 90.527, |
|
"eval_steps_per_second": 2.831, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 29.0962288686606, |
|
"grad_norm": 2.5361807346343994, |
|
"learning_rate": 1.05e-05, |
|
"loss": 0.5941, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 29.0962288686606, |
|
"eval_accuracy": 0.8634033694143568, |
|
"eval_loss": 0.8251069188117981, |
|
"eval_runtime": 226.448, |
|
"eval_samples_per_second": 90.237, |
|
"eval_steps_per_second": 2.822, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 29.258777633289988, |
|
"grad_norm": 2.5238239765167236, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5988, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 29.258777633289988, |
|
"eval_accuracy": 0.8638910896653752, |
|
"eval_loss": 0.8146407604217529, |
|
"eval_runtime": 222.0422, |
|
"eval_samples_per_second": 92.028, |
|
"eval_steps_per_second": 2.878, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 29.421326397919376, |
|
"grad_norm": 2.291041851043701, |
|
"learning_rate": 9.5e-06, |
|
"loss": 0.5923, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 29.421326397919376, |
|
"eval_accuracy": 0.8637874627536761, |
|
"eval_loss": 0.8178155422210693, |
|
"eval_runtime": 221.0933, |
|
"eval_samples_per_second": 92.423, |
|
"eval_steps_per_second": 2.89, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 29.583875162548765, |
|
"grad_norm": 2.601836681365967, |
|
"learning_rate": 9e-06, |
|
"loss": 0.5887, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 29.583875162548765, |
|
"eval_accuracy": 0.864438274609619, |
|
"eval_loss": 0.8162264823913574, |
|
"eval_runtime": 228.6953, |
|
"eval_samples_per_second": 89.35, |
|
"eval_steps_per_second": 2.794, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 29.746423927178153, |
|
"grad_norm": 2.56785249710083, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 0.5833, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 29.746423927178153, |
|
"eval_accuracy": 0.8640183304537222, |
|
"eval_loss": 0.8157439827919006, |
|
"eval_runtime": 224.1585, |
|
"eval_samples_per_second": 91.159, |
|
"eval_steps_per_second": 2.851, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 29.90897269180754, |
|
"grad_norm": 2.409670114517212, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5951, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 29.90897269180754, |
|
"eval_accuracy": 0.8643441985332161, |
|
"eval_loss": 0.812107264995575, |
|
"eval_runtime": 221.533, |
|
"eval_samples_per_second": 92.239, |
|
"eval_steps_per_second": 2.884, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 30.07152145643693, |
|
"grad_norm": 2.19612717628479, |
|
"learning_rate": 7.5e-06, |
|
"loss": 0.5928, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 30.07152145643693, |
|
"eval_accuracy": 0.8640605097561019, |
|
"eval_loss": 0.8203216791152954, |
|
"eval_runtime": 221.1524, |
|
"eval_samples_per_second": 92.398, |
|
"eval_steps_per_second": 2.889, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 30.234070221066318, |
|
"grad_norm": 2.249537944793701, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 0.5878, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 30.234070221066318, |
|
"eval_accuracy": 0.8644494113060075, |
|
"eval_loss": 0.8103818893432617, |
|
"eval_runtime": 221.6978, |
|
"eval_samples_per_second": 92.17, |
|
"eval_steps_per_second": 2.882, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 30.39661898569571, |
|
"grad_norm": 2.63948917388916, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 0.5831, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 30.39661898569571, |
|
"eval_accuracy": 0.8646731645828304, |
|
"eval_loss": 0.8149096369743347, |
|
"eval_runtime": 231.6244, |
|
"eval_samples_per_second": 88.22, |
|
"eval_steps_per_second": 2.759, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 30.5591677503251, |
|
"grad_norm": 2.257843494415283, |
|
"learning_rate": 6e-06, |
|
"loss": 0.5922, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 30.5591677503251, |
|
"eval_accuracy": 0.8642099290677292, |
|
"eval_loss": 0.8136395215988159, |
|
"eval_runtime": 227.8537, |
|
"eval_samples_per_second": 89.68, |
|
"eval_steps_per_second": 2.804, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 30.721716514954487, |
|
"grad_norm": 2.54514479637146, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 0.5834, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 30.721716514954487, |
|
"eval_accuracy": 0.8647608401845014, |
|
"eval_loss": 0.8055408596992493, |
|
"eval_runtime": 222.2677, |
|
"eval_samples_per_second": 91.934, |
|
"eval_steps_per_second": 2.875, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 30.884265279583875, |
|
"grad_norm": 2.5504415035247803, |
|
"learning_rate": 5e-06, |
|
"loss": 0.5851, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 30.884265279583875, |
|
"eval_accuracy": 0.8655900321339209, |
|
"eval_loss": 0.8102414011955261, |
|
"eval_runtime": 222.3833, |
|
"eval_samples_per_second": 91.886, |
|
"eval_steps_per_second": 2.873, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 31.046814044213264, |
|
"grad_norm": 2.5223162174224854, |
|
"learning_rate": 4.5e-06, |
|
"loss": 0.5858, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 31.046814044213264, |
|
"eval_accuracy": 0.8648693759071118, |
|
"eval_loss": 0.8096651434898376, |
|
"eval_runtime": 222.4149, |
|
"eval_samples_per_second": 91.873, |
|
"eval_steps_per_second": 2.873, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 31.209362808842652, |
|
"grad_norm": 2.704538583755493, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.5854, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 31.209362808842652, |
|
"eval_accuracy": 0.8643289659912363, |
|
"eval_loss": 0.8139033317565918, |
|
"eval_runtime": 222.3309, |
|
"eval_samples_per_second": 91.908, |
|
"eval_steps_per_second": 2.874, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 31.37191157347204, |
|
"grad_norm": 2.3286654949188232, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 0.5809, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 31.37191157347204, |
|
"eval_accuracy": 0.8655915682628923, |
|
"eval_loss": 0.8057557940483093, |
|
"eval_runtime": 222.7618, |
|
"eval_samples_per_second": 91.73, |
|
"eval_steps_per_second": 2.869, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 31.53446033810143, |
|
"grad_norm": 2.572178363800049, |
|
"learning_rate": 3e-06, |
|
"loss": 0.5846, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 31.53446033810143, |
|
"eval_accuracy": 0.8653023190042686, |
|
"eval_loss": 0.8064507842063904, |
|
"eval_runtime": 223.3613, |
|
"eval_samples_per_second": 91.484, |
|
"eval_steps_per_second": 2.861, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 31.69700910273082, |
|
"grad_norm": 2.4184162616729736, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.585, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 31.69700910273082, |
|
"eval_accuracy": 0.8657269498564959, |
|
"eval_loss": 0.7961297631263733, |
|
"eval_runtime": 225.1117, |
|
"eval_samples_per_second": 90.773, |
|
"eval_steps_per_second": 2.839, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 31.85955786736021, |
|
"grad_norm": 2.121875047683716, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.588, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 31.85955786736021, |
|
"eval_accuracy": 0.8654759221390312, |
|
"eval_loss": 0.7994445562362671, |
|
"eval_runtime": 222.5184, |
|
"eval_samples_per_second": 91.831, |
|
"eval_steps_per_second": 2.872, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 32.022106631989594, |
|
"grad_norm": 2.2691831588745117, |
|
"learning_rate": 1.5e-06, |
|
"loss": 0.58, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 32.022106631989594, |
|
"eval_accuracy": 0.8651082523885062, |
|
"eval_loss": 0.8115944266319275, |
|
"eval_runtime": 222.6443, |
|
"eval_samples_per_second": 91.779, |
|
"eval_steps_per_second": 2.87, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 32.184655396618986, |
|
"grad_norm": 2.047391891479492, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 0.5776, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 32.184655396618986, |
|
"eval_accuracy": 0.8659600453633429, |
|
"eval_loss": 0.8002920150756836, |
|
"eval_runtime": 223.1406, |
|
"eval_samples_per_second": 91.575, |
|
"eval_steps_per_second": 2.864, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 32.34720416124838, |
|
"grad_norm": 2.4214348793029785, |
|
"learning_rate": 5.000000000000001e-07, |
|
"loss": 0.581, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 32.34720416124838, |
|
"eval_accuracy": 0.8655375271936185, |
|
"eval_loss": 0.7950243949890137, |
|
"eval_runtime": 240.505, |
|
"eval_samples_per_second": 84.963, |
|
"eval_steps_per_second": 2.657, |
|
"step": 99500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 33, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.401908931210772e+17, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|