|
{ |
|
"best_metric": 0.014622284099459648, |
|
"best_model_checkpoint": "sentiment-distillation-smollm/checkpoint-1875", |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 1875, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 9.476157188415527, |
|
"learning_rate": 1.597444089456869e-06, |
|
"loss": 1.074, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.032, |
|
"grad_norm": 10.569486618041992, |
|
"learning_rate": 3.194888178913738e-06, |
|
"loss": 1.0372, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.048, |
|
"grad_norm": 8.480586051940918, |
|
"learning_rate": 4.792332268370607e-06, |
|
"loss": 0.9365, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.064, |
|
"grad_norm": 6.626020908355713, |
|
"learning_rate": 6.389776357827476e-06, |
|
"loss": 0.8842, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 12.755849838256836, |
|
"learning_rate": 7.987220447284345e-06, |
|
"loss": 0.9098, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.096, |
|
"grad_norm": 7.676151275634766, |
|
"learning_rate": 9.584664536741214e-06, |
|
"loss": 0.7348, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.112, |
|
"grad_norm": 5.426048278808594, |
|
"learning_rate": 1.1182108626198083e-05, |
|
"loss": 0.4936, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.128, |
|
"grad_norm": 3.9603002071380615, |
|
"learning_rate": 1.2779552715654951e-05, |
|
"loss": 0.3438, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.144, |
|
"grad_norm": 3.396127700805664, |
|
"learning_rate": 1.4376996805111822e-05, |
|
"loss": 0.1482, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 4.765463829040527, |
|
"learning_rate": 1.597444089456869e-05, |
|
"loss": 0.1855, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.176, |
|
"grad_norm": 0.04306459426879883, |
|
"learning_rate": 1.757188498402556e-05, |
|
"loss": 0.1788, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.192, |
|
"grad_norm": 0.04322272166609764, |
|
"learning_rate": 1.9169329073482428e-05, |
|
"loss": 0.1233, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.208, |
|
"grad_norm": 0.053582437336444855, |
|
"learning_rate": 2.07667731629393e-05, |
|
"loss": 0.0908, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.224, |
|
"grad_norm": 8.254273414611816, |
|
"learning_rate": 2.2364217252396165e-05, |
|
"loss": 0.1362, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.054730091243982315, |
|
"learning_rate": 2.3961661341853036e-05, |
|
"loss": 0.072, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.256, |
|
"grad_norm": 0.0763891264796257, |
|
"learning_rate": 2.5559105431309903e-05, |
|
"loss": 0.1823, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.272, |
|
"grad_norm": 6.117145538330078, |
|
"learning_rate": 2.7156549520766773e-05, |
|
"loss": 0.0963, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.288, |
|
"grad_norm": 1.8248779773712158, |
|
"learning_rate": 2.8753993610223644e-05, |
|
"loss": 0.2741, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.304, |
|
"grad_norm": 0.014617039822041988, |
|
"learning_rate": 3.0351437699680514e-05, |
|
"loss": 0.0691, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 2.316850423812866, |
|
"learning_rate": 3.194888178913738e-05, |
|
"loss": 0.0186, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.336, |
|
"grad_norm": 0.010957750491797924, |
|
"learning_rate": 3.354632587859425e-05, |
|
"loss": 0.1676, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.352, |
|
"grad_norm": 0.02525000460445881, |
|
"learning_rate": 3.514376996805112e-05, |
|
"loss": 0.1689, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.368, |
|
"grad_norm": 0.010558371432125568, |
|
"learning_rate": 3.6741214057507985e-05, |
|
"loss": 0.1759, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.384, |
|
"grad_norm": 0.009851646609604359, |
|
"learning_rate": 3.8338658146964856e-05, |
|
"loss": 0.1253, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.009924142621457577, |
|
"learning_rate": 3.9936102236421726e-05, |
|
"loss": 0.1141, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.416, |
|
"grad_norm": 0.00981380045413971, |
|
"learning_rate": 4.15335463258786e-05, |
|
"loss": 0.1227, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.432, |
|
"grad_norm": 4.685519218444824, |
|
"learning_rate": 4.313099041533547e-05, |
|
"loss": 0.2, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.448, |
|
"grad_norm": 1.1163793802261353, |
|
"learning_rate": 4.472843450479233e-05, |
|
"loss": 0.073, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.464, |
|
"grad_norm": 0.01022783387452364, |
|
"learning_rate": 4.632587859424921e-05, |
|
"loss": 0.1287, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.018457185477018356, |
|
"learning_rate": 4.792332268370607e-05, |
|
"loss": 0.0794, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.496, |
|
"grad_norm": 10.666505813598633, |
|
"learning_rate": 4.952076677316294e-05, |
|
"loss": 0.17, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.512, |
|
"grad_norm": 2.132030725479126, |
|
"learning_rate": 4.987553342816501e-05, |
|
"loss": 0.0952, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.528, |
|
"grad_norm": 11.730767250061035, |
|
"learning_rate": 4.969772403982931e-05, |
|
"loss": 0.1995, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.544, |
|
"grad_norm": 0.01148867979645729, |
|
"learning_rate": 4.95199146514936e-05, |
|
"loss": 0.1019, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.047348447144031525, |
|
"learning_rate": 4.9342105263157894e-05, |
|
"loss": 0.0706, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.576, |
|
"grad_norm": 0.010151210241019726, |
|
"learning_rate": 4.916429587482219e-05, |
|
"loss": 0.0878, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.592, |
|
"grad_norm": 0.4587936997413635, |
|
"learning_rate": 4.8986486486486486e-05, |
|
"loss": 0.0864, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.608, |
|
"grad_norm": 0.1231539249420166, |
|
"learning_rate": 4.8808677098150786e-05, |
|
"loss": 0.0096, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.624, |
|
"grad_norm": 0.005615161266177893, |
|
"learning_rate": 4.863086770981508e-05, |
|
"loss": 0.1351, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.02301601506769657, |
|
"learning_rate": 4.845305832147938e-05, |
|
"loss": 0.0363, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.656, |
|
"grad_norm": 3.3105900287628174, |
|
"learning_rate": 4.827524893314367e-05, |
|
"loss": 0.0764, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.672, |
|
"grad_norm": 0.0835251584649086, |
|
"learning_rate": 4.809743954480797e-05, |
|
"loss": 0.0238, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.688, |
|
"grad_norm": 0.0839819684624672, |
|
"learning_rate": 4.7919630156472264e-05, |
|
"loss": 0.1182, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.704, |
|
"grad_norm": 0.0160073135048151, |
|
"learning_rate": 4.774182076813656e-05, |
|
"loss": 0.1647, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.09903815388679504, |
|
"learning_rate": 4.756401137980086e-05, |
|
"loss": 0.138, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.736, |
|
"grad_norm": 8.14660930633545, |
|
"learning_rate": 4.738620199146515e-05, |
|
"loss": 0.0731, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.752, |
|
"grad_norm": 0.07308264076709747, |
|
"learning_rate": 4.720839260312945e-05, |
|
"loss": 0.017, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.768, |
|
"grad_norm": 0.8220946788787842, |
|
"learning_rate": 4.703058321479374e-05, |
|
"loss": 0.0299, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.784, |
|
"grad_norm": 0.007219772785902023, |
|
"learning_rate": 4.685277382645804e-05, |
|
"loss": 0.0162, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.009510455653071404, |
|
"learning_rate": 4.6674964438122335e-05, |
|
"loss": 0.0011, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.816, |
|
"grad_norm": 9.018925666809082, |
|
"learning_rate": 4.6497155049786634e-05, |
|
"loss": 0.1081, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.832, |
|
"grad_norm": 0.008953461423516273, |
|
"learning_rate": 4.631934566145093e-05, |
|
"loss": 0.0494, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.848, |
|
"grad_norm": 10.596187591552734, |
|
"learning_rate": 4.614153627311522e-05, |
|
"loss": 0.0158, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.864, |
|
"grad_norm": 2.3671252727508545, |
|
"learning_rate": 4.596372688477952e-05, |
|
"loss": 0.0816, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.002445698482915759, |
|
"learning_rate": 4.578591749644381e-05, |
|
"loss": 0.0042, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.896, |
|
"grad_norm": 11.779576301574707, |
|
"learning_rate": 4.560810810810811e-05, |
|
"loss": 0.1067, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.912, |
|
"grad_norm": 7.589051246643066, |
|
"learning_rate": 4.5430298719772405e-05, |
|
"loss": 0.0853, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.928, |
|
"grad_norm": 0.11888572573661804, |
|
"learning_rate": 4.5252489331436705e-05, |
|
"loss": 0.0317, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.944, |
|
"grad_norm": 0.07746365666389465, |
|
"learning_rate": 4.5074679943101e-05, |
|
"loss": 0.2303, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.004527238663285971, |
|
"learning_rate": 4.489687055476529e-05, |
|
"loss": 0.0665, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.976, |
|
"grad_norm": 4.120000839233398, |
|
"learning_rate": 4.471906116642959e-05, |
|
"loss": 0.0546, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.992, |
|
"grad_norm": 0.0016751989023759961, |
|
"learning_rate": 4.4541251778093884e-05, |
|
"loss": 0.0051, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.9939879759519038, |
|
"eval_f1_macro": 0.9861065168559803, |
|
"eval_f1_micro": 0.9939879759519038, |
|
"eval_f1_weighted": 0.9939373007372426, |
|
"eval_loss": 0.015998151153326035, |
|
"eval_precision_macro": 0.9959514170040485, |
|
"eval_precision_micro": 0.9939879759519038, |
|
"eval_precision_weighted": 0.994060996486901, |
|
"eval_recall_macro": 0.9770065284178188, |
|
"eval_recall_micro": 0.9939879759519038, |
|
"eval_recall_weighted": 0.9939879759519038, |
|
"eval_runtime": 10.7994, |
|
"eval_samples_per_second": 46.206, |
|
"eval_steps_per_second": 2.963, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.008, |
|
"grad_norm": 4.363436222076416, |
|
"learning_rate": 4.436344238975818e-05, |
|
"loss": 0.0119, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.024, |
|
"grad_norm": 0.002567918971180916, |
|
"learning_rate": 4.4185633001422476e-05, |
|
"loss": 0.0736, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.00732471002265811, |
|
"learning_rate": 4.4007823613086776e-05, |
|
"loss": 0.0001, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.056, |
|
"grad_norm": 0.002863505156710744, |
|
"learning_rate": 4.383001422475107e-05, |
|
"loss": 0.0608, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.072, |
|
"grad_norm": 0.008520668372511864, |
|
"learning_rate": 4.365220483641537e-05, |
|
"loss": 0.0125, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.088, |
|
"grad_norm": 0.001538406009785831, |
|
"learning_rate": 4.347439544807966e-05, |
|
"loss": 0.0653, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.104, |
|
"grad_norm": 0.001671103062108159, |
|
"learning_rate": 4.3296586059743954e-05, |
|
"loss": 0.081, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.20056261122226715, |
|
"learning_rate": 4.3118776671408254e-05, |
|
"loss": 0.0057, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.1360000000000001, |
|
"grad_norm": 0.008405996486544609, |
|
"learning_rate": 4.294096728307255e-05, |
|
"loss": 0.0091, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.152, |
|
"grad_norm": 0.19218170642852783, |
|
"learning_rate": 4.2763157894736847e-05, |
|
"loss": 0.0019, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.168, |
|
"grad_norm": 0.07524432241916656, |
|
"learning_rate": 4.258534850640114e-05, |
|
"loss": 0.0952, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.184, |
|
"grad_norm": 0.0015570322284474969, |
|
"learning_rate": 4.240753911806544e-05, |
|
"loss": 0.0435, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.0036333040334284306, |
|
"learning_rate": 4.222972972972973e-05, |
|
"loss": 0.0207, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.216, |
|
"grad_norm": 9.78964900970459, |
|
"learning_rate": 4.205192034139403e-05, |
|
"loss": 0.1004, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.232, |
|
"grad_norm": 0.0014888375299051404, |
|
"learning_rate": 4.187411095305832e-05, |
|
"loss": 0.0671, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.248, |
|
"grad_norm": 0.023692140355706215, |
|
"learning_rate": 4.169630156472262e-05, |
|
"loss": 0.0026, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.264, |
|
"grad_norm": 0.004593148361891508, |
|
"learning_rate": 4.151849217638692e-05, |
|
"loss": 0.0423, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.003923716489225626, |
|
"learning_rate": 4.134068278805121e-05, |
|
"loss": 0.0236, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.296, |
|
"grad_norm": 0.002044517546892166, |
|
"learning_rate": 4.116287339971551e-05, |
|
"loss": 0.005, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.312, |
|
"grad_norm": 0.022982032969594002, |
|
"learning_rate": 4.09850640113798e-05, |
|
"loss": 0.0697, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.328, |
|
"grad_norm": 0.23058412969112396, |
|
"learning_rate": 4.08072546230441e-05, |
|
"loss": 0.1222, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.3439999999999999, |
|
"grad_norm": 0.0017761716153472662, |
|
"learning_rate": 4.0629445234708395e-05, |
|
"loss": 0.0022, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.12850730121135712, |
|
"learning_rate": 4.0451635846372695e-05, |
|
"loss": 0.0189, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.376, |
|
"grad_norm": 0.0014099746476858854, |
|
"learning_rate": 4.027382645803698e-05, |
|
"loss": 0.0001, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.392, |
|
"grad_norm": 10.44194507598877, |
|
"learning_rate": 4.009601706970128e-05, |
|
"loss": 0.1423, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.408, |
|
"grad_norm": 5.446048736572266, |
|
"learning_rate": 3.9918207681365574e-05, |
|
"loss": 0.1013, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.424, |
|
"grad_norm": 0.017437923699617386, |
|
"learning_rate": 3.9740398293029873e-05, |
|
"loss": 0.0002, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.007926377467811108, |
|
"learning_rate": 3.956258890469417e-05, |
|
"loss": 0.0661, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.456, |
|
"grad_norm": 0.0029465279076248407, |
|
"learning_rate": 3.9384779516358466e-05, |
|
"loss": 0.0338, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.472, |
|
"grad_norm": 0.0010107713751494884, |
|
"learning_rate": 3.9206970128022766e-05, |
|
"loss": 0.0171, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.488, |
|
"grad_norm": 0.036259789019823074, |
|
"learning_rate": 3.902916073968706e-05, |
|
"loss": 0.0004, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.504, |
|
"grad_norm": 0.7286566495895386, |
|
"learning_rate": 3.885135135135135e-05, |
|
"loss": 0.014, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.008453834801912308, |
|
"learning_rate": 3.8673541963015645e-05, |
|
"loss": 0.0002, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.536, |
|
"grad_norm": 0.1271582841873169, |
|
"learning_rate": 3.8495732574679944e-05, |
|
"loss": 0.0043, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.552, |
|
"grad_norm": 6.197739124298096, |
|
"learning_rate": 3.831792318634424e-05, |
|
"loss": 0.0075, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.568, |
|
"grad_norm": 0.008371386677026749, |
|
"learning_rate": 3.814011379800854e-05, |
|
"loss": 0.0299, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.584, |
|
"grad_norm": 0.0011657042196020484, |
|
"learning_rate": 3.796230440967283e-05, |
|
"loss": 0.1188, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.010952652432024479, |
|
"learning_rate": 3.778449502133713e-05, |
|
"loss": 0.0116, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.616, |
|
"grad_norm": 0.0010061347857117653, |
|
"learning_rate": 3.760668563300143e-05, |
|
"loss": 0.0043, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.6320000000000001, |
|
"grad_norm": 0.06395132839679718, |
|
"learning_rate": 3.742887624466572e-05, |
|
"loss": 0.0445, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.6480000000000001, |
|
"grad_norm": 0.001108819618821144, |
|
"learning_rate": 3.7251066856330015e-05, |
|
"loss": 0.0323, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.6640000000000001, |
|
"grad_norm": 0.0014280881732702255, |
|
"learning_rate": 3.707325746799431e-05, |
|
"loss": 0.0008, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.0012123563792556524, |
|
"learning_rate": 3.689544807965861e-05, |
|
"loss": 0.0014, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.696, |
|
"grad_norm": 0.0011878483928740025, |
|
"learning_rate": 3.67176386913229e-05, |
|
"loss": 0.0524, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.712, |
|
"grad_norm": 0.0037567925173789263, |
|
"learning_rate": 3.65398293029872e-05, |
|
"loss": 0.0237, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.728, |
|
"grad_norm": 0.002096337964758277, |
|
"learning_rate": 3.636201991465149e-05, |
|
"loss": 0.0334, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.744, |
|
"grad_norm": 8.111098289489746, |
|
"learning_rate": 3.618421052631579e-05, |
|
"loss": 0.1467, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.0023632964584976435, |
|
"learning_rate": 3.600640113798009e-05, |
|
"loss": 0.0003, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.776, |
|
"grad_norm": 9.071378707885742, |
|
"learning_rate": 3.5828591749644385e-05, |
|
"loss": 0.0176, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.792, |
|
"grad_norm": 0.08811729401350021, |
|
"learning_rate": 3.565078236130868e-05, |
|
"loss": 0.1454, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.808, |
|
"grad_norm": 0.0101834237575531, |
|
"learning_rate": 3.547297297297297e-05, |
|
"loss": 0.0172, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.8239999999999998, |
|
"grad_norm": 0.1676841378211975, |
|
"learning_rate": 3.529516358463727e-05, |
|
"loss": 0.049, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 0.043762240558862686, |
|
"learning_rate": 3.5117354196301564e-05, |
|
"loss": 0.0432, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.8559999999999999, |
|
"grad_norm": 0.011058449745178223, |
|
"learning_rate": 3.4939544807965863e-05, |
|
"loss": 0.001, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.8719999999999999, |
|
"grad_norm": 0.0018902173032984138, |
|
"learning_rate": 3.4761735419630156e-05, |
|
"loss": 0.0399, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.888, |
|
"grad_norm": 0.001800389145500958, |
|
"learning_rate": 3.4583926031294456e-05, |
|
"loss": 0.0184, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.904, |
|
"grad_norm": 0.06505569815635681, |
|
"learning_rate": 3.440611664295875e-05, |
|
"loss": 0.0073, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.5011488199234009, |
|
"learning_rate": 3.422830725462304e-05, |
|
"loss": 0.0007, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.936, |
|
"grad_norm": 0.013254979625344276, |
|
"learning_rate": 3.405049786628734e-05, |
|
"loss": 0.0059, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.952, |
|
"grad_norm": 0.7582728266716003, |
|
"learning_rate": 3.3872688477951634e-05, |
|
"loss": 0.0013, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.968, |
|
"grad_norm": 0.009909105487167835, |
|
"learning_rate": 3.3694879089615934e-05, |
|
"loss": 0.0002, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.984, |
|
"grad_norm": 0.000718823226634413, |
|
"learning_rate": 3.351706970128023e-05, |
|
"loss": 0.0047, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.6642739772796631, |
|
"learning_rate": 3.333926031294453e-05, |
|
"loss": 0.0015, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9919839679358717, |
|
"eval_f1_macro": 0.9810672282141238, |
|
"eval_f1_micro": 0.9919839679358717, |
|
"eval_f1_weighted": 0.992107547860045, |
|
"eval_loss": 0.019453825429081917, |
|
"eval_precision_macro": 0.9692164931816348, |
|
"eval_precision_micro": 0.9919839679358717, |
|
"eval_precision_weighted": 0.9925061936994298, |
|
"eval_recall_macro": 0.9942915690866512, |
|
"eval_recall_micro": 0.9919839679358717, |
|
"eval_recall_weighted": 0.9919839679358717, |
|
"eval_runtime": 10.636, |
|
"eval_samples_per_second": 46.916, |
|
"eval_steps_per_second": 3.009, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 2.016, |
|
"grad_norm": 0.005395730957388878, |
|
"learning_rate": 3.316145092460882e-05, |
|
"loss": 0.0001, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 2.032, |
|
"grad_norm": 0.0008304046932607889, |
|
"learning_rate": 3.298364153627312e-05, |
|
"loss": 0.0001, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 2.048, |
|
"grad_norm": 0.0007791437674313784, |
|
"learning_rate": 3.280583214793741e-05, |
|
"loss": 0.0001, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 2.064, |
|
"grad_norm": 0.0008820474613457918, |
|
"learning_rate": 3.2628022759601705e-05, |
|
"loss": 0.0001, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.000594152370467782, |
|
"learning_rate": 3.2450213371266005e-05, |
|
"loss": 0.0002, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.096, |
|
"grad_norm": 0.0006243674433790147, |
|
"learning_rate": 3.22724039829303e-05, |
|
"loss": 0.0022, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 2.112, |
|
"grad_norm": 0.0007709413184784353, |
|
"learning_rate": 3.20945945945946e-05, |
|
"loss": 0.0154, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 2.128, |
|
"grad_norm": 0.004626740701496601, |
|
"learning_rate": 3.191678520625889e-05, |
|
"loss": 0.0012, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 2.144, |
|
"grad_norm": 0.002739989897236228, |
|
"learning_rate": 3.173897581792319e-05, |
|
"loss": 0.0004, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.007507917005568743, |
|
"learning_rate": 3.156116642958748e-05, |
|
"loss": 0.0001, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.176, |
|
"grad_norm": 0.004014975391328335, |
|
"learning_rate": 3.138335704125178e-05, |
|
"loss": 0.001, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.192, |
|
"grad_norm": 0.000455196452094242, |
|
"learning_rate": 3.1205547652916076e-05, |
|
"loss": 0.0005, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.208, |
|
"grad_norm": 0.00030664558289572597, |
|
"learning_rate": 3.102773826458037e-05, |
|
"loss": 0.0265, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.224, |
|
"grad_norm": 0.00043227567221038043, |
|
"learning_rate": 3.084992887624467e-05, |
|
"loss": 0.0001, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.0481143593788147, |
|
"learning_rate": 3.067211948790896e-05, |
|
"loss": 0.0023, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.2560000000000002, |
|
"grad_norm": 0.005968974903225899, |
|
"learning_rate": 3.0494310099573257e-05, |
|
"loss": 0.0025, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.2720000000000002, |
|
"grad_norm": 0.21466164290905, |
|
"learning_rate": 3.0316500711237557e-05, |
|
"loss": 0.0003, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.288, |
|
"grad_norm": 0.0018357799854129553, |
|
"learning_rate": 3.0138691322901853e-05, |
|
"loss": 0.0001, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.304, |
|
"grad_norm": 0.0004517412162385881, |
|
"learning_rate": 2.996088193456615e-05, |
|
"loss": 0.0001, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.005384071730077267, |
|
"learning_rate": 2.9783072546230446e-05, |
|
"loss": 0.002, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.336, |
|
"grad_norm": 0.00046941509936004877, |
|
"learning_rate": 2.9605263157894735e-05, |
|
"loss": 0.0001, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.352, |
|
"grad_norm": 0.011709867045283318, |
|
"learning_rate": 2.9427453769559032e-05, |
|
"loss": 0.0001, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.368, |
|
"grad_norm": 0.03616934269666672, |
|
"learning_rate": 2.9249644381223328e-05, |
|
"loss": 0.0008, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.384, |
|
"grad_norm": 0.0006385542219504714, |
|
"learning_rate": 2.9071834992887624e-05, |
|
"loss": 0.0001, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 1.0423965454101562, |
|
"learning_rate": 2.889402560455192e-05, |
|
"loss": 0.0021, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.416, |
|
"grad_norm": 0.21064622700214386, |
|
"learning_rate": 2.8716216216216217e-05, |
|
"loss": 0.0003, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.432, |
|
"grad_norm": 0.0016910170670598745, |
|
"learning_rate": 2.8538406827880517e-05, |
|
"loss": 0.0001, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.448, |
|
"grad_norm": 0.0004065225657541305, |
|
"learning_rate": 2.8360597439544813e-05, |
|
"loss": 0.0004, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.464, |
|
"grad_norm": 0.0006998078897595406, |
|
"learning_rate": 2.8182788051209103e-05, |
|
"loss": 0.0001, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.0057923863641917706, |
|
"learning_rate": 2.80049786628734e-05, |
|
"loss": 0.0012, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.496, |
|
"grad_norm": 0.00030601295293308794, |
|
"learning_rate": 2.7827169274537695e-05, |
|
"loss": 0.0236, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.512, |
|
"grad_norm": 0.00045617681462317705, |
|
"learning_rate": 2.764935988620199e-05, |
|
"loss": 0.0001, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.528, |
|
"grad_norm": 0.00044945545960217714, |
|
"learning_rate": 2.7471550497866288e-05, |
|
"loss": 0.0001, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.544, |
|
"grad_norm": 0.0003521046892274171, |
|
"learning_rate": 2.7293741109530584e-05, |
|
"loss": 0.0263, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.008946732617914677, |
|
"learning_rate": 2.711593172119488e-05, |
|
"loss": 0.0006, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.576, |
|
"grad_norm": 0.09531024098396301, |
|
"learning_rate": 2.6938122332859177e-05, |
|
"loss": 0.0003, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.592, |
|
"grad_norm": 0.0010131685994565487, |
|
"learning_rate": 2.6760312944523473e-05, |
|
"loss": 0.0001, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.608, |
|
"grad_norm": 0.0016723967855796218, |
|
"learning_rate": 2.6582503556187766e-05, |
|
"loss": 0.0018, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.624, |
|
"grad_norm": 0.008407480083405972, |
|
"learning_rate": 2.6404694167852062e-05, |
|
"loss": 0.0344, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.001671052654273808, |
|
"learning_rate": 2.622688477951636e-05, |
|
"loss": 0.0005, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.656, |
|
"grad_norm": 0.0003514339041430503, |
|
"learning_rate": 2.6049075391180655e-05, |
|
"loss": 0.0001, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.672, |
|
"grad_norm": 0.00044292688835412264, |
|
"learning_rate": 2.587126600284495e-05, |
|
"loss": 0.0017, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.6879999999999997, |
|
"grad_norm": 0.00033179231104440987, |
|
"learning_rate": 2.5693456614509247e-05, |
|
"loss": 0.0243, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.7039999999999997, |
|
"grad_norm": 0.0023233199026435614, |
|
"learning_rate": 2.5515647226173544e-05, |
|
"loss": 0.0001, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 0.0012358427047729492, |
|
"learning_rate": 2.533783783783784e-05, |
|
"loss": 0.0002, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.7359999999999998, |
|
"grad_norm": 0.00035215960815548897, |
|
"learning_rate": 2.5160028449502136e-05, |
|
"loss": 0.0235, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.752, |
|
"grad_norm": 0.24424146115779877, |
|
"learning_rate": 2.4982219061166433e-05, |
|
"loss": 0.0005, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.768, |
|
"grad_norm": 2.014695167541504, |
|
"learning_rate": 2.480440967283073e-05, |
|
"loss": 0.0111, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.784, |
|
"grad_norm": 0.00039951372309587896, |
|
"learning_rate": 2.4626600284495022e-05, |
|
"loss": 0.0004, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.0016597098438069224, |
|
"learning_rate": 2.4448790896159318e-05, |
|
"loss": 0.0002, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.816, |
|
"grad_norm": 0.010242385789752007, |
|
"learning_rate": 2.4270981507823614e-05, |
|
"loss": 0.0001, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.832, |
|
"grad_norm": 0.0023806928656995296, |
|
"learning_rate": 2.409317211948791e-05, |
|
"loss": 0.0001, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.848, |
|
"grad_norm": 0.01970355026423931, |
|
"learning_rate": 2.3915362731152204e-05, |
|
"loss": 0.0183, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.864, |
|
"grad_norm": 0.0002463227428961545, |
|
"learning_rate": 2.37375533428165e-05, |
|
"loss": 0.0, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.000822290952783078, |
|
"learning_rate": 2.35597439544808e-05, |
|
"loss": 0.0001, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.896, |
|
"grad_norm": 0.0026505696587264538, |
|
"learning_rate": 2.3381934566145096e-05, |
|
"loss": 0.0004, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.912, |
|
"grad_norm": 0.0005039023817516863, |
|
"learning_rate": 2.320412517780939e-05, |
|
"loss": 0.0001, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.928, |
|
"grad_norm": 0.009028232656419277, |
|
"learning_rate": 2.3026315789473685e-05, |
|
"loss": 0.0001, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.944, |
|
"grad_norm": 0.002705842722207308, |
|
"learning_rate": 2.284850640113798e-05, |
|
"loss": 0.0003, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.00383372837677598, |
|
"learning_rate": 2.2670697012802278e-05, |
|
"loss": 0.0017, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.976, |
|
"grad_norm": 0.00024894202942959964, |
|
"learning_rate": 2.2492887624466574e-05, |
|
"loss": 0.0002, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.992, |
|
"grad_norm": 0.0009176091407425702, |
|
"learning_rate": 2.2315078236130867e-05, |
|
"loss": 0.0, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9939879759519038, |
|
"eval_f1_macro": 0.9910040909369707, |
|
"eval_f1_micro": 0.9939879759519038, |
|
"eval_f1_weighted": 0.9939809297459037, |
|
"eval_loss": 0.014622284099459648, |
|
"eval_precision_macro": 0.9959514170040485, |
|
"eval_precision_micro": 0.9939879759519038, |
|
"eval_precision_weighted": 0.994060996486901, |
|
"eval_recall_macro": 0.9862711213517666, |
|
"eval_recall_micro": 0.9939879759519038, |
|
"eval_recall_weighted": 0.9939879759519038, |
|
"eval_runtime": 10.5217, |
|
"eval_samples_per_second": 47.426, |
|
"eval_steps_per_second": 3.041, |
|
"step": 1875 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3125, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 5, |
|
"early_stopping_threshold": 0.01 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 2 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2446967439360000.0, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|