{ "best_metric": 0.014622284099459648, "best_model_checkpoint": "sentiment-distillation-smollm/checkpoint-1875", "epoch": 3.0, "eval_steps": 500, "global_step": 1875, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 9.476157188415527, "learning_rate": 1.597444089456869e-06, "loss": 1.074, "step": 10 }, { "epoch": 0.032, "grad_norm": 10.569486618041992, "learning_rate": 3.194888178913738e-06, "loss": 1.0372, "step": 20 }, { "epoch": 0.048, "grad_norm": 8.480586051940918, "learning_rate": 4.792332268370607e-06, "loss": 0.9365, "step": 30 }, { "epoch": 0.064, "grad_norm": 6.626020908355713, "learning_rate": 6.389776357827476e-06, "loss": 0.8842, "step": 40 }, { "epoch": 0.08, "grad_norm": 12.755849838256836, "learning_rate": 7.987220447284345e-06, "loss": 0.9098, "step": 50 }, { "epoch": 0.096, "grad_norm": 7.676151275634766, "learning_rate": 9.584664536741214e-06, "loss": 0.7348, "step": 60 }, { "epoch": 0.112, "grad_norm": 5.426048278808594, "learning_rate": 1.1182108626198083e-05, "loss": 0.4936, "step": 70 }, { "epoch": 0.128, "grad_norm": 3.9603002071380615, "learning_rate": 1.2779552715654951e-05, "loss": 0.3438, "step": 80 }, { "epoch": 0.144, "grad_norm": 3.396127700805664, "learning_rate": 1.4376996805111822e-05, "loss": 0.1482, "step": 90 }, { "epoch": 0.16, "grad_norm": 4.765463829040527, "learning_rate": 1.597444089456869e-05, "loss": 0.1855, "step": 100 }, { "epoch": 0.176, "grad_norm": 0.04306459426879883, "learning_rate": 1.757188498402556e-05, "loss": 0.1788, "step": 110 }, { "epoch": 0.192, "grad_norm": 0.04322272166609764, "learning_rate": 1.9169329073482428e-05, "loss": 0.1233, "step": 120 }, { "epoch": 0.208, "grad_norm": 0.053582437336444855, "learning_rate": 2.07667731629393e-05, "loss": 0.0908, "step": 130 }, { "epoch": 0.224, "grad_norm": 8.254273414611816, "learning_rate": 2.2364217252396165e-05, "loss": 0.1362, "step": 140 }, { "epoch": 0.24, "grad_norm": 0.054730091243982315, "learning_rate": 2.3961661341853036e-05, "loss": 0.072, "step": 150 }, { "epoch": 0.256, "grad_norm": 0.0763891264796257, "learning_rate": 2.5559105431309903e-05, "loss": 0.1823, "step": 160 }, { "epoch": 0.272, "grad_norm": 6.117145538330078, "learning_rate": 2.7156549520766773e-05, "loss": 0.0963, "step": 170 }, { "epoch": 0.288, "grad_norm": 1.8248779773712158, "learning_rate": 2.8753993610223644e-05, "loss": 0.2741, "step": 180 }, { "epoch": 0.304, "grad_norm": 0.014617039822041988, "learning_rate": 3.0351437699680514e-05, "loss": 0.0691, "step": 190 }, { "epoch": 0.32, "grad_norm": 2.316850423812866, "learning_rate": 3.194888178913738e-05, "loss": 0.0186, "step": 200 }, { "epoch": 0.336, "grad_norm": 0.010957750491797924, "learning_rate": 3.354632587859425e-05, "loss": 0.1676, "step": 210 }, { "epoch": 0.352, "grad_norm": 0.02525000460445881, "learning_rate": 3.514376996805112e-05, "loss": 0.1689, "step": 220 }, { "epoch": 0.368, "grad_norm": 0.010558371432125568, "learning_rate": 3.6741214057507985e-05, "loss": 0.1759, "step": 230 }, { "epoch": 0.384, "grad_norm": 0.009851646609604359, "learning_rate": 3.8338658146964856e-05, "loss": 0.1253, "step": 240 }, { "epoch": 0.4, "grad_norm": 0.009924142621457577, "learning_rate": 3.9936102236421726e-05, "loss": 0.1141, "step": 250 }, { "epoch": 0.416, "grad_norm": 0.00981380045413971, "learning_rate": 4.15335463258786e-05, "loss": 0.1227, "step": 260 }, { "epoch": 0.432, "grad_norm": 4.685519218444824, "learning_rate": 4.313099041533547e-05, "loss": 0.2, "step": 270 }, { "epoch": 0.448, "grad_norm": 1.1163793802261353, "learning_rate": 4.472843450479233e-05, "loss": 0.073, "step": 280 }, { "epoch": 0.464, "grad_norm": 0.01022783387452364, "learning_rate": 4.632587859424921e-05, "loss": 0.1287, "step": 290 }, { "epoch": 0.48, "grad_norm": 0.018457185477018356, "learning_rate": 4.792332268370607e-05, "loss": 0.0794, "step": 300 }, { "epoch": 0.496, "grad_norm": 10.666505813598633, "learning_rate": 4.952076677316294e-05, "loss": 0.17, "step": 310 }, { "epoch": 0.512, "grad_norm": 2.132030725479126, "learning_rate": 4.987553342816501e-05, "loss": 0.0952, "step": 320 }, { "epoch": 0.528, "grad_norm": 11.730767250061035, "learning_rate": 4.969772403982931e-05, "loss": 0.1995, "step": 330 }, { "epoch": 0.544, "grad_norm": 0.01148867979645729, "learning_rate": 4.95199146514936e-05, "loss": 0.1019, "step": 340 }, { "epoch": 0.56, "grad_norm": 0.047348447144031525, "learning_rate": 4.9342105263157894e-05, "loss": 0.0706, "step": 350 }, { "epoch": 0.576, "grad_norm": 0.010151210241019726, "learning_rate": 4.916429587482219e-05, "loss": 0.0878, "step": 360 }, { "epoch": 0.592, "grad_norm": 0.4587936997413635, "learning_rate": 4.8986486486486486e-05, "loss": 0.0864, "step": 370 }, { "epoch": 0.608, "grad_norm": 0.1231539249420166, "learning_rate": 4.8808677098150786e-05, "loss": 0.0096, "step": 380 }, { "epoch": 0.624, "grad_norm": 0.005615161266177893, "learning_rate": 4.863086770981508e-05, "loss": 0.1351, "step": 390 }, { "epoch": 0.64, "grad_norm": 0.02301601506769657, "learning_rate": 4.845305832147938e-05, "loss": 0.0363, "step": 400 }, { "epoch": 0.656, "grad_norm": 3.3105900287628174, "learning_rate": 4.827524893314367e-05, "loss": 0.0764, "step": 410 }, { "epoch": 0.672, "grad_norm": 0.0835251584649086, "learning_rate": 4.809743954480797e-05, "loss": 0.0238, "step": 420 }, { "epoch": 0.688, "grad_norm": 0.0839819684624672, "learning_rate": 4.7919630156472264e-05, "loss": 0.1182, "step": 430 }, { "epoch": 0.704, "grad_norm": 0.0160073135048151, "learning_rate": 4.774182076813656e-05, "loss": 0.1647, "step": 440 }, { "epoch": 0.72, "grad_norm": 0.09903815388679504, "learning_rate": 4.756401137980086e-05, "loss": 0.138, "step": 450 }, { "epoch": 0.736, "grad_norm": 8.14660930633545, "learning_rate": 4.738620199146515e-05, "loss": 0.0731, "step": 460 }, { "epoch": 0.752, "grad_norm": 0.07308264076709747, "learning_rate": 4.720839260312945e-05, "loss": 0.017, "step": 470 }, { "epoch": 0.768, "grad_norm": 0.8220946788787842, "learning_rate": 4.703058321479374e-05, "loss": 0.0299, "step": 480 }, { "epoch": 0.784, "grad_norm": 0.007219772785902023, "learning_rate": 4.685277382645804e-05, "loss": 0.0162, "step": 490 }, { "epoch": 0.8, "grad_norm": 0.009510455653071404, "learning_rate": 4.6674964438122335e-05, "loss": 0.0011, "step": 500 }, { "epoch": 0.816, "grad_norm": 9.018925666809082, "learning_rate": 4.6497155049786634e-05, "loss": 0.1081, "step": 510 }, { "epoch": 0.832, "grad_norm": 0.008953461423516273, "learning_rate": 4.631934566145093e-05, "loss": 0.0494, "step": 520 }, { "epoch": 0.848, "grad_norm": 10.596187591552734, "learning_rate": 4.614153627311522e-05, "loss": 0.0158, "step": 530 }, { "epoch": 0.864, "grad_norm": 2.3671252727508545, "learning_rate": 4.596372688477952e-05, "loss": 0.0816, "step": 540 }, { "epoch": 0.88, "grad_norm": 0.002445698482915759, "learning_rate": 4.578591749644381e-05, "loss": 0.0042, "step": 550 }, { "epoch": 0.896, "grad_norm": 11.779576301574707, "learning_rate": 4.560810810810811e-05, "loss": 0.1067, "step": 560 }, { "epoch": 0.912, "grad_norm": 7.589051246643066, "learning_rate": 4.5430298719772405e-05, "loss": 0.0853, "step": 570 }, { "epoch": 0.928, "grad_norm": 0.11888572573661804, "learning_rate": 4.5252489331436705e-05, "loss": 0.0317, "step": 580 }, { "epoch": 0.944, "grad_norm": 0.07746365666389465, "learning_rate": 4.5074679943101e-05, "loss": 0.2303, "step": 590 }, { "epoch": 0.96, "grad_norm": 0.004527238663285971, "learning_rate": 4.489687055476529e-05, "loss": 0.0665, "step": 600 }, { "epoch": 0.976, "grad_norm": 4.120000839233398, "learning_rate": 4.471906116642959e-05, "loss": 0.0546, "step": 610 }, { "epoch": 0.992, "grad_norm": 0.0016751989023759961, "learning_rate": 4.4541251778093884e-05, "loss": 0.0051, "step": 620 }, { "epoch": 1.0, "eval_accuracy": 0.9939879759519038, "eval_f1_macro": 0.9861065168559803, "eval_f1_micro": 0.9939879759519038, "eval_f1_weighted": 0.9939373007372426, "eval_loss": 0.015998151153326035, "eval_precision_macro": 0.9959514170040485, "eval_precision_micro": 0.9939879759519038, "eval_precision_weighted": 0.994060996486901, "eval_recall_macro": 0.9770065284178188, "eval_recall_micro": 0.9939879759519038, "eval_recall_weighted": 0.9939879759519038, "eval_runtime": 10.7994, "eval_samples_per_second": 46.206, "eval_steps_per_second": 2.963, "step": 625 }, { "epoch": 1.008, "grad_norm": 4.363436222076416, "learning_rate": 4.436344238975818e-05, "loss": 0.0119, "step": 630 }, { "epoch": 1.024, "grad_norm": 0.002567918971180916, "learning_rate": 4.4185633001422476e-05, "loss": 0.0736, "step": 640 }, { "epoch": 1.04, "grad_norm": 0.00732471002265811, "learning_rate": 4.4007823613086776e-05, "loss": 0.0001, "step": 650 }, { "epoch": 1.056, "grad_norm": 0.002863505156710744, "learning_rate": 4.383001422475107e-05, "loss": 0.0608, "step": 660 }, { "epoch": 1.072, "grad_norm": 0.008520668372511864, "learning_rate": 4.365220483641537e-05, "loss": 0.0125, "step": 670 }, { "epoch": 1.088, "grad_norm": 0.001538406009785831, "learning_rate": 4.347439544807966e-05, "loss": 0.0653, "step": 680 }, { "epoch": 1.104, "grad_norm": 0.001671103062108159, "learning_rate": 4.3296586059743954e-05, "loss": 0.081, "step": 690 }, { "epoch": 1.12, "grad_norm": 0.20056261122226715, "learning_rate": 4.3118776671408254e-05, "loss": 0.0057, "step": 700 }, { "epoch": 1.1360000000000001, "grad_norm": 0.008405996486544609, "learning_rate": 4.294096728307255e-05, "loss": 0.0091, "step": 710 }, { "epoch": 1.152, "grad_norm": 0.19218170642852783, "learning_rate": 4.2763157894736847e-05, "loss": 0.0019, "step": 720 }, { "epoch": 1.168, "grad_norm": 0.07524432241916656, "learning_rate": 4.258534850640114e-05, "loss": 0.0952, "step": 730 }, { "epoch": 1.184, "grad_norm": 0.0015570322284474969, "learning_rate": 4.240753911806544e-05, "loss": 0.0435, "step": 740 }, { "epoch": 1.2, "grad_norm": 0.0036333040334284306, "learning_rate": 4.222972972972973e-05, "loss": 0.0207, "step": 750 }, { "epoch": 1.216, "grad_norm": 9.78964900970459, "learning_rate": 4.205192034139403e-05, "loss": 0.1004, "step": 760 }, { "epoch": 1.232, "grad_norm": 0.0014888375299051404, "learning_rate": 4.187411095305832e-05, "loss": 0.0671, "step": 770 }, { "epoch": 1.248, "grad_norm": 0.023692140355706215, "learning_rate": 4.169630156472262e-05, "loss": 0.0026, "step": 780 }, { "epoch": 1.264, "grad_norm": 0.004593148361891508, "learning_rate": 4.151849217638692e-05, "loss": 0.0423, "step": 790 }, { "epoch": 1.28, "grad_norm": 0.003923716489225626, "learning_rate": 4.134068278805121e-05, "loss": 0.0236, "step": 800 }, { "epoch": 1.296, "grad_norm": 0.002044517546892166, "learning_rate": 4.116287339971551e-05, "loss": 0.005, "step": 810 }, { "epoch": 1.312, "grad_norm": 0.022982032969594002, "learning_rate": 4.09850640113798e-05, "loss": 0.0697, "step": 820 }, { "epoch": 1.328, "grad_norm": 0.23058412969112396, "learning_rate": 4.08072546230441e-05, "loss": 0.1222, "step": 830 }, { "epoch": 1.3439999999999999, "grad_norm": 0.0017761716153472662, "learning_rate": 4.0629445234708395e-05, "loss": 0.0022, "step": 840 }, { "epoch": 1.3599999999999999, "grad_norm": 0.12850730121135712, "learning_rate": 4.0451635846372695e-05, "loss": 0.0189, "step": 850 }, { "epoch": 1.376, "grad_norm": 0.0014099746476858854, "learning_rate": 4.027382645803698e-05, "loss": 0.0001, "step": 860 }, { "epoch": 1.392, "grad_norm": 10.44194507598877, "learning_rate": 4.009601706970128e-05, "loss": 0.1423, "step": 870 }, { "epoch": 1.408, "grad_norm": 5.446048736572266, "learning_rate": 3.9918207681365574e-05, "loss": 0.1013, "step": 880 }, { "epoch": 1.424, "grad_norm": 0.017437923699617386, "learning_rate": 3.9740398293029873e-05, "loss": 0.0002, "step": 890 }, { "epoch": 1.44, "grad_norm": 0.007926377467811108, "learning_rate": 3.956258890469417e-05, "loss": 0.0661, "step": 900 }, { "epoch": 1.456, "grad_norm": 0.0029465279076248407, "learning_rate": 3.9384779516358466e-05, "loss": 0.0338, "step": 910 }, { "epoch": 1.472, "grad_norm": 0.0010107713751494884, "learning_rate": 3.9206970128022766e-05, "loss": 0.0171, "step": 920 }, { "epoch": 1.488, "grad_norm": 0.036259789019823074, "learning_rate": 3.902916073968706e-05, "loss": 0.0004, "step": 930 }, { "epoch": 1.504, "grad_norm": 0.7286566495895386, "learning_rate": 3.885135135135135e-05, "loss": 0.014, "step": 940 }, { "epoch": 1.52, "grad_norm": 0.008453834801912308, "learning_rate": 3.8673541963015645e-05, "loss": 0.0002, "step": 950 }, { "epoch": 1.536, "grad_norm": 0.1271582841873169, "learning_rate": 3.8495732574679944e-05, "loss": 0.0043, "step": 960 }, { "epoch": 1.552, "grad_norm": 6.197739124298096, "learning_rate": 3.831792318634424e-05, "loss": 0.0075, "step": 970 }, { "epoch": 1.568, "grad_norm": 0.008371386677026749, "learning_rate": 3.814011379800854e-05, "loss": 0.0299, "step": 980 }, { "epoch": 1.584, "grad_norm": 0.0011657042196020484, "learning_rate": 3.796230440967283e-05, "loss": 0.1188, "step": 990 }, { "epoch": 1.6, "grad_norm": 0.010952652432024479, "learning_rate": 3.778449502133713e-05, "loss": 0.0116, "step": 1000 }, { "epoch": 1.616, "grad_norm": 0.0010061347857117653, "learning_rate": 3.760668563300143e-05, "loss": 0.0043, "step": 1010 }, { "epoch": 1.6320000000000001, "grad_norm": 0.06395132839679718, "learning_rate": 3.742887624466572e-05, "loss": 0.0445, "step": 1020 }, { "epoch": 1.6480000000000001, "grad_norm": 0.001108819618821144, "learning_rate": 3.7251066856330015e-05, "loss": 0.0323, "step": 1030 }, { "epoch": 1.6640000000000001, "grad_norm": 0.0014280881732702255, "learning_rate": 3.707325746799431e-05, "loss": 0.0008, "step": 1040 }, { "epoch": 1.6800000000000002, "grad_norm": 0.0012123563792556524, "learning_rate": 3.689544807965861e-05, "loss": 0.0014, "step": 1050 }, { "epoch": 1.696, "grad_norm": 0.0011878483928740025, "learning_rate": 3.67176386913229e-05, "loss": 0.0524, "step": 1060 }, { "epoch": 1.712, "grad_norm": 0.0037567925173789263, "learning_rate": 3.65398293029872e-05, "loss": 0.0237, "step": 1070 }, { "epoch": 1.728, "grad_norm": 0.002096337964758277, "learning_rate": 3.636201991465149e-05, "loss": 0.0334, "step": 1080 }, { "epoch": 1.744, "grad_norm": 8.111098289489746, "learning_rate": 3.618421052631579e-05, "loss": 0.1467, "step": 1090 }, { "epoch": 1.76, "grad_norm": 0.0023632964584976435, "learning_rate": 3.600640113798009e-05, "loss": 0.0003, "step": 1100 }, { "epoch": 1.776, "grad_norm": 9.071378707885742, "learning_rate": 3.5828591749644385e-05, "loss": 0.0176, "step": 1110 }, { "epoch": 1.792, "grad_norm": 0.08811729401350021, "learning_rate": 3.565078236130868e-05, "loss": 0.1454, "step": 1120 }, { "epoch": 1.808, "grad_norm": 0.0101834237575531, "learning_rate": 3.547297297297297e-05, "loss": 0.0172, "step": 1130 }, { "epoch": 1.8239999999999998, "grad_norm": 0.1676841378211975, "learning_rate": 3.529516358463727e-05, "loss": 0.049, "step": 1140 }, { "epoch": 1.8399999999999999, "grad_norm": 0.043762240558862686, "learning_rate": 3.5117354196301564e-05, "loss": 0.0432, "step": 1150 }, { "epoch": 1.8559999999999999, "grad_norm": 0.011058449745178223, "learning_rate": 3.4939544807965863e-05, "loss": 0.001, "step": 1160 }, { "epoch": 1.8719999999999999, "grad_norm": 0.0018902173032984138, "learning_rate": 3.4761735419630156e-05, "loss": 0.0399, "step": 1170 }, { "epoch": 1.888, "grad_norm": 0.001800389145500958, "learning_rate": 3.4583926031294456e-05, "loss": 0.0184, "step": 1180 }, { "epoch": 1.904, "grad_norm": 0.06505569815635681, "learning_rate": 3.440611664295875e-05, "loss": 0.0073, "step": 1190 }, { "epoch": 1.92, "grad_norm": 0.5011488199234009, "learning_rate": 3.422830725462304e-05, "loss": 0.0007, "step": 1200 }, { "epoch": 1.936, "grad_norm": 0.013254979625344276, "learning_rate": 3.405049786628734e-05, "loss": 0.0059, "step": 1210 }, { "epoch": 1.952, "grad_norm": 0.7582728266716003, "learning_rate": 3.3872688477951634e-05, "loss": 0.0013, "step": 1220 }, { "epoch": 1.968, "grad_norm": 0.009909105487167835, "learning_rate": 3.3694879089615934e-05, "loss": 0.0002, "step": 1230 }, { "epoch": 1.984, "grad_norm": 0.000718823226634413, "learning_rate": 3.351706970128023e-05, "loss": 0.0047, "step": 1240 }, { "epoch": 2.0, "grad_norm": 0.6642739772796631, "learning_rate": 3.333926031294453e-05, "loss": 0.0015, "step": 1250 }, { "epoch": 2.0, "eval_accuracy": 0.9919839679358717, "eval_f1_macro": 0.9810672282141238, "eval_f1_micro": 0.9919839679358717, "eval_f1_weighted": 0.992107547860045, "eval_loss": 0.019453825429081917, "eval_precision_macro": 0.9692164931816348, "eval_precision_micro": 0.9919839679358717, "eval_precision_weighted": 0.9925061936994298, "eval_recall_macro": 0.9942915690866512, "eval_recall_micro": 0.9919839679358717, "eval_recall_weighted": 0.9919839679358717, "eval_runtime": 10.636, "eval_samples_per_second": 46.916, "eval_steps_per_second": 3.009, "step": 1250 }, { "epoch": 2.016, "grad_norm": 0.005395730957388878, "learning_rate": 3.316145092460882e-05, "loss": 0.0001, "step": 1260 }, { "epoch": 2.032, "grad_norm": 0.0008304046932607889, "learning_rate": 3.298364153627312e-05, "loss": 0.0001, "step": 1270 }, { "epoch": 2.048, "grad_norm": 0.0007791437674313784, "learning_rate": 3.280583214793741e-05, "loss": 0.0001, "step": 1280 }, { "epoch": 2.064, "grad_norm": 0.0008820474613457918, "learning_rate": 3.2628022759601705e-05, "loss": 0.0001, "step": 1290 }, { "epoch": 2.08, "grad_norm": 0.000594152370467782, "learning_rate": 3.2450213371266005e-05, "loss": 0.0002, "step": 1300 }, { "epoch": 2.096, "grad_norm": 0.0006243674433790147, "learning_rate": 3.22724039829303e-05, "loss": 0.0022, "step": 1310 }, { "epoch": 2.112, "grad_norm": 0.0007709413184784353, "learning_rate": 3.20945945945946e-05, "loss": 0.0154, "step": 1320 }, { "epoch": 2.128, "grad_norm": 0.004626740701496601, "learning_rate": 3.191678520625889e-05, "loss": 0.0012, "step": 1330 }, { "epoch": 2.144, "grad_norm": 0.002739989897236228, "learning_rate": 3.173897581792319e-05, "loss": 0.0004, "step": 1340 }, { "epoch": 2.16, "grad_norm": 0.007507917005568743, "learning_rate": 3.156116642958748e-05, "loss": 0.0001, "step": 1350 }, { "epoch": 2.176, "grad_norm": 0.004014975391328335, "learning_rate": 3.138335704125178e-05, "loss": 0.001, "step": 1360 }, { "epoch": 2.192, "grad_norm": 0.000455196452094242, "learning_rate": 3.1205547652916076e-05, "loss": 0.0005, "step": 1370 }, { "epoch": 2.208, "grad_norm": 0.00030664558289572597, "learning_rate": 3.102773826458037e-05, "loss": 0.0265, "step": 1380 }, { "epoch": 2.224, "grad_norm": 0.00043227567221038043, "learning_rate": 3.084992887624467e-05, "loss": 0.0001, "step": 1390 }, { "epoch": 2.24, "grad_norm": 0.0481143593788147, "learning_rate": 3.067211948790896e-05, "loss": 0.0023, "step": 1400 }, { "epoch": 2.2560000000000002, "grad_norm": 0.005968974903225899, "learning_rate": 3.0494310099573257e-05, "loss": 0.0025, "step": 1410 }, { "epoch": 2.2720000000000002, "grad_norm": 0.21466164290905, "learning_rate": 3.0316500711237557e-05, "loss": 0.0003, "step": 1420 }, { "epoch": 2.288, "grad_norm": 0.0018357799854129553, "learning_rate": 3.0138691322901853e-05, "loss": 0.0001, "step": 1430 }, { "epoch": 2.304, "grad_norm": 0.0004517412162385881, "learning_rate": 2.996088193456615e-05, "loss": 0.0001, "step": 1440 }, { "epoch": 2.32, "grad_norm": 0.005384071730077267, "learning_rate": 2.9783072546230446e-05, "loss": 0.002, "step": 1450 }, { "epoch": 2.336, "grad_norm": 0.00046941509936004877, "learning_rate": 2.9605263157894735e-05, "loss": 0.0001, "step": 1460 }, { "epoch": 2.352, "grad_norm": 0.011709867045283318, "learning_rate": 2.9427453769559032e-05, "loss": 0.0001, "step": 1470 }, { "epoch": 2.368, "grad_norm": 0.03616934269666672, "learning_rate": 2.9249644381223328e-05, "loss": 0.0008, "step": 1480 }, { "epoch": 2.384, "grad_norm": 0.0006385542219504714, "learning_rate": 2.9071834992887624e-05, "loss": 0.0001, "step": 1490 }, { "epoch": 2.4, "grad_norm": 1.0423965454101562, "learning_rate": 2.889402560455192e-05, "loss": 0.0021, "step": 1500 }, { "epoch": 2.416, "grad_norm": 0.21064622700214386, "learning_rate": 2.8716216216216217e-05, "loss": 0.0003, "step": 1510 }, { "epoch": 2.432, "grad_norm": 0.0016910170670598745, "learning_rate": 2.8538406827880517e-05, "loss": 0.0001, "step": 1520 }, { "epoch": 2.448, "grad_norm": 0.0004065225657541305, "learning_rate": 2.8360597439544813e-05, "loss": 0.0004, "step": 1530 }, { "epoch": 2.464, "grad_norm": 0.0006998078897595406, "learning_rate": 2.8182788051209103e-05, "loss": 0.0001, "step": 1540 }, { "epoch": 2.48, "grad_norm": 0.0057923863641917706, "learning_rate": 2.80049786628734e-05, "loss": 0.0012, "step": 1550 }, { "epoch": 2.496, "grad_norm": 0.00030601295293308794, "learning_rate": 2.7827169274537695e-05, "loss": 0.0236, "step": 1560 }, { "epoch": 2.512, "grad_norm": 0.00045617681462317705, "learning_rate": 2.764935988620199e-05, "loss": 0.0001, "step": 1570 }, { "epoch": 2.528, "grad_norm": 0.00044945545960217714, "learning_rate": 2.7471550497866288e-05, "loss": 0.0001, "step": 1580 }, { "epoch": 2.544, "grad_norm": 0.0003521046892274171, "learning_rate": 2.7293741109530584e-05, "loss": 0.0263, "step": 1590 }, { "epoch": 2.56, "grad_norm": 0.008946732617914677, "learning_rate": 2.711593172119488e-05, "loss": 0.0006, "step": 1600 }, { "epoch": 2.576, "grad_norm": 0.09531024098396301, "learning_rate": 2.6938122332859177e-05, "loss": 0.0003, "step": 1610 }, { "epoch": 2.592, "grad_norm": 0.0010131685994565487, "learning_rate": 2.6760312944523473e-05, "loss": 0.0001, "step": 1620 }, { "epoch": 2.608, "grad_norm": 0.0016723967855796218, "learning_rate": 2.6582503556187766e-05, "loss": 0.0018, "step": 1630 }, { "epoch": 2.624, "grad_norm": 0.008407480083405972, "learning_rate": 2.6404694167852062e-05, "loss": 0.0344, "step": 1640 }, { "epoch": 2.64, "grad_norm": 0.001671052654273808, "learning_rate": 2.622688477951636e-05, "loss": 0.0005, "step": 1650 }, { "epoch": 2.656, "grad_norm": 0.0003514339041430503, "learning_rate": 2.6049075391180655e-05, "loss": 0.0001, "step": 1660 }, { "epoch": 2.672, "grad_norm": 0.00044292688835412264, "learning_rate": 2.587126600284495e-05, "loss": 0.0017, "step": 1670 }, { "epoch": 2.6879999999999997, "grad_norm": 0.00033179231104440987, "learning_rate": 2.5693456614509247e-05, "loss": 0.0243, "step": 1680 }, { "epoch": 2.7039999999999997, "grad_norm": 0.0023233199026435614, "learning_rate": 2.5515647226173544e-05, "loss": 0.0001, "step": 1690 }, { "epoch": 2.7199999999999998, "grad_norm": 0.0012358427047729492, "learning_rate": 2.533783783783784e-05, "loss": 0.0002, "step": 1700 }, { "epoch": 2.7359999999999998, "grad_norm": 0.00035215960815548897, "learning_rate": 2.5160028449502136e-05, "loss": 0.0235, "step": 1710 }, { "epoch": 2.752, "grad_norm": 0.24424146115779877, "learning_rate": 2.4982219061166433e-05, "loss": 0.0005, "step": 1720 }, { "epoch": 2.768, "grad_norm": 2.014695167541504, "learning_rate": 2.480440967283073e-05, "loss": 0.0111, "step": 1730 }, { "epoch": 2.784, "grad_norm": 0.00039951372309587896, "learning_rate": 2.4626600284495022e-05, "loss": 0.0004, "step": 1740 }, { "epoch": 2.8, "grad_norm": 0.0016597098438069224, "learning_rate": 2.4448790896159318e-05, "loss": 0.0002, "step": 1750 }, { "epoch": 2.816, "grad_norm": 0.010242385789752007, "learning_rate": 2.4270981507823614e-05, "loss": 0.0001, "step": 1760 }, { "epoch": 2.832, "grad_norm": 0.0023806928656995296, "learning_rate": 2.409317211948791e-05, "loss": 0.0001, "step": 1770 }, { "epoch": 2.848, "grad_norm": 0.01970355026423931, "learning_rate": 2.3915362731152204e-05, "loss": 0.0183, "step": 1780 }, { "epoch": 2.864, "grad_norm": 0.0002463227428961545, "learning_rate": 2.37375533428165e-05, "loss": 0.0, "step": 1790 }, { "epoch": 2.88, "grad_norm": 0.000822290952783078, "learning_rate": 2.35597439544808e-05, "loss": 0.0001, "step": 1800 }, { "epoch": 2.896, "grad_norm": 0.0026505696587264538, "learning_rate": 2.3381934566145096e-05, "loss": 0.0004, "step": 1810 }, { "epoch": 2.912, "grad_norm": 0.0005039023817516863, "learning_rate": 2.320412517780939e-05, "loss": 0.0001, "step": 1820 }, { "epoch": 2.928, "grad_norm": 0.009028232656419277, "learning_rate": 2.3026315789473685e-05, "loss": 0.0001, "step": 1830 }, { "epoch": 2.944, "grad_norm": 0.002705842722207308, "learning_rate": 2.284850640113798e-05, "loss": 0.0003, "step": 1840 }, { "epoch": 2.96, "grad_norm": 0.00383372837677598, "learning_rate": 2.2670697012802278e-05, "loss": 0.0017, "step": 1850 }, { "epoch": 2.976, "grad_norm": 0.00024894202942959964, "learning_rate": 2.2492887624466574e-05, "loss": 0.0002, "step": 1860 }, { "epoch": 2.992, "grad_norm": 0.0009176091407425702, "learning_rate": 2.2315078236130867e-05, "loss": 0.0, "step": 1870 }, { "epoch": 3.0, "eval_accuracy": 0.9939879759519038, "eval_f1_macro": 0.9910040909369707, "eval_f1_micro": 0.9939879759519038, "eval_f1_weighted": 0.9939809297459037, "eval_loss": 0.014622284099459648, "eval_precision_macro": 0.9959514170040485, "eval_precision_micro": 0.9939879759519038, "eval_precision_weighted": 0.994060996486901, "eval_recall_macro": 0.9862711213517666, "eval_recall_micro": 0.9939879759519038, "eval_recall_weighted": 0.9939879759519038, "eval_runtime": 10.5217, "eval_samples_per_second": 47.426, "eval_steps_per_second": 3.041, "step": 1875 } ], "logging_steps": 10, "max_steps": 3125, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 2 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2446967439360000.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }