{ "best_global_step": 1000, "best_metric": 0.8438986049887841, "best_model_checkpoint": "./results/checkpoint-1000", "epoch": 0.029189526267256648, "eval_steps": 100, "global_step": 1100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00026535932970233315, "grad_norm": 4.446621894836426, "learning_rate": 3.6e-07, "loss": 1.855, "step": 10 }, { "epoch": 0.0005307186594046663, "grad_norm": 6.435311317443848, "learning_rate": 7.6e-07, "loss": 1.8131, "step": 20 }, { "epoch": 0.0007960779891069995, "grad_norm": 3.5201945304870605, "learning_rate": 1.1600000000000001e-06, "loss": 1.7821, "step": 30 }, { "epoch": 0.0010614373188093326, "grad_norm": 5.47129487991333, "learning_rate": 1.56e-06, "loss": 1.8187, "step": 40 }, { "epoch": 0.001326796648511666, "grad_norm": 7.340343475341797, "learning_rate": 1.9600000000000003e-06, "loss": 1.7791, "step": 50 }, { "epoch": 0.001592155978213999, "grad_norm": 5.010741233825684, "learning_rate": 2.3600000000000003e-06, "loss": 1.7705, "step": 60 }, { "epoch": 0.0018575153079163323, "grad_norm": 4.706456184387207, "learning_rate": 2.7600000000000003e-06, "loss": 1.7616, "step": 70 }, { "epoch": 0.002122874637618665, "grad_norm": 3.865488290786743, "learning_rate": 3.1600000000000002e-06, "loss": 1.7265, "step": 80 }, { "epoch": 0.0023882339673209985, "grad_norm": 6.390440464019775, "learning_rate": 3.5600000000000002e-06, "loss": 1.7143, "step": 90 }, { "epoch": 0.002653593297023332, "grad_norm": 5.747302532196045, "learning_rate": 3.96e-06, "loss": 1.6675, "step": 100 }, { "epoch": 0.002653593297023332, "eval_accuracy": 0.3978160782284494, "eval_f1": 0.22667861021073907, "eval_loss": 1.7215721607208252, "eval_precision": 0.3277163462999534, "eval_recall": 0.3978160782284494, "eval_runtime": 1141.6234, "eval_samples_per_second": 66.019, "eval_steps_per_second": 8.253, "step": 100 }, { "epoch": 0.0029189526267256647, "grad_norm": 5.64985466003418, "learning_rate": 4.360000000000001e-06, "loss": 1.6701, "step": 110 }, { "epoch": 0.003184311956427998, "grad_norm": 7.48480224609375, "learning_rate": 4.76e-06, "loss": 1.6665, "step": 120 }, { "epoch": 0.0034496712861303313, "grad_norm": 6.59235954284668, "learning_rate": 5.1600000000000006e-06, "loss": 1.6049, "step": 130 }, { "epoch": 0.0037150306158326646, "grad_norm": 5.440073013305664, "learning_rate": 5.560000000000001e-06, "loss": 1.6174, "step": 140 }, { "epoch": 0.0039803899455349975, "grad_norm": 5.735574245452881, "learning_rate": 5.9600000000000005e-06, "loss": 1.5654, "step": 150 }, { "epoch": 0.00424574927523733, "grad_norm": 5.891671180725098, "learning_rate": 6.360000000000001e-06, "loss": 1.5445, "step": 160 }, { "epoch": 0.004511108604939664, "grad_norm": 7.2471089363098145, "learning_rate": 6.760000000000001e-06, "loss": 1.5545, "step": 170 }, { "epoch": 0.004776467934641997, "grad_norm": 8.074533462524414, "learning_rate": 7.16e-06, "loss": 1.5121, "step": 180 }, { "epoch": 0.00504182726434433, "grad_norm": 6.5745768547058105, "learning_rate": 7.5600000000000005e-06, "loss": 1.4951, "step": 190 }, { "epoch": 0.005307186594046664, "grad_norm": 7.1547112464904785, "learning_rate": 7.960000000000002e-06, "loss": 1.5225, "step": 200 }, { "epoch": 0.005307186594046664, "eval_accuracy": 0.5399965503058286, "eval_f1": 0.4954655997286682, "eval_loss": 1.4961189031600952, "eval_precision": 0.5230960318752841, "eval_recall": 0.5399965503058286, "eval_runtime": 1146.418, "eval_samples_per_second": 65.743, "eval_steps_per_second": 8.219, "step": 200 }, { "epoch": 0.0055725459237489965, "grad_norm": 10.55716609954834, "learning_rate": 8.36e-06, "loss": 1.486, "step": 210 }, { "epoch": 0.005837905253451329, "grad_norm": 10.896256446838379, "learning_rate": 8.76e-06, "loss": 1.3551, "step": 220 }, { "epoch": 0.006103264583153663, "grad_norm": 10.593475341796875, "learning_rate": 9.16e-06, "loss": 1.3457, "step": 230 }, { "epoch": 0.006368623912855996, "grad_norm": 12.37448787689209, "learning_rate": 9.56e-06, "loss": 1.4264, "step": 240 }, { "epoch": 0.006633983242558329, "grad_norm": 15.805830001831055, "learning_rate": 9.960000000000001e-06, "loss": 1.1405, "step": 250 }, { "epoch": 0.006899342572260663, "grad_norm": 7.878324031829834, "learning_rate": 1.036e-05, "loss": 1.3623, "step": 260 }, { "epoch": 0.0071647019019629955, "grad_norm": 9.763978004455566, "learning_rate": 1.0760000000000002e-05, "loss": 1.1918, "step": 270 }, { "epoch": 0.007430061231665329, "grad_norm": 8.449042320251465, "learning_rate": 1.1160000000000002e-05, "loss": 1.1437, "step": 280 }, { "epoch": 0.007695420561367662, "grad_norm": 10.577520370483398, "learning_rate": 1.156e-05, "loss": 1.0793, "step": 290 }, { "epoch": 0.007960779891069995, "grad_norm": 11.707535743713379, "learning_rate": 1.196e-05, "loss": 1.1895, "step": 300 }, { "epoch": 0.007960779891069995, "eval_accuracy": 0.6815534238214651, "eval_f1": 0.6624035468309406, "eval_loss": 1.228468894958496, "eval_precision": 0.6548079319665443, "eval_recall": 0.6815534238214651, "eval_runtime": 1147.8213, "eval_samples_per_second": 65.663, "eval_steps_per_second": 8.209, "step": 300 }, { "epoch": 0.008226139220772328, "grad_norm": 7.825327396392822, "learning_rate": 1.236e-05, "loss": 1.0, "step": 310 }, { "epoch": 0.00849149855047466, "grad_norm": 9.7120943069458, "learning_rate": 1.2760000000000001e-05, "loss": 1.2152, "step": 320 }, { "epoch": 0.008756857880176995, "grad_norm": 10.450387954711914, "learning_rate": 1.3160000000000001e-05, "loss": 1.2838, "step": 330 }, { "epoch": 0.009022217209879328, "grad_norm": 6.901065826416016, "learning_rate": 1.3560000000000002e-05, "loss": 1.1989, "step": 340 }, { "epoch": 0.009287576539581661, "grad_norm": 7.151400089263916, "learning_rate": 1.396e-05, "loss": 1.0571, "step": 350 }, { "epoch": 0.009552935869283994, "grad_norm": 7.359131813049316, "learning_rate": 1.4360000000000001e-05, "loss": 1.2923, "step": 360 }, { "epoch": 0.009818295198986327, "grad_norm": 6.762883186340332, "learning_rate": 1.4760000000000001e-05, "loss": 1.2465, "step": 370 }, { "epoch": 0.01008365452868866, "grad_norm": 7.525689125061035, "learning_rate": 1.516e-05, "loss": 0.889, "step": 380 }, { "epoch": 0.010349013858390994, "grad_norm": 12.986421585083008, "learning_rate": 1.556e-05, "loss": 1.2117, "step": 390 }, { "epoch": 0.010614373188093327, "grad_norm": 10.896193504333496, "learning_rate": 1.5960000000000003e-05, "loss": 1.0, "step": 400 }, { "epoch": 0.010614373188093327, "eval_accuracy": 0.7040295081532195, "eval_f1": 0.7118047953474329, "eval_loss": 1.004681944847107, "eval_precision": 0.7430775529469658, "eval_recall": 0.7040295081532195, "eval_runtime": 1149.1166, "eval_samples_per_second": 65.589, "eval_steps_per_second": 8.199, "step": 400 }, { "epoch": 0.01087973251779566, "grad_norm": 10.948391914367676, "learning_rate": 1.636e-05, "loss": 1.1142, "step": 410 }, { "epoch": 0.011145091847497993, "grad_norm": 9.782318115234375, "learning_rate": 1.6760000000000002e-05, "loss": 1.0815, "step": 420 }, { "epoch": 0.011410451177200326, "grad_norm": 15.20601749420166, "learning_rate": 1.7160000000000002e-05, "loss": 1.1052, "step": 430 }, { "epoch": 0.011675810506902659, "grad_norm": 13.752850532531738, "learning_rate": 1.756e-05, "loss": 0.9449, "step": 440 }, { "epoch": 0.011941169836604993, "grad_norm": 14.253933906555176, "learning_rate": 1.796e-05, "loss": 0.8107, "step": 450 }, { "epoch": 0.012206529166307326, "grad_norm": 10.736539840698242, "learning_rate": 1.8360000000000004e-05, "loss": 0.8507, "step": 460 }, { "epoch": 0.012471888496009659, "grad_norm": 19.90713119506836, "learning_rate": 1.876e-05, "loss": 0.8218, "step": 470 }, { "epoch": 0.012737247825711992, "grad_norm": 5.8942084312438965, "learning_rate": 1.916e-05, "loss": 0.8322, "step": 480 }, { "epoch": 0.013002607155414325, "grad_norm": 5.852909088134766, "learning_rate": 1.9560000000000002e-05, "loss": 0.72, "step": 490 }, { "epoch": 0.013267966485116658, "grad_norm": 8.721864700317383, "learning_rate": 1.9960000000000002e-05, "loss": 0.8702, "step": 500 }, { "epoch": 0.013267966485116658, "eval_accuracy": 0.7919303692499569, "eval_f1": 0.7952185444780413, "eval_loss": 0.7821776270866394, "eval_precision": 0.8109789187273935, "eval_recall": 0.7919303692499569, "eval_runtime": 1149.5055, "eval_samples_per_second": 65.566, "eval_steps_per_second": 8.197, "step": 500 }, { "epoch": 0.013533325814818992, "grad_norm": 7.045879364013672, "learning_rate": 1.9999042145593872e-05, "loss": 0.8162, "step": 510 }, { "epoch": 0.013798685144521325, "grad_norm": 12.590984344482422, "learning_rate": 1.9997977862920393e-05, "loss": 0.8281, "step": 520 }, { "epoch": 0.014064044474223658, "grad_norm": 4.697333812713623, "learning_rate": 1.9996913580246914e-05, "loss": 0.8597, "step": 530 }, { "epoch": 0.014329403803925991, "grad_norm": 7.0437188148498535, "learning_rate": 1.999584929757344e-05, "loss": 0.6957, "step": 540 }, { "epoch": 0.014594763133628324, "grad_norm": 12.778396606445312, "learning_rate": 1.999478501489996e-05, "loss": 0.8493, "step": 550 }, { "epoch": 0.014860122463330658, "grad_norm": 30.98528480529785, "learning_rate": 1.999372073222648e-05, "loss": 0.8598, "step": 560 }, { "epoch": 0.015125481793032991, "grad_norm": 6.842329502105713, "learning_rate": 1.9992656449553e-05, "loss": 0.7805, "step": 570 }, { "epoch": 0.015390841122735324, "grad_norm": 7.865843772888184, "learning_rate": 1.9991592166879526e-05, "loss": 0.6858, "step": 580 }, { "epoch": 0.01565620045243766, "grad_norm": 7.990331172943115, "learning_rate": 1.9990527884206047e-05, "loss": 0.8487, "step": 590 }, { "epoch": 0.01592155978213999, "grad_norm": 7.9941935539245605, "learning_rate": 1.9989463601532568e-05, "loss": 0.7483, "step": 600 }, { "epoch": 0.01592155978213999, "eval_accuracy": 0.8143533813636906, "eval_f1": 0.8091828053145604, "eval_loss": 0.7799807190895081, "eval_precision": 0.8135668963695762, "eval_recall": 0.8143533813636906, "eval_runtime": 1149.6535, "eval_samples_per_second": 65.558, "eval_steps_per_second": 8.196, "step": 600 }, { "epoch": 0.016186919111842325, "grad_norm": 7.86228609085083, "learning_rate": 1.9988399318859092e-05, "loss": 0.8267, "step": 610 }, { "epoch": 0.016452278441544656, "grad_norm": 17.805816650390625, "learning_rate": 1.9987335036185613e-05, "loss": 0.7449, "step": 620 }, { "epoch": 0.01671763777124699, "grad_norm": 20.419509887695312, "learning_rate": 1.9986270753512134e-05, "loss": 0.806, "step": 630 }, { "epoch": 0.01698299710094932, "grad_norm": 8.158143043518066, "learning_rate": 1.9985206470838655e-05, "loss": 0.5455, "step": 640 }, { "epoch": 0.017248356430651656, "grad_norm": 19.537343978881836, "learning_rate": 1.998414218816518e-05, "loss": 0.6334, "step": 650 }, { "epoch": 0.01751371576035399, "grad_norm": 13.77968692779541, "learning_rate": 1.99830779054917e-05, "loss": 0.6848, "step": 660 }, { "epoch": 0.017779075090056322, "grad_norm": 13.80045223236084, "learning_rate": 1.9982013622818222e-05, "loss": 0.8403, "step": 670 }, { "epoch": 0.018044434419758656, "grad_norm": 7.564460277557373, "learning_rate": 1.9980949340144743e-05, "loss": 0.8387, "step": 680 }, { "epoch": 0.018309793749460988, "grad_norm": 9.932668685913086, "learning_rate": 1.9979885057471267e-05, "loss": 0.6252, "step": 690 }, { "epoch": 0.018575153079163322, "grad_norm": 3.894618034362793, "learning_rate": 1.997882077479779e-05, "loss": 0.6915, "step": 700 }, { "epoch": 0.018575153079163322, "eval_accuracy": 0.8227255237564516, "eval_f1": 0.8164082076886476, "eval_loss": 0.7442639470100403, "eval_precision": 0.8239897054500639, "eval_recall": 0.8227255237564516, "eval_runtime": 1149.8124, "eval_samples_per_second": 65.549, "eval_steps_per_second": 8.194, "step": 700 }, { "epoch": 0.018840512408865657, "grad_norm": 8.771450996398926, "learning_rate": 1.997775649212431e-05, "loss": 0.5164, "step": 710 }, { "epoch": 0.019105871738567988, "grad_norm": 13.675606727600098, "learning_rate": 1.997669220945083e-05, "loss": 0.7917, "step": 720 }, { "epoch": 0.019371231068270323, "grad_norm": 7.2742462158203125, "learning_rate": 1.9975627926777355e-05, "loss": 0.5966, "step": 730 }, { "epoch": 0.019636590397972654, "grad_norm": 8.653703689575195, "learning_rate": 1.9974563644103876e-05, "loss": 0.7009, "step": 740 }, { "epoch": 0.01990194972767499, "grad_norm": 4.837522983551025, "learning_rate": 1.9973499361430397e-05, "loss": 0.6471, "step": 750 }, { "epoch": 0.02016730905737732, "grad_norm": 51.784481048583984, "learning_rate": 1.997243507875692e-05, "loss": 0.5877, "step": 760 }, { "epoch": 0.020432668387079654, "grad_norm": 10.597892761230469, "learning_rate": 1.9971370796083442e-05, "loss": 0.5925, "step": 770 }, { "epoch": 0.02069802771678199, "grad_norm": 13.237262725830078, "learning_rate": 1.9970306513409963e-05, "loss": 0.6298, "step": 780 }, { "epoch": 0.02096338704648432, "grad_norm": 9.751429557800293, "learning_rate": 1.9969242230736484e-05, "loss": 0.5414, "step": 790 }, { "epoch": 0.021228746376186654, "grad_norm": 15.811433792114258, "learning_rate": 1.996817794806301e-05, "loss": 0.7402, "step": 800 }, { "epoch": 0.021228746376186654, "eval_accuracy": 0.8336318645596996, "eval_f1": 0.8326606774493011, "eval_loss": 0.6281165480613708, "eval_precision": 0.8348071025847044, "eval_recall": 0.8336318645596996, "eval_runtime": 1150.0313, "eval_samples_per_second": 65.536, "eval_steps_per_second": 8.193, "step": 800 }, { "epoch": 0.021494105705888986, "grad_norm": 6.421304225921631, "learning_rate": 1.9967113665389526e-05, "loss": 0.5246, "step": 810 }, { "epoch": 0.02175946503559132, "grad_norm": 10.94848918914795, "learning_rate": 1.996604938271605e-05, "loss": 0.4998, "step": 820 }, { "epoch": 0.022024824365293655, "grad_norm": 18.42159080505371, "learning_rate": 1.9964985100042572e-05, "loss": 0.3855, "step": 830 }, { "epoch": 0.022290183694995986, "grad_norm": 6.594385623931885, "learning_rate": 1.9963920817369096e-05, "loss": 0.6205, "step": 840 }, { "epoch": 0.02255554302469832, "grad_norm": 15.90577220916748, "learning_rate": 1.9962856534695617e-05, "loss": 0.5837, "step": 850 }, { "epoch": 0.022820902354400652, "grad_norm": 13.169767379760742, "learning_rate": 1.996179225202214e-05, "loss": 0.6141, "step": 860 }, { "epoch": 0.023086261684102986, "grad_norm": 18.30284881591797, "learning_rate": 1.9960727969348663e-05, "loss": 0.5903, "step": 870 }, { "epoch": 0.023351621013805317, "grad_norm": 18.575279235839844, "learning_rate": 1.9959663686675184e-05, "loss": 0.6436, "step": 880 }, { "epoch": 0.023616980343507652, "grad_norm": 7.2494587898254395, "learning_rate": 1.9958599404001705e-05, "loss": 0.5912, "step": 890 }, { "epoch": 0.023882339673209987, "grad_norm": 27.128528594970703, "learning_rate": 1.9957535121328226e-05, "loss": 0.6165, "step": 900 }, { "epoch": 0.023882339673209987, "eval_accuracy": 0.8347198450291233, "eval_f1": 0.8366928520828808, "eval_loss": 0.6065123081207275, "eval_precision": 0.8420416556337017, "eval_recall": 0.8347198450291233, "eval_runtime": 1150.012, "eval_samples_per_second": 65.538, "eval_steps_per_second": 8.193, "step": 900 }, { "epoch": 0.024147699002912318, "grad_norm": 11.817070960998535, "learning_rate": 1.995647083865475e-05, "loss": 0.6235, "step": 910 }, { "epoch": 0.024413058332614652, "grad_norm": 13.93792724609375, "learning_rate": 1.9955406555981268e-05, "loss": 0.5442, "step": 920 }, { "epoch": 0.024678417662316984, "grad_norm": 12.457510948181152, "learning_rate": 1.9954342273307792e-05, "loss": 0.5218, "step": 930 }, { "epoch": 0.024943776992019318, "grad_norm": 19.165714263916016, "learning_rate": 1.9953277990634313e-05, "loss": 0.6517, "step": 940 }, { "epoch": 0.025209136321721653, "grad_norm": 16.57741355895996, "learning_rate": 1.9952213707960838e-05, "loss": 0.6778, "step": 950 }, { "epoch": 0.025474495651423984, "grad_norm": 10.39408016204834, "learning_rate": 1.995114942528736e-05, "loss": 0.7138, "step": 960 }, { "epoch": 0.02573985498112632, "grad_norm": 9.648213386535645, "learning_rate": 1.995008514261388e-05, "loss": 0.63, "step": 970 }, { "epoch": 0.02600521431082865, "grad_norm": 18.695945739746094, "learning_rate": 1.99490208599404e-05, "loss": 0.6557, "step": 980 }, { "epoch": 0.026270573640530984, "grad_norm": 16.400314331054688, "learning_rate": 1.9947956577266925e-05, "loss": 0.7228, "step": 990 }, { "epoch": 0.026535932970233315, "grad_norm": 6.129698753356934, "learning_rate": 1.9946892294593446e-05, "loss": 0.4626, "step": 1000 }, { "epoch": 0.026535932970233315, "eval_accuracy": 0.846382464939166, "eval_f1": 0.8438986049887841, "eval_loss": 0.6415271759033203, "eval_precision": 0.8464411216655519, "eval_recall": 0.846382464939166, "eval_runtime": 1149.903, "eval_samples_per_second": 65.544, "eval_steps_per_second": 8.194, "step": 1000 }, { "epoch": 0.02680129229993565, "grad_norm": 9.027094841003418, "learning_rate": 1.9945828011919967e-05, "loss": 0.7249, "step": 1010 }, { "epoch": 0.027066651629637985, "grad_norm": 14.91533374786377, "learning_rate": 1.9944763729246492e-05, "loss": 0.6765, "step": 1020 }, { "epoch": 0.027332010959340316, "grad_norm": 3.9888482093811035, "learning_rate": 1.994369944657301e-05, "loss": 0.7675, "step": 1030 }, { "epoch": 0.02759737028904265, "grad_norm": 7.831120491027832, "learning_rate": 1.9942635163899534e-05, "loss": 0.5764, "step": 1040 }, { "epoch": 0.02786272961874498, "grad_norm": 7.661794185638428, "learning_rate": 1.9941570881226055e-05, "loss": 0.6496, "step": 1050 }, { "epoch": 0.028128088948447316, "grad_norm": 5.446905136108398, "learning_rate": 1.994050659855258e-05, "loss": 0.7055, "step": 1060 }, { "epoch": 0.02839344827814965, "grad_norm": 11.230253219604492, "learning_rate": 1.9939442315879097e-05, "loss": 0.6096, "step": 1070 }, { "epoch": 0.028658807607851982, "grad_norm": 4.4712982177734375, "learning_rate": 1.993837803320562e-05, "loss": 0.603, "step": 1080 }, { "epoch": 0.028924166937554317, "grad_norm": 6.14396858215332, "learning_rate": 1.9937313750532142e-05, "loss": 0.7755, "step": 1090 }, { "epoch": 0.029189526267256648, "grad_norm": 15.914993286132812, "learning_rate": 1.9936249467858667e-05, "loss": 0.4619, "step": 1100 }, { "epoch": 0.029189526267256648, "eval_accuracy": 0.846621289920259, "eval_f1": 0.842526016669277, "eval_loss": 0.6318368911743164, "eval_precision": 0.8510012460868045, "eval_recall": 0.846621289920259, "eval_runtime": 1149.9706, "eval_samples_per_second": 65.54, "eval_steps_per_second": 8.193, "step": 1100 } ], "logging_steps": 10, "max_steps": 188420, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.01 }, "attributes": { "early_stopping_patience_counter": 3 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4630920885043200.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }