{ "best_metric": null, "best_model_checkpoint": null, "epoch": 19.931506849315067, "eval_steps": 500, "global_step": 1455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.136986301369863, "grad_norm": 7.088261127471924, "learning_rate": 2.7397260273972603e-05, "loss": 1.4774, "step": 10 }, { "epoch": 0.273972602739726, "grad_norm": 3.0658137798309326, "learning_rate": 5.479452054794521e-05, "loss": 0.5535, "step": 20 }, { "epoch": 0.410958904109589, "grad_norm": 1.415347695350647, "learning_rate": 8.219178082191781e-05, "loss": 0.2769, "step": 30 }, { "epoch": 0.547945205479452, "grad_norm": 1.2462260723114014, "learning_rate": 0.00010958904109589041, "loss": 0.2071, "step": 40 }, { "epoch": 0.684931506849315, "grad_norm": 1.1219278573989868, "learning_rate": 0.000136986301369863, "loss": 0.179, "step": 50 }, { "epoch": 0.821917808219178, "grad_norm": 2.4104228019714355, "learning_rate": 0.00016438356164383562, "loss": 0.1587, "step": 60 }, { "epoch": 0.958904109589041, "grad_norm": 1.2239787578582764, "learning_rate": 0.0001917808219178082, "loss": 0.1366, "step": 70 }, { "epoch": 1.095890410958904, "grad_norm": 0.9942715167999268, "learning_rate": 0.00019998733979961563, "loss": 0.1218, "step": 80 }, { "epoch": 1.2328767123287672, "grad_norm": 0.6293880939483643, "learning_rate": 0.0001999253383717226, "loss": 0.1168, "step": 90 }, { "epoch": 1.36986301369863, "grad_norm": 0.7170248031616211, "learning_rate": 0.00019981170237143067, "loss": 0.1052, "step": 100 }, { "epoch": 1.5068493150684932, "grad_norm": 0.7464343905448914, "learning_rate": 0.00019964649051804355, "loss": 0.1066, "step": 110 }, { "epoch": 1.643835616438356, "grad_norm": 0.6828764081001282, "learning_rate": 0.000199429788181734, "loss": 0.1057, "step": 120 }, { "epoch": 1.7808219178082192, "grad_norm": 0.6028720736503601, "learning_rate": 0.0001991617073394306, "loss": 0.0843, "step": 130 }, { "epoch": 1.9178082191780823, "grad_norm": 0.5440357327461243, "learning_rate": 0.00019884238651695556, "loss": 0.0948, "step": 140 }, { "epoch": 2.0547945205479454, "grad_norm": 0.8612964749336243, "learning_rate": 0.00019847199071744415, "loss": 0.085, "step": 150 }, { "epoch": 2.191780821917808, "grad_norm": 0.889124870300293, "learning_rate": 0.00019805071133608242, "loss": 0.0962, "step": 160 }, { "epoch": 2.328767123287671, "grad_norm": 0.45466411113739014, "learning_rate": 0.0001975787660612072, "loss": 0.0763, "step": 170 }, { "epoch": 2.4657534246575343, "grad_norm": 0.42088282108306885, "learning_rate": 0.00019705639876181969, "loss": 0.0635, "step": 180 }, { "epoch": 2.602739726027397, "grad_norm": 0.5170985460281372, "learning_rate": 0.00019648387936157068, "loss": 0.0726, "step": 190 }, { "epoch": 2.73972602739726, "grad_norm": 0.4313249886035919, "learning_rate": 0.00019586150369928245, "loss": 0.0669, "step": 200 }, { "epoch": 2.8767123287671232, "grad_norm": 0.3355115056037903, "learning_rate": 0.00019518959337607957, "loss": 0.0682, "step": 210 }, { "epoch": 3.0136986301369864, "grad_norm": 0.34427109360694885, "learning_rate": 0.0001944684955892075, "loss": 0.0638, "step": 220 }, { "epoch": 3.1506849315068495, "grad_norm": 0.2929873466491699, "learning_rate": 0.0001936985829526247, "loss": 0.0632, "step": 230 }, { "epoch": 3.287671232876712, "grad_norm": 0.3884938657283783, "learning_rate": 0.00019288025330446126, "loss": 0.0655, "step": 240 }, { "epoch": 3.4246575342465753, "grad_norm": 0.27399152517318726, "learning_rate": 0.00019201392950144363, "loss": 0.0533, "step": 250 }, { "epoch": 3.5616438356164384, "grad_norm": 0.2924444079399109, "learning_rate": 0.0001911000592003909, "loss": 0.0589, "step": 260 }, { "epoch": 3.6986301369863015, "grad_norm": 0.43013861775398254, "learning_rate": 0.00019013911462689668, "loss": 0.0615, "step": 270 }, { "epoch": 3.8356164383561646, "grad_norm": 0.5247001647949219, "learning_rate": 0.000189131592331315, "loss": 0.0583, "step": 280 }, { "epoch": 3.9726027397260273, "grad_norm": 0.5796880722045898, "learning_rate": 0.00018807801293217735, "loss": 0.0556, "step": 290 }, { "epoch": 4.109589041095891, "grad_norm": 0.5179729461669922, "learning_rate": 0.00018697892084717238, "loss": 0.056, "step": 300 }, { "epoch": 4.2465753424657535, "grad_norm": 0.42960262298583984, "learning_rate": 0.00018583488401182843, "loss": 0.0637, "step": 310 }, { "epoch": 4.383561643835616, "grad_norm": 0.3196163773536682, "learning_rate": 0.0001846464935860431, "loss": 0.0518, "step": 320 }, { "epoch": 4.52054794520548, "grad_norm": 0.4424096643924713, "learning_rate": 0.0001834143636486124, "loss": 0.0524, "step": 330 }, { "epoch": 4.657534246575342, "grad_norm": 0.50010746717453, "learning_rate": 0.00018213913087991685, "loss": 0.0629, "step": 340 }, { "epoch": 4.794520547945205, "grad_norm": 0.4036540389060974, "learning_rate": 0.00018082145423292868, "loss": 0.0531, "step": 350 }, { "epoch": 4.931506849315069, "grad_norm": 0.36036092042922974, "learning_rate": 0.0001794620145927101, "loss": 0.0556, "step": 360 }, { "epoch": 5.068493150684931, "grad_norm": 0.22472509741783142, "learning_rate": 0.00017806151442457827, "loss": 0.0446, "step": 370 }, { "epoch": 5.205479452054795, "grad_norm": 0.3514921963214874, "learning_rate": 0.00017662067741111974, "loss": 0.0443, "step": 380 }, { "epoch": 5.342465753424658, "grad_norm": 0.2920095920562744, "learning_rate": 0.00017514024807824055, "loss": 0.0451, "step": 390 }, { "epoch": 5.47945205479452, "grad_norm": 0.21051590144634247, "learning_rate": 0.00017362099141044626, "loss": 0.0476, "step": 400 }, { "epoch": 5.616438356164384, "grad_norm": 0.36196619272232056, "learning_rate": 0.00017206369245555036, "loss": 0.0521, "step": 410 }, { "epoch": 5.7534246575342465, "grad_norm": 0.3503723442554474, "learning_rate": 0.0001704691559190155, "loss": 0.0472, "step": 420 }, { "epoch": 5.890410958904109, "grad_norm": 0.3881896734237671, "learning_rate": 0.0001688382057481364, "loss": 0.0537, "step": 430 }, { "epoch": 6.027397260273973, "grad_norm": 0.29409492015838623, "learning_rate": 0.00016717168470628077, "loss": 0.0436, "step": 440 }, { "epoch": 6.164383561643835, "grad_norm": 0.2455558031797409, "learning_rate": 0.0001654704539374066, "loss": 0.0429, "step": 450 }, { "epoch": 6.301369863013699, "grad_norm": 0.30749672651290894, "learning_rate": 0.00016373539252108202, "loss": 0.042, "step": 460 }, { "epoch": 6.438356164383562, "grad_norm": 0.4117829501628876, "learning_rate": 0.00016196739701823716, "loss": 0.0422, "step": 470 }, { "epoch": 6.575342465753424, "grad_norm": 0.3047957718372345, "learning_rate": 0.00016016738100788297, "loss": 0.0456, "step": 480 }, { "epoch": 6.712328767123288, "grad_norm": 0.3104310631752014, "learning_rate": 0.00015833627461503595, "loss": 0.0405, "step": 490 }, { "epoch": 6.8493150684931505, "grad_norm": 0.3713166415691376, "learning_rate": 0.0001564750240300934, "loss": 0.0451, "step": 500 }, { "epoch": 6.986301369863014, "grad_norm": 0.23804673552513123, "learning_rate": 0.00015458459101990693, "loss": 0.0387, "step": 510 }, { "epoch": 7.123287671232877, "grad_norm": 0.4476951062679291, "learning_rate": 0.00015266595243080714, "loss": 0.0406, "step": 520 }, { "epoch": 7.260273972602739, "grad_norm": 0.27973777055740356, "learning_rate": 0.00015072009968383656, "loss": 0.0464, "step": 530 }, { "epoch": 7.397260273972603, "grad_norm": 0.3597777783870697, "learning_rate": 0.00014874803826245089, "loss": 0.0459, "step": 540 }, { "epoch": 7.534246575342466, "grad_norm": 0.27027377486228943, "learning_rate": 0.00014675078719295415, "loss": 0.0375, "step": 550 }, { "epoch": 7.671232876712329, "grad_norm": 0.27681443095207214, "learning_rate": 0.00014472937851793557, "loss": 0.0421, "step": 560 }, { "epoch": 7.808219178082192, "grad_norm": 0.3312411904335022, "learning_rate": 0.00014268485676298078, "loss": 0.048, "step": 570 }, { "epoch": 7.945205479452055, "grad_norm": 0.2358381599187851, "learning_rate": 0.0001406182783969324, "loss": 0.0409, "step": 580 }, { "epoch": 8.082191780821917, "grad_norm": 0.19072838127613068, "learning_rate": 0.00013853071128597924, "loss": 0.0417, "step": 590 }, { "epoch": 8.219178082191782, "grad_norm": 0.3328644931316376, "learning_rate": 0.0001364232341418564, "loss": 0.0397, "step": 600 }, { "epoch": 8.356164383561644, "grad_norm": 0.27157458662986755, "learning_rate": 0.00013429693596444067, "loss": 0.0395, "step": 610 }, { "epoch": 8.493150684931507, "grad_norm": 0.2969032824039459, "learning_rate": 0.00013215291547903006, "loss": 0.0406, "step": 620 }, { "epoch": 8.63013698630137, "grad_norm": 0.2864357829093933, "learning_rate": 0.00012999228056859784, "loss": 0.0424, "step": 630 }, { "epoch": 8.767123287671232, "grad_norm": 0.25885725021362305, "learning_rate": 0.00012781614770131442, "loss": 0.0392, "step": 640 }, { "epoch": 8.904109589041095, "grad_norm": 0.2456735372543335, "learning_rate": 0.00012562564135363313, "loss": 0.0415, "step": 650 }, { "epoch": 9.04109589041096, "grad_norm": 0.41431066393852234, "learning_rate": 0.0001234218934292376, "loss": 0.0407, "step": 660 }, { "epoch": 9.178082191780822, "grad_norm": 0.260213702917099, "learning_rate": 0.00012120604267415172, "loss": 0.0393, "step": 670 }, { "epoch": 9.315068493150685, "grad_norm": 0.3395901322364807, "learning_rate": 0.00011897923408831346, "loss": 0.035, "step": 680 }, { "epoch": 9.452054794520548, "grad_norm": 0.3405311405658722, "learning_rate": 0.0001167426183339174, "loss": 0.0342, "step": 690 }, { "epoch": 9.58904109589041, "grad_norm": 0.20802819728851318, "learning_rate": 0.00011449735114083127, "loss": 0.0347, "step": 700 }, { "epoch": 9.726027397260275, "grad_norm": 0.5094506144523621, "learning_rate": 0.00011224459270939384, "loss": 0.0373, "step": 710 }, { "epoch": 9.863013698630137, "grad_norm": 0.21799403429031372, "learning_rate": 0.000109985507110903, "loss": 0.0392, "step": 720 }, { "epoch": 10.0, "grad_norm": 0.28433603048324585, "learning_rate": 0.00010772126168610325, "loss": 0.0373, "step": 730 }, { "epoch": 10.136986301369863, "grad_norm": 0.3425813913345337, "learning_rate": 0.00010545302644198405, "loss": 0.0385, "step": 740 }, { "epoch": 10.273972602739725, "grad_norm": 0.2662697434425354, "learning_rate": 0.00010318197344720018, "loss": 0.0347, "step": 750 }, { "epoch": 10.41095890410959, "grad_norm": 0.2841816842556, "learning_rate": 0.0001009092762264271, "loss": 0.04, "step": 760 }, { "epoch": 10.547945205479452, "grad_norm": 0.2933363914489746, "learning_rate": 9.863610915396365e-05, "loss": 0.0363, "step": 770 }, { "epoch": 10.684931506849315, "grad_norm": 0.20692330598831177, "learning_rate": 9.63636468468959e-05, "loss": 0.0361, "step": 780 }, { "epoch": 10.821917808219178, "grad_norm": 0.24741721153259277, "learning_rate": 9.409306355813529e-05, "loss": 0.0341, "step": 790 }, { "epoch": 10.95890410958904, "grad_norm": 0.1948077529668808, "learning_rate": 9.18255325696454e-05, "loss": 0.0349, "step": 800 }, { "epoch": 11.095890410958905, "grad_norm": 0.16165360808372498, "learning_rate": 8.956222558616998e-05, "loss": 0.0318, "step": 810 }, { "epoch": 11.232876712328768, "grad_norm": 0.25702184438705444, "learning_rate": 8.730431212977625e-05, "loss": 0.0281, "step": 820 }, { "epoch": 11.36986301369863, "grad_norm": 0.27587395906448364, "learning_rate": 8.505295893552594e-05, "loss": 0.0349, "step": 830 }, { "epoch": 11.506849315068493, "grad_norm": 0.3140430152416229, "learning_rate": 8.280932934858652e-05, "loss": 0.0305, "step": 840 }, { "epoch": 11.643835616438356, "grad_norm": 0.21165433526039124, "learning_rate": 8.05745827230941e-05, "loss": 0.0314, "step": 850 }, { "epoch": 11.780821917808218, "grad_norm": 0.20445489883422852, "learning_rate": 7.834987382307861e-05, "loss": 0.0319, "step": 860 }, { "epoch": 11.917808219178083, "grad_norm": 0.27832481265068054, "learning_rate": 7.613635222576072e-05, "loss": 0.0334, "step": 870 }, { "epoch": 12.054794520547945, "grad_norm": 0.25728923082351685, "learning_rate": 7.393516172752919e-05, "loss": 0.033, "step": 880 }, { "epoch": 12.191780821917808, "grad_norm": 0.2254086136817932, "learning_rate": 7.174743975290513e-05, "loss": 0.0346, "step": 890 }, { "epoch": 12.32876712328767, "grad_norm": 0.31018713116645813, "learning_rate": 6.957431676679896e-05, "loss": 0.0329, "step": 900 }, { "epoch": 12.465753424657533, "grad_norm": 0.32662343978881836, "learning_rate": 6.741691569036338e-05, "loss": 0.0342, "step": 910 }, { "epoch": 12.602739726027398, "grad_norm": 0.2533169984817505, "learning_rate": 6.527635132074493e-05, "loss": 0.0264, "step": 920 }, { "epoch": 12.73972602739726, "grad_norm": 0.27445635199546814, "learning_rate": 6.315372975503285e-05, "loss": 0.0281, "step": 930 }, { "epoch": 12.876712328767123, "grad_norm": 0.21471256017684937, "learning_rate": 6.1050147818704e-05, "loss": 0.0321, "step": 940 }, { "epoch": 13.013698630136986, "grad_norm": 0.19105984270572662, "learning_rate": 5.896669249885851e-05, "loss": 0.0273, "step": 950 }, { "epoch": 13.150684931506849, "grad_norm": 0.3308360278606415, "learning_rate": 5.690444038253935e-05, "loss": 0.0343, "step": 960 }, { "epoch": 13.287671232876713, "grad_norm": 0.1988590806722641, "learning_rate": 5.4864457100425783e-05, "loss": 0.028, "step": 970 }, { "epoch": 13.424657534246576, "grad_norm": 0.1858794391155243, "learning_rate": 5.284779677618841e-05, "loss": 0.0273, "step": 980 }, { "epoch": 13.561643835616438, "grad_norm": 0.29671627283096313, "learning_rate": 5.0855501481790305e-05, "loss": 0.0271, "step": 990 }, { "epoch": 13.698630136986301, "grad_norm": 0.17693527042865753, "learning_rate": 4.8888600699015496e-05, "loss": 0.034, "step": 1000 }, { "epoch": 13.835616438356164, "grad_norm": 0.31038013100624084, "learning_rate": 4.694811078750338e-05, "loss": 0.0251, "step": 1010 }, { "epoch": 13.972602739726028, "grad_norm": 0.3317829668521881, "learning_rate": 4.50350344595635e-05, "loss": 0.0334, "step": 1020 }, { "epoch": 14.10958904109589, "grad_norm": 0.1818408966064453, "learning_rate": 4.315036026204262e-05, "loss": 0.0272, "step": 1030 }, { "epoch": 14.246575342465754, "grad_norm": 0.2105715572834015, "learning_rate": 4.129506206551138e-05, "loss": 0.025, "step": 1040 }, { "epoch": 14.383561643835616, "grad_norm": 0.18613150715827942, "learning_rate": 3.947009856103465e-05, "loss": 0.0238, "step": 1050 }, { "epoch": 14.520547945205479, "grad_norm": 0.2959461212158203, "learning_rate": 3.767641276478563e-05, "loss": 0.0249, "step": 1060 }, { "epoch": 14.657534246575342, "grad_norm": 0.18495745956897736, "learning_rate": 3.591493153075966e-05, "loss": 0.0214, "step": 1070 }, { "epoch": 14.794520547945206, "grad_norm": 0.1501263529062271, "learning_rate": 3.41865650718396e-05, "loss": 0.0266, "step": 1080 }, { "epoch": 14.931506849315069, "grad_norm": 0.3387095332145691, "learning_rate": 3.24922064894601e-05, "loss": 0.0268, "step": 1090 }, { "epoch": 15.068493150684931, "grad_norm": 0.23434942960739136, "learning_rate": 3.083273131211382e-05, "loss": 0.0272, "step": 1100 }, { "epoch": 15.205479452054794, "grad_norm": 0.163187175989151, "learning_rate": 2.920899704293849e-05, "loss": 0.0232, "step": 1110 }, { "epoch": 15.342465753424657, "grad_norm": 0.20000265538692474, "learning_rate": 2.762184271661785e-05, "loss": 0.0261, "step": 1120 }, { "epoch": 15.479452054794521, "grad_norm": 0.18943333625793457, "learning_rate": 2.6072088465826038e-05, "loss": 0.0246, "step": 1130 }, { "epoch": 15.616438356164384, "grad_norm": 0.2833252251148224, "learning_rate": 2.4560535097439108e-05, "loss": 0.0253, "step": 1140 }, { "epoch": 15.753424657534246, "grad_norm": 0.1302843540906906, "learning_rate": 2.308796367873296e-05, "loss": 0.0246, "step": 1150 }, { "epoch": 15.89041095890411, "grad_norm": 0.16615238785743713, "learning_rate": 2.165513513378121e-05, "loss": 0.0254, "step": 1160 }, { "epoch": 16.027397260273972, "grad_norm": 0.17113815248012543, "learning_rate": 2.0262789850261798e-05, "loss": 0.0288, "step": 1170 }, { "epoch": 16.164383561643834, "grad_norm": 0.21394069492816925, "learning_rate": 1.8911647296875147e-05, "loss": 0.025, "step": 1180 }, { "epoch": 16.301369863013697, "grad_norm": 0.2763649523258209, "learning_rate": 1.7602405651572275e-05, "loss": 0.0219, "step": 1190 }, { "epoch": 16.438356164383563, "grad_norm": 0.13925646245479584, "learning_rate": 1.6335741440784035e-05, "loss": 0.0217, "step": 1200 }, { "epoch": 16.575342465753426, "grad_norm": 0.20826192200183868, "learning_rate": 1.511230918983867e-05, "loss": 0.023, "step": 1210 }, { "epoch": 16.71232876712329, "grad_norm": 0.2256271094083786, "learning_rate": 1.3932741084747913e-05, "loss": 0.023, "step": 1220 }, { "epoch": 16.84931506849315, "grad_norm": 0.27016547322273254, "learning_rate": 1.2797646645536566e-05, "loss": 0.0211, "step": 1230 }, { "epoch": 16.986301369863014, "grad_norm": 0.26627489924430847, "learning_rate": 1.1707612411284253e-05, "loss": 0.0235, "step": 1240 }, { "epoch": 17.123287671232877, "grad_norm": 0.18498767912387848, "learning_rate": 1.0663201637042252e-05, "loss": 0.022, "step": 1250 }, { "epoch": 17.26027397260274, "grad_norm": 0.23852607607841492, "learning_rate": 9.664954002781745e-06, "loss": 0.0228, "step": 1260 }, { "epoch": 17.397260273972602, "grad_norm": 0.15411531925201416, "learning_rate": 8.713385334524283e-06, "loss": 0.0198, "step": 1270 }, { "epoch": 17.534246575342465, "grad_norm": 0.25403866171836853, "learning_rate": 7.808987337798158e-06, "loss": 0.0257, "step": 1280 }, { "epoch": 17.671232876712327, "grad_norm": 0.14403975009918213, "learning_rate": 6.952227343558671e-06, "loss": 0.0215, "step": 1290 }, { "epoch": 17.80821917808219, "grad_norm": 0.188527911901474, "learning_rate": 6.143548066703475e-06, "loss": 0.0224, "step": 1300 }, { "epoch": 17.945205479452056, "grad_norm": 0.1309424489736557, "learning_rate": 5.383367377307857e-06, "loss": 0.0215, "step": 1310 }, { "epoch": 18.08219178082192, "grad_norm": 0.11233002692461014, "learning_rate": 4.672078084698095e-06, "loss": 0.0211, "step": 1320 }, { "epoch": 18.21917808219178, "grad_norm": 0.22869743406772614, "learning_rate": 4.010047734474454e-06, "loss": 0.0215, "step": 1330 }, { "epoch": 18.356164383561644, "grad_norm": 0.11979719996452332, "learning_rate": 3.397618418588877e-06, "loss": 0.0273, "step": 1340 }, { "epoch": 18.493150684931507, "grad_norm": 0.2112375795841217, "learning_rate": 2.8351065985751766e-06, "loss": 0.0228, "step": 1350 }, { "epoch": 18.63013698630137, "grad_norm": 0.14134034514427185, "learning_rate": 2.322802942023461e-06, "loss": 0.0247, "step": 1360 }, { "epoch": 18.767123287671232, "grad_norm": 0.09884881973266602, "learning_rate": 1.8609721723830132e-06, "loss": 0.0196, "step": 1370 }, { "epoch": 18.904109589041095, "grad_norm": 0.14044946432113647, "learning_rate": 1.4498529321713584e-06, "loss": 0.0198, "step": 1380 }, { "epoch": 19.041095890410958, "grad_norm": 0.13853876292705536, "learning_rate": 1.0896576596600705e-06, "loss": 0.0182, "step": 1390 }, { "epoch": 19.17808219178082, "grad_norm": 0.1654110848903656, "learning_rate": 7.80572479101327e-07, "loss": 0.0229, "step": 1400 }, { "epoch": 19.315068493150687, "grad_norm": 0.15151838958263397, "learning_rate": 5.227571045515633e-07, "loss": 0.0202, "step": 1410 }, { "epoch": 19.45205479452055, "grad_norm": 0.2258201688528061, "learning_rate": 3.163447573422351e-07, "loss": 0.0197, "step": 1420 }, { "epoch": 19.589041095890412, "grad_norm": 0.24640779197216034, "learning_rate": 1.614420972401165e-07, "loss": 0.0187, "step": 1430 }, { "epoch": 19.726027397260275, "grad_norm": 0.21181590855121613, "learning_rate": 5.812916733284324e-08, "loss": 0.0198, "step": 1440 }, { "epoch": 19.863013698630137, "grad_norm": 0.14787183701992035, "learning_rate": 6.459352668164442e-09, "loss": 0.0186, "step": 1450 }, { "epoch": 19.931506849315067, "step": 1455, "total_flos": 1.1504025698630573e+17, "train_loss": 0.05927258820058554, "train_runtime": 1048.7401, "train_samples_per_second": 88.792, "train_steps_per_second": 1.387 } ], "logging_steps": 10, "max_steps": 1455, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1504025698630573e+17, "train_batch_size": 64, "trial_name": null, "trial_params": null }