{ "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 500, "global_step": 1455, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03436426116838488, "grad_norm": 5.543055534362793, "learning_rate": 2.7397260273972603e-05, "loss": 1.3448, "step": 10 }, { "epoch": 0.06872852233676977, "grad_norm": 2.9320967197418213, "learning_rate": 5.479452054794521e-05, "loss": 0.574, "step": 20 }, { "epoch": 0.10309278350515463, "grad_norm": 2.3950774669647217, "learning_rate": 8.219178082191781e-05, "loss": 0.3403, "step": 30 }, { "epoch": 0.13745704467353953, "grad_norm": 1.2579808235168457, "learning_rate": 0.00010958904109589041, "loss": 0.2498, "step": 40 }, { "epoch": 0.1718213058419244, "grad_norm": 1.5484106540679932, "learning_rate": 0.000136986301369863, "loss": 0.2205, "step": 50 }, { "epoch": 0.20618556701030927, "grad_norm": 0.9280014634132385, "learning_rate": 0.00016438356164383562, "loss": 0.2014, "step": 60 }, { "epoch": 0.24054982817869416, "grad_norm": 0.8061118125915527, "learning_rate": 0.0001917808219178082, "loss": 0.1826, "step": 70 }, { "epoch": 0.27491408934707906, "grad_norm": 1.369801640510559, "learning_rate": 0.00019998733979961563, "loss": 0.1547, "step": 80 }, { "epoch": 0.30927835051546393, "grad_norm": 0.7772133350372314, "learning_rate": 0.0001999253383717226, "loss": 0.1451, "step": 90 }, { "epoch": 0.3436426116838488, "grad_norm": 1.0345059633255005, "learning_rate": 0.00019981170237143067, "loss": 0.1422, "step": 100 }, { "epoch": 0.37800687285223367, "grad_norm": 0.7138351798057556, "learning_rate": 0.00019964649051804355, "loss": 0.1441, "step": 110 }, { "epoch": 0.41237113402061853, "grad_norm": 0.7671028971672058, "learning_rate": 0.000199429788181734, "loss": 0.1323, "step": 120 }, { "epoch": 0.44673539518900346, "grad_norm": 0.7595092058181763, "learning_rate": 0.0001991617073394306, "loss": 0.1167, "step": 130 }, { "epoch": 0.48109965635738833, "grad_norm": 0.9330806732177734, "learning_rate": 0.00019884238651695556, "loss": 0.1146, "step": 140 }, { "epoch": 0.5154639175257731, "grad_norm": 0.6702235341072083, "learning_rate": 0.00019847199071744415, "loss": 0.1254, "step": 150 }, { "epoch": 0.5498281786941581, "grad_norm": 0.493888258934021, "learning_rate": 0.00019805071133608242, "loss": 0.1005, "step": 160 }, { "epoch": 0.584192439862543, "grad_norm": 0.5991718769073486, "learning_rate": 0.0001975787660612072, "loss": 0.0953, "step": 170 }, { "epoch": 0.6185567010309279, "grad_norm": 0.6898994445800781, "learning_rate": 0.00019705639876181969, "loss": 0.0816, "step": 180 }, { "epoch": 0.6529209621993127, "grad_norm": 0.6507448554039001, "learning_rate": 0.00019648387936157068, "loss": 0.0898, "step": 190 }, { "epoch": 0.6872852233676976, "grad_norm": 0.6411319971084595, "learning_rate": 0.00019586150369928245, "loss": 0.0887, "step": 200 }, { "epoch": 0.7216494845360825, "grad_norm": 0.5917101502418518, "learning_rate": 0.00019518959337607957, "loss": 0.079, "step": 210 }, { "epoch": 0.7560137457044673, "grad_norm": 0.6061806678771973, "learning_rate": 0.0001944684955892075, "loss": 0.0946, "step": 220 }, { "epoch": 0.7903780068728522, "grad_norm": 0.4965127408504486, "learning_rate": 0.0001936985829526247, "loss": 0.0769, "step": 230 }, { "epoch": 0.8247422680412371, "grad_norm": 0.6563584208488464, "learning_rate": 0.00019288025330446126, "loss": 0.0865, "step": 240 }, { "epoch": 0.8591065292096219, "grad_norm": 0.38654449582099915, "learning_rate": 0.00019201392950144363, "loss": 0.0767, "step": 250 }, { "epoch": 0.8934707903780069, "grad_norm": 0.4376271069049835, "learning_rate": 0.0001911000592003909, "loss": 0.0729, "step": 260 }, { "epoch": 0.9278350515463918, "grad_norm": 0.3565465211868286, "learning_rate": 0.00019013911462689668, "loss": 0.0697, "step": 270 }, { "epoch": 0.9621993127147767, "grad_norm": 0.6215035915374756, "learning_rate": 0.000189131592331315, "loss": 0.087, "step": 280 }, { "epoch": 0.9965635738831615, "grad_norm": 0.5725772976875305, "learning_rate": 0.00018807801293217735, "loss": 0.0703, "step": 290 }, { "epoch": 1.0309278350515463, "grad_norm": 0.3892384469509125, "learning_rate": 0.00018697892084717238, "loss": 0.0657, "step": 300 }, { "epoch": 1.0652920962199313, "grad_norm": 0.4619602859020233, "learning_rate": 0.00018583488401182843, "loss": 0.0686, "step": 310 }, { "epoch": 1.0996563573883162, "grad_norm": 0.48653048276901245, "learning_rate": 0.0001846464935860431, "loss": 0.0712, "step": 320 }, { "epoch": 1.134020618556701, "grad_norm": 0.42437320947647095, "learning_rate": 0.0001834143636486124, "loss": 0.0745, "step": 330 }, { "epoch": 1.168384879725086, "grad_norm": 0.5314765572547913, "learning_rate": 0.00018213913087991685, "loss": 0.0583, "step": 340 }, { "epoch": 1.2027491408934707, "grad_norm": 0.618599534034729, "learning_rate": 0.00018082145423292868, "loss": 0.0735, "step": 350 }, { "epoch": 1.2371134020618557, "grad_norm": 0.4557077884674072, "learning_rate": 0.0001794620145927101, "loss": 0.0724, "step": 360 }, { "epoch": 1.2714776632302405, "grad_norm": 0.3549683690071106, "learning_rate": 0.00017806151442457827, "loss": 0.0591, "step": 370 }, { "epoch": 1.3058419243986255, "grad_norm": 0.4492852985858917, "learning_rate": 0.00017662067741111974, "loss": 0.0586, "step": 380 }, { "epoch": 1.3402061855670104, "grad_norm": 0.392857164144516, "learning_rate": 0.00017514024807824055, "loss": 0.0765, "step": 390 }, { "epoch": 1.3745704467353952, "grad_norm": 0.3970784544944763, "learning_rate": 0.00017362099141044626, "loss": 0.0534, "step": 400 }, { "epoch": 1.40893470790378, "grad_norm": 0.44281744956970215, "learning_rate": 0.00017206369245555036, "loss": 0.0611, "step": 410 }, { "epoch": 1.443298969072165, "grad_norm": 0.3246597945690155, "learning_rate": 0.0001704691559190155, "loss": 0.0671, "step": 420 }, { "epoch": 1.47766323024055, "grad_norm": 0.33960649371147156, "learning_rate": 0.0001688382057481364, "loss": 0.0661, "step": 430 }, { "epoch": 1.5120274914089347, "grad_norm": 0.5256112813949585, "learning_rate": 0.00016717168470628077, "loss": 0.0675, "step": 440 }, { "epoch": 1.5463917525773194, "grad_norm": 0.5827927589416504, "learning_rate": 0.0001654704539374066, "loss": 0.0667, "step": 450 }, { "epoch": 1.5807560137457046, "grad_norm": 0.6631433963775635, "learning_rate": 0.00016373539252108202, "loss": 0.0636, "step": 460 }, { "epoch": 1.6151202749140894, "grad_norm": 0.5339364409446716, "learning_rate": 0.00016196739701823716, "loss": 0.0726, "step": 470 }, { "epoch": 1.6494845360824741, "grad_norm": 0.6372013092041016, "learning_rate": 0.00016016738100788297, "loss": 0.0556, "step": 480 }, { "epoch": 1.6838487972508591, "grad_norm": 0.46744048595428467, "learning_rate": 0.00015833627461503595, "loss": 0.059, "step": 490 }, { "epoch": 1.718213058419244, "grad_norm": 0.32253313064575195, "learning_rate": 0.0001564750240300934, "loss": 0.0475, "step": 500 }, { "epoch": 1.7525773195876289, "grad_norm": 0.4637961685657501, "learning_rate": 0.00015458459101990693, "loss": 0.0514, "step": 510 }, { "epoch": 1.7869415807560136, "grad_norm": 0.43408897519111633, "learning_rate": 0.00015266595243080714, "loss": 0.0509, "step": 520 }, { "epoch": 1.8213058419243986, "grad_norm": 0.5546535849571228, "learning_rate": 0.00015072009968383656, "loss": 0.0572, "step": 530 }, { "epoch": 1.8556701030927836, "grad_norm": 0.3202098309993744, "learning_rate": 0.00014874803826245089, "loss": 0.0615, "step": 540 }, { "epoch": 1.8900343642611683, "grad_norm": 0.4085174798965454, "learning_rate": 0.00014675078719295415, "loss": 0.0561, "step": 550 }, { "epoch": 1.9243986254295533, "grad_norm": 0.4084959030151367, "learning_rate": 0.00014472937851793557, "loss": 0.0616, "step": 560 }, { "epoch": 1.9587628865979383, "grad_norm": 0.4582497179508209, "learning_rate": 0.00014268485676298078, "loss": 0.0675, "step": 570 }, { "epoch": 1.993127147766323, "grad_norm": 0.25662359595298767, "learning_rate": 0.0001406182783969324, "loss": 0.0543, "step": 580 }, { "epoch": 2.027491408934708, "grad_norm": 0.2858852744102478, "learning_rate": 0.00013853071128597924, "loss": 0.0447, "step": 590 }, { "epoch": 2.0618556701030926, "grad_norm": 0.4853512942790985, "learning_rate": 0.0001364232341418564, "loss": 0.0537, "step": 600 }, { "epoch": 2.0962199312714778, "grad_norm": 0.40022608637809753, "learning_rate": 0.00013429693596444067, "loss": 0.0647, "step": 610 }, { "epoch": 2.1305841924398625, "grad_norm": 0.44074031710624695, "learning_rate": 0.00013215291547903006, "loss": 0.063, "step": 620 }, { "epoch": 2.1649484536082473, "grad_norm": 0.3592728078365326, "learning_rate": 0.00012999228056859784, "loss": 0.0608, "step": 630 }, { "epoch": 2.1993127147766325, "grad_norm": 0.3472447395324707, "learning_rate": 0.00012781614770131442, "loss": 0.0541, "step": 640 }, { "epoch": 2.2336769759450172, "grad_norm": 0.30898717045783997, "learning_rate": 0.00012562564135363313, "loss": 0.0454, "step": 650 }, { "epoch": 2.268041237113402, "grad_norm": 0.3706619441509247, "learning_rate": 0.0001234218934292376, "loss": 0.0524, "step": 660 }, { "epoch": 2.3024054982817868, "grad_norm": 0.35367798805236816, "learning_rate": 0.00012120604267415172, "loss": 0.0351, "step": 670 }, { "epoch": 2.336769759450172, "grad_norm": 0.36357077956199646, "learning_rate": 0.00011897923408831346, "loss": 0.0558, "step": 680 }, { "epoch": 2.3711340206185567, "grad_norm": 0.4092961251735687, "learning_rate": 0.0001167426183339174, "loss": 0.041, "step": 690 }, { "epoch": 2.4054982817869415, "grad_norm": 0.2752332389354706, "learning_rate": 0.00011449735114083127, "loss": 0.0407, "step": 700 }, { "epoch": 2.4398625429553267, "grad_norm": 0.38444244861602783, "learning_rate": 0.00011224459270939384, "loss": 0.044, "step": 710 }, { "epoch": 2.4742268041237114, "grad_norm": 0.3202449679374695, "learning_rate": 0.000109985507110903, "loss": 0.0422, "step": 720 }, { "epoch": 2.508591065292096, "grad_norm": 0.2754347324371338, "learning_rate": 0.00010772126168610325, "loss": 0.0468, "step": 730 }, { "epoch": 2.542955326460481, "grad_norm": 0.32674992084503174, "learning_rate": 0.00010545302644198405, "loss": 0.0461, "step": 740 }, { "epoch": 2.5773195876288657, "grad_norm": 0.27970951795578003, "learning_rate": 0.00010318197344720018, "loss": 0.0396, "step": 750 }, { "epoch": 2.611683848797251, "grad_norm": 0.3448905646800995, "learning_rate": 0.0001009092762264271, "loss": 0.039, "step": 760 }, { "epoch": 2.6460481099656357, "grad_norm": 0.32179659605026245, "learning_rate": 9.863610915396365e-05, "loss": 0.0446, "step": 770 }, { "epoch": 2.680412371134021, "grad_norm": 0.3091253340244293, "learning_rate": 9.63636468468959e-05, "loss": 0.0536, "step": 780 }, { "epoch": 2.7147766323024056, "grad_norm": 0.4326021671295166, "learning_rate": 9.409306355813529e-05, "loss": 0.0401, "step": 790 }, { "epoch": 2.7491408934707904, "grad_norm": 0.2855621874332428, "learning_rate": 9.18255325696454e-05, "loss": 0.0475, "step": 800 }, { "epoch": 2.783505154639175, "grad_norm": 0.33704933524131775, "learning_rate": 8.956222558616998e-05, "loss": 0.036, "step": 810 }, { "epoch": 2.81786941580756, "grad_norm": 0.3991442620754242, "learning_rate": 8.730431212977625e-05, "loss": 0.0582, "step": 820 }, { "epoch": 2.852233676975945, "grad_norm": 0.29364827275276184, "learning_rate": 8.505295893552594e-05, "loss": 0.0442, "step": 830 }, { "epoch": 2.88659793814433, "grad_norm": 0.38249287009239197, "learning_rate": 8.280932934858652e-05, "loss": 0.043, "step": 840 }, { "epoch": 2.9209621993127146, "grad_norm": 0.5033205151557922, "learning_rate": 8.05745827230941e-05, "loss": 0.0407, "step": 850 }, { "epoch": 2.9553264604811, "grad_norm": 0.24252034723758698, "learning_rate": 7.834987382307861e-05, "loss": 0.0459, "step": 860 }, { "epoch": 2.9896907216494846, "grad_norm": 0.24571438133716583, "learning_rate": 7.613635222576072e-05, "loss": 0.0452, "step": 870 }, { "epoch": 3.0240549828178693, "grad_norm": 0.32780882716178894, "learning_rate": 7.393516172752919e-05, "loss": 0.0427, "step": 880 }, { "epoch": 3.058419243986254, "grad_norm": 0.28867006301879883, "learning_rate": 7.174743975290513e-05, "loss": 0.0411, "step": 890 }, { "epoch": 3.0927835051546393, "grad_norm": 0.37426048517227173, "learning_rate": 6.957431676679896e-05, "loss": 0.0386, "step": 900 }, { "epoch": 3.127147766323024, "grad_norm": 0.24257159233093262, "learning_rate": 6.741691569036338e-05, "loss": 0.0329, "step": 910 }, { "epoch": 3.161512027491409, "grad_norm": 0.2398187667131424, "learning_rate": 6.527635132074493e-05, "loss": 0.0494, "step": 920 }, { "epoch": 3.195876288659794, "grad_norm": 0.35756927728652954, "learning_rate": 6.315372975503285e-05, "loss": 0.0494, "step": 930 }, { "epoch": 3.2302405498281788, "grad_norm": 0.33009472489356995, "learning_rate": 6.1050147818704e-05, "loss": 0.0375, "step": 940 }, { "epoch": 3.2646048109965635, "grad_norm": 0.30801263451576233, "learning_rate": 5.896669249885851e-05, "loss": 0.0361, "step": 950 }, { "epoch": 3.2989690721649483, "grad_norm": 0.31775572896003723, "learning_rate": 5.690444038253935e-05, "loss": 0.0473, "step": 960 }, { "epoch": 3.3333333333333335, "grad_norm": 0.3134918212890625, "learning_rate": 5.4864457100425783e-05, "loss": 0.0298, "step": 970 }, { "epoch": 3.3676975945017182, "grad_norm": 0.2736685574054718, "learning_rate": 5.284779677618841e-05, "loss": 0.0334, "step": 980 }, { "epoch": 3.402061855670103, "grad_norm": 0.5353654623031616, "learning_rate": 5.0855501481790305e-05, "loss": 0.0395, "step": 990 }, { "epoch": 3.436426116838488, "grad_norm": 0.40775638818740845, "learning_rate": 4.8888600699015496e-05, "loss": 0.0365, "step": 1000 }, { "epoch": 3.470790378006873, "grad_norm": 0.25919926166534424, "learning_rate": 4.694811078750338e-05, "loss": 0.0422, "step": 1010 }, { "epoch": 3.5051546391752577, "grad_norm": 0.3091573417186737, "learning_rate": 4.50350344595635e-05, "loss": 0.0318, "step": 1020 }, { "epoch": 3.5395189003436425, "grad_norm": 0.33824992179870605, "learning_rate": 4.315036026204262e-05, "loss": 0.034, "step": 1030 }, { "epoch": 3.5738831615120272, "grad_norm": 0.2815360128879547, "learning_rate": 4.129506206551138e-05, "loss": 0.0307, "step": 1040 }, { "epoch": 3.6082474226804124, "grad_norm": 0.15872405469417572, "learning_rate": 3.947009856103465e-05, "loss": 0.04, "step": 1050 }, { "epoch": 3.642611683848797, "grad_norm": 0.24633029103279114, "learning_rate": 3.767641276478563e-05, "loss": 0.047, "step": 1060 }, { "epoch": 3.6769759450171824, "grad_norm": 0.22606323659420013, "learning_rate": 3.591493153075966e-05, "loss": 0.0313, "step": 1070 }, { "epoch": 3.711340206185567, "grad_norm": 0.36013609170913696, "learning_rate": 3.41865650718396e-05, "loss": 0.033, "step": 1080 }, { "epoch": 3.745704467353952, "grad_norm": 0.2635957896709442, "learning_rate": 3.24922064894601e-05, "loss": 0.0377, "step": 1090 }, { "epoch": 3.7800687285223367, "grad_norm": 0.22290170192718506, "learning_rate": 3.083273131211382e-05, "loss": 0.032, "step": 1100 }, { "epoch": 3.8144329896907214, "grad_norm": 0.21059395372867584, "learning_rate": 2.920899704293849e-05, "loss": 0.0339, "step": 1110 }, { "epoch": 3.8487972508591066, "grad_norm": 0.22615396976470947, "learning_rate": 2.762184271661785e-05, "loss": 0.0282, "step": 1120 }, { "epoch": 3.8831615120274914, "grad_norm": 0.14210452139377594, "learning_rate": 2.6072088465826038e-05, "loss": 0.031, "step": 1130 }, { "epoch": 3.917525773195876, "grad_norm": 0.199430912733078, "learning_rate": 2.4560535097439108e-05, "loss": 0.0286, "step": 1140 }, { "epoch": 3.9518900343642613, "grad_norm": 0.22842490673065186, "learning_rate": 2.308796367873296e-05, "loss": 0.0343, "step": 1150 }, { "epoch": 3.986254295532646, "grad_norm": 0.20589038729667664, "learning_rate": 2.165513513378121e-05, "loss": 0.0266, "step": 1160 }, { "epoch": 4.020618556701031, "grad_norm": 0.19770939648151398, "learning_rate": 2.0262789850261798e-05, "loss": 0.0334, "step": 1170 }, { "epoch": 4.054982817869416, "grad_norm": 0.32386648654937744, "learning_rate": 1.8911647296875147e-05, "loss": 0.0274, "step": 1180 }, { "epoch": 4.0893470790378, "grad_norm": 0.20792323350906372, "learning_rate": 1.7602405651572275e-05, "loss": 0.0295, "step": 1190 }, { "epoch": 4.123711340206185, "grad_norm": 0.17066961526870728, "learning_rate": 1.6335741440784035e-05, "loss": 0.0385, "step": 1200 }, { "epoch": 4.158075601374571, "grad_norm": 0.3135523796081543, "learning_rate": 1.511230918983867e-05, "loss": 0.0432, "step": 1210 }, { "epoch": 4.1924398625429555, "grad_norm": 0.2972412407398224, "learning_rate": 1.3932741084747913e-05, "loss": 0.0316, "step": 1220 }, { "epoch": 4.22680412371134, "grad_norm": 0.19240647554397583, "learning_rate": 1.2797646645536566e-05, "loss": 0.0278, "step": 1230 }, { "epoch": 4.261168384879725, "grad_norm": 0.3429684638977051, "learning_rate": 1.1707612411284253e-05, "loss": 0.0369, "step": 1240 }, { "epoch": 4.29553264604811, "grad_norm": 0.2781321704387665, "learning_rate": 1.0663201637042252e-05, "loss": 0.0335, "step": 1250 }, { "epoch": 4.329896907216495, "grad_norm": 0.2124054729938507, "learning_rate": 9.664954002781745e-06, "loss": 0.0264, "step": 1260 }, { "epoch": 4.364261168384879, "grad_norm": 0.20779696106910706, "learning_rate": 8.713385334524283e-06, "loss": 0.0235, "step": 1270 }, { "epoch": 4.398625429553265, "grad_norm": 0.20655418932437897, "learning_rate": 7.808987337798158e-06, "loss": 0.029, "step": 1280 }, { "epoch": 4.43298969072165, "grad_norm": 0.3112201392650604, "learning_rate": 6.952227343558671e-06, "loss": 0.0273, "step": 1290 }, { "epoch": 4.4673539518900345, "grad_norm": 0.20721083879470825, "learning_rate": 6.143548066703475e-06, "loss": 0.0317, "step": 1300 }, { "epoch": 4.501718213058419, "grad_norm": 0.24077868461608887, "learning_rate": 5.383367377307857e-06, "loss": 0.0258, "step": 1310 }, { "epoch": 4.536082474226804, "grad_norm": 0.19729691743850708, "learning_rate": 4.672078084698095e-06, "loss": 0.0287, "step": 1320 }, { "epoch": 4.570446735395189, "grad_norm": 0.3005140423774719, "learning_rate": 4.010047734474454e-06, "loss": 0.0324, "step": 1330 }, { "epoch": 4.6048109965635735, "grad_norm": 0.3983416259288788, "learning_rate": 3.397618418588877e-06, "loss": 0.0419, "step": 1340 }, { "epoch": 4.639175257731958, "grad_norm": 0.24803949892520905, "learning_rate": 2.8351065985751766e-06, "loss": 0.0305, "step": 1350 }, { "epoch": 4.673539518900344, "grad_norm": 0.3299216628074646, "learning_rate": 2.322802942023461e-06, "loss": 0.0378, "step": 1360 }, { "epoch": 4.707903780068729, "grad_norm": 0.33067575097084045, "learning_rate": 1.8609721723830132e-06, "loss": 0.0278, "step": 1370 }, { "epoch": 4.742268041237113, "grad_norm": 0.3416236340999603, "learning_rate": 1.4498529321713584e-06, "loss": 0.0257, "step": 1380 }, { "epoch": 4.776632302405498, "grad_norm": 0.20940996706485748, "learning_rate": 1.0896576596600705e-06, "loss": 0.031, "step": 1390 }, { "epoch": 4.810996563573883, "grad_norm": 0.186074361205101, "learning_rate": 7.80572479101327e-07, "loss": 0.0295, "step": 1400 }, { "epoch": 4.845360824742268, "grad_norm": 0.19710496068000793, "learning_rate": 5.227571045515633e-07, "loss": 0.0416, "step": 1410 }, { "epoch": 4.879725085910653, "grad_norm": 0.20495687425136566, "learning_rate": 3.163447573422351e-07, "loss": 0.0329, "step": 1420 }, { "epoch": 4.914089347079038, "grad_norm": 0.4691133499145508, "learning_rate": 1.614420972401165e-07, "loss": 0.0323, "step": 1430 }, { "epoch": 4.948453608247423, "grad_norm": 0.23419992625713348, "learning_rate": 5.812916733284324e-08, "loss": 0.0205, "step": 1440 }, { "epoch": 4.982817869415808, "grad_norm": 0.21251557767391205, "learning_rate": 6.459352668164442e-09, "loss": 0.0332, "step": 1450 }, { "epoch": 5.0, "step": 1455, "total_flos": 2.885368061470752e+16, "train_loss": 0.07233676388603714, "train_runtime": 485.4593, "train_samples_per_second": 47.955, "train_steps_per_second": 2.997 } ], "logging_steps": 10, "max_steps": 1455, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.885368061470752e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }