{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9908256880733948, "eval_steps": 500, "global_step": 243, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.06116207951070336, "grad_norm": 1.7883533239364624, "learning_rate": 0.00019986631570270832, "loss": 2.3883, "num_input_tokens_seen": 4016, "step": 5 }, { "epoch": 0.12232415902140673, "grad_norm": 2.9251019954681396, "learning_rate": 0.00019932383577419432, "loss": 1.4752, "num_input_tokens_seen": 7952, "step": 10 }, { "epoch": 0.1834862385321101, "grad_norm": 5.290874004364014, "learning_rate": 0.0001983664691986601, "loss": 1.1174, "num_input_tokens_seen": 11936, "step": 15 }, { "epoch": 0.24464831804281345, "grad_norm": 6.394321918487549, "learning_rate": 0.00019699821500217434, "loss": 0.7815, "num_input_tokens_seen": 15808, "step": 20 }, { "epoch": 0.3058103975535168, "grad_norm": 1.3936750888824463, "learning_rate": 0.00019522478853384155, "loss": 0.632, "num_input_tokens_seen": 19888, "step": 25 }, { "epoch": 0.3669724770642202, "grad_norm": 0.6125549674034119, "learning_rate": 0.00019305359759215685, "loss": 0.4603, "num_input_tokens_seen": 23856, "step": 30 }, { "epoch": 0.42813455657492355, "grad_norm": 0.8421280980110168, "learning_rate": 0.00019049371148181253, "loss": 0.4761, "num_input_tokens_seen": 27936, "step": 35 }, { "epoch": 0.4892966360856269, "grad_norm": 0.7261258959770203, "learning_rate": 0.0001875558231302091, "loss": 0.4525, "num_input_tokens_seen": 31872, "step": 40 }, { "epoch": 0.5504587155963303, "grad_norm": 0.8600367903709412, "learning_rate": 0.00018425220442191495, "loss": 0.4803, "num_input_tokens_seen": 35696, "step": 45 }, { "epoch": 0.6116207951070336, "grad_norm": 0.8055685758590698, "learning_rate": 0.00018059665493764743, "loss": 0.3933, "num_input_tokens_seen": 39680, "step": 50 }, { "epoch": 0.672782874617737, "grad_norm": 0.7538366317749023, "learning_rate": 0.0001766044443118978, "loss": 0.3919, "num_input_tokens_seen": 43920, "step": 55 }, { "epoch": 0.7339449541284404, "grad_norm": 0.8907052874565125, "learning_rate": 0.00017229224844997928, "loss": 0.3804, "num_input_tokens_seen": 48160, "step": 60 }, { "epoch": 0.7951070336391437, "grad_norm": 0.7426741123199463, "learning_rate": 0.00016767807987092621, "loss": 0.3576, "num_input_tokens_seen": 52080, "step": 65 }, { "epoch": 0.8562691131498471, "grad_norm": 0.6457409262657166, "learning_rate": 0.00016278121246720987, "loss": 0.3343, "num_input_tokens_seen": 56128, "step": 70 }, { "epoch": 0.9174311926605505, "grad_norm": 0.9887730479240417, "learning_rate": 0.00015762210099555803, "loss": 0.4526, "num_input_tokens_seen": 59952, "step": 75 }, { "epoch": 0.9785932721712538, "grad_norm": 0.9649442434310913, "learning_rate": 0.00015222229563517385, "loss": 0.4048, "num_input_tokens_seen": 63840, "step": 80 }, { "epoch": 1.0489296636085628, "grad_norm": 0.8496802449226379, "learning_rate": 0.0001466043519702539, "loss": 0.3599, "num_input_tokens_seen": 68176, "step": 85 }, { "epoch": 1.110091743119266, "grad_norm": 0.6206537485122681, "learning_rate": 0.00014079173677281837, "loss": 0.2197, "num_input_tokens_seen": 72336, "step": 90 }, { "epoch": 1.1712538226299694, "grad_norm": 0.7237979173660278, "learning_rate": 0.00013480872997940905, "loss": 0.1816, "num_input_tokens_seen": 76384, "step": 95 }, { "epoch": 1.2324159021406729, "grad_norm": 1.0752713680267334, "learning_rate": 0.00012868032327110904, "loss": 0.1839, "num_input_tokens_seen": 80368, "step": 100 }, { "epoch": 1.2935779816513762, "grad_norm": 1.0526070594787598, "learning_rate": 0.00012243211568052677, "loss": 0.21, "num_input_tokens_seen": 84400, "step": 105 }, { "epoch": 1.3547400611620795, "grad_norm": 1.073427677154541, "learning_rate": 0.00011609020666180575, "loss": 0.2568, "num_input_tokens_seen": 88352, "step": 110 }, { "epoch": 1.4159021406727827, "grad_norm": 0.8212242722511292, "learning_rate": 0.00010968108707031792, "loss": 0.2049, "num_input_tokens_seen": 92464, "step": 115 }, { "epoch": 1.4770642201834863, "grad_norm": 1.1588611602783203, "learning_rate": 0.00010323152850743107, "loss": 0.2213, "num_input_tokens_seen": 96432, "step": 120 }, { "epoch": 1.5382262996941896, "grad_norm": 1.08698308467865, "learning_rate": 9.676847149256895e-05, "loss": 0.2836, "num_input_tokens_seen": 100416, "step": 125 }, { "epoch": 1.599388379204893, "grad_norm": 1.1150074005126953, "learning_rate": 9.03189129296821e-05, "loss": 0.228, "num_input_tokens_seen": 104432, "step": 130 }, { "epoch": 1.6605504587155964, "grad_norm": 0.7235325574874878, "learning_rate": 8.390979333819426e-05, "loss": 0.1778, "num_input_tokens_seen": 108432, "step": 135 }, { "epoch": 1.7217125382262997, "grad_norm": 0.8855745196342468, "learning_rate": 7.756788431947326e-05, "loss": 0.1871, "num_input_tokens_seen": 112320, "step": 140 }, { "epoch": 1.782874617737003, "grad_norm": 1.3950036764144897, "learning_rate": 7.131967672889101e-05, "loss": 0.2289, "num_input_tokens_seen": 116272, "step": 145 }, { "epoch": 1.8440366972477065, "grad_norm": 1.182417392730713, "learning_rate": 6.519127002059095e-05, "loss": 0.2889, "num_input_tokens_seen": 120224, "step": 150 }, { "epoch": 1.9051987767584098, "grad_norm": 1.020156741142273, "learning_rate": 5.920826322718165e-05, "loss": 0.2724, "num_input_tokens_seen": 124176, "step": 155 }, { "epoch": 1.9663608562691133, "grad_norm": 0.8830191493034363, "learning_rate": 5.339564802974615e-05, "loss": 0.2359, "num_input_tokens_seen": 128256, "step": 160 }, { "epoch": 2.036697247706422, "grad_norm": 0.8336369395256042, "learning_rate": 4.777770436482617e-05, "loss": 0.1864, "num_input_tokens_seen": 132576, "step": 165 }, { "epoch": 2.0978593272171255, "grad_norm": 0.583479106426239, "learning_rate": 4.2377899004441966e-05, "loss": 0.0671, "num_input_tokens_seen": 136608, "step": 170 }, { "epoch": 2.159021406727829, "grad_norm": 0.686152994632721, "learning_rate": 3.721878753279017e-05, "loss": 0.1465, "num_input_tokens_seen": 140416, "step": 175 }, { "epoch": 2.220183486238532, "grad_norm": 0.6811323761940002, "learning_rate": 3.2321920129073816e-05, "loss": 0.091, "num_input_tokens_seen": 144336, "step": 180 }, { "epoch": 2.2813455657492354, "grad_norm": 0.5875197052955627, "learning_rate": 2.770775155002071e-05, "loss": 0.114, "num_input_tokens_seen": 148256, "step": 185 }, { "epoch": 2.3425076452599387, "grad_norm": 0.6630216836929321, "learning_rate": 2.339555568810221e-05, "loss": 0.1381, "num_input_tokens_seen": 152416, "step": 190 }, { "epoch": 2.4036697247706424, "grad_norm": 0.7730478644371033, "learning_rate": 1.9403345062352573e-05, "loss": 0.0799, "num_input_tokens_seen": 156400, "step": 195 }, { "epoch": 2.4648318042813457, "grad_norm": 0.7855842709541321, "learning_rate": 1.5747795578085046e-05, "loss": 0.0722, "num_input_tokens_seen": 160528, "step": 200 }, { "epoch": 2.525993883792049, "grad_norm": 0.7239196300506592, "learning_rate": 1.2444176869790925e-05, "loss": 0.1003, "num_input_tokens_seen": 164720, "step": 205 }, { "epoch": 2.5871559633027523, "grad_norm": 0.8815809488296509, "learning_rate": 9.506288518187467e-06, "loss": 0.1155, "num_input_tokens_seen": 168560, "step": 210 }, { "epoch": 2.6483180428134556, "grad_norm": 1.0388275384902954, "learning_rate": 6.946402407843155e-06, "loss": 0.0962, "num_input_tokens_seen": 172608, "step": 215 }, { "epoch": 2.709480122324159, "grad_norm": 0.9285043478012085, "learning_rate": 4.775211466158469e-06, "loss": 0.0808, "num_input_tokens_seen": 176656, "step": 220 }, { "epoch": 2.770642201834862, "grad_norm": 0.9785643815994263, "learning_rate": 3.0017849978256516e-06, "loss": 0.1206, "num_input_tokens_seen": 180608, "step": 225 }, { "epoch": 2.8318042813455655, "grad_norm": 0.9847484827041626, "learning_rate": 1.6335308013398886e-06, "loss": 0.1259, "num_input_tokens_seen": 184592, "step": 230 }, { "epoch": 2.8929663608562692, "grad_norm": 1.2911163568496704, "learning_rate": 6.761642258056978e-07, "loss": 0.158, "num_input_tokens_seen": 188496, "step": 235 }, { "epoch": 2.9541284403669725, "grad_norm": 0.7206448912620544, "learning_rate": 1.3368429729168076e-07, "loss": 0.062, "num_input_tokens_seen": 192368, "step": 240 }, { "epoch": 2.9908256880733948, "num_input_tokens_seen": 194864, "step": 243, "total_flos": 7611007265931264.0, "train_loss": 0.34003226710445106, "train_runtime": 5382.7216, "train_samples_per_second": 0.728, "train_steps_per_second": 0.045 } ], "logging_steps": 5, "max_steps": 243, "num_input_tokens_seen": 194864, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7611007265931264.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }