|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9908256880733948, |
|
"eval_steps": 500, |
|
"global_step": 243, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.06116207951070336, |
|
"grad_norm": 1.7883533239364624, |
|
"learning_rate": 0.00019986631570270832, |
|
"loss": 2.3883, |
|
"num_input_tokens_seen": 4016, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.12232415902140673, |
|
"grad_norm": 2.9251019954681396, |
|
"learning_rate": 0.00019932383577419432, |
|
"loss": 1.4752, |
|
"num_input_tokens_seen": 7952, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1834862385321101, |
|
"grad_norm": 5.290874004364014, |
|
"learning_rate": 0.0001983664691986601, |
|
"loss": 1.1174, |
|
"num_input_tokens_seen": 11936, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.24464831804281345, |
|
"grad_norm": 6.394321918487549, |
|
"learning_rate": 0.00019699821500217434, |
|
"loss": 0.7815, |
|
"num_input_tokens_seen": 15808, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.3058103975535168, |
|
"grad_norm": 1.3936750888824463, |
|
"learning_rate": 0.00019522478853384155, |
|
"loss": 0.632, |
|
"num_input_tokens_seen": 19888, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.3669724770642202, |
|
"grad_norm": 0.6125549674034119, |
|
"learning_rate": 0.00019305359759215685, |
|
"loss": 0.4603, |
|
"num_input_tokens_seen": 23856, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.42813455657492355, |
|
"grad_norm": 0.8421280980110168, |
|
"learning_rate": 0.00019049371148181253, |
|
"loss": 0.4761, |
|
"num_input_tokens_seen": 27936, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.4892966360856269, |
|
"grad_norm": 0.7261258959770203, |
|
"learning_rate": 0.0001875558231302091, |
|
"loss": 0.4525, |
|
"num_input_tokens_seen": 31872, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.5504587155963303, |
|
"grad_norm": 0.8600367903709412, |
|
"learning_rate": 0.00018425220442191495, |
|
"loss": 0.4803, |
|
"num_input_tokens_seen": 35696, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.6116207951070336, |
|
"grad_norm": 0.8055685758590698, |
|
"learning_rate": 0.00018059665493764743, |
|
"loss": 0.3933, |
|
"num_input_tokens_seen": 39680, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.672782874617737, |
|
"grad_norm": 0.7538366317749023, |
|
"learning_rate": 0.0001766044443118978, |
|
"loss": 0.3919, |
|
"num_input_tokens_seen": 43920, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.7339449541284404, |
|
"grad_norm": 0.8907052874565125, |
|
"learning_rate": 0.00017229224844997928, |
|
"loss": 0.3804, |
|
"num_input_tokens_seen": 48160, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.7951070336391437, |
|
"grad_norm": 0.7426741123199463, |
|
"learning_rate": 0.00016767807987092621, |
|
"loss": 0.3576, |
|
"num_input_tokens_seen": 52080, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.8562691131498471, |
|
"grad_norm": 0.6457409262657166, |
|
"learning_rate": 0.00016278121246720987, |
|
"loss": 0.3343, |
|
"num_input_tokens_seen": 56128, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.9174311926605505, |
|
"grad_norm": 0.9887730479240417, |
|
"learning_rate": 0.00015762210099555803, |
|
"loss": 0.4526, |
|
"num_input_tokens_seen": 59952, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.9785932721712538, |
|
"grad_norm": 0.9649442434310913, |
|
"learning_rate": 0.00015222229563517385, |
|
"loss": 0.4048, |
|
"num_input_tokens_seen": 63840, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0489296636085628, |
|
"grad_norm": 0.8496802449226379, |
|
"learning_rate": 0.0001466043519702539, |
|
"loss": 0.3599, |
|
"num_input_tokens_seen": 68176, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 1.110091743119266, |
|
"grad_norm": 0.6206537485122681, |
|
"learning_rate": 0.00014079173677281837, |
|
"loss": 0.2197, |
|
"num_input_tokens_seen": 72336, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.1712538226299694, |
|
"grad_norm": 0.7237979173660278, |
|
"learning_rate": 0.00013480872997940905, |
|
"loss": 0.1816, |
|
"num_input_tokens_seen": 76384, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 1.2324159021406729, |
|
"grad_norm": 1.0752713680267334, |
|
"learning_rate": 0.00012868032327110904, |
|
"loss": 0.1839, |
|
"num_input_tokens_seen": 80368, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.2935779816513762, |
|
"grad_norm": 1.0526070594787598, |
|
"learning_rate": 0.00012243211568052677, |
|
"loss": 0.21, |
|
"num_input_tokens_seen": 84400, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 1.3547400611620795, |
|
"grad_norm": 1.073427677154541, |
|
"learning_rate": 0.00011609020666180575, |
|
"loss": 0.2568, |
|
"num_input_tokens_seen": 88352, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.4159021406727827, |
|
"grad_norm": 0.8212242722511292, |
|
"learning_rate": 0.00010968108707031792, |
|
"loss": 0.2049, |
|
"num_input_tokens_seen": 92464, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.4770642201834863, |
|
"grad_norm": 1.1588611602783203, |
|
"learning_rate": 0.00010323152850743107, |
|
"loss": 0.2213, |
|
"num_input_tokens_seen": 96432, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.5382262996941896, |
|
"grad_norm": 1.08698308467865, |
|
"learning_rate": 9.676847149256895e-05, |
|
"loss": 0.2836, |
|
"num_input_tokens_seen": 100416, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.599388379204893, |
|
"grad_norm": 1.1150074005126953, |
|
"learning_rate": 9.03189129296821e-05, |
|
"loss": 0.228, |
|
"num_input_tokens_seen": 104432, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.6605504587155964, |
|
"grad_norm": 0.7235325574874878, |
|
"learning_rate": 8.390979333819426e-05, |
|
"loss": 0.1778, |
|
"num_input_tokens_seen": 108432, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.7217125382262997, |
|
"grad_norm": 0.8855745196342468, |
|
"learning_rate": 7.756788431947326e-05, |
|
"loss": 0.1871, |
|
"num_input_tokens_seen": 112320, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.782874617737003, |
|
"grad_norm": 1.3950036764144897, |
|
"learning_rate": 7.131967672889101e-05, |
|
"loss": 0.2289, |
|
"num_input_tokens_seen": 116272, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.8440366972477065, |
|
"grad_norm": 1.182417392730713, |
|
"learning_rate": 6.519127002059095e-05, |
|
"loss": 0.2889, |
|
"num_input_tokens_seen": 120224, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.9051987767584098, |
|
"grad_norm": 1.020156741142273, |
|
"learning_rate": 5.920826322718165e-05, |
|
"loss": 0.2724, |
|
"num_input_tokens_seen": 124176, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.9663608562691133, |
|
"grad_norm": 0.8830191493034363, |
|
"learning_rate": 5.339564802974615e-05, |
|
"loss": 0.2359, |
|
"num_input_tokens_seen": 128256, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.036697247706422, |
|
"grad_norm": 0.8336369395256042, |
|
"learning_rate": 4.777770436482617e-05, |
|
"loss": 0.1864, |
|
"num_input_tokens_seen": 132576, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 2.0978593272171255, |
|
"grad_norm": 0.583479106426239, |
|
"learning_rate": 4.2377899004441966e-05, |
|
"loss": 0.0671, |
|
"num_input_tokens_seen": 136608, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.159021406727829, |
|
"grad_norm": 0.686152994632721, |
|
"learning_rate": 3.721878753279017e-05, |
|
"loss": 0.1465, |
|
"num_input_tokens_seen": 140416, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 2.220183486238532, |
|
"grad_norm": 0.6811323761940002, |
|
"learning_rate": 3.2321920129073816e-05, |
|
"loss": 0.091, |
|
"num_input_tokens_seen": 144336, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.2813455657492354, |
|
"grad_norm": 0.5875197052955627, |
|
"learning_rate": 2.770775155002071e-05, |
|
"loss": 0.114, |
|
"num_input_tokens_seen": 148256, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 2.3425076452599387, |
|
"grad_norm": 0.6630216836929321, |
|
"learning_rate": 2.339555568810221e-05, |
|
"loss": 0.1381, |
|
"num_input_tokens_seen": 152416, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.4036697247706424, |
|
"grad_norm": 0.7730478644371033, |
|
"learning_rate": 1.9403345062352573e-05, |
|
"loss": 0.0799, |
|
"num_input_tokens_seen": 156400, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 2.4648318042813457, |
|
"grad_norm": 0.7855842709541321, |
|
"learning_rate": 1.5747795578085046e-05, |
|
"loss": 0.0722, |
|
"num_input_tokens_seen": 160528, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.525993883792049, |
|
"grad_norm": 0.7239196300506592, |
|
"learning_rate": 1.2444176869790925e-05, |
|
"loss": 0.1003, |
|
"num_input_tokens_seen": 164720, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 2.5871559633027523, |
|
"grad_norm": 0.8815809488296509, |
|
"learning_rate": 9.506288518187467e-06, |
|
"loss": 0.1155, |
|
"num_input_tokens_seen": 168560, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.6483180428134556, |
|
"grad_norm": 1.0388275384902954, |
|
"learning_rate": 6.946402407843155e-06, |
|
"loss": 0.0962, |
|
"num_input_tokens_seen": 172608, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 2.709480122324159, |
|
"grad_norm": 0.9285043478012085, |
|
"learning_rate": 4.775211466158469e-06, |
|
"loss": 0.0808, |
|
"num_input_tokens_seen": 176656, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.770642201834862, |
|
"grad_norm": 0.9785643815994263, |
|
"learning_rate": 3.0017849978256516e-06, |
|
"loss": 0.1206, |
|
"num_input_tokens_seen": 180608, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.8318042813455655, |
|
"grad_norm": 0.9847484827041626, |
|
"learning_rate": 1.6335308013398886e-06, |
|
"loss": 0.1259, |
|
"num_input_tokens_seen": 184592, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.8929663608562692, |
|
"grad_norm": 1.2911163568496704, |
|
"learning_rate": 6.761642258056978e-07, |
|
"loss": 0.158, |
|
"num_input_tokens_seen": 188496, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.9541284403669725, |
|
"grad_norm": 0.7206448912620544, |
|
"learning_rate": 1.3368429729168076e-07, |
|
"loss": 0.062, |
|
"num_input_tokens_seen": 192368, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.9908256880733948, |
|
"num_input_tokens_seen": 194864, |
|
"step": 243, |
|
"total_flos": 7611007265931264.0, |
|
"train_loss": 0.34003226710445106, |
|
"train_runtime": 5382.7216, |
|
"train_samples_per_second": 0.728, |
|
"train_steps_per_second": 0.045 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 243, |
|
"num_input_tokens_seen": 194864, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7611007265931264.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|