|
{ |
|
"best_metric": 2.6344850063323975, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_42/checkpoint-88000", |
|
"epoch": 66.1010138941044, |
|
"eval_steps": 2000, |
|
"global_step": 88000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5020653398422832, |
|
"eval_loss": 7.505205154418945, |
|
"eval_runtime": 4.9012, |
|
"eval_samples_per_second": 677.383, |
|
"eval_steps_per_second": 42.438, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"grad_norm": 1.3569484949111938, |
|
"learning_rate": 1e-05, |
|
"loss": 7.4063, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"eval_loss": 6.433775901794434, |
|
"eval_runtime": 4.7916, |
|
"eval_samples_per_second": 692.873, |
|
"eval_steps_per_second": 43.409, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.50619601952685, |
|
"eval_loss": 6.305135250091553, |
|
"eval_runtime": 4.8722, |
|
"eval_samples_per_second": 681.421, |
|
"eval_steps_per_second": 42.691, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"grad_norm": 1.548327088356018, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0672, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"eval_loss": 6.1662516593933105, |
|
"eval_runtime": 4.8572, |
|
"eval_samples_per_second": 683.527, |
|
"eval_steps_per_second": 42.823, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.510326699211416, |
|
"eval_loss": 6.095215797424316, |
|
"eval_runtime": 4.7932, |
|
"eval_samples_per_second": 692.655, |
|
"eval_steps_per_second": 43.395, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"grad_norm": 1.639375925064087, |
|
"learning_rate": 3e-05, |
|
"loss": 5.8691, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"eval_loss": 6.0121965408325195, |
|
"eval_runtime": 4.8217, |
|
"eval_samples_per_second": 688.56, |
|
"eval_steps_per_second": 43.139, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.514457378895981, |
|
"eval_loss": 5.951181888580322, |
|
"eval_runtime": 4.7668, |
|
"eval_samples_per_second": 696.478, |
|
"eval_steps_per_second": 43.635, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"grad_norm": 1.6758147478103638, |
|
"learning_rate": 4e-05, |
|
"loss": 5.7209, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"eval_loss": 5.869147300720215, |
|
"eval_runtime": 4.6953, |
|
"eval_samples_per_second": 707.092, |
|
"eval_steps_per_second": 44.3, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 13.518588058580548, |
|
"eval_loss": 5.852941513061523, |
|
"eval_runtime": 4.8028, |
|
"eval_samples_per_second": 691.267, |
|
"eval_steps_per_second": 43.308, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"grad_norm": 2.3887548446655273, |
|
"learning_rate": 5e-05, |
|
"loss": 5.6105, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"eval_loss": 5.796474933624268, |
|
"eval_runtime": 4.7859, |
|
"eval_samples_per_second": 693.704, |
|
"eval_steps_per_second": 43.461, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 16.522718738265116, |
|
"eval_loss": 5.740424156188965, |
|
"eval_runtime": 4.6987, |
|
"eval_samples_per_second": 706.584, |
|
"eval_steps_per_second": 44.268, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"grad_norm": 4.52785587310791, |
|
"learning_rate": 6e-05, |
|
"loss": 5.5302, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"eval_loss": 5.742419719696045, |
|
"eval_runtime": 3.8111, |
|
"eval_samples_per_second": 871.137, |
|
"eval_steps_per_second": 54.577, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 19.52684941794968, |
|
"eval_loss": 5.7255754470825195, |
|
"eval_runtime": 2.6268, |
|
"eval_samples_per_second": 1263.889, |
|
"eval_steps_per_second": 79.183, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"grad_norm": 4.69941520690918, |
|
"learning_rate": 7e-05, |
|
"loss": 5.4587, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"eval_loss": 5.6831231117248535, |
|
"eval_runtime": 2.5816, |
|
"eval_samples_per_second": 1286.034, |
|
"eval_steps_per_second": 80.571, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 22.53098009763425, |
|
"eval_loss": 5.396602153778076, |
|
"eval_runtime": 2.6907, |
|
"eval_samples_per_second": 1233.859, |
|
"eval_steps_per_second": 77.302, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"grad_norm": 5.43873929977417, |
|
"learning_rate": 8e-05, |
|
"loss": 5.0899, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"eval_loss": 4.704195022583008, |
|
"eval_runtime": 2.6893, |
|
"eval_samples_per_second": 1234.521, |
|
"eval_steps_per_second": 77.343, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 25.535110777318813, |
|
"eval_loss": 4.2316813468933105, |
|
"eval_runtime": 4.2325, |
|
"eval_samples_per_second": 784.414, |
|
"eval_steps_per_second": 49.144, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"grad_norm": 4.492648124694824, |
|
"learning_rate": 9e-05, |
|
"loss": 4.0988, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"eval_loss": 3.90928316116333, |
|
"eval_runtime": 4.8525, |
|
"eval_samples_per_second": 684.177, |
|
"eval_steps_per_second": 42.864, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 28.549380398047315, |
|
"eval_loss": 3.749643564224243, |
|
"eval_runtime": 4.8877, |
|
"eval_samples_per_second": 679.256, |
|
"eval_steps_per_second": 42.556, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 30.051445737889598, |
|
"grad_norm": 4.555140018463135, |
|
"learning_rate": 0.0001, |
|
"loss": 3.555, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 30.051445737889598, |
|
"eval_loss": 3.5960817337036133, |
|
"eval_runtime": 4.777, |
|
"eval_samples_per_second": 695.0, |
|
"eval_steps_per_second": 43.542, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 31.55351107773188, |
|
"eval_loss": 3.454169988632202, |
|
"eval_runtime": 4.8006, |
|
"eval_samples_per_second": 691.578, |
|
"eval_steps_per_second": 43.328, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 33.055576417574166, |
|
"grad_norm": 4.173329830169678, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 3.2522, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 33.055576417574166, |
|
"eval_loss": 3.3299686908721924, |
|
"eval_runtime": 4.544, |
|
"eval_samples_per_second": 730.638, |
|
"eval_steps_per_second": 45.775, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 34.55764175741645, |
|
"eval_loss": 3.2829904556274414, |
|
"eval_runtime": 4.8645, |
|
"eval_samples_per_second": 682.495, |
|
"eval_steps_per_second": 42.759, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 36.05970709725873, |
|
"grad_norm": 4.994029521942139, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 3.0484, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 36.05970709725873, |
|
"eval_loss": 3.1864311695098877, |
|
"eval_runtime": 4.5673, |
|
"eval_samples_per_second": 726.907, |
|
"eval_steps_per_second": 45.541, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 37.56177243710101, |
|
"eval_loss": 3.118920087814331, |
|
"eval_runtime": 4.2694, |
|
"eval_samples_per_second": 777.63, |
|
"eval_steps_per_second": 48.719, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 39.063837776943295, |
|
"grad_norm": 4.800024509429932, |
|
"learning_rate": 8e-05, |
|
"loss": 2.9026, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 39.063837776943295, |
|
"eval_loss": 3.0475075244903564, |
|
"eval_runtime": 4.6498, |
|
"eval_samples_per_second": 714.014, |
|
"eval_steps_per_second": 44.733, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 40.56590311678558, |
|
"eval_loss": 2.993332624435425, |
|
"eval_runtime": 4.9159, |
|
"eval_samples_per_second": 675.36, |
|
"eval_steps_per_second": 42.312, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 42.06796845662787, |
|
"grad_norm": 4.461485385894775, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 2.7874, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 42.06796845662787, |
|
"eval_loss": 2.9410743713378906, |
|
"eval_runtime": 4.855, |
|
"eval_samples_per_second": 683.83, |
|
"eval_steps_per_second": 42.842, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 43.57003379647015, |
|
"eval_loss": 2.935495138168335, |
|
"eval_runtime": 4.8864, |
|
"eval_samples_per_second": 679.431, |
|
"eval_steps_per_second": 42.567, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 45.07209913631243, |
|
"grad_norm": 4.862215518951416, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.7001, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 45.07209913631243, |
|
"eval_loss": 2.891265630722046, |
|
"eval_runtime": 4.7977, |
|
"eval_samples_per_second": 691.997, |
|
"eval_steps_per_second": 43.354, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 46.574164476154714, |
|
"eval_loss": 2.860131025314331, |
|
"eval_runtime": 4.8035, |
|
"eval_samples_per_second": 691.17, |
|
"eval_steps_per_second": 43.302, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 48.076229815996996, |
|
"grad_norm": 4.176759719848633, |
|
"learning_rate": 6e-05, |
|
"loss": 2.6298, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 48.076229815996996, |
|
"eval_loss": 2.8227334022521973, |
|
"eval_runtime": 4.8405, |
|
"eval_samples_per_second": 685.878, |
|
"eval_steps_per_second": 42.971, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 49.57829515583928, |
|
"eval_loss": 2.8202409744262695, |
|
"eval_runtime": 4.9107, |
|
"eval_samples_per_second": 676.074, |
|
"eval_steps_per_second": 42.356, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 51.08036049568156, |
|
"grad_norm": 4.324464321136475, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 2.5722, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 51.08036049568156, |
|
"eval_loss": 2.7873833179473877, |
|
"eval_runtime": 4.9614, |
|
"eval_samples_per_second": 669.165, |
|
"eval_steps_per_second": 41.924, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 52.58242583552384, |
|
"eval_loss": 2.7716081142425537, |
|
"eval_runtime": 4.9279, |
|
"eval_samples_per_second": 673.719, |
|
"eval_steps_per_second": 42.209, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 54.084491175366125, |
|
"grad_norm": 4.540297985076904, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 2.523, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 54.084491175366125, |
|
"eval_loss": 2.736281633377075, |
|
"eval_runtime": 5.0438, |
|
"eval_samples_per_second": 658.231, |
|
"eval_steps_per_second": 41.239, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 55.586556515208414, |
|
"eval_loss": 2.721235513687134, |
|
"eval_runtime": 4.9084, |
|
"eval_samples_per_second": 676.391, |
|
"eval_steps_per_second": 42.376, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 57.0886218550507, |
|
"grad_norm": 4.172296524047852, |
|
"learning_rate": 4e-05, |
|
"loss": 2.4788, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 57.0886218550507, |
|
"eval_loss": 2.6943581104278564, |
|
"eval_runtime": 4.972, |
|
"eval_samples_per_second": 667.739, |
|
"eval_steps_per_second": 41.834, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 58.59068719489298, |
|
"eval_loss": 2.6760997772216797, |
|
"eval_runtime": 4.8781, |
|
"eval_samples_per_second": 680.594, |
|
"eval_steps_per_second": 42.64, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 60.09275253473526, |
|
"grad_norm": 4.14224910736084, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.4466, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 60.09275253473526, |
|
"eval_loss": 2.670499563217163, |
|
"eval_runtime": 4.6467, |
|
"eval_samples_per_second": 714.491, |
|
"eval_steps_per_second": 44.763, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 61.594817874577544, |
|
"eval_loss": 2.6551034450531006, |
|
"eval_runtime": 4.9275, |
|
"eval_samples_per_second": 673.77, |
|
"eval_steps_per_second": 42.212, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 63.096883214419826, |
|
"grad_norm": 4.495992183685303, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 2.4122, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 63.096883214419826, |
|
"eval_loss": 2.6368117332458496, |
|
"eval_runtime": 4.8647, |
|
"eval_samples_per_second": 682.471, |
|
"eval_steps_per_second": 42.757, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 64.59894855426211, |
|
"eval_loss": 2.6423897743225098, |
|
"eval_runtime": 4.8657, |
|
"eval_samples_per_second": 682.327, |
|
"eval_steps_per_second": 42.748, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 66.1010138941044, |
|
"grad_norm": 3.998427152633667, |
|
"learning_rate": 2e-05, |
|
"loss": 2.3832, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 66.1010138941044, |
|
"eval_loss": 2.6344850063323975, |
|
"eval_runtime": 4.8088, |
|
"eval_samples_per_second": 690.4, |
|
"eval_steps_per_second": 43.254, |
|
"step": 88000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 76, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7445280879149056e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|