|
{ |
|
"best_metric": 3.3299686908721924, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_42/checkpoint-44000", |
|
"epoch": 33.055576417574166, |
|
"eval_steps": 2000, |
|
"global_step": 44000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5020653398422832, |
|
"eval_loss": 7.505205154418945, |
|
"eval_runtime": 4.9012, |
|
"eval_samples_per_second": 677.383, |
|
"eval_steps_per_second": 42.438, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"grad_norm": 1.3569484949111938, |
|
"learning_rate": 1e-05, |
|
"loss": 7.4063, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"eval_loss": 6.433775901794434, |
|
"eval_runtime": 4.7916, |
|
"eval_samples_per_second": 692.873, |
|
"eval_steps_per_second": 43.409, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.50619601952685, |
|
"eval_loss": 6.305135250091553, |
|
"eval_runtime": 4.8722, |
|
"eval_samples_per_second": 681.421, |
|
"eval_steps_per_second": 42.691, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"grad_norm": 1.548327088356018, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0672, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"eval_loss": 6.1662516593933105, |
|
"eval_runtime": 4.8572, |
|
"eval_samples_per_second": 683.527, |
|
"eval_steps_per_second": 42.823, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.510326699211416, |
|
"eval_loss": 6.095215797424316, |
|
"eval_runtime": 4.7932, |
|
"eval_samples_per_second": 692.655, |
|
"eval_steps_per_second": 43.395, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"grad_norm": 1.639375925064087, |
|
"learning_rate": 3e-05, |
|
"loss": 5.8691, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"eval_loss": 6.0121965408325195, |
|
"eval_runtime": 4.8217, |
|
"eval_samples_per_second": 688.56, |
|
"eval_steps_per_second": 43.139, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.514457378895981, |
|
"eval_loss": 5.951181888580322, |
|
"eval_runtime": 4.7668, |
|
"eval_samples_per_second": 696.478, |
|
"eval_steps_per_second": 43.635, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"grad_norm": 1.6758147478103638, |
|
"learning_rate": 4e-05, |
|
"loss": 5.7209, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"eval_loss": 5.869147300720215, |
|
"eval_runtime": 4.6953, |
|
"eval_samples_per_second": 707.092, |
|
"eval_steps_per_second": 44.3, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 13.518588058580548, |
|
"eval_loss": 5.852941513061523, |
|
"eval_runtime": 4.8028, |
|
"eval_samples_per_second": 691.267, |
|
"eval_steps_per_second": 43.308, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"grad_norm": 2.3887548446655273, |
|
"learning_rate": 5e-05, |
|
"loss": 5.6105, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"eval_loss": 5.796474933624268, |
|
"eval_runtime": 4.7859, |
|
"eval_samples_per_second": 693.704, |
|
"eval_steps_per_second": 43.461, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 16.522718738265116, |
|
"eval_loss": 5.740424156188965, |
|
"eval_runtime": 4.6987, |
|
"eval_samples_per_second": 706.584, |
|
"eval_steps_per_second": 44.268, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"grad_norm": 4.52785587310791, |
|
"learning_rate": 6e-05, |
|
"loss": 5.5302, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"eval_loss": 5.742419719696045, |
|
"eval_runtime": 3.8111, |
|
"eval_samples_per_second": 871.137, |
|
"eval_steps_per_second": 54.577, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 19.52684941794968, |
|
"eval_loss": 5.7255754470825195, |
|
"eval_runtime": 2.6268, |
|
"eval_samples_per_second": 1263.889, |
|
"eval_steps_per_second": 79.183, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"grad_norm": 4.69941520690918, |
|
"learning_rate": 7e-05, |
|
"loss": 5.4587, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"eval_loss": 5.6831231117248535, |
|
"eval_runtime": 2.5816, |
|
"eval_samples_per_second": 1286.034, |
|
"eval_steps_per_second": 80.571, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 22.53098009763425, |
|
"eval_loss": 5.396602153778076, |
|
"eval_runtime": 2.6907, |
|
"eval_samples_per_second": 1233.859, |
|
"eval_steps_per_second": 77.302, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"grad_norm": 5.43873929977417, |
|
"learning_rate": 8e-05, |
|
"loss": 5.0899, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"eval_loss": 4.704195022583008, |
|
"eval_runtime": 2.6893, |
|
"eval_samples_per_second": 1234.521, |
|
"eval_steps_per_second": 77.343, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 25.535110777318813, |
|
"eval_loss": 4.2316813468933105, |
|
"eval_runtime": 4.2325, |
|
"eval_samples_per_second": 784.414, |
|
"eval_steps_per_second": 49.144, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"grad_norm": 4.492648124694824, |
|
"learning_rate": 9e-05, |
|
"loss": 4.0988, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"eval_loss": 3.90928316116333, |
|
"eval_runtime": 4.8525, |
|
"eval_samples_per_second": 684.177, |
|
"eval_steps_per_second": 42.864, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 28.549380398047315, |
|
"eval_loss": 3.749643564224243, |
|
"eval_runtime": 4.8877, |
|
"eval_samples_per_second": 679.256, |
|
"eval_steps_per_second": 42.556, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 30.051445737889598, |
|
"grad_norm": 4.555140018463135, |
|
"learning_rate": 0.0001, |
|
"loss": 3.555, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 30.051445737889598, |
|
"eval_loss": 3.5960817337036133, |
|
"eval_runtime": 4.777, |
|
"eval_samples_per_second": 695.0, |
|
"eval_steps_per_second": 43.542, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 31.55351107773188, |
|
"eval_loss": 3.454169988632202, |
|
"eval_runtime": 4.8006, |
|
"eval_samples_per_second": 691.578, |
|
"eval_steps_per_second": 43.328, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 33.055576417574166, |
|
"grad_norm": 4.173329830169678, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 3.2522, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 33.055576417574166, |
|
"eval_loss": 3.3299686908721924, |
|
"eval_runtime": 4.544, |
|
"eval_samples_per_second": 730.638, |
|
"eval_steps_per_second": 45.775, |
|
"step": 44000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 76, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3722640439574528e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|