|
{ |
|
"best_metric": 4.99446439743042, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_30/checkpoint-32000", |
|
"epoch": 24.03304543747653, |
|
"eval_steps": 2000, |
|
"global_step": 32000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5020653398422832, |
|
"eval_loss": 7.472109317779541, |
|
"eval_runtime": 2.5707, |
|
"eval_samples_per_second": 1291.498, |
|
"eval_steps_per_second": 80.913, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"grad_norm": 0.9529591798782349, |
|
"learning_rate": 1e-05, |
|
"loss": 7.3851, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"eval_loss": 6.4175896644592285, |
|
"eval_runtime": 2.5761, |
|
"eval_samples_per_second": 1288.76, |
|
"eval_steps_per_second": 80.742, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.50619601952685, |
|
"eval_loss": 6.3026041984558105, |
|
"eval_runtime": 2.5876, |
|
"eval_samples_per_second": 1283.023, |
|
"eval_steps_per_second": 80.382, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"grad_norm": 1.5569523572921753, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0706, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"eval_loss": 6.200850486755371, |
|
"eval_runtime": 2.6505, |
|
"eval_samples_per_second": 1252.599, |
|
"eval_steps_per_second": 78.476, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.510326699211416, |
|
"eval_loss": 6.105844497680664, |
|
"eval_runtime": 2.5888, |
|
"eval_samples_per_second": 1282.425, |
|
"eval_steps_per_second": 80.345, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"grad_norm": 2.3049111366271973, |
|
"learning_rate": 3e-05, |
|
"loss": 5.8733, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"eval_loss": 6.0330352783203125, |
|
"eval_runtime": 2.5874, |
|
"eval_samples_per_second": 1283.132, |
|
"eval_steps_per_second": 80.389, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.514457378895981, |
|
"eval_loss": 5.968105792999268, |
|
"eval_runtime": 2.6066, |
|
"eval_samples_per_second": 1273.702, |
|
"eval_steps_per_second": 79.798, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"grad_norm": 2.316237688064575, |
|
"learning_rate": 4e-05, |
|
"loss": 5.723, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"eval_loss": 5.878941535949707, |
|
"eval_runtime": 2.5723, |
|
"eval_samples_per_second": 1290.696, |
|
"eval_steps_per_second": 80.863, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 13.518588058580548, |
|
"eval_loss": 5.8198137283325195, |
|
"eval_runtime": 2.5819, |
|
"eval_samples_per_second": 1285.857, |
|
"eval_steps_per_second": 80.56, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"grad_norm": 2.299781322479248, |
|
"learning_rate": 5e-05, |
|
"loss": 5.6127, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"eval_loss": 5.802139759063721, |
|
"eval_runtime": 2.6545, |
|
"eval_samples_per_second": 1250.713, |
|
"eval_steps_per_second": 78.358, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 16.522718738265116, |
|
"eval_loss": 5.7662129402160645, |
|
"eval_runtime": 2.5877, |
|
"eval_samples_per_second": 1282.983, |
|
"eval_steps_per_second": 80.38, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"grad_norm": 2.8674566745758057, |
|
"learning_rate": 6e-05, |
|
"loss": 5.5325, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"eval_loss": 5.731865406036377, |
|
"eval_runtime": 2.5813, |
|
"eval_samples_per_second": 1286.16, |
|
"eval_steps_per_second": 80.579, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 19.52684941794968, |
|
"eval_loss": 5.703192234039307, |
|
"eval_runtime": 2.5846, |
|
"eval_samples_per_second": 1284.544, |
|
"eval_steps_per_second": 80.477, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"grad_norm": 4.573985576629639, |
|
"learning_rate": 7e-05, |
|
"loss": 5.4632, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"eval_loss": 5.70460319519043, |
|
"eval_runtime": 2.5824, |
|
"eval_samples_per_second": 1285.631, |
|
"eval_steps_per_second": 80.546, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 22.53098009763425, |
|
"eval_loss": 5.570806503295898, |
|
"eval_runtime": 2.5775, |
|
"eval_samples_per_second": 1288.087, |
|
"eval_steps_per_second": 80.699, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"grad_norm": 5.407034397125244, |
|
"learning_rate": 8e-05, |
|
"loss": 5.2633, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"eval_loss": 4.99446439743042, |
|
"eval_runtime": 2.5969, |
|
"eval_samples_per_second": 1278.448, |
|
"eval_steps_per_second": 80.096, |
|
"step": 32000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 76, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9980102137872384.0, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|