{ "best_metric": 4.99446439743042, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_30/checkpoint-32000", "epoch": 24.03304543747653, "eval_steps": 2000, "global_step": 32000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.5020653398422832, "eval_loss": 7.472109317779541, "eval_runtime": 2.5707, "eval_samples_per_second": 1291.498, "eval_steps_per_second": 80.913, "step": 2000 }, { "epoch": 3.0041306796845664, "grad_norm": 0.9529591798782349, "learning_rate": 1e-05, "loss": 7.3851, "step": 4000 }, { "epoch": 3.0041306796845664, "eval_loss": 6.4175896644592285, "eval_runtime": 2.5761, "eval_samples_per_second": 1288.76, "eval_steps_per_second": 80.742, "step": 4000 }, { "epoch": 4.50619601952685, "eval_loss": 6.3026041984558105, "eval_runtime": 2.5876, "eval_samples_per_second": 1283.023, "eval_steps_per_second": 80.382, "step": 6000 }, { "epoch": 6.008261359369133, "grad_norm": 1.5569523572921753, "learning_rate": 2e-05, "loss": 6.0706, "step": 8000 }, { "epoch": 6.008261359369133, "eval_loss": 6.200850486755371, "eval_runtime": 2.6505, "eval_samples_per_second": 1252.599, "eval_steps_per_second": 78.476, "step": 8000 }, { "epoch": 7.510326699211416, "eval_loss": 6.105844497680664, "eval_runtime": 2.5888, "eval_samples_per_second": 1282.425, "eval_steps_per_second": 80.345, "step": 10000 }, { "epoch": 9.0123920390537, "grad_norm": 2.3049111366271973, "learning_rate": 3e-05, "loss": 5.8733, "step": 12000 }, { "epoch": 9.0123920390537, "eval_loss": 6.0330352783203125, "eval_runtime": 2.5874, "eval_samples_per_second": 1283.132, "eval_steps_per_second": 80.389, "step": 12000 }, { "epoch": 10.514457378895981, "eval_loss": 5.968105792999268, "eval_runtime": 2.6066, "eval_samples_per_second": 1273.702, "eval_steps_per_second": 79.798, "step": 14000 }, { "epoch": 12.016522718738265, "grad_norm": 2.316237688064575, "learning_rate": 4e-05, "loss": 5.723, "step": 16000 }, { "epoch": 12.016522718738265, "eval_loss": 5.878941535949707, "eval_runtime": 2.5723, "eval_samples_per_second": 1290.696, "eval_steps_per_second": 80.863, "step": 16000 }, { "epoch": 13.518588058580548, "eval_loss": 5.8198137283325195, "eval_runtime": 2.5819, "eval_samples_per_second": 1285.857, "eval_steps_per_second": 80.56, "step": 18000 }, { "epoch": 15.020653398422832, "grad_norm": 2.299781322479248, "learning_rate": 5e-05, "loss": 5.6127, "step": 20000 }, { "epoch": 15.020653398422832, "eval_loss": 5.802139759063721, "eval_runtime": 2.6545, "eval_samples_per_second": 1250.713, "eval_steps_per_second": 78.358, "step": 20000 }, { "epoch": 16.522718738265116, "eval_loss": 5.7662129402160645, "eval_runtime": 2.5877, "eval_samples_per_second": 1282.983, "eval_steps_per_second": 80.38, "step": 22000 }, { "epoch": 18.0247840781074, "grad_norm": 2.8674566745758057, "learning_rate": 6e-05, "loss": 5.5325, "step": 24000 }, { "epoch": 18.0247840781074, "eval_loss": 5.731865406036377, "eval_runtime": 2.5813, "eval_samples_per_second": 1286.16, "eval_steps_per_second": 80.579, "step": 24000 }, { "epoch": 19.52684941794968, "eval_loss": 5.703192234039307, "eval_runtime": 2.5846, "eval_samples_per_second": 1284.544, "eval_steps_per_second": 80.477, "step": 26000 }, { "epoch": 21.028914757791963, "grad_norm": 4.573985576629639, "learning_rate": 7e-05, "loss": 5.4632, "step": 28000 }, { "epoch": 21.028914757791963, "eval_loss": 5.70460319519043, "eval_runtime": 2.5824, "eval_samples_per_second": 1285.631, "eval_steps_per_second": 80.546, "step": 28000 }, { "epoch": 22.53098009763425, "eval_loss": 5.570806503295898, "eval_runtime": 2.5775, "eval_samples_per_second": 1288.087, "eval_steps_per_second": 80.699, "step": 30000 }, { "epoch": 24.03304543747653, "grad_norm": 5.407034397125244, "learning_rate": 8e-05, "loss": 5.2633, "step": 32000 }, { "epoch": 24.03304543747653, "eval_loss": 4.99446439743042, "eval_runtime": 2.5969, "eval_samples_per_second": 1278.448, "eval_steps_per_second": 80.096, "step": 32000 } ], "logging_steps": 4000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 76, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 9980102137872384.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }