{ "best_metric": 2.7392032146453857, "best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_30/checkpoint-76000", "epoch": 57.078482914006756, "eval_steps": 2000, "global_step": 76000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.5020653398422832, "eval_loss": 7.472109317779541, "eval_runtime": 2.5707, "eval_samples_per_second": 1291.498, "eval_steps_per_second": 80.913, "step": 2000 }, { "epoch": 3.0041306796845664, "grad_norm": 0.9529591798782349, "learning_rate": 1e-05, "loss": 7.3851, "step": 4000 }, { "epoch": 3.0041306796845664, "eval_loss": 6.4175896644592285, "eval_runtime": 2.5761, "eval_samples_per_second": 1288.76, "eval_steps_per_second": 80.742, "step": 4000 }, { "epoch": 4.50619601952685, "eval_loss": 6.3026041984558105, "eval_runtime": 2.5876, "eval_samples_per_second": 1283.023, "eval_steps_per_second": 80.382, "step": 6000 }, { "epoch": 6.008261359369133, "grad_norm": 1.5569523572921753, "learning_rate": 2e-05, "loss": 6.0706, "step": 8000 }, { "epoch": 6.008261359369133, "eval_loss": 6.200850486755371, "eval_runtime": 2.6505, "eval_samples_per_second": 1252.599, "eval_steps_per_second": 78.476, "step": 8000 }, { "epoch": 7.510326699211416, "eval_loss": 6.105844497680664, "eval_runtime": 2.5888, "eval_samples_per_second": 1282.425, "eval_steps_per_second": 80.345, "step": 10000 }, { "epoch": 9.0123920390537, "grad_norm": 2.3049111366271973, "learning_rate": 3e-05, "loss": 5.8733, "step": 12000 }, { "epoch": 9.0123920390537, "eval_loss": 6.0330352783203125, "eval_runtime": 2.5874, "eval_samples_per_second": 1283.132, "eval_steps_per_second": 80.389, "step": 12000 }, { "epoch": 10.514457378895981, "eval_loss": 5.968105792999268, "eval_runtime": 2.6066, "eval_samples_per_second": 1273.702, "eval_steps_per_second": 79.798, "step": 14000 }, { "epoch": 12.016522718738265, "grad_norm": 2.316237688064575, "learning_rate": 4e-05, "loss": 5.723, "step": 16000 }, { "epoch": 12.016522718738265, "eval_loss": 5.878941535949707, "eval_runtime": 2.5723, "eval_samples_per_second": 1290.696, "eval_steps_per_second": 80.863, "step": 16000 }, { "epoch": 13.518588058580548, "eval_loss": 5.8198137283325195, "eval_runtime": 2.5819, "eval_samples_per_second": 1285.857, "eval_steps_per_second": 80.56, "step": 18000 }, { "epoch": 15.020653398422832, "grad_norm": 2.299781322479248, "learning_rate": 5e-05, "loss": 5.6127, "step": 20000 }, { "epoch": 15.020653398422832, "eval_loss": 5.802139759063721, "eval_runtime": 2.6545, "eval_samples_per_second": 1250.713, "eval_steps_per_second": 78.358, "step": 20000 }, { "epoch": 16.522718738265116, "eval_loss": 5.7662129402160645, "eval_runtime": 2.5877, "eval_samples_per_second": 1282.983, "eval_steps_per_second": 80.38, "step": 22000 }, { "epoch": 18.0247840781074, "grad_norm": 2.8674566745758057, "learning_rate": 6e-05, "loss": 5.5325, "step": 24000 }, { "epoch": 18.0247840781074, "eval_loss": 5.731865406036377, "eval_runtime": 2.5813, "eval_samples_per_second": 1286.16, "eval_steps_per_second": 80.579, "step": 24000 }, { "epoch": 19.52684941794968, "eval_loss": 5.703192234039307, "eval_runtime": 2.5846, "eval_samples_per_second": 1284.544, "eval_steps_per_second": 80.477, "step": 26000 }, { "epoch": 21.028914757791963, "grad_norm": 4.573985576629639, "learning_rate": 7e-05, "loss": 5.4632, "step": 28000 }, { "epoch": 21.028914757791963, "eval_loss": 5.70460319519043, "eval_runtime": 2.5824, "eval_samples_per_second": 1285.631, "eval_steps_per_second": 80.546, "step": 28000 }, { "epoch": 22.53098009763425, "eval_loss": 5.570806503295898, "eval_runtime": 2.5775, "eval_samples_per_second": 1288.087, "eval_steps_per_second": 80.699, "step": 30000 }, { "epoch": 24.03304543747653, "grad_norm": 5.407034397125244, "learning_rate": 8e-05, "loss": 5.2633, "step": 32000 }, { "epoch": 24.03304543747653, "eval_loss": 4.99446439743042, "eval_runtime": 2.5969, "eval_samples_per_second": 1278.448, "eval_steps_per_second": 80.096, "step": 32000 }, { "epoch": 25.535110777318813, "eval_loss": 4.523237705230713, "eval_runtime": 2.5888, "eval_samples_per_second": 1282.462, "eval_steps_per_second": 80.347, "step": 34000 }, { "epoch": 27.037176117161096, "grad_norm": 5.287095069885254, "learning_rate": 9e-05, "loss": 4.3386, "step": 36000 }, { "epoch": 27.037176117161096, "eval_loss": 4.140077590942383, "eval_runtime": 2.5851, "eval_samples_per_second": 1284.299, "eval_steps_per_second": 80.462, "step": 36000 }, { "epoch": 28.539241457003378, "eval_loss": 3.857847213745117, "eval_runtime": 2.6038, "eval_samples_per_second": 1275.044, "eval_steps_per_second": 79.882, "step": 38000 }, { "epoch": 30.041306796845664, "grad_norm": 4.30466890335083, "learning_rate": 0.0001, "loss": 3.6823, "step": 40000 }, { "epoch": 30.041306796845664, "eval_loss": 3.6678993701934814, "eval_runtime": 2.7074, "eval_samples_per_second": 1226.275, "eval_steps_per_second": 76.827, "step": 40000 }, { "epoch": 31.543372136687946, "eval_loss": 3.5396695137023926, "eval_runtime": 4.9347, "eval_samples_per_second": 672.792, "eval_steps_per_second": 42.151, "step": 42000 }, { "epoch": 33.04543747653023, "grad_norm": 4.037199974060059, "learning_rate": 9.333333333333334e-05, "loss": 3.3424, "step": 44000 }, { "epoch": 33.04543747653023, "eval_loss": 3.4012534618377686, "eval_runtime": 4.8721, "eval_samples_per_second": 681.437, "eval_steps_per_second": 42.692, "step": 44000 }, { "epoch": 34.547502816372514, "eval_loss": 3.315351963043213, "eval_runtime": 4.9228, "eval_samples_per_second": 674.419, "eval_steps_per_second": 42.253, "step": 46000 }, { "epoch": 36.0495681562148, "grad_norm": 5.304887771606445, "learning_rate": 8.666666666666667e-05, "loss": 3.1266, "step": 48000 }, { "epoch": 36.0495681562148, "eval_loss": 3.2563161849975586, "eval_runtime": 4.958, "eval_samples_per_second": 669.632, "eval_steps_per_second": 41.953, "step": 48000 }, { "epoch": 37.55163349605708, "eval_loss": 3.1665167808532715, "eval_runtime": 4.695, "eval_samples_per_second": 707.142, "eval_steps_per_second": 44.303, "step": 50000 }, { "epoch": 39.05369883589936, "grad_norm": 4.668087005615234, "learning_rate": 8e-05, "loss": 2.9693, "step": 52000 }, { "epoch": 39.05369883589936, "eval_loss": 3.1341300010681152, "eval_runtime": 3.8879, "eval_samples_per_second": 853.931, "eval_steps_per_second": 53.499, "step": 52000 }, { "epoch": 40.55576417574164, "eval_loss": 3.0564303398132324, "eval_runtime": 4.2699, "eval_samples_per_second": 777.528, "eval_steps_per_second": 48.713, "step": 54000 }, { "epoch": 42.057829515583926, "grad_norm": 4.504222393035889, "learning_rate": 7.333333333333333e-05, "loss": 2.8544, "step": 56000 }, { "epoch": 42.057829515583926, "eval_loss": 3.0329349040985107, "eval_runtime": 4.8343, "eval_samples_per_second": 686.764, "eval_steps_per_second": 43.026, "step": 56000 }, { "epoch": 43.55989485542621, "eval_loss": 2.9552295207977295, "eval_runtime": 4.8628, "eval_samples_per_second": 682.729, "eval_steps_per_second": 42.773, "step": 58000 }, { "epoch": 45.0619601952685, "grad_norm": 4.24030065536499, "learning_rate": 6.666666666666667e-05, "loss": 2.758, "step": 60000 }, { "epoch": 45.0619601952685, "eval_loss": 2.9491591453552246, "eval_runtime": 4.8121, "eval_samples_per_second": 689.932, "eval_steps_per_second": 43.225, "step": 60000 }, { "epoch": 46.56402553511078, "eval_loss": 2.8936784267425537, "eval_runtime": 4.9154, "eval_samples_per_second": 675.43, "eval_steps_per_second": 42.316, "step": 62000 }, { "epoch": 48.06609087495306, "grad_norm": 4.06797456741333, "learning_rate": 6e-05, "loss": 2.684, "step": 64000 }, { "epoch": 48.06609087495306, "eval_loss": 2.866178512573242, "eval_runtime": 4.8779, "eval_samples_per_second": 680.615, "eval_steps_per_second": 42.641, "step": 64000 }, { "epoch": 49.568156214795344, "eval_loss": 2.8494906425476074, "eval_runtime": 4.8964, "eval_samples_per_second": 678.05, "eval_steps_per_second": 42.48, "step": 66000 }, { "epoch": 51.07022155463763, "grad_norm": 4.346592903137207, "learning_rate": 5.333333333333333e-05, "loss": 2.6223, "step": 68000 }, { "epoch": 51.07022155463763, "eval_loss": 2.825268507003784, "eval_runtime": 4.8304, "eval_samples_per_second": 687.313, "eval_steps_per_second": 43.061, "step": 68000 }, { "epoch": 52.57228689447991, "eval_loss": 2.78464937210083, "eval_runtime": 4.904, "eval_samples_per_second": 676.993, "eval_steps_per_second": 42.414, "step": 70000 }, { "epoch": 54.07435223432219, "grad_norm": 4.131275177001953, "learning_rate": 4.666666666666667e-05, "loss": 2.5704, "step": 72000 }, { "epoch": 54.07435223432219, "eval_loss": 2.7910172939300537, "eval_runtime": 4.8745, "eval_samples_per_second": 681.097, "eval_steps_per_second": 42.671, "step": 72000 }, { "epoch": 55.57641757416447, "eval_loss": 2.750303030014038, "eval_runtime": 2.5866, "eval_samples_per_second": 1283.529, "eval_steps_per_second": 80.414, "step": 74000 }, { "epoch": 57.078482914006756, "grad_norm": 4.2763261795043945, "learning_rate": 4e-05, "loss": 2.5235, "step": 76000 }, { "epoch": 57.078482914006756, "eval_loss": 2.7392032146453857, "eval_runtime": 4.7672, "eval_samples_per_second": 696.429, "eval_steps_per_second": 43.632, "step": 76000 } ], "logging_steps": 4000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 76, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.370274257744691e+16, "train_batch_size": 16, "trial_name": null, "trial_params": null }