|
{ |
|
"best_metric": 2.6637983322143555, |
|
"best_model_checkpoint": "/home/p318482/babyLM_controlled/models_trained_last/de_mlm/de_childes_30/checkpoint-88000", |
|
"epoch": 66.09087495306046, |
|
"eval_steps": 2000, |
|
"global_step": 88000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.5020653398422832, |
|
"eval_loss": 7.472109317779541, |
|
"eval_runtime": 2.5707, |
|
"eval_samples_per_second": 1291.498, |
|
"eval_steps_per_second": 80.913, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"grad_norm": 0.9529591798782349, |
|
"learning_rate": 1e-05, |
|
"loss": 7.3851, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 3.0041306796845664, |
|
"eval_loss": 6.4175896644592285, |
|
"eval_runtime": 2.5761, |
|
"eval_samples_per_second": 1288.76, |
|
"eval_steps_per_second": 80.742, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 4.50619601952685, |
|
"eval_loss": 6.3026041984558105, |
|
"eval_runtime": 2.5876, |
|
"eval_samples_per_second": 1283.023, |
|
"eval_steps_per_second": 80.382, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"grad_norm": 1.5569523572921753, |
|
"learning_rate": 2e-05, |
|
"loss": 6.0706, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 6.008261359369133, |
|
"eval_loss": 6.200850486755371, |
|
"eval_runtime": 2.6505, |
|
"eval_samples_per_second": 1252.599, |
|
"eval_steps_per_second": 78.476, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 7.510326699211416, |
|
"eval_loss": 6.105844497680664, |
|
"eval_runtime": 2.5888, |
|
"eval_samples_per_second": 1282.425, |
|
"eval_steps_per_second": 80.345, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"grad_norm": 2.3049111366271973, |
|
"learning_rate": 3e-05, |
|
"loss": 5.8733, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 9.0123920390537, |
|
"eval_loss": 6.0330352783203125, |
|
"eval_runtime": 2.5874, |
|
"eval_samples_per_second": 1283.132, |
|
"eval_steps_per_second": 80.389, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 10.514457378895981, |
|
"eval_loss": 5.968105792999268, |
|
"eval_runtime": 2.6066, |
|
"eval_samples_per_second": 1273.702, |
|
"eval_steps_per_second": 79.798, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"grad_norm": 2.316237688064575, |
|
"learning_rate": 4e-05, |
|
"loss": 5.723, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 12.016522718738265, |
|
"eval_loss": 5.878941535949707, |
|
"eval_runtime": 2.5723, |
|
"eval_samples_per_second": 1290.696, |
|
"eval_steps_per_second": 80.863, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 13.518588058580548, |
|
"eval_loss": 5.8198137283325195, |
|
"eval_runtime": 2.5819, |
|
"eval_samples_per_second": 1285.857, |
|
"eval_steps_per_second": 80.56, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"grad_norm": 2.299781322479248, |
|
"learning_rate": 5e-05, |
|
"loss": 5.6127, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 15.020653398422832, |
|
"eval_loss": 5.802139759063721, |
|
"eval_runtime": 2.6545, |
|
"eval_samples_per_second": 1250.713, |
|
"eval_steps_per_second": 78.358, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 16.522718738265116, |
|
"eval_loss": 5.7662129402160645, |
|
"eval_runtime": 2.5877, |
|
"eval_samples_per_second": 1282.983, |
|
"eval_steps_per_second": 80.38, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"grad_norm": 2.8674566745758057, |
|
"learning_rate": 6e-05, |
|
"loss": 5.5325, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 18.0247840781074, |
|
"eval_loss": 5.731865406036377, |
|
"eval_runtime": 2.5813, |
|
"eval_samples_per_second": 1286.16, |
|
"eval_steps_per_second": 80.579, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 19.52684941794968, |
|
"eval_loss": 5.703192234039307, |
|
"eval_runtime": 2.5846, |
|
"eval_samples_per_second": 1284.544, |
|
"eval_steps_per_second": 80.477, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"grad_norm": 4.573985576629639, |
|
"learning_rate": 7e-05, |
|
"loss": 5.4632, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 21.028914757791963, |
|
"eval_loss": 5.70460319519043, |
|
"eval_runtime": 2.5824, |
|
"eval_samples_per_second": 1285.631, |
|
"eval_steps_per_second": 80.546, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 22.53098009763425, |
|
"eval_loss": 5.570806503295898, |
|
"eval_runtime": 2.5775, |
|
"eval_samples_per_second": 1288.087, |
|
"eval_steps_per_second": 80.699, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"grad_norm": 5.407034397125244, |
|
"learning_rate": 8e-05, |
|
"loss": 5.2633, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 24.03304543747653, |
|
"eval_loss": 4.99446439743042, |
|
"eval_runtime": 2.5969, |
|
"eval_samples_per_second": 1278.448, |
|
"eval_steps_per_second": 80.096, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 25.535110777318813, |
|
"eval_loss": 4.523237705230713, |
|
"eval_runtime": 2.5888, |
|
"eval_samples_per_second": 1282.462, |
|
"eval_steps_per_second": 80.347, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"grad_norm": 5.287095069885254, |
|
"learning_rate": 9e-05, |
|
"loss": 4.3386, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 27.037176117161096, |
|
"eval_loss": 4.140077590942383, |
|
"eval_runtime": 2.5851, |
|
"eval_samples_per_second": 1284.299, |
|
"eval_steps_per_second": 80.462, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 28.539241457003378, |
|
"eval_loss": 3.857847213745117, |
|
"eval_runtime": 2.6038, |
|
"eval_samples_per_second": 1275.044, |
|
"eval_steps_per_second": 79.882, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 30.041306796845664, |
|
"grad_norm": 4.30466890335083, |
|
"learning_rate": 0.0001, |
|
"loss": 3.6823, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 30.041306796845664, |
|
"eval_loss": 3.6678993701934814, |
|
"eval_runtime": 2.7074, |
|
"eval_samples_per_second": 1226.275, |
|
"eval_steps_per_second": 76.827, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 31.543372136687946, |
|
"eval_loss": 3.5396695137023926, |
|
"eval_runtime": 4.9347, |
|
"eval_samples_per_second": 672.792, |
|
"eval_steps_per_second": 42.151, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 33.04543747653023, |
|
"grad_norm": 4.037199974060059, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 3.3424, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 33.04543747653023, |
|
"eval_loss": 3.4012534618377686, |
|
"eval_runtime": 4.8721, |
|
"eval_samples_per_second": 681.437, |
|
"eval_steps_per_second": 42.692, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 34.547502816372514, |
|
"eval_loss": 3.315351963043213, |
|
"eval_runtime": 4.9228, |
|
"eval_samples_per_second": 674.419, |
|
"eval_steps_per_second": 42.253, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 36.0495681562148, |
|
"grad_norm": 5.304887771606445, |
|
"learning_rate": 8.666666666666667e-05, |
|
"loss": 3.1266, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 36.0495681562148, |
|
"eval_loss": 3.2563161849975586, |
|
"eval_runtime": 4.958, |
|
"eval_samples_per_second": 669.632, |
|
"eval_steps_per_second": 41.953, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 37.55163349605708, |
|
"eval_loss": 3.1665167808532715, |
|
"eval_runtime": 4.695, |
|
"eval_samples_per_second": 707.142, |
|
"eval_steps_per_second": 44.303, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 39.05369883589936, |
|
"grad_norm": 4.668087005615234, |
|
"learning_rate": 8e-05, |
|
"loss": 2.9693, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 39.05369883589936, |
|
"eval_loss": 3.1341300010681152, |
|
"eval_runtime": 3.8879, |
|
"eval_samples_per_second": 853.931, |
|
"eval_steps_per_second": 53.499, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 40.55576417574164, |
|
"eval_loss": 3.0564303398132324, |
|
"eval_runtime": 4.2699, |
|
"eval_samples_per_second": 777.528, |
|
"eval_steps_per_second": 48.713, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 42.057829515583926, |
|
"grad_norm": 4.504222393035889, |
|
"learning_rate": 7.333333333333333e-05, |
|
"loss": 2.8544, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 42.057829515583926, |
|
"eval_loss": 3.0329349040985107, |
|
"eval_runtime": 4.8343, |
|
"eval_samples_per_second": 686.764, |
|
"eval_steps_per_second": 43.026, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 43.55989485542621, |
|
"eval_loss": 2.9552295207977295, |
|
"eval_runtime": 4.8628, |
|
"eval_samples_per_second": 682.729, |
|
"eval_steps_per_second": 42.773, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 45.0619601952685, |
|
"grad_norm": 4.24030065536499, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 2.758, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 45.0619601952685, |
|
"eval_loss": 2.9491591453552246, |
|
"eval_runtime": 4.8121, |
|
"eval_samples_per_second": 689.932, |
|
"eval_steps_per_second": 43.225, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 46.56402553511078, |
|
"eval_loss": 2.8936784267425537, |
|
"eval_runtime": 4.9154, |
|
"eval_samples_per_second": 675.43, |
|
"eval_steps_per_second": 42.316, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 48.06609087495306, |
|
"grad_norm": 4.06797456741333, |
|
"learning_rate": 6e-05, |
|
"loss": 2.684, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 48.06609087495306, |
|
"eval_loss": 2.866178512573242, |
|
"eval_runtime": 4.8779, |
|
"eval_samples_per_second": 680.615, |
|
"eval_steps_per_second": 42.641, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 49.568156214795344, |
|
"eval_loss": 2.8494906425476074, |
|
"eval_runtime": 4.8964, |
|
"eval_samples_per_second": 678.05, |
|
"eval_steps_per_second": 42.48, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 51.07022155463763, |
|
"grad_norm": 4.346592903137207, |
|
"learning_rate": 5.333333333333333e-05, |
|
"loss": 2.6223, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 51.07022155463763, |
|
"eval_loss": 2.825268507003784, |
|
"eval_runtime": 4.8304, |
|
"eval_samples_per_second": 687.313, |
|
"eval_steps_per_second": 43.061, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 52.57228689447991, |
|
"eval_loss": 2.78464937210083, |
|
"eval_runtime": 4.904, |
|
"eval_samples_per_second": 676.993, |
|
"eval_steps_per_second": 42.414, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 54.07435223432219, |
|
"grad_norm": 4.131275177001953, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 2.5704, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 54.07435223432219, |
|
"eval_loss": 2.7910172939300537, |
|
"eval_runtime": 4.8745, |
|
"eval_samples_per_second": 681.097, |
|
"eval_steps_per_second": 42.671, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 55.57641757416447, |
|
"eval_loss": 2.750303030014038, |
|
"eval_runtime": 2.5866, |
|
"eval_samples_per_second": 1283.529, |
|
"eval_steps_per_second": 80.414, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 57.078482914006756, |
|
"grad_norm": 4.2763261795043945, |
|
"learning_rate": 4e-05, |
|
"loss": 2.5235, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 57.078482914006756, |
|
"eval_loss": 2.7392032146453857, |
|
"eval_runtime": 4.7672, |
|
"eval_samples_per_second": 696.429, |
|
"eval_steps_per_second": 43.632, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 58.580548253849045, |
|
"eval_loss": 2.722296714782715, |
|
"eval_runtime": 2.5739, |
|
"eval_samples_per_second": 1289.861, |
|
"eval_steps_per_second": 80.811, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 60.08261359369133, |
|
"grad_norm": 4.225383281707764, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 2.4865, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 60.08261359369133, |
|
"eval_loss": 2.714205503463745, |
|
"eval_runtime": 2.5794, |
|
"eval_samples_per_second": 1287.128, |
|
"eval_steps_per_second": 80.639, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 61.58467893353361, |
|
"eval_loss": 2.7068099975585938, |
|
"eval_runtime": 2.5894, |
|
"eval_samples_per_second": 1282.152, |
|
"eval_steps_per_second": 80.328, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 63.08674427337589, |
|
"grad_norm": 4.279160976409912, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 2.4549, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 63.08674427337589, |
|
"eval_loss": 2.7050325870513916, |
|
"eval_runtime": 2.5878, |
|
"eval_samples_per_second": 1282.952, |
|
"eval_steps_per_second": 80.378, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 64.58880961321817, |
|
"eval_loss": 2.674473524093628, |
|
"eval_runtime": 2.5772, |
|
"eval_samples_per_second": 1288.232, |
|
"eval_steps_per_second": 80.709, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 66.09087495306046, |
|
"grad_norm": 5.214581489562988, |
|
"learning_rate": 2e-05, |
|
"loss": 2.427, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 66.09087495306046, |
|
"eval_loss": 2.6637983322143555, |
|
"eval_runtime": 2.6044, |
|
"eval_samples_per_second": 1274.757, |
|
"eval_steps_per_second": 79.864, |
|
"step": 88000 |
|
} |
|
], |
|
"logging_steps": 4000, |
|
"max_steps": 100000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 76, |
|
"save_steps": 4000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.7445280879149056e+16, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|