dakwi's picture
Training in progress, step 23000, checkpoint
8597729 verified
{
"best_metric": 0.9653898477554321,
"best_model_checkpoint": "chessgpt2-medium-m/checkpoint-23000",
"epoch": 2.9434348605067826,
"eval_steps": 1000,
"global_step": 23000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.1279754287176862,
"grad_norm": 1.6428743600845337,
"learning_rate": 4.9959725443266765e-05,
"loss": 2.4607,
"step": 1000
},
{
"epoch": 0.1279754287176862,
"eval_loss": 1.8105345964431763,
"eval_runtime": 376.5583,
"eval_samples_per_second": 73.781,
"eval_steps_per_second": 9.223,
"step": 1000
},
{
"epoch": 0.2559508574353724,
"grad_norm": 1.0423442125320435,
"learning_rate": 4.952992634500169e-05,
"loss": 1.6726,
"step": 2000
},
{
"epoch": 0.2559508574353724,
"eval_loss": 1.4938699007034302,
"eval_runtime": 376.7393,
"eval_samples_per_second": 73.746,
"eval_steps_per_second": 9.219,
"step": 2000
},
{
"epoch": 0.3839262861530586,
"grad_norm": 1.234433889389038,
"learning_rate": 4.863737385256343e-05,
"loss": 1.4686,
"step": 3000
},
{
"epoch": 0.3839262861530586,
"eval_loss": 1.3617429733276367,
"eval_runtime": 379.1693,
"eval_samples_per_second": 73.273,
"eval_steps_per_second": 9.159,
"step": 3000
},
{
"epoch": 0.5119017148707448,
"grad_norm": 1.1198837757110596,
"learning_rate": 4.729890583531792e-05,
"loss": 1.3601,
"step": 4000
},
{
"epoch": 0.5119017148707448,
"eval_loss": 1.2697532176971436,
"eval_runtime": 376.7562,
"eval_samples_per_second": 73.743,
"eval_steps_per_second": 9.218,
"step": 4000
},
{
"epoch": 0.6398771435884311,
"grad_norm": 0.9552892446517944,
"learning_rate": 4.5539772292358576e-05,
"loss": 1.2875,
"step": 5000
},
{
"epoch": 0.6398771435884311,
"eval_loss": 1.2163749933242798,
"eval_runtime": 378.6404,
"eval_samples_per_second": 73.376,
"eval_steps_per_second": 9.172,
"step": 5000
},
{
"epoch": 0.7678525723061173,
"grad_norm": 1.001042127609253,
"learning_rate": 4.3393159015047314e-05,
"loss": 1.2366,
"step": 6000
},
{
"epoch": 0.7678525723061173,
"eval_loss": 1.1716755628585815,
"eval_runtime": 376.0934,
"eval_samples_per_second": 73.873,
"eval_steps_per_second": 9.234,
"step": 6000
},
{
"epoch": 0.8958280010238034,
"grad_norm": 1.0304712057113647,
"learning_rate": 4.089956154202057e-05,
"loss": 1.1992,
"step": 7000
},
{
"epoch": 0.8958280010238034,
"eval_loss": 1.1404258012771606,
"eval_runtime": 377.955,
"eval_samples_per_second": 73.509,
"eval_steps_per_second": 9.189,
"step": 7000
},
{
"epoch": 1.0238034297414895,
"grad_norm": 0.7832671403884888,
"learning_rate": 3.810602121690553e-05,
"loss": 1.1646,
"step": 8000
},
{
"epoch": 1.0238034297414895,
"eval_loss": 1.1096745729446411,
"eval_runtime": 377.602,
"eval_samples_per_second": 73.577,
"eval_steps_per_second": 9.198,
"step": 8000
},
{
"epoch": 1.151778858459176,
"grad_norm": 0.7852849364280701,
"learning_rate": 3.5065237760403566e-05,
"loss": 1.126,
"step": 9000
},
{
"epoch": 1.151778858459176,
"eval_loss": 1.0933990478515625,
"eval_runtime": 378.2151,
"eval_samples_per_second": 73.458,
"eval_steps_per_second": 9.183,
"step": 9000
},
{
"epoch": 1.279754287176862,
"grad_norm": 0.7889285087585449,
"learning_rate": 3.183457509793587e-05,
"loss": 1.1085,
"step": 10000
},
{
"epoch": 1.279754287176862,
"eval_loss": 1.0705418586730957,
"eval_runtime": 376.2258,
"eval_samples_per_second": 73.847,
"eval_steps_per_second": 9.231,
"step": 10000
},
{
"epoch": 1.4077297158945483,
"grad_norm": 0.8414849638938904,
"learning_rate": 2.8478382244815133e-05,
"loss": 1.0875,
"step": 11000
},
{
"epoch": 1.4077297158945483,
"eval_loss": 1.0541125535964966,
"eval_runtime": 378.6403,
"eval_samples_per_second": 73.376,
"eval_steps_per_second": 9.172,
"step": 11000
},
{
"epoch": 1.5357051446122343,
"grad_norm": 0.8442362546920776,
"learning_rate": 2.5053264766232426e-05,
"loss": 1.0733,
"step": 12000
},
{
"epoch": 1.5357051446122343,
"eval_loss": 1.0388243198394775,
"eval_runtime": 376.572,
"eval_samples_per_second": 73.779,
"eval_steps_per_second": 9.223,
"step": 12000
},
{
"epoch": 1.6636805733299207,
"grad_norm": 0.9030175805091858,
"learning_rate": 2.1627142455795886e-05,
"loss": 1.059,
"step": 13000
},
{
"epoch": 1.6636805733299207,
"eval_loss": 1.0245987176895142,
"eval_runtime": 378.3137,
"eval_samples_per_second": 73.439,
"eval_steps_per_second": 9.18,
"step": 13000
},
{
"epoch": 1.7916560020476069,
"grad_norm": 0.8050616979598999,
"learning_rate": 1.8264648598812123e-05,
"loss": 1.0451,
"step": 14000
},
{
"epoch": 1.7916560020476069,
"eval_loss": 1.0108946561813354,
"eval_runtime": 376.6418,
"eval_samples_per_second": 73.765,
"eval_steps_per_second": 9.221,
"step": 14000
},
{
"epoch": 1.919631430765293,
"grad_norm": 0.7604677677154541,
"learning_rate": 1.503551911234875e-05,
"loss": 1.0327,
"step": 15000
},
{
"epoch": 1.919631430765293,
"eval_loss": 1.0017954111099243,
"eval_runtime": 378.423,
"eval_samples_per_second": 73.418,
"eval_steps_per_second": 9.178,
"step": 15000
},
{
"epoch": 2.047606859482979,
"grad_norm": 0.8110759258270264,
"learning_rate": 1.1987749064346765e-05,
"loss": 1.0118,
"step": 16000
},
{
"epoch": 2.047606859482979,
"eval_loss": 0.991495668888092,
"eval_runtime": 377.8976,
"eval_samples_per_second": 73.52,
"eval_steps_per_second": 9.19,
"step": 16000
},
{
"epoch": 2.1755822882006655,
"grad_norm": 0.8434863090515137,
"learning_rate": 9.185453185391116e-06,
"loss": 0.9862,
"step": 17000
},
{
"epoch": 2.1755822882006655,
"eval_loss": 0.9861236810684204,
"eval_runtime": 378.3771,
"eval_samples_per_second": 73.427,
"eval_steps_per_second": 9.179,
"step": 17000
},
{
"epoch": 2.303557716918352,
"grad_norm": 0.8797647356987,
"learning_rate": 6.681496368397716e-06,
"loss": 0.9806,
"step": 18000
},
{
"epoch": 2.303557716918352,
"eval_loss": 0.9783245325088501,
"eval_runtime": 376.5913,
"eval_samples_per_second": 73.775,
"eval_steps_per_second": 9.222,
"step": 18000
},
{
"epoch": 2.431533145636038,
"grad_norm": 0.8292227983474731,
"learning_rate": 4.5231153844693594e-06,
"loss": 0.9757,
"step": 19000
},
{
"epoch": 2.431533145636038,
"eval_loss": 0.9735883474349976,
"eval_runtime": 372.1389,
"eval_samples_per_second": 74.658,
"eval_steps_per_second": 9.333,
"step": 19000
},
{
"epoch": 2.559508574353724,
"grad_norm": 0.8722350597381592,
"learning_rate": 2.75102776826896e-06,
"loss": 0.9713,
"step": 20000
},
{
"epoch": 2.559508574353724,
"eval_loss": 0.969468891620636,
"eval_runtime": 372.7114,
"eval_samples_per_second": 74.543,
"eval_steps_per_second": 9.318,
"step": 20000
},
{
"epoch": 2.68748400307141,
"grad_norm": 0.8890399932861328,
"learning_rate": 1.3986636878139808e-06,
"loss": 0.9675,
"step": 21000
},
{
"epoch": 2.68748400307141,
"eval_loss": 0.967136025428772,
"eval_runtime": 373.3491,
"eval_samples_per_second": 74.416,
"eval_steps_per_second": 9.302,
"step": 21000
},
{
"epoch": 2.8154594317890966,
"grad_norm": 0.8660491704940796,
"learning_rate": 4.915352893533093e-07,
"loss": 0.9659,
"step": 22000
},
{
"epoch": 2.8154594317890966,
"eval_loss": 0.9657524228096008,
"eval_runtime": 372.6975,
"eval_samples_per_second": 74.546,
"eval_steps_per_second": 9.319,
"step": 22000
},
{
"epoch": 2.9434348605067826,
"grad_norm": 0.8297032713890076,
"learning_rate": 4.675541454145782e-08,
"loss": 0.965,
"step": 23000
},
{
"epoch": 2.9434348605067826,
"eval_loss": 0.9653898477554321,
"eval_runtime": 373.8133,
"eval_samples_per_second": 74.323,
"eval_steps_per_second": 9.291,
"step": 23000
}
],
"logging_steps": 1000,
"max_steps": 23442,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 9.049023909679596e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}