|
{ |
|
"best_metric": 0.9653898477554321, |
|
"best_model_checkpoint": "chessgpt2-medium-m/checkpoint-23000", |
|
"epoch": 2.9434348605067826, |
|
"eval_steps": 1000, |
|
"global_step": 23000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.1279754287176862, |
|
"grad_norm": 1.6428743600845337, |
|
"learning_rate": 4.9959725443266765e-05, |
|
"loss": 2.4607, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1279754287176862, |
|
"eval_loss": 1.8105345964431763, |
|
"eval_runtime": 376.5583, |
|
"eval_samples_per_second": 73.781, |
|
"eval_steps_per_second": 9.223, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2559508574353724, |
|
"grad_norm": 1.0423442125320435, |
|
"learning_rate": 4.952992634500169e-05, |
|
"loss": 1.6726, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2559508574353724, |
|
"eval_loss": 1.4938699007034302, |
|
"eval_runtime": 376.7393, |
|
"eval_samples_per_second": 73.746, |
|
"eval_steps_per_second": 9.219, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.3839262861530586, |
|
"grad_norm": 1.234433889389038, |
|
"learning_rate": 4.863737385256343e-05, |
|
"loss": 1.4686, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3839262861530586, |
|
"eval_loss": 1.3617429733276367, |
|
"eval_runtime": 379.1693, |
|
"eval_samples_per_second": 73.273, |
|
"eval_steps_per_second": 9.159, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.5119017148707448, |
|
"grad_norm": 1.1198837757110596, |
|
"learning_rate": 4.729890583531792e-05, |
|
"loss": 1.3601, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.5119017148707448, |
|
"eval_loss": 1.2697532176971436, |
|
"eval_runtime": 376.7562, |
|
"eval_samples_per_second": 73.743, |
|
"eval_steps_per_second": 9.218, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.6398771435884311, |
|
"grad_norm": 0.9552892446517944, |
|
"learning_rate": 4.5539772292358576e-05, |
|
"loss": 1.2875, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.6398771435884311, |
|
"eval_loss": 1.2163749933242798, |
|
"eval_runtime": 378.6404, |
|
"eval_samples_per_second": 73.376, |
|
"eval_steps_per_second": 9.172, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.7678525723061173, |
|
"grad_norm": 1.001042127609253, |
|
"learning_rate": 4.3393159015047314e-05, |
|
"loss": 1.2366, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.7678525723061173, |
|
"eval_loss": 1.1716755628585815, |
|
"eval_runtime": 376.0934, |
|
"eval_samples_per_second": 73.873, |
|
"eval_steps_per_second": 9.234, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.8958280010238034, |
|
"grad_norm": 1.0304712057113647, |
|
"learning_rate": 4.089956154202057e-05, |
|
"loss": 1.1992, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.8958280010238034, |
|
"eval_loss": 1.1404258012771606, |
|
"eval_runtime": 377.955, |
|
"eval_samples_per_second": 73.509, |
|
"eval_steps_per_second": 9.189, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.0238034297414895, |
|
"grad_norm": 0.7832671403884888, |
|
"learning_rate": 3.810602121690553e-05, |
|
"loss": 1.1646, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.0238034297414895, |
|
"eval_loss": 1.1096745729446411, |
|
"eval_runtime": 377.602, |
|
"eval_samples_per_second": 73.577, |
|
"eval_steps_per_second": 9.198, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.151778858459176, |
|
"grad_norm": 0.7852849364280701, |
|
"learning_rate": 3.5065237760403566e-05, |
|
"loss": 1.126, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.151778858459176, |
|
"eval_loss": 1.0933990478515625, |
|
"eval_runtime": 378.2151, |
|
"eval_samples_per_second": 73.458, |
|
"eval_steps_per_second": 9.183, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 1.279754287176862, |
|
"grad_norm": 0.7889285087585449, |
|
"learning_rate": 3.183457509793587e-05, |
|
"loss": 1.1085, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.279754287176862, |
|
"eval_loss": 1.0705418586730957, |
|
"eval_runtime": 376.2258, |
|
"eval_samples_per_second": 73.847, |
|
"eval_steps_per_second": 9.231, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.4077297158945483, |
|
"grad_norm": 0.8414849638938904, |
|
"learning_rate": 2.8478382244815133e-05, |
|
"loss": 1.0875, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.4077297158945483, |
|
"eval_loss": 1.0541125535964966, |
|
"eval_runtime": 378.6403, |
|
"eval_samples_per_second": 73.376, |
|
"eval_steps_per_second": 9.172, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.5357051446122343, |
|
"grad_norm": 0.8442362546920776, |
|
"learning_rate": 2.5053264766232426e-05, |
|
"loss": 1.0733, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.5357051446122343, |
|
"eval_loss": 1.0388243198394775, |
|
"eval_runtime": 376.572, |
|
"eval_samples_per_second": 73.779, |
|
"eval_steps_per_second": 9.223, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.6636805733299207, |
|
"grad_norm": 0.9030175805091858, |
|
"learning_rate": 2.1627142455795886e-05, |
|
"loss": 1.059, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.6636805733299207, |
|
"eval_loss": 1.0245987176895142, |
|
"eval_runtime": 378.3137, |
|
"eval_samples_per_second": 73.439, |
|
"eval_steps_per_second": 9.18, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.7916560020476069, |
|
"grad_norm": 0.8050616979598999, |
|
"learning_rate": 1.8264648598812123e-05, |
|
"loss": 1.0451, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.7916560020476069, |
|
"eval_loss": 1.0108946561813354, |
|
"eval_runtime": 376.6418, |
|
"eval_samples_per_second": 73.765, |
|
"eval_steps_per_second": 9.221, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.919631430765293, |
|
"grad_norm": 0.7604677677154541, |
|
"learning_rate": 1.503551911234875e-05, |
|
"loss": 1.0327, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.919631430765293, |
|
"eval_loss": 1.0017954111099243, |
|
"eval_runtime": 378.423, |
|
"eval_samples_per_second": 73.418, |
|
"eval_steps_per_second": 9.178, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.047606859482979, |
|
"grad_norm": 0.8110759258270264, |
|
"learning_rate": 1.1987749064346765e-05, |
|
"loss": 1.0118, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.047606859482979, |
|
"eval_loss": 0.991495668888092, |
|
"eval_runtime": 377.8976, |
|
"eval_samples_per_second": 73.52, |
|
"eval_steps_per_second": 9.19, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.1755822882006655, |
|
"grad_norm": 0.8434863090515137, |
|
"learning_rate": 9.185453185391116e-06, |
|
"loss": 0.9862, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.1755822882006655, |
|
"eval_loss": 0.9861236810684204, |
|
"eval_runtime": 378.3771, |
|
"eval_samples_per_second": 73.427, |
|
"eval_steps_per_second": 9.179, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 2.303557716918352, |
|
"grad_norm": 0.8797647356987, |
|
"learning_rate": 6.681496368397716e-06, |
|
"loss": 0.9806, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.303557716918352, |
|
"eval_loss": 0.9783245325088501, |
|
"eval_runtime": 376.5913, |
|
"eval_samples_per_second": 73.775, |
|
"eval_steps_per_second": 9.222, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 2.431533145636038, |
|
"grad_norm": 0.8292227983474731, |
|
"learning_rate": 4.5231153844693594e-06, |
|
"loss": 0.9757, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.431533145636038, |
|
"eval_loss": 0.9735883474349976, |
|
"eval_runtime": 372.1389, |
|
"eval_samples_per_second": 74.658, |
|
"eval_steps_per_second": 9.333, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.559508574353724, |
|
"grad_norm": 0.8722350597381592, |
|
"learning_rate": 2.75102776826896e-06, |
|
"loss": 0.9713, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.559508574353724, |
|
"eval_loss": 0.969468891620636, |
|
"eval_runtime": 372.7114, |
|
"eval_samples_per_second": 74.543, |
|
"eval_steps_per_second": 9.318, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.68748400307141, |
|
"grad_norm": 0.8890399932861328, |
|
"learning_rate": 1.3986636878139808e-06, |
|
"loss": 0.9675, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.68748400307141, |
|
"eval_loss": 0.967136025428772, |
|
"eval_runtime": 373.3491, |
|
"eval_samples_per_second": 74.416, |
|
"eval_steps_per_second": 9.302, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.8154594317890966, |
|
"grad_norm": 0.8660491704940796, |
|
"learning_rate": 4.915352893533093e-07, |
|
"loss": 0.9659, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.8154594317890966, |
|
"eval_loss": 0.9657524228096008, |
|
"eval_runtime": 372.6975, |
|
"eval_samples_per_second": 74.546, |
|
"eval_steps_per_second": 9.319, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.9434348605067826, |
|
"grad_norm": 0.8297032713890076, |
|
"learning_rate": 4.675541454145782e-08, |
|
"loss": 0.965, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.9434348605067826, |
|
"eval_loss": 0.9653898477554321, |
|
"eval_runtime": 373.8133, |
|
"eval_samples_per_second": 74.323, |
|
"eval_steps_per_second": 9.291, |
|
"step": 23000 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 23442, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 1000, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.049023909679596e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|