lesso12's picture
Training in progress, step 500, checkpoint
f1aba76 verified
{
"best_metric": 0.06628672778606415,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.22381378692927484,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0004476275738585497,
"eval_loss": 0.7005916237831116,
"eval_runtime": 56.7667,
"eval_samples_per_second": 16.577,
"eval_steps_per_second": 4.157,
"step": 1
},
{
"epoch": 0.004476275738585497,
"grad_norm": 4.427389621734619,
"learning_rate": 4.24e-05,
"loss": 0.778,
"step": 10
},
{
"epoch": 0.008952551477170993,
"grad_norm": 1.9455581903457642,
"learning_rate": 8.48e-05,
"loss": 0.2889,
"step": 20
},
{
"epoch": 0.01342882721575649,
"grad_norm": 2.1114604473114014,
"learning_rate": 0.0001272,
"loss": 0.2659,
"step": 30
},
{
"epoch": 0.017905102954341987,
"grad_norm": 1.851693034172058,
"learning_rate": 0.0001696,
"loss": 0.2646,
"step": 40
},
{
"epoch": 0.022381378692927483,
"grad_norm": 3.312408208847046,
"learning_rate": 0.000212,
"loss": 0.3177,
"step": 50
},
{
"epoch": 0.022381378692927483,
"eval_loss": 0.22718137502670288,
"eval_runtime": 56.5677,
"eval_samples_per_second": 16.635,
"eval_steps_per_second": 4.172,
"step": 50
},
{
"epoch": 0.02685765443151298,
"grad_norm": 2.3215768337249756,
"learning_rate": 0.00021174178932754136,
"loss": 0.3498,
"step": 60
},
{
"epoch": 0.03133393017009848,
"grad_norm": 1.7366793155670166,
"learning_rate": 0.00021096841528660647,
"loss": 0.2283,
"step": 70
},
{
"epoch": 0.03581020590868397,
"grad_norm": 2.2174038887023926,
"learning_rate": 0.0002096836456777834,
"loss": 0.238,
"step": 80
},
{
"epoch": 0.04028648164726947,
"grad_norm": 1.7410643100738525,
"learning_rate": 0.00020789373976946182,
"loss": 0.1995,
"step": 90
},
{
"epoch": 0.044762757385854966,
"grad_norm": 2.036144733428955,
"learning_rate": 0.0002056074178033063,
"loss": 0.5017,
"step": 100
},
{
"epoch": 0.044762757385854966,
"eval_loss": 0.20847293734550476,
"eval_runtime": 56.567,
"eval_samples_per_second": 16.635,
"eval_steps_per_second": 4.172,
"step": 100
},
{
"epoch": 0.049239033124440466,
"grad_norm": 4.98552942276001,
"learning_rate": 0.00020283581851011567,
"loss": 0.427,
"step": 110
},
{
"epoch": 0.05371530886302596,
"grad_norm": 1.9600409269332886,
"learning_rate": 0.00019959244484304625,
"loss": 0.2183,
"step": 120
},
{
"epoch": 0.05819158460161146,
"grad_norm": 2.0890045166015625,
"learning_rate": 0.00019589309819258114,
"loss": 0.2125,
"step": 130
},
{
"epoch": 0.06266786034019696,
"grad_norm": 1.7276463508605957,
"learning_rate": 0.00019175580140374444,
"loss": 0.1975,
"step": 140
},
{
"epoch": 0.06714413607878246,
"grad_norm": 4.4569268226623535,
"learning_rate": 0.00018720071097061167,
"loss": 0.3504,
"step": 150
},
{
"epoch": 0.06714413607878246,
"eval_loss": 0.1679604947566986,
"eval_runtime": 56.7224,
"eval_samples_per_second": 16.59,
"eval_steps_per_second": 4.161,
"step": 150
},
{
"epoch": 0.07162041181736795,
"grad_norm": 1.5052324533462524,
"learning_rate": 0.00018225001883589702,
"loss": 0.2832,
"step": 160
},
{
"epoch": 0.07609668755595345,
"grad_norm": 1.6397607326507568,
"learning_rate": 0.00017692784427403898,
"loss": 0.1907,
"step": 170
},
{
"epoch": 0.08057296329453895,
"grad_norm": 1.8498625755310059,
"learning_rate": 0.00017126011638451976,
"loss": 0.2125,
"step": 180
},
{
"epoch": 0.08504923903312445,
"grad_norm": 1.0760177373886108,
"learning_rate": 0.00016527444776789915,
"loss": 0.1955,
"step": 190
},
{
"epoch": 0.08952551477170993,
"grad_norm": 3.997589111328125,
"learning_rate": 0.00015900000000000002,
"loss": 0.2774,
"step": 200
},
{
"epoch": 0.08952551477170993,
"eval_loss": 0.15109291672706604,
"eval_runtime": 56.7556,
"eval_samples_per_second": 16.58,
"eval_steps_per_second": 4.158,
"step": 200
},
{
"epoch": 0.09400179051029543,
"grad_norm": 0.8544609546661377,
"learning_rate": 0.0001524673415596422,
"loss": 0.2676,
"step": 210
},
{
"epoch": 0.09847806624888093,
"grad_norm": 8.311532974243164,
"learning_rate": 0.00014570829890208668,
"loss": 0.2037,
"step": 220
},
{
"epoch": 0.10295434198746643,
"grad_norm": 0.8494248390197754,
"learning_rate": 0.00013875580140374443,
"loss": 0.188,
"step": 230
},
{
"epoch": 0.10743061772605192,
"grad_norm": 1.2756179571151733,
"learning_rate": 0.00013164372093356477,
"loss": 0.1783,
"step": 240
},
{
"epoch": 0.11190689346463742,
"grad_norm": 1.4278101921081543,
"learning_rate": 0.00012440670683269464,
"loss": 0.2238,
"step": 250
},
{
"epoch": 0.11190689346463742,
"eval_loss": 0.13769862055778503,
"eval_runtime": 56.6904,
"eval_samples_per_second": 16.599,
"eval_steps_per_second": 4.163,
"step": 250
},
{
"epoch": 0.11638316920322292,
"grad_norm": 1.384413242340088,
"learning_rate": 0.00011708001710637128,
"loss": 0.2849,
"step": 260
},
{
"epoch": 0.12085944494180842,
"grad_norm": 1.2294275760650635,
"learning_rate": 0.00010969934665046512,
"loss": 0.1591,
"step": 270
},
{
"epoch": 0.12533572068039392,
"grad_norm": 1.8471887111663818,
"learning_rate": 0.00010230065334953492,
"loss": 0.206,
"step": 280
},
{
"epoch": 0.12981199641897942,
"grad_norm": 0.8707221746444702,
"learning_rate": 9.491998289362875e-05,
"loss": 0.1345,
"step": 290
},
{
"epoch": 0.13428827215756492,
"grad_norm": 2.1860179901123047,
"learning_rate": 8.759329316730539e-05,
"loss": 0.2196,
"step": 300
},
{
"epoch": 0.13428827215756492,
"eval_loss": 0.10647980868816376,
"eval_runtime": 56.7117,
"eval_samples_per_second": 16.593,
"eval_steps_per_second": 4.161,
"step": 300
},
{
"epoch": 0.1387645478961504,
"grad_norm": 0.8854772448539734,
"learning_rate": 8.035627906643523e-05,
"loss": 0.2294,
"step": 310
},
{
"epoch": 0.1432408236347359,
"grad_norm": 0.7382193803787231,
"learning_rate": 7.324419859625559e-05,
"loss": 0.1443,
"step": 320
},
{
"epoch": 0.1477170993733214,
"grad_norm": 0.7851634621620178,
"learning_rate": 6.629170109791332e-05,
"loss": 0.1584,
"step": 330
},
{
"epoch": 0.1521933751119069,
"grad_norm": 0.8075612187385559,
"learning_rate": 5.9532658440357784e-05,
"loss": 0.1667,
"step": 340
},
{
"epoch": 0.1566696508504924,
"grad_norm": 1.8064610958099365,
"learning_rate": 5.300000000000002e-05,
"loss": 0.1634,
"step": 350
},
{
"epoch": 0.1566696508504924,
"eval_loss": 0.08859492838382721,
"eval_runtime": 56.9347,
"eval_samples_per_second": 16.528,
"eval_steps_per_second": 4.145,
"step": 350
},
{
"epoch": 0.1611459265890779,
"grad_norm": 0.7478625774383545,
"learning_rate": 4.672555223210085e-05,
"loss": 0.1664,
"step": 360
},
{
"epoch": 0.1656222023276634,
"grad_norm": 0.8703305721282959,
"learning_rate": 4.073988361548022e-05,
"loss": 0.117,
"step": 370
},
{
"epoch": 0.1700984780662489,
"grad_norm": 1.1630523204803467,
"learning_rate": 3.507215572596106e-05,
"loss": 0.1343,
"step": 380
},
{
"epoch": 0.17457475380483437,
"grad_norm": 0.8155301809310913,
"learning_rate": 2.9749981164102997e-05,
"loss": 0.1345,
"step": 390
},
{
"epoch": 0.17905102954341987,
"grad_norm": 2.1470611095428467,
"learning_rate": 2.479928902938834e-05,
"loss": 0.1763,
"step": 400
},
{
"epoch": 0.17905102954341987,
"eval_loss": 0.07403463125228882,
"eval_runtime": 56.9196,
"eval_samples_per_second": 16.532,
"eval_steps_per_second": 4.146,
"step": 400
},
{
"epoch": 0.18352730528200537,
"grad_norm": 0.6138676404953003,
"learning_rate": 2.024419859625558e-05,
"loss": 0.1626,
"step": 410
},
{
"epoch": 0.18800358102059087,
"grad_norm": 0.581366240978241,
"learning_rate": 1.610690180741885e-05,
"loss": 0.1102,
"step": 420
},
{
"epoch": 0.19247985675917637,
"grad_norm": 0.7073346376419067,
"learning_rate": 1.240755515695374e-05,
"loss": 0.1405,
"step": 430
},
{
"epoch": 0.19695613249776187,
"grad_norm": 0.80199134349823,
"learning_rate": 9.164181489884296e-06,
"loss": 0.1096,
"step": 440
},
{
"epoch": 0.20143240823634737,
"grad_norm": 1.6628501415252686,
"learning_rate": 6.392582196693718e-06,
"loss": 0.1787,
"step": 450
},
{
"epoch": 0.20143240823634737,
"eval_loss": 0.06836262345314026,
"eval_runtime": 56.6998,
"eval_samples_per_second": 16.596,
"eval_steps_per_second": 4.162,
"step": 450
},
{
"epoch": 0.20590868397493287,
"grad_norm": 0.6762747168540955,
"learning_rate": 4.106260230538197e-06,
"loss": 0.1374,
"step": 460
},
{
"epoch": 0.21038495971351837,
"grad_norm": 0.873978853225708,
"learning_rate": 2.316354322216597e-06,
"loss": 0.1061,
"step": 470
},
{
"epoch": 0.21486123545210384,
"grad_norm": 0.9010151028633118,
"learning_rate": 1.0315847133935416e-06,
"loss": 0.1034,
"step": 480
},
{
"epoch": 0.21933751119068934,
"grad_norm": 0.5987867712974548,
"learning_rate": 2.582106724586351e-07,
"loss": 0.1206,
"step": 490
},
{
"epoch": 0.22381378692927484,
"grad_norm": 1.7397042512893677,
"learning_rate": 0.0,
"loss": 0.1681,
"step": 500
},
{
"epoch": 0.22381378692927484,
"eval_loss": 0.06628672778606415,
"eval_runtime": 56.633,
"eval_samples_per_second": 16.616,
"eval_steps_per_second": 4.167,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.033185098727424e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}