|
{ |
|
"best_metric": 0.06628672778606415, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.22381378692927484, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004476275738585497, |
|
"eval_loss": 0.7005916237831116, |
|
"eval_runtime": 56.7667, |
|
"eval_samples_per_second": 16.577, |
|
"eval_steps_per_second": 4.157, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004476275738585497, |
|
"grad_norm": 4.427389621734619, |
|
"learning_rate": 4.24e-05, |
|
"loss": 0.778, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008952551477170993, |
|
"grad_norm": 1.9455581903457642, |
|
"learning_rate": 8.48e-05, |
|
"loss": 0.2889, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.01342882721575649, |
|
"grad_norm": 2.1114604473114014, |
|
"learning_rate": 0.0001272, |
|
"loss": 0.2659, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017905102954341987, |
|
"grad_norm": 1.851693034172058, |
|
"learning_rate": 0.0001696, |
|
"loss": 0.2646, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.022381378692927483, |
|
"grad_norm": 3.312408208847046, |
|
"learning_rate": 0.000212, |
|
"loss": 0.3177, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.022381378692927483, |
|
"eval_loss": 0.22718137502670288, |
|
"eval_runtime": 56.5677, |
|
"eval_samples_per_second": 16.635, |
|
"eval_steps_per_second": 4.172, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02685765443151298, |
|
"grad_norm": 2.3215768337249756, |
|
"learning_rate": 0.00021174178932754136, |
|
"loss": 0.3498, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03133393017009848, |
|
"grad_norm": 1.7366793155670166, |
|
"learning_rate": 0.00021096841528660647, |
|
"loss": 0.2283, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.03581020590868397, |
|
"grad_norm": 2.2174038887023926, |
|
"learning_rate": 0.0002096836456777834, |
|
"loss": 0.238, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04028648164726947, |
|
"grad_norm": 1.7410643100738525, |
|
"learning_rate": 0.00020789373976946182, |
|
"loss": 0.1995, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.044762757385854966, |
|
"grad_norm": 2.036144733428955, |
|
"learning_rate": 0.0002056074178033063, |
|
"loss": 0.5017, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.044762757385854966, |
|
"eval_loss": 0.20847293734550476, |
|
"eval_runtime": 56.567, |
|
"eval_samples_per_second": 16.635, |
|
"eval_steps_per_second": 4.172, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.049239033124440466, |
|
"grad_norm": 4.98552942276001, |
|
"learning_rate": 0.00020283581851011567, |
|
"loss": 0.427, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.05371530886302596, |
|
"grad_norm": 1.9600409269332886, |
|
"learning_rate": 0.00019959244484304625, |
|
"loss": 0.2183, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.05819158460161146, |
|
"grad_norm": 2.0890045166015625, |
|
"learning_rate": 0.00019589309819258114, |
|
"loss": 0.2125, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06266786034019696, |
|
"grad_norm": 1.7276463508605957, |
|
"learning_rate": 0.00019175580140374444, |
|
"loss": 0.1975, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.06714413607878246, |
|
"grad_norm": 4.4569268226623535, |
|
"learning_rate": 0.00018720071097061167, |
|
"loss": 0.3504, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.06714413607878246, |
|
"eval_loss": 0.1679604947566986, |
|
"eval_runtime": 56.7224, |
|
"eval_samples_per_second": 16.59, |
|
"eval_steps_per_second": 4.161, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07162041181736795, |
|
"grad_norm": 1.5052324533462524, |
|
"learning_rate": 0.00018225001883589702, |
|
"loss": 0.2832, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.07609668755595345, |
|
"grad_norm": 1.6397607326507568, |
|
"learning_rate": 0.00017692784427403898, |
|
"loss": 0.1907, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08057296329453895, |
|
"grad_norm": 1.8498625755310059, |
|
"learning_rate": 0.00017126011638451976, |
|
"loss": 0.2125, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.08504923903312445, |
|
"grad_norm": 1.0760177373886108, |
|
"learning_rate": 0.00016527444776789915, |
|
"loss": 0.1955, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.08952551477170993, |
|
"grad_norm": 3.997589111328125, |
|
"learning_rate": 0.00015900000000000002, |
|
"loss": 0.2774, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.08952551477170993, |
|
"eval_loss": 0.15109291672706604, |
|
"eval_runtime": 56.7556, |
|
"eval_samples_per_second": 16.58, |
|
"eval_steps_per_second": 4.158, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09400179051029543, |
|
"grad_norm": 0.8544609546661377, |
|
"learning_rate": 0.0001524673415596422, |
|
"loss": 0.2676, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.09847806624888093, |
|
"grad_norm": 8.311532974243164, |
|
"learning_rate": 0.00014570829890208668, |
|
"loss": 0.2037, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.10295434198746643, |
|
"grad_norm": 0.8494248390197754, |
|
"learning_rate": 0.00013875580140374443, |
|
"loss": 0.188, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.10743061772605192, |
|
"grad_norm": 1.2756179571151733, |
|
"learning_rate": 0.00013164372093356477, |
|
"loss": 0.1783, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.11190689346463742, |
|
"grad_norm": 1.4278101921081543, |
|
"learning_rate": 0.00012440670683269464, |
|
"loss": 0.2238, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11190689346463742, |
|
"eval_loss": 0.13769862055778503, |
|
"eval_runtime": 56.6904, |
|
"eval_samples_per_second": 16.599, |
|
"eval_steps_per_second": 4.163, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.11638316920322292, |
|
"grad_norm": 1.384413242340088, |
|
"learning_rate": 0.00011708001710637128, |
|
"loss": 0.2849, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.12085944494180842, |
|
"grad_norm": 1.2294275760650635, |
|
"learning_rate": 0.00010969934665046512, |
|
"loss": 0.1591, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.12533572068039392, |
|
"grad_norm": 1.8471887111663818, |
|
"learning_rate": 0.00010230065334953492, |
|
"loss": 0.206, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.12981199641897942, |
|
"grad_norm": 0.8707221746444702, |
|
"learning_rate": 9.491998289362875e-05, |
|
"loss": 0.1345, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.13428827215756492, |
|
"grad_norm": 2.1860179901123047, |
|
"learning_rate": 8.759329316730539e-05, |
|
"loss": 0.2196, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.13428827215756492, |
|
"eval_loss": 0.10647980868816376, |
|
"eval_runtime": 56.7117, |
|
"eval_samples_per_second": 16.593, |
|
"eval_steps_per_second": 4.161, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.1387645478961504, |
|
"grad_norm": 0.8854772448539734, |
|
"learning_rate": 8.035627906643523e-05, |
|
"loss": 0.2294, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.1432408236347359, |
|
"grad_norm": 0.7382193803787231, |
|
"learning_rate": 7.324419859625559e-05, |
|
"loss": 0.1443, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1477170993733214, |
|
"grad_norm": 0.7851634621620178, |
|
"learning_rate": 6.629170109791332e-05, |
|
"loss": 0.1584, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.1521933751119069, |
|
"grad_norm": 0.8075612187385559, |
|
"learning_rate": 5.9532658440357784e-05, |
|
"loss": 0.1667, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.1566696508504924, |
|
"grad_norm": 1.8064610958099365, |
|
"learning_rate": 5.300000000000002e-05, |
|
"loss": 0.1634, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1566696508504924, |
|
"eval_loss": 0.08859492838382721, |
|
"eval_runtime": 56.9347, |
|
"eval_samples_per_second": 16.528, |
|
"eval_steps_per_second": 4.145, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.1611459265890779, |
|
"grad_norm": 0.7478625774383545, |
|
"learning_rate": 4.672555223210085e-05, |
|
"loss": 0.1664, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.1656222023276634, |
|
"grad_norm": 0.8703305721282959, |
|
"learning_rate": 4.073988361548022e-05, |
|
"loss": 0.117, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.1700984780662489, |
|
"grad_norm": 1.1630523204803467, |
|
"learning_rate": 3.507215572596106e-05, |
|
"loss": 0.1343, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.17457475380483437, |
|
"grad_norm": 0.8155301809310913, |
|
"learning_rate": 2.9749981164102997e-05, |
|
"loss": 0.1345, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.17905102954341987, |
|
"grad_norm": 2.1470611095428467, |
|
"learning_rate": 2.479928902938834e-05, |
|
"loss": 0.1763, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.17905102954341987, |
|
"eval_loss": 0.07403463125228882, |
|
"eval_runtime": 56.9196, |
|
"eval_samples_per_second": 16.532, |
|
"eval_steps_per_second": 4.146, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.18352730528200537, |
|
"grad_norm": 0.6138676404953003, |
|
"learning_rate": 2.024419859625558e-05, |
|
"loss": 0.1626, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.18800358102059087, |
|
"grad_norm": 0.581366240978241, |
|
"learning_rate": 1.610690180741885e-05, |
|
"loss": 0.1102, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.19247985675917637, |
|
"grad_norm": 0.7073346376419067, |
|
"learning_rate": 1.240755515695374e-05, |
|
"loss": 0.1405, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.19695613249776187, |
|
"grad_norm": 0.80199134349823, |
|
"learning_rate": 9.164181489884296e-06, |
|
"loss": 0.1096, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.20143240823634737, |
|
"grad_norm": 1.6628501415252686, |
|
"learning_rate": 6.392582196693718e-06, |
|
"loss": 0.1787, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20143240823634737, |
|
"eval_loss": 0.06836262345314026, |
|
"eval_runtime": 56.6998, |
|
"eval_samples_per_second": 16.596, |
|
"eval_steps_per_second": 4.162, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.20590868397493287, |
|
"grad_norm": 0.6762747168540955, |
|
"learning_rate": 4.106260230538197e-06, |
|
"loss": 0.1374, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.21038495971351837, |
|
"grad_norm": 0.873978853225708, |
|
"learning_rate": 2.316354322216597e-06, |
|
"loss": 0.1061, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.21486123545210384, |
|
"grad_norm": 0.9010151028633118, |
|
"learning_rate": 1.0315847133935416e-06, |
|
"loss": 0.1034, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.21933751119068934, |
|
"grad_norm": 0.5987867712974548, |
|
"learning_rate": 2.582106724586351e-07, |
|
"loss": 0.1206, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.22381378692927484, |
|
"grad_norm": 1.7397042512893677, |
|
"learning_rate": 0.0, |
|
"loss": 0.1681, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22381378692927484, |
|
"eval_loss": 0.06628672778606415, |
|
"eval_runtime": 56.633, |
|
"eval_samples_per_second": 16.616, |
|
"eval_steps_per_second": 4.167, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.033185098727424e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|