{ "best_metric": 0.06628672778606415, "best_model_checkpoint": "miner_id_24/checkpoint-500", "epoch": 0.22381378692927484, "eval_steps": 50, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004476275738585497, "eval_loss": 0.7005916237831116, "eval_runtime": 56.7667, "eval_samples_per_second": 16.577, "eval_steps_per_second": 4.157, "step": 1 }, { "epoch": 0.004476275738585497, "grad_norm": 4.427389621734619, "learning_rate": 4.24e-05, "loss": 0.778, "step": 10 }, { "epoch": 0.008952551477170993, "grad_norm": 1.9455581903457642, "learning_rate": 8.48e-05, "loss": 0.2889, "step": 20 }, { "epoch": 0.01342882721575649, "grad_norm": 2.1114604473114014, "learning_rate": 0.0001272, "loss": 0.2659, "step": 30 }, { "epoch": 0.017905102954341987, "grad_norm": 1.851693034172058, "learning_rate": 0.0001696, "loss": 0.2646, "step": 40 }, { "epoch": 0.022381378692927483, "grad_norm": 3.312408208847046, "learning_rate": 0.000212, "loss": 0.3177, "step": 50 }, { "epoch": 0.022381378692927483, "eval_loss": 0.22718137502670288, "eval_runtime": 56.5677, "eval_samples_per_second": 16.635, "eval_steps_per_second": 4.172, "step": 50 }, { "epoch": 0.02685765443151298, "grad_norm": 2.3215768337249756, "learning_rate": 0.00021174178932754136, "loss": 0.3498, "step": 60 }, { "epoch": 0.03133393017009848, "grad_norm": 1.7366793155670166, "learning_rate": 0.00021096841528660647, "loss": 0.2283, "step": 70 }, { "epoch": 0.03581020590868397, "grad_norm": 2.2174038887023926, "learning_rate": 0.0002096836456777834, "loss": 0.238, "step": 80 }, { "epoch": 0.04028648164726947, "grad_norm": 1.7410643100738525, "learning_rate": 0.00020789373976946182, "loss": 0.1995, "step": 90 }, { "epoch": 0.044762757385854966, "grad_norm": 2.036144733428955, "learning_rate": 0.0002056074178033063, "loss": 0.5017, "step": 100 }, { "epoch": 0.044762757385854966, "eval_loss": 0.20847293734550476, "eval_runtime": 56.567, "eval_samples_per_second": 16.635, "eval_steps_per_second": 4.172, "step": 100 }, { "epoch": 0.049239033124440466, "grad_norm": 4.98552942276001, "learning_rate": 0.00020283581851011567, "loss": 0.427, "step": 110 }, { "epoch": 0.05371530886302596, "grad_norm": 1.9600409269332886, "learning_rate": 0.00019959244484304625, "loss": 0.2183, "step": 120 }, { "epoch": 0.05819158460161146, "grad_norm": 2.0890045166015625, "learning_rate": 0.00019589309819258114, "loss": 0.2125, "step": 130 }, { "epoch": 0.06266786034019696, "grad_norm": 1.7276463508605957, "learning_rate": 0.00019175580140374444, "loss": 0.1975, "step": 140 }, { "epoch": 0.06714413607878246, "grad_norm": 4.4569268226623535, "learning_rate": 0.00018720071097061167, "loss": 0.3504, "step": 150 }, { "epoch": 0.06714413607878246, "eval_loss": 0.1679604947566986, "eval_runtime": 56.7224, "eval_samples_per_second": 16.59, "eval_steps_per_second": 4.161, "step": 150 }, { "epoch": 0.07162041181736795, "grad_norm": 1.5052324533462524, "learning_rate": 0.00018225001883589702, "loss": 0.2832, "step": 160 }, { "epoch": 0.07609668755595345, "grad_norm": 1.6397607326507568, "learning_rate": 0.00017692784427403898, "loss": 0.1907, "step": 170 }, { "epoch": 0.08057296329453895, "grad_norm": 1.8498625755310059, "learning_rate": 0.00017126011638451976, "loss": 0.2125, "step": 180 }, { "epoch": 0.08504923903312445, "grad_norm": 1.0760177373886108, "learning_rate": 0.00016527444776789915, "loss": 0.1955, "step": 190 }, { "epoch": 0.08952551477170993, "grad_norm": 3.997589111328125, "learning_rate": 0.00015900000000000002, "loss": 0.2774, "step": 200 }, { "epoch": 0.08952551477170993, "eval_loss": 0.15109291672706604, "eval_runtime": 56.7556, "eval_samples_per_second": 16.58, "eval_steps_per_second": 4.158, "step": 200 }, { "epoch": 0.09400179051029543, "grad_norm": 0.8544609546661377, "learning_rate": 0.0001524673415596422, "loss": 0.2676, "step": 210 }, { "epoch": 0.09847806624888093, "grad_norm": 8.311532974243164, "learning_rate": 0.00014570829890208668, "loss": 0.2037, "step": 220 }, { "epoch": 0.10295434198746643, "grad_norm": 0.8494248390197754, "learning_rate": 0.00013875580140374443, "loss": 0.188, "step": 230 }, { "epoch": 0.10743061772605192, "grad_norm": 1.2756179571151733, "learning_rate": 0.00013164372093356477, "loss": 0.1783, "step": 240 }, { "epoch": 0.11190689346463742, "grad_norm": 1.4278101921081543, "learning_rate": 0.00012440670683269464, "loss": 0.2238, "step": 250 }, { "epoch": 0.11190689346463742, "eval_loss": 0.13769862055778503, "eval_runtime": 56.6904, "eval_samples_per_second": 16.599, "eval_steps_per_second": 4.163, "step": 250 }, { "epoch": 0.11638316920322292, "grad_norm": 1.384413242340088, "learning_rate": 0.00011708001710637128, "loss": 0.2849, "step": 260 }, { "epoch": 0.12085944494180842, "grad_norm": 1.2294275760650635, "learning_rate": 0.00010969934665046512, "loss": 0.1591, "step": 270 }, { "epoch": 0.12533572068039392, "grad_norm": 1.8471887111663818, "learning_rate": 0.00010230065334953492, "loss": 0.206, "step": 280 }, { "epoch": 0.12981199641897942, "grad_norm": 0.8707221746444702, "learning_rate": 9.491998289362875e-05, "loss": 0.1345, "step": 290 }, { "epoch": 0.13428827215756492, "grad_norm": 2.1860179901123047, "learning_rate": 8.759329316730539e-05, "loss": 0.2196, "step": 300 }, { "epoch": 0.13428827215756492, "eval_loss": 0.10647980868816376, "eval_runtime": 56.7117, "eval_samples_per_second": 16.593, "eval_steps_per_second": 4.161, "step": 300 }, { "epoch": 0.1387645478961504, "grad_norm": 0.8854772448539734, "learning_rate": 8.035627906643523e-05, "loss": 0.2294, "step": 310 }, { "epoch": 0.1432408236347359, "grad_norm": 0.7382193803787231, "learning_rate": 7.324419859625559e-05, "loss": 0.1443, "step": 320 }, { "epoch": 0.1477170993733214, "grad_norm": 0.7851634621620178, "learning_rate": 6.629170109791332e-05, "loss": 0.1584, "step": 330 }, { "epoch": 0.1521933751119069, "grad_norm": 0.8075612187385559, "learning_rate": 5.9532658440357784e-05, "loss": 0.1667, "step": 340 }, { "epoch": 0.1566696508504924, "grad_norm": 1.8064610958099365, "learning_rate": 5.300000000000002e-05, "loss": 0.1634, "step": 350 }, { "epoch": 0.1566696508504924, "eval_loss": 0.08859492838382721, "eval_runtime": 56.9347, "eval_samples_per_second": 16.528, "eval_steps_per_second": 4.145, "step": 350 }, { "epoch": 0.1611459265890779, "grad_norm": 0.7478625774383545, "learning_rate": 4.672555223210085e-05, "loss": 0.1664, "step": 360 }, { "epoch": 0.1656222023276634, "grad_norm": 0.8703305721282959, "learning_rate": 4.073988361548022e-05, "loss": 0.117, "step": 370 }, { "epoch": 0.1700984780662489, "grad_norm": 1.1630523204803467, "learning_rate": 3.507215572596106e-05, "loss": 0.1343, "step": 380 }, { "epoch": 0.17457475380483437, "grad_norm": 0.8155301809310913, "learning_rate": 2.9749981164102997e-05, "loss": 0.1345, "step": 390 }, { "epoch": 0.17905102954341987, "grad_norm": 2.1470611095428467, "learning_rate": 2.479928902938834e-05, "loss": 0.1763, "step": 400 }, { "epoch": 0.17905102954341987, "eval_loss": 0.07403463125228882, "eval_runtime": 56.9196, "eval_samples_per_second": 16.532, "eval_steps_per_second": 4.146, "step": 400 }, { "epoch": 0.18352730528200537, "grad_norm": 0.6138676404953003, "learning_rate": 2.024419859625558e-05, "loss": 0.1626, "step": 410 }, { "epoch": 0.18800358102059087, "grad_norm": 0.581366240978241, "learning_rate": 1.610690180741885e-05, "loss": 0.1102, "step": 420 }, { "epoch": 0.19247985675917637, "grad_norm": 0.7073346376419067, "learning_rate": 1.240755515695374e-05, "loss": 0.1405, "step": 430 }, { "epoch": 0.19695613249776187, "grad_norm": 0.80199134349823, "learning_rate": 9.164181489884296e-06, "loss": 0.1096, "step": 440 }, { "epoch": 0.20143240823634737, "grad_norm": 1.6628501415252686, "learning_rate": 6.392582196693718e-06, "loss": 0.1787, "step": 450 }, { "epoch": 0.20143240823634737, "eval_loss": 0.06836262345314026, "eval_runtime": 56.6998, "eval_samples_per_second": 16.596, "eval_steps_per_second": 4.162, "step": 450 }, { "epoch": 0.20590868397493287, "grad_norm": 0.6762747168540955, "learning_rate": 4.106260230538197e-06, "loss": 0.1374, "step": 460 }, { "epoch": 0.21038495971351837, "grad_norm": 0.873978853225708, "learning_rate": 2.316354322216597e-06, "loss": 0.1061, "step": 470 }, { "epoch": 0.21486123545210384, "grad_norm": 0.9010151028633118, "learning_rate": 1.0315847133935416e-06, "loss": 0.1034, "step": 480 }, { "epoch": 0.21933751119068934, "grad_norm": 0.5987867712974548, "learning_rate": 2.582106724586351e-07, "loss": 0.1206, "step": 490 }, { "epoch": 0.22381378692927484, "grad_norm": 1.7397042512893677, "learning_rate": 0.0, "loss": 0.1681, "step": 500 }, { "epoch": 0.22381378692927484, "eval_loss": 0.06628672778606415, "eval_runtime": 56.633, "eval_samples_per_second": 16.616, "eval_steps_per_second": 4.167, "step": 500 } ], "logging_steps": 10, "max_steps": 500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.033185098727424e+16, "train_batch_size": 4, "trial_name": null, "trial_params": null }