lesso08's picture
Training in progress, step 500, checkpoint
02fa523 verified
{
"best_metric": 2.206617832183838,
"best_model_checkpoint": "miner_id_24/checkpoint-500",
"epoch": 0.0768344218209758,
"eval_steps": 50,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0001536688436419516,
"eval_loss": 3.802302360534668,
"eval_runtime": 176.2084,
"eval_samples_per_second": 15.55,
"eval_steps_per_second": 3.887,
"step": 1
},
{
"epoch": 0.001536688436419516,
"grad_norm": 29.49568748474121,
"learning_rate": 4.16e-05,
"loss": 5.6992,
"step": 10
},
{
"epoch": 0.003073376872839032,
"grad_norm": 11.658048629760742,
"learning_rate": 8.32e-05,
"loss": 6.3089,
"step": 20
},
{
"epoch": 0.004610065309258548,
"grad_norm": 14.371132850646973,
"learning_rate": 0.0001248,
"loss": 5.7372,
"step": 30
},
{
"epoch": 0.006146753745678064,
"grad_norm": 18.474430084228516,
"learning_rate": 0.0001664,
"loss": 6.365,
"step": 40
},
{
"epoch": 0.00768344218209758,
"grad_norm": 42.71783447265625,
"learning_rate": 0.000208,
"loss": 7.0134,
"step": 50
},
{
"epoch": 0.00768344218209758,
"eval_loss": 3.467595338821411,
"eval_runtime": 175.6638,
"eval_samples_per_second": 15.598,
"eval_steps_per_second": 3.899,
"step": 50
},
{
"epoch": 0.009220130618517095,
"grad_norm": 8.05245590209961,
"learning_rate": 0.0002077466612270217,
"loss": 5.4029,
"step": 60
},
{
"epoch": 0.010756819054936612,
"grad_norm": 7.458392143249512,
"learning_rate": 0.0002069878791491233,
"loss": 5.7947,
"step": 70
},
{
"epoch": 0.012293507491356128,
"grad_norm": 27.035818099975586,
"learning_rate": 0.00020572735047631578,
"loss": 5.5678,
"step": 80
},
{
"epoch": 0.013830195927775643,
"grad_norm": 17.315340042114258,
"learning_rate": 0.00020397121637758515,
"loss": 6.5471,
"step": 90
},
{
"epoch": 0.01536688436419516,
"grad_norm": 33.20652770996094,
"learning_rate": 0.00020172803256173445,
"loss": 7.03,
"step": 100
},
{
"epoch": 0.01536688436419516,
"eval_loss": 3.3676817417144775,
"eval_runtime": 176.2178,
"eval_samples_per_second": 15.549,
"eval_steps_per_second": 3.887,
"step": 100
},
{
"epoch": 0.016903572800614674,
"grad_norm": 6.50840425491333,
"learning_rate": 0.00019900872759483047,
"loss": 5.6033,
"step": 110
},
{
"epoch": 0.01844026123703419,
"grad_norm": 6.7641825675964355,
"learning_rate": 0.0001958265496573284,
"loss": 5.4601,
"step": 120
},
{
"epoch": 0.019976949673453707,
"grad_norm": 14.109837532043457,
"learning_rate": 0.00019219700200026827,
"loss": 5.4273,
"step": 130
},
{
"epoch": 0.021513638109873223,
"grad_norm": 11.948671340942383,
"learning_rate": 0.0001881377674149945,
"loss": 6.1093,
"step": 140
},
{
"epoch": 0.02305032654629274,
"grad_norm": 21.177404403686523,
"learning_rate": 0.00018366862208437368,
"loss": 6.2444,
"step": 150
},
{
"epoch": 0.02305032654629274,
"eval_loss": 3.1878161430358887,
"eval_runtime": 175.8544,
"eval_samples_per_second": 15.581,
"eval_steps_per_second": 3.895,
"step": 150
},
{
"epoch": 0.024587014982712256,
"grad_norm": 6.276320457458496,
"learning_rate": 0.00017881133923521971,
"loss": 5.5092,
"step": 160
},
{
"epoch": 0.026123703419131773,
"grad_norm": 6.799064636230469,
"learning_rate": 0.00017358958306132124,
"loss": 5.1537,
"step": 170
},
{
"epoch": 0.027660391855551286,
"grad_norm": 9.907474517822266,
"learning_rate": 0.00016802879343386844,
"loss": 5.5056,
"step": 180
},
{
"epoch": 0.029197080291970802,
"grad_norm": 13.780465126037598,
"learning_rate": 0.00016215606196095766,
"loss": 5.6722,
"step": 190
},
{
"epoch": 0.03073376872839032,
"grad_norm": 17.128713607788086,
"learning_rate": 0.000156,
"loss": 5.9664,
"step": 200
},
{
"epoch": 0.03073376872839032,
"eval_loss": 3.0006656646728516,
"eval_runtime": 175.6129,
"eval_samples_per_second": 15.602,
"eval_steps_per_second": 3.901,
"step": 200
},
{
"epoch": 0.032270457164809835,
"grad_norm": 4.835846900939941,
"learning_rate": 0.00014959059926606403,
"loss": 5.1,
"step": 210
},
{
"epoch": 0.03380714560122935,
"grad_norm": 5.523816108703613,
"learning_rate": 0.00014295908571525487,
"loss": 4.9443,
"step": 220
},
{
"epoch": 0.03534383403764887,
"grad_norm": 9.212928771972656,
"learning_rate": 0.00013613776741499452,
"loss": 4.8833,
"step": 230
},
{
"epoch": 0.03688052247406838,
"grad_norm": 10.690314292907715,
"learning_rate": 0.00012915987714236542,
"loss": 5.425,
"step": 240
},
{
"epoch": 0.0384172109104879,
"grad_norm": 16.973690032958984,
"learning_rate": 0.00012205941047736077,
"loss": 5.6566,
"step": 250
},
{
"epoch": 0.0384172109104879,
"eval_loss": 2.7571074962615967,
"eval_runtime": 175.489,
"eval_samples_per_second": 15.614,
"eval_steps_per_second": 3.903,
"step": 250
},
{
"epoch": 0.039953899346907414,
"grad_norm": 4.981348514556885,
"learning_rate": 0.00011487096017983597,
"loss": 4.7851,
"step": 260
},
{
"epoch": 0.041490587783326933,
"grad_norm": 6.857975482940674,
"learning_rate": 0.00010762954765706012,
"loss": 4.6581,
"step": 270
},
{
"epoch": 0.043027276219746446,
"grad_norm": 8.468694686889648,
"learning_rate": 0.00010037045234293992,
"loss": 4.5606,
"step": 280
},
{
"epoch": 0.04456396465616596,
"grad_norm": 11.279480934143066,
"learning_rate": 9.312903982016405e-05,
"loss": 5.3172,
"step": 290
},
{
"epoch": 0.04610065309258548,
"grad_norm": 19.332422256469727,
"learning_rate": 8.594058952263925e-05,
"loss": 5.3847,
"step": 300
},
{
"epoch": 0.04610065309258548,
"eval_loss": 2.572946310043335,
"eval_runtime": 175.6968,
"eval_samples_per_second": 15.595,
"eval_steps_per_second": 3.899,
"step": 300
},
{
"epoch": 0.04763734152900499,
"grad_norm": 4.437800407409668,
"learning_rate": 7.884012285763457e-05,
"loss": 4.644,
"step": 310
},
{
"epoch": 0.04917402996542451,
"grad_norm": 4.734920024871826,
"learning_rate": 7.186223258500548e-05,
"loss": 4.554,
"step": 320
},
{
"epoch": 0.050710718401844025,
"grad_norm": 7.473479747772217,
"learning_rate": 6.504091428474514e-05,
"loss": 4.4863,
"step": 330
},
{
"epoch": 0.052247406838263545,
"grad_norm": 8.95118236541748,
"learning_rate": 5.840940073393593e-05,
"loss": 5.0514,
"step": 340
},
{
"epoch": 0.05378409527468306,
"grad_norm": 15.974135398864746,
"learning_rate": 5.200000000000002e-05,
"loss": 5.0546,
"step": 350
},
{
"epoch": 0.05378409527468306,
"eval_loss": 2.400266170501709,
"eval_runtime": 178.4074,
"eval_samples_per_second": 15.358,
"eval_steps_per_second": 3.84,
"step": 350
},
{
"epoch": 0.05532078371110257,
"grad_norm": 3.9929511547088623,
"learning_rate": 4.5843938039042344e-05,
"loss": 4.4327,
"step": 360
},
{
"epoch": 0.05685747214752209,
"grad_norm": 5.959095478057861,
"learning_rate": 3.997120656613154e-05,
"loss": 4.501,
"step": 370
},
{
"epoch": 0.058394160583941604,
"grad_norm": 7.357118606567383,
"learning_rate": 3.441041693867878e-05,
"loss": 4.2774,
"step": 380
},
{
"epoch": 0.059930849020361124,
"grad_norm": 10.237184524536133,
"learning_rate": 2.9188660764780296e-05,
"loss": 4.8857,
"step": 390
},
{
"epoch": 0.06146753745678064,
"grad_norm": 14.245146751403809,
"learning_rate": 2.4331377915626298e-05,
"loss": 4.9947,
"step": 400
},
{
"epoch": 0.06146753745678064,
"eval_loss": 2.4386484622955322,
"eval_runtime": 175.8523,
"eval_samples_per_second": 15.581,
"eval_steps_per_second": 3.895,
"step": 400
},
{
"epoch": 0.06300422589320015,
"grad_norm": 3.9970903396606445,
"learning_rate": 1.9862232585005475e-05,
"loss": 4.2686,
"step": 410
},
{
"epoch": 0.06454091432961967,
"grad_norm": 6.5729546546936035,
"learning_rate": 1.58029979997317e-05,
"loss": 4.3436,
"step": 420
},
{
"epoch": 0.06607760276603919,
"grad_norm": 7.991576194763184,
"learning_rate": 1.2173450342671593e-05,
"loss": 3.9325,
"step": 430
},
{
"epoch": 0.0676142912024587,
"grad_norm": 11.29770278930664,
"learning_rate": 8.991272405169498e-06,
"loss": 4.787,
"step": 440
},
{
"epoch": 0.06915097963887822,
"grad_norm": 15.63181209564209,
"learning_rate": 6.271967438265535e-06,
"loss": 5.0258,
"step": 450
},
{
"epoch": 0.06915097963887822,
"eval_loss": 2.2233057022094727,
"eval_runtime": 176.0184,
"eval_samples_per_second": 15.567,
"eval_steps_per_second": 3.892,
"step": 450
},
{
"epoch": 0.07068766807529774,
"grad_norm": 4.114557266235352,
"learning_rate": 4.028783622414835e-06,
"loss": 4.1326,
"step": 460
},
{
"epoch": 0.07222435651171726,
"grad_norm": 4.679042339324951,
"learning_rate": 2.272649523684208e-06,
"loss": 4.2644,
"step": 470
},
{
"epoch": 0.07376104494813676,
"grad_norm": 6.639362335205078,
"learning_rate": 1.0121208508766823e-06,
"loss": 4.0476,
"step": 480
},
{
"epoch": 0.07529773338455628,
"grad_norm": 8.920401573181152,
"learning_rate": 2.533387729782834e-07,
"loss": 4.6659,
"step": 490
},
{
"epoch": 0.0768344218209758,
"grad_norm": 13.285958290100098,
"learning_rate": 0.0,
"loss": 5.0228,
"step": 500
},
{
"epoch": 0.0768344218209758,
"eval_loss": 2.206617832183838,
"eval_runtime": 175.6859,
"eval_samples_per_second": 15.596,
"eval_steps_per_second": 3.899,
"step": 500
}
],
"logging_steps": 10,
"max_steps": 500,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.829167574233907e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}