|
{ |
|
"best_metric": 2.206617832183838, |
|
"best_model_checkpoint": "miner_id_24/checkpoint-500", |
|
"epoch": 0.0768344218209758, |
|
"eval_steps": 50, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0001536688436419516, |
|
"eval_loss": 3.802302360534668, |
|
"eval_runtime": 176.2084, |
|
"eval_samples_per_second": 15.55, |
|
"eval_steps_per_second": 3.887, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.001536688436419516, |
|
"grad_norm": 29.49568748474121, |
|
"learning_rate": 4.16e-05, |
|
"loss": 5.6992, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.003073376872839032, |
|
"grad_norm": 11.658048629760742, |
|
"learning_rate": 8.32e-05, |
|
"loss": 6.3089, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.004610065309258548, |
|
"grad_norm": 14.371132850646973, |
|
"learning_rate": 0.0001248, |
|
"loss": 5.7372, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.006146753745678064, |
|
"grad_norm": 18.474430084228516, |
|
"learning_rate": 0.0001664, |
|
"loss": 6.365, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.00768344218209758, |
|
"grad_norm": 42.71783447265625, |
|
"learning_rate": 0.000208, |
|
"loss": 7.0134, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.00768344218209758, |
|
"eval_loss": 3.467595338821411, |
|
"eval_runtime": 175.6638, |
|
"eval_samples_per_second": 15.598, |
|
"eval_steps_per_second": 3.899, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.009220130618517095, |
|
"grad_norm": 8.05245590209961, |
|
"learning_rate": 0.0002077466612270217, |
|
"loss": 5.4029, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.010756819054936612, |
|
"grad_norm": 7.458392143249512, |
|
"learning_rate": 0.0002069878791491233, |
|
"loss": 5.7947, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.012293507491356128, |
|
"grad_norm": 27.035818099975586, |
|
"learning_rate": 0.00020572735047631578, |
|
"loss": 5.5678, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.013830195927775643, |
|
"grad_norm": 17.315340042114258, |
|
"learning_rate": 0.00020397121637758515, |
|
"loss": 6.5471, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.01536688436419516, |
|
"grad_norm": 33.20652770996094, |
|
"learning_rate": 0.00020172803256173445, |
|
"loss": 7.03, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.01536688436419516, |
|
"eval_loss": 3.3676817417144775, |
|
"eval_runtime": 176.2178, |
|
"eval_samples_per_second": 15.549, |
|
"eval_steps_per_second": 3.887, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.016903572800614674, |
|
"grad_norm": 6.50840425491333, |
|
"learning_rate": 0.00019900872759483047, |
|
"loss": 5.6033, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.01844026123703419, |
|
"grad_norm": 6.7641825675964355, |
|
"learning_rate": 0.0001958265496573284, |
|
"loss": 5.4601, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.019976949673453707, |
|
"grad_norm": 14.109837532043457, |
|
"learning_rate": 0.00019219700200026827, |
|
"loss": 5.4273, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.021513638109873223, |
|
"grad_norm": 11.948671340942383, |
|
"learning_rate": 0.0001881377674149945, |
|
"loss": 6.1093, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.02305032654629274, |
|
"grad_norm": 21.177404403686523, |
|
"learning_rate": 0.00018366862208437368, |
|
"loss": 6.2444, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.02305032654629274, |
|
"eval_loss": 3.1878161430358887, |
|
"eval_runtime": 175.8544, |
|
"eval_samples_per_second": 15.581, |
|
"eval_steps_per_second": 3.895, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.024587014982712256, |
|
"grad_norm": 6.276320457458496, |
|
"learning_rate": 0.00017881133923521971, |
|
"loss": 5.5092, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.026123703419131773, |
|
"grad_norm": 6.799064636230469, |
|
"learning_rate": 0.00017358958306132124, |
|
"loss": 5.1537, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.027660391855551286, |
|
"grad_norm": 9.907474517822266, |
|
"learning_rate": 0.00016802879343386844, |
|
"loss": 5.5056, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.029197080291970802, |
|
"grad_norm": 13.780465126037598, |
|
"learning_rate": 0.00016215606196095766, |
|
"loss": 5.6722, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.03073376872839032, |
|
"grad_norm": 17.128713607788086, |
|
"learning_rate": 0.000156, |
|
"loss": 5.9664, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.03073376872839032, |
|
"eval_loss": 3.0006656646728516, |
|
"eval_runtime": 175.6129, |
|
"eval_samples_per_second": 15.602, |
|
"eval_steps_per_second": 3.901, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.032270457164809835, |
|
"grad_norm": 4.835846900939941, |
|
"learning_rate": 0.00014959059926606403, |
|
"loss": 5.1, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.03380714560122935, |
|
"grad_norm": 5.523816108703613, |
|
"learning_rate": 0.00014295908571525487, |
|
"loss": 4.9443, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.03534383403764887, |
|
"grad_norm": 9.212928771972656, |
|
"learning_rate": 0.00013613776741499452, |
|
"loss": 4.8833, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.03688052247406838, |
|
"grad_norm": 10.690314292907715, |
|
"learning_rate": 0.00012915987714236542, |
|
"loss": 5.425, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0384172109104879, |
|
"grad_norm": 16.973690032958984, |
|
"learning_rate": 0.00012205941047736077, |
|
"loss": 5.6566, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0384172109104879, |
|
"eval_loss": 2.7571074962615967, |
|
"eval_runtime": 175.489, |
|
"eval_samples_per_second": 15.614, |
|
"eval_steps_per_second": 3.903, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.039953899346907414, |
|
"grad_norm": 4.981348514556885, |
|
"learning_rate": 0.00011487096017983597, |
|
"loss": 4.7851, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.041490587783326933, |
|
"grad_norm": 6.857975482940674, |
|
"learning_rate": 0.00010762954765706012, |
|
"loss": 4.6581, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.043027276219746446, |
|
"grad_norm": 8.468694686889648, |
|
"learning_rate": 0.00010037045234293992, |
|
"loss": 4.5606, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.04456396465616596, |
|
"grad_norm": 11.279480934143066, |
|
"learning_rate": 9.312903982016405e-05, |
|
"loss": 5.3172, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.04610065309258548, |
|
"grad_norm": 19.332422256469727, |
|
"learning_rate": 8.594058952263925e-05, |
|
"loss": 5.3847, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04610065309258548, |
|
"eval_loss": 2.572946310043335, |
|
"eval_runtime": 175.6968, |
|
"eval_samples_per_second": 15.595, |
|
"eval_steps_per_second": 3.899, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.04763734152900499, |
|
"grad_norm": 4.437800407409668, |
|
"learning_rate": 7.884012285763457e-05, |
|
"loss": 4.644, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.04917402996542451, |
|
"grad_norm": 4.734920024871826, |
|
"learning_rate": 7.186223258500548e-05, |
|
"loss": 4.554, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.050710718401844025, |
|
"grad_norm": 7.473479747772217, |
|
"learning_rate": 6.504091428474514e-05, |
|
"loss": 4.4863, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.052247406838263545, |
|
"grad_norm": 8.95118236541748, |
|
"learning_rate": 5.840940073393593e-05, |
|
"loss": 5.0514, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.05378409527468306, |
|
"grad_norm": 15.974135398864746, |
|
"learning_rate": 5.200000000000002e-05, |
|
"loss": 5.0546, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.05378409527468306, |
|
"eval_loss": 2.400266170501709, |
|
"eval_runtime": 178.4074, |
|
"eval_samples_per_second": 15.358, |
|
"eval_steps_per_second": 3.84, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.05532078371110257, |
|
"grad_norm": 3.9929511547088623, |
|
"learning_rate": 4.5843938039042344e-05, |
|
"loss": 4.4327, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.05685747214752209, |
|
"grad_norm": 5.959095478057861, |
|
"learning_rate": 3.997120656613154e-05, |
|
"loss": 4.501, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.058394160583941604, |
|
"grad_norm": 7.357118606567383, |
|
"learning_rate": 3.441041693867878e-05, |
|
"loss": 4.2774, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.059930849020361124, |
|
"grad_norm": 10.237184524536133, |
|
"learning_rate": 2.9188660764780296e-05, |
|
"loss": 4.8857, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.06146753745678064, |
|
"grad_norm": 14.245146751403809, |
|
"learning_rate": 2.4331377915626298e-05, |
|
"loss": 4.9947, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06146753745678064, |
|
"eval_loss": 2.4386484622955322, |
|
"eval_runtime": 175.8523, |
|
"eval_samples_per_second": 15.581, |
|
"eval_steps_per_second": 3.895, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.06300422589320015, |
|
"grad_norm": 3.9970903396606445, |
|
"learning_rate": 1.9862232585005475e-05, |
|
"loss": 4.2686, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.06454091432961967, |
|
"grad_norm": 6.5729546546936035, |
|
"learning_rate": 1.58029979997317e-05, |
|
"loss": 4.3436, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.06607760276603919, |
|
"grad_norm": 7.991576194763184, |
|
"learning_rate": 1.2173450342671593e-05, |
|
"loss": 3.9325, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.0676142912024587, |
|
"grad_norm": 11.29770278930664, |
|
"learning_rate": 8.991272405169498e-06, |
|
"loss": 4.787, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.06915097963887822, |
|
"grad_norm": 15.63181209564209, |
|
"learning_rate": 6.271967438265535e-06, |
|
"loss": 5.0258, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.06915097963887822, |
|
"eval_loss": 2.2233057022094727, |
|
"eval_runtime": 176.0184, |
|
"eval_samples_per_second": 15.567, |
|
"eval_steps_per_second": 3.892, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.07068766807529774, |
|
"grad_norm": 4.114557266235352, |
|
"learning_rate": 4.028783622414835e-06, |
|
"loss": 4.1326, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.07222435651171726, |
|
"grad_norm": 4.679042339324951, |
|
"learning_rate": 2.272649523684208e-06, |
|
"loss": 4.2644, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.07376104494813676, |
|
"grad_norm": 6.639362335205078, |
|
"learning_rate": 1.0121208508766823e-06, |
|
"loss": 4.0476, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.07529773338455628, |
|
"grad_norm": 8.920401573181152, |
|
"learning_rate": 2.533387729782834e-07, |
|
"loss": 4.6659, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.0768344218209758, |
|
"grad_norm": 13.285958290100098, |
|
"learning_rate": 0.0, |
|
"loss": 5.0228, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.0768344218209758, |
|
"eval_loss": 2.206617832183838, |
|
"eval_runtime": 175.6859, |
|
"eval_samples_per_second": 15.596, |
|
"eval_steps_per_second": 3.899, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 9.829167574233907e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|