innovation-hacking2's picture
Upload folder using huggingface_hub
ad5fb8e verified
{
"best_metric": 2.9402894973754883,
"best_model_checkpoint": "./models/lora-finetuning/LLaMmlein_1B/checkpoint-26000",
"epoch": 0.25068939583855604,
"eval_steps": 1000,
"global_step": 30000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016712626389237067,
"grad_norm": 25.078365325927734,
"learning_rate": 4.93e-05,
"loss": 4.0094,
"step": 500
},
{
"epoch": 0.033425252778474135,
"grad_norm": 13.957054138183594,
"learning_rate": 4.916204915525037e-05,
"loss": 3.8688,
"step": 1000
},
{
"epoch": 0.033425252778474135,
"eval_loss": 3.781656265258789,
"eval_runtime": 249.2772,
"eval_samples_per_second": 17.9,
"eval_steps_per_second": 2.238,
"step": 1000
},
{
"epoch": 0.05013787916771121,
"grad_norm": 13.748096466064453,
"learning_rate": 4.8313900125777615e-05,
"loss": 3.7362,
"step": 1500
},
{
"epoch": 0.06685050555694827,
"grad_norm": 11.765251159667969,
"learning_rate": 4.746405139885101e-05,
"loss": 3.7171,
"step": 2000
},
{
"epoch": 0.06685050555694827,
"eval_loss": 3.6495721340179443,
"eval_runtime": 249.1863,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 2000
},
{
"epoch": 0.08356313194618534,
"grad_norm": 14.349141120910645,
"learning_rate": 4.66142026719244e-05,
"loss": 3.6213,
"step": 2500
},
{
"epoch": 0.10027575833542242,
"grad_norm": 12.409707069396973,
"learning_rate": 4.576435394499779e-05,
"loss": 3.5643,
"step": 3000
},
{
"epoch": 0.10027575833542242,
"eval_loss": 3.565227508544922,
"eval_runtime": 249.1942,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 3000
},
{
"epoch": 0.11698838472465949,
"grad_norm": 8.590898513793945,
"learning_rate": 4.4914505218071186e-05,
"loss": 3.5126,
"step": 3500
},
{
"epoch": 0.13370101111389654,
"grad_norm": 14.560601234436035,
"learning_rate": 4.406465649114458e-05,
"loss": 3.4777,
"step": 4000
},
{
"epoch": 0.13370101111389654,
"eval_loss": 3.48999285697937,
"eval_runtime": 249.1782,
"eval_samples_per_second": 17.907,
"eval_steps_per_second": 2.239,
"step": 4000
},
{
"epoch": 0.1504136375031336,
"grad_norm": 10.846818923950195,
"learning_rate": 4.321480776421797e-05,
"loss": 3.4575,
"step": 4500
},
{
"epoch": 0.16712626389237067,
"grad_norm": 10.20279598236084,
"learning_rate": 4.2364959037291364e-05,
"loss": 3.3958,
"step": 5000
},
{
"epoch": 0.16712626389237067,
"eval_loss": 3.433652639389038,
"eval_runtime": 249.1687,
"eval_samples_per_second": 17.908,
"eval_steps_per_second": 2.239,
"step": 5000
},
{
"epoch": 0.18383889028160774,
"grad_norm": 11.423223495483398,
"learning_rate": 4.1515110310364756e-05,
"loss": 3.3574,
"step": 5500
},
{
"epoch": 0.20055151667084484,
"grad_norm": 10.610285758972168,
"learning_rate": 4.066526158343815e-05,
"loss": 3.3517,
"step": 6000
},
{
"epoch": 0.20055151667084484,
"eval_loss": 3.387014150619507,
"eval_runtime": 249.1821,
"eval_samples_per_second": 17.907,
"eval_steps_per_second": 2.239,
"step": 6000
},
{
"epoch": 0.2172641430600819,
"grad_norm": 8.165976524353027,
"learning_rate": 3.981541285651154e-05,
"loss": 3.3294,
"step": 6500
},
{
"epoch": 0.23397676944931897,
"grad_norm": 10.196443557739258,
"learning_rate": 3.8965564129584935e-05,
"loss": 3.2805,
"step": 7000
},
{
"epoch": 0.23397676944931897,
"eval_loss": 3.3461484909057617,
"eval_runtime": 249.1887,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 7000
},
{
"epoch": 0.25068939583855604,
"grad_norm": 9.405224800109863,
"learning_rate": 3.8115715402658334e-05,
"loss": 3.2766,
"step": 7500
},
{
"epoch": 0.2674020222277931,
"grad_norm": 8.490914344787598,
"learning_rate": 3.726586667573172e-05,
"loss": 3.2408,
"step": 8000
},
{
"epoch": 0.2674020222277931,
"eval_loss": 3.315063714981079,
"eval_runtime": 249.1918,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 8000
},
{
"epoch": 0.2841146486170302,
"grad_norm": 11.21275806427002,
"learning_rate": 3.641601794880511e-05,
"loss": 3.2381,
"step": 8500
},
{
"epoch": 0.3008272750062672,
"grad_norm": 10.82959270477295,
"learning_rate": 3.556616922187851e-05,
"loss": 3.1716,
"step": 9000
},
{
"epoch": 0.3008272750062672,
"eval_loss": 3.2550790309906006,
"eval_runtime": 249.1754,
"eval_samples_per_second": 17.907,
"eval_steps_per_second": 2.239,
"step": 9000
},
{
"epoch": 0.3175399013955043,
"grad_norm": 11.301346778869629,
"learning_rate": 3.47163204949519e-05,
"loss": 3.176,
"step": 9500
},
{
"epoch": 0.33425252778474135,
"grad_norm": 10.199508666992188,
"learning_rate": 3.386647176802529e-05,
"loss": 3.1645,
"step": 10000
},
{
"epoch": 0.33425252778474135,
"eval_loss": 3.2413389682769775,
"eval_runtime": 249.1913,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 10000
},
{
"epoch": 0.35096515417397844,
"grad_norm": 7.239902019500732,
"learning_rate": 3.301662304109869e-05,
"loss": 3.1457,
"step": 10500
},
{
"epoch": 0.3676777805632155,
"grad_norm": 10.78636360168457,
"learning_rate": 3.2166774314172076e-05,
"loss": 3.1221,
"step": 11000
},
{
"epoch": 0.3676777805632155,
"eval_loss": 3.1978135108947754,
"eval_runtime": 249.1932,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 11000
},
{
"epoch": 0.3843904069524526,
"grad_norm": 8.892194747924805,
"learning_rate": 3.131692558724547e-05,
"loss": 3.1435,
"step": 11500
},
{
"epoch": 0.4011030333416897,
"grad_norm": 9.634190559387207,
"learning_rate": 3.0467076860318865e-05,
"loss": 3.0882,
"step": 12000
},
{
"epoch": 0.4011030333416897,
"eval_loss": 3.1803853511810303,
"eval_runtime": 249.2264,
"eval_samples_per_second": 17.903,
"eval_steps_per_second": 2.239,
"step": 12000
},
{
"epoch": 0.4178156597309267,
"grad_norm": 8.388688087463379,
"learning_rate": 2.9617228133392254e-05,
"loss": 3.0468,
"step": 12500
},
{
"epoch": 0.4345282861201638,
"grad_norm": 7.932670593261719,
"learning_rate": 2.876737940646565e-05,
"loss": 3.0627,
"step": 13000
},
{
"epoch": 0.4345282861201638,
"eval_loss": 3.1499178409576416,
"eval_runtime": 249.1889,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 13000
},
{
"epoch": 0.45124091250940085,
"grad_norm": 10.718411445617676,
"learning_rate": 2.7917530679539043e-05,
"loss": 3.0438,
"step": 13500
},
{
"epoch": 0.46795353889863794,
"grad_norm": 10.233518600463867,
"learning_rate": 2.7072781044973994e-05,
"loss": 3.0231,
"step": 14000
},
{
"epoch": 0.46795353889863794,
"eval_loss": 3.1285717487335205,
"eval_runtime": 249.2148,
"eval_samples_per_second": 17.904,
"eval_steps_per_second": 2.239,
"step": 14000
},
{
"epoch": 0.484666165287875,
"grad_norm": 10.557904243469238,
"learning_rate": 2.6222932318047387e-05,
"loss": 3.0278,
"step": 14500
},
{
"epoch": 0.5013787916771121,
"grad_norm": 6.9997334480285645,
"learning_rate": 2.5373083591120783e-05,
"loss": 3.019,
"step": 15000
},
{
"epoch": 0.5013787916771121,
"eval_loss": 3.1056880950927734,
"eval_runtime": 249.1758,
"eval_samples_per_second": 17.907,
"eval_steps_per_second": 2.239,
"step": 15000
},
{
"epoch": 0.5180914180663492,
"grad_norm": 8.922798156738281,
"learning_rate": 2.4523234864194176e-05,
"loss": 2.9907,
"step": 15500
},
{
"epoch": 0.5348040444555862,
"grad_norm": 9.043310165405273,
"learning_rate": 2.367338613726757e-05,
"loss": 2.9945,
"step": 16000
},
{
"epoch": 0.5348040444555862,
"eval_loss": 3.0864944458007812,
"eval_runtime": 249.2,
"eval_samples_per_second": 17.905,
"eval_steps_per_second": 2.239,
"step": 16000
},
{
"epoch": 0.5515166708448233,
"grad_norm": 24.114103317260742,
"learning_rate": 2.2830336200156375e-05,
"loss": 3.8624,
"step": 16500
},
{
"epoch": 0.5682292972340603,
"grad_norm": 15.645467758178711,
"learning_rate": 2.1983886868137474e-05,
"loss": 3.5355,
"step": 17000
},
{
"epoch": 0.5682292972340603,
"eval_loss": 3.449280261993408,
"eval_runtime": 249.5095,
"eval_samples_per_second": 17.883,
"eval_steps_per_second": 2.236,
"step": 17000
},
{
"epoch": 0.5849419236232974,
"grad_norm": 12.026097297668457,
"learning_rate": 2.113743753611857e-05,
"loss": 3.4294,
"step": 17500
},
{
"epoch": 0.6016545500125344,
"grad_norm": 15.589037895202637,
"learning_rate": 2.0287588809191966e-05,
"loss": 3.3682,
"step": 18000
},
{
"epoch": 0.6016545500125344,
"eval_loss": 3.334568500518799,
"eval_runtime": 249.295,
"eval_samples_per_second": 17.898,
"eval_steps_per_second": 2.238,
"step": 18000
},
{
"epoch": 0.6183671764017715,
"grad_norm": 18.867961883544922,
"learning_rate": 1.9437740082265356e-05,
"loss": 3.3126,
"step": 18500
},
{
"epoch": 0.6350798027910086,
"grad_norm": 14.147026062011719,
"learning_rate": 1.8587891355338752e-05,
"loss": 3.2718,
"step": 19000
},
{
"epoch": 0.6350798027910086,
"eval_loss": 3.245798349380493,
"eval_runtime": 249.2433,
"eval_samples_per_second": 17.902,
"eval_steps_per_second": 2.239,
"step": 19000
},
{
"epoch": 0.6517924291802457,
"grad_norm": 15.220115661621094,
"learning_rate": 1.7738042628412145e-05,
"loss": 3.1792,
"step": 19500
},
{
"epoch": 0.6685050555694827,
"grad_norm": 17.45062255859375,
"learning_rate": 1.6888193901485537e-05,
"loss": 3.1603,
"step": 20000
},
{
"epoch": 0.6685050555694827,
"eval_loss": 3.180778741836548,
"eval_runtime": 249.2332,
"eval_samples_per_second": 17.903,
"eval_steps_per_second": 2.239,
"step": 20000
},
{
"epoch": 0.6852176819587198,
"grad_norm": 16.632200241088867,
"learning_rate": 1.603834517455893e-05,
"loss": 3.1225,
"step": 20500
},
{
"epoch": 0.7019303083479569,
"grad_norm": 13.793306350708008,
"learning_rate": 1.5188496447632323e-05,
"loss": 3.0798,
"step": 21000
},
{
"epoch": 0.7019303083479569,
"eval_loss": 3.1489596366882324,
"eval_runtime": 249.2624,
"eval_samples_per_second": 17.901,
"eval_steps_per_second": 2.239,
"step": 21000
},
{
"epoch": 0.718642934737194,
"grad_norm": 18.920320510864258,
"learning_rate": 1.4338647720705714e-05,
"loss": 3.0602,
"step": 21500
},
{
"epoch": 0.735355561126431,
"grad_norm": 16.346004486083984,
"learning_rate": 1.3488798993779108e-05,
"loss": 3.0281,
"step": 22000
},
{
"epoch": 0.735355561126431,
"eval_loss": 3.083583354949951,
"eval_runtime": 249.1953,
"eval_samples_per_second": 17.906,
"eval_steps_per_second": 2.239,
"step": 22000
},
{
"epoch": 0.7520681875156681,
"grad_norm": 12.671531677246094,
"learning_rate": 1.26389502668525e-05,
"loss": 3.0254,
"step": 22500
},
{
"epoch": 0.7687808139049052,
"grad_norm": 14.962254524230957,
"learning_rate": 1.1789101539925894e-05,
"loss": 3.023,
"step": 23000
},
{
"epoch": 0.7687808139049052,
"eval_loss": 3.0390138626098633,
"eval_runtime": 249.2133,
"eval_samples_per_second": 17.904,
"eval_steps_per_second": 2.239,
"step": 23000
},
{
"epoch": 0.7854934402941423,
"grad_norm": 16.1228084564209,
"learning_rate": 1.0939252812999288e-05,
"loss": 2.9347,
"step": 23500
},
{
"epoch": 0.8022060666833793,
"grad_norm": 13.979843139648438,
"learning_rate": 1.0089404086072679e-05,
"loss": 2.9604,
"step": 24000
},
{
"epoch": 0.8022060666833793,
"eval_loss": 3.0071020126342773,
"eval_runtime": 249.2612,
"eval_samples_per_second": 17.901,
"eval_steps_per_second": 2.239,
"step": 24000
},
{
"epoch": 0.8189186930726163,
"grad_norm": 17.260498046875,
"learning_rate": 9.239555359146072e-06,
"loss": 2.899,
"step": 24500
},
{
"epoch": 0.8356313194618534,
"grad_norm": 13.376078605651855,
"learning_rate": 8.389706632219466e-06,
"loss": 2.9291,
"step": 25000
},
{
"epoch": 0.8356313194618534,
"eval_loss": 2.9687299728393555,
"eval_runtime": 249.2378,
"eval_samples_per_second": 17.903,
"eval_steps_per_second": 2.239,
"step": 25000
},
{
"epoch": 0.8523439458510905,
"grad_norm": 20.349105834960938,
"learning_rate": 7.539857905292859e-06,
"loss": 2.8853,
"step": 25500
},
{
"epoch": 0.8690565722403276,
"grad_norm": 17.05868911743164,
"learning_rate": 6.690009178366251e-06,
"loss": 2.8542,
"step": 26000
},
{
"epoch": 0.8690565722403276,
"eval_loss": 2.9402894973754883,
"eval_runtime": 249.3105,
"eval_samples_per_second": 17.897,
"eval_steps_per_second": 2.238,
"step": 26000
},
{
"epoch": 0.8857691986295646,
"grad_norm": 28.65497589111328,
"learning_rate": 5.846959241255057e-06,
"loss": 3.7512,
"step": 26500
},
{
"epoch": 0.9024818250188017,
"grad_norm": 41.53907775878906,
"learning_rate": 4.99711051432845e-06,
"loss": 3.4169,
"step": 27000
},
{
"epoch": 0.9024818250188017,
"eval_loss": 3.276771068572998,
"eval_runtime": 249.323,
"eval_samples_per_second": 17.896,
"eval_steps_per_second": 2.238,
"step": 27000
},
{
"epoch": 0.9191944514080388,
"grad_norm": 26.49399757385254,
"learning_rate": 4.147261787401842e-06,
"loss": 3.3055,
"step": 27500
},
{
"epoch": 0.9359070777972759,
"grad_norm": 30.56415557861328,
"learning_rate": 3.297413060475236e-06,
"loss": 3.2131,
"step": 28000
},
{
"epoch": 0.9359070777972759,
"eval_loss": 3.140591859817505,
"eval_runtime": 249.261,
"eval_samples_per_second": 17.901,
"eval_steps_per_second": 2.239,
"step": 28000
},
{
"epoch": 0.9526197041865129,
"grad_norm": 33.53205871582031,
"learning_rate": 2.4509637284563352e-06,
"loss": 3.1509,
"step": 28500
},
{
"epoch": 0.96933233057575,
"grad_norm": 26.73455810546875,
"learning_rate": 1.6011150015297277e-06,
"loss": 3.0998,
"step": 29000
},
{
"epoch": 0.96933233057575,
"eval_loss": 3.0688536167144775,
"eval_runtime": 249.2871,
"eval_samples_per_second": 17.899,
"eval_steps_per_second": 2.238,
"step": 29000
},
{
"epoch": 0.9860449569649871,
"grad_norm": 26.864349365234375,
"learning_rate": 7.512662746031207e-07,
"loss": 3.0876,
"step": 29500
},
{
"epoch": 0.25068939583855604,
"grad_norm": 65.23794555664062,
"learning_rate": 4.794667744453116e-05,
"loss": 4.6117,
"step": 30000
},
{
"epoch": 0.25068939583855604,
"eval_loss": 4.784034252166748,
"eval_runtime": 253.6816,
"eval_samples_per_second": 17.589,
"eval_steps_per_second": 2.2,
"step": 30000
}
],
"logging_steps": 500,
"max_steps": 718020,
"num_input_tokens_seen": 0,
"num_train_epochs": 6,
"save_steps": 2000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.063185677140099e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}