|
{ |
|
"best_metric": 2.9402894973754883, |
|
"best_model_checkpoint": "./models/lora-finetuning/LLaMmlein_1B/checkpoint-26000", |
|
"epoch": 0.25068939583855604, |
|
"eval_steps": 1000, |
|
"global_step": 30000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.016712626389237067, |
|
"grad_norm": 25.078365325927734, |
|
"learning_rate": 4.93e-05, |
|
"loss": 4.0094, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"grad_norm": 13.957054138183594, |
|
"learning_rate": 4.916204915525037e-05, |
|
"loss": 3.8688, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.033425252778474135, |
|
"eval_loss": 3.781656265258789, |
|
"eval_runtime": 249.2772, |
|
"eval_samples_per_second": 17.9, |
|
"eval_steps_per_second": 2.238, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.05013787916771121, |
|
"grad_norm": 13.748096466064453, |
|
"learning_rate": 4.8313900125777615e-05, |
|
"loss": 3.7362, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"grad_norm": 11.765251159667969, |
|
"learning_rate": 4.746405139885101e-05, |
|
"loss": 3.7171, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06685050555694827, |
|
"eval_loss": 3.6495721340179443, |
|
"eval_runtime": 249.1863, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.08356313194618534, |
|
"grad_norm": 14.349141120910645, |
|
"learning_rate": 4.66142026719244e-05, |
|
"loss": 3.6213, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"grad_norm": 12.409707069396973, |
|
"learning_rate": 4.576435394499779e-05, |
|
"loss": 3.5643, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.10027575833542242, |
|
"eval_loss": 3.565227508544922, |
|
"eval_runtime": 249.1942, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.11698838472465949, |
|
"grad_norm": 8.590898513793945, |
|
"learning_rate": 4.4914505218071186e-05, |
|
"loss": 3.5126, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"grad_norm": 14.560601234436035, |
|
"learning_rate": 4.406465649114458e-05, |
|
"loss": 3.4777, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.13370101111389654, |
|
"eval_loss": 3.48999285697937, |
|
"eval_runtime": 249.1782, |
|
"eval_samples_per_second": 17.907, |
|
"eval_steps_per_second": 2.239, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1504136375031336, |
|
"grad_norm": 10.846818923950195, |
|
"learning_rate": 4.321480776421797e-05, |
|
"loss": 3.4575, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"grad_norm": 10.20279598236084, |
|
"learning_rate": 4.2364959037291364e-05, |
|
"loss": 3.3958, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.16712626389237067, |
|
"eval_loss": 3.433652639389038, |
|
"eval_runtime": 249.1687, |
|
"eval_samples_per_second": 17.908, |
|
"eval_steps_per_second": 2.239, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.18383889028160774, |
|
"grad_norm": 11.423223495483398, |
|
"learning_rate": 4.1515110310364756e-05, |
|
"loss": 3.3574, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"grad_norm": 10.610285758972168, |
|
"learning_rate": 4.066526158343815e-05, |
|
"loss": 3.3517, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.20055151667084484, |
|
"eval_loss": 3.387014150619507, |
|
"eval_runtime": 249.1821, |
|
"eval_samples_per_second": 17.907, |
|
"eval_steps_per_second": 2.239, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2172641430600819, |
|
"grad_norm": 8.165976524353027, |
|
"learning_rate": 3.981541285651154e-05, |
|
"loss": 3.3294, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"grad_norm": 10.196443557739258, |
|
"learning_rate": 3.8965564129584935e-05, |
|
"loss": 3.2805, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.23397676944931897, |
|
"eval_loss": 3.3461484909057617, |
|
"eval_runtime": 249.1887, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"grad_norm": 9.405224800109863, |
|
"learning_rate": 3.8115715402658334e-05, |
|
"loss": 3.2766, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"grad_norm": 8.490914344787598, |
|
"learning_rate": 3.726586667573172e-05, |
|
"loss": 3.2408, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2674020222277931, |
|
"eval_loss": 3.315063714981079, |
|
"eval_runtime": 249.1918, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2841146486170302, |
|
"grad_norm": 11.21275806427002, |
|
"learning_rate": 3.641601794880511e-05, |
|
"loss": 3.2381, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"grad_norm": 10.82959270477295, |
|
"learning_rate": 3.556616922187851e-05, |
|
"loss": 3.1716, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3008272750062672, |
|
"eval_loss": 3.2550790309906006, |
|
"eval_runtime": 249.1754, |
|
"eval_samples_per_second": 17.907, |
|
"eval_steps_per_second": 2.239, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.3175399013955043, |
|
"grad_norm": 11.301346778869629, |
|
"learning_rate": 3.47163204949519e-05, |
|
"loss": 3.176, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"grad_norm": 10.199508666992188, |
|
"learning_rate": 3.386647176802529e-05, |
|
"loss": 3.1645, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.33425252778474135, |
|
"eval_loss": 3.2413389682769775, |
|
"eval_runtime": 249.1913, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.35096515417397844, |
|
"grad_norm": 7.239902019500732, |
|
"learning_rate": 3.301662304109869e-05, |
|
"loss": 3.1457, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"grad_norm": 10.78636360168457, |
|
"learning_rate": 3.2166774314172076e-05, |
|
"loss": 3.1221, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3676777805632155, |
|
"eval_loss": 3.1978135108947754, |
|
"eval_runtime": 249.1932, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3843904069524526, |
|
"grad_norm": 8.892194747924805, |
|
"learning_rate": 3.131692558724547e-05, |
|
"loss": 3.1435, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"grad_norm": 9.634190559387207, |
|
"learning_rate": 3.0467076860318865e-05, |
|
"loss": 3.0882, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4011030333416897, |
|
"eval_loss": 3.1803853511810303, |
|
"eval_runtime": 249.2264, |
|
"eval_samples_per_second": 17.903, |
|
"eval_steps_per_second": 2.239, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.4178156597309267, |
|
"grad_norm": 8.388688087463379, |
|
"learning_rate": 2.9617228133392254e-05, |
|
"loss": 3.0468, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"grad_norm": 7.932670593261719, |
|
"learning_rate": 2.876737940646565e-05, |
|
"loss": 3.0627, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.4345282861201638, |
|
"eval_loss": 3.1499178409576416, |
|
"eval_runtime": 249.1889, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.45124091250940085, |
|
"grad_norm": 10.718411445617676, |
|
"learning_rate": 2.7917530679539043e-05, |
|
"loss": 3.0438, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"grad_norm": 10.233518600463867, |
|
"learning_rate": 2.7072781044973994e-05, |
|
"loss": 3.0231, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.46795353889863794, |
|
"eval_loss": 3.1285717487335205, |
|
"eval_runtime": 249.2148, |
|
"eval_samples_per_second": 17.904, |
|
"eval_steps_per_second": 2.239, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.484666165287875, |
|
"grad_norm": 10.557904243469238, |
|
"learning_rate": 2.6222932318047387e-05, |
|
"loss": 3.0278, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"grad_norm": 6.9997334480285645, |
|
"learning_rate": 2.5373083591120783e-05, |
|
"loss": 3.019, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5013787916771121, |
|
"eval_loss": 3.1056880950927734, |
|
"eval_runtime": 249.1758, |
|
"eval_samples_per_second": 17.907, |
|
"eval_steps_per_second": 2.239, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.5180914180663492, |
|
"grad_norm": 8.922798156738281, |
|
"learning_rate": 2.4523234864194176e-05, |
|
"loss": 2.9907, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"grad_norm": 9.043310165405273, |
|
"learning_rate": 2.367338613726757e-05, |
|
"loss": 2.9945, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5348040444555862, |
|
"eval_loss": 3.0864944458007812, |
|
"eval_runtime": 249.2, |
|
"eval_samples_per_second": 17.905, |
|
"eval_steps_per_second": 2.239, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5515166708448233, |
|
"grad_norm": 24.114103317260742, |
|
"learning_rate": 2.2830336200156375e-05, |
|
"loss": 3.8624, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"grad_norm": 15.645467758178711, |
|
"learning_rate": 2.1983886868137474e-05, |
|
"loss": 3.5355, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5682292972340603, |
|
"eval_loss": 3.449280261993408, |
|
"eval_runtime": 249.5095, |
|
"eval_samples_per_second": 17.883, |
|
"eval_steps_per_second": 2.236, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5849419236232974, |
|
"grad_norm": 12.026097297668457, |
|
"learning_rate": 2.113743753611857e-05, |
|
"loss": 3.4294, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"grad_norm": 15.589037895202637, |
|
"learning_rate": 2.0287588809191966e-05, |
|
"loss": 3.3682, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6016545500125344, |
|
"eval_loss": 3.334568500518799, |
|
"eval_runtime": 249.295, |
|
"eval_samples_per_second": 17.898, |
|
"eval_steps_per_second": 2.238, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.6183671764017715, |
|
"grad_norm": 18.867961883544922, |
|
"learning_rate": 1.9437740082265356e-05, |
|
"loss": 3.3126, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"grad_norm": 14.147026062011719, |
|
"learning_rate": 1.8587891355338752e-05, |
|
"loss": 3.2718, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6350798027910086, |
|
"eval_loss": 3.245798349380493, |
|
"eval_runtime": 249.2433, |
|
"eval_samples_per_second": 17.902, |
|
"eval_steps_per_second": 2.239, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6517924291802457, |
|
"grad_norm": 15.220115661621094, |
|
"learning_rate": 1.7738042628412145e-05, |
|
"loss": 3.1792, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"grad_norm": 17.45062255859375, |
|
"learning_rate": 1.6888193901485537e-05, |
|
"loss": 3.1603, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6685050555694827, |
|
"eval_loss": 3.180778741836548, |
|
"eval_runtime": 249.2332, |
|
"eval_samples_per_second": 17.903, |
|
"eval_steps_per_second": 2.239, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6852176819587198, |
|
"grad_norm": 16.632200241088867, |
|
"learning_rate": 1.603834517455893e-05, |
|
"loss": 3.1225, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"grad_norm": 13.793306350708008, |
|
"learning_rate": 1.5188496447632323e-05, |
|
"loss": 3.0798, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.7019303083479569, |
|
"eval_loss": 3.1489596366882324, |
|
"eval_runtime": 249.2624, |
|
"eval_samples_per_second": 17.901, |
|
"eval_steps_per_second": 2.239, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.718642934737194, |
|
"grad_norm": 18.920320510864258, |
|
"learning_rate": 1.4338647720705714e-05, |
|
"loss": 3.0602, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"grad_norm": 16.346004486083984, |
|
"learning_rate": 1.3488798993779108e-05, |
|
"loss": 3.0281, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.735355561126431, |
|
"eval_loss": 3.083583354949951, |
|
"eval_runtime": 249.1953, |
|
"eval_samples_per_second": 17.906, |
|
"eval_steps_per_second": 2.239, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7520681875156681, |
|
"grad_norm": 12.671531677246094, |
|
"learning_rate": 1.26389502668525e-05, |
|
"loss": 3.0254, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"grad_norm": 14.962254524230957, |
|
"learning_rate": 1.1789101539925894e-05, |
|
"loss": 3.023, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7687808139049052, |
|
"eval_loss": 3.0390138626098633, |
|
"eval_runtime": 249.2133, |
|
"eval_samples_per_second": 17.904, |
|
"eval_steps_per_second": 2.239, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7854934402941423, |
|
"grad_norm": 16.1228084564209, |
|
"learning_rate": 1.0939252812999288e-05, |
|
"loss": 2.9347, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"grad_norm": 13.979843139648438, |
|
"learning_rate": 1.0089404086072679e-05, |
|
"loss": 2.9604, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8022060666833793, |
|
"eval_loss": 3.0071020126342773, |
|
"eval_runtime": 249.2612, |
|
"eval_samples_per_second": 17.901, |
|
"eval_steps_per_second": 2.239, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.8189186930726163, |
|
"grad_norm": 17.260498046875, |
|
"learning_rate": 9.239555359146072e-06, |
|
"loss": 2.899, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"grad_norm": 13.376078605651855, |
|
"learning_rate": 8.389706632219466e-06, |
|
"loss": 2.9291, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8356313194618534, |
|
"eval_loss": 2.9687299728393555, |
|
"eval_runtime": 249.2378, |
|
"eval_samples_per_second": 17.903, |
|
"eval_steps_per_second": 2.239, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8523439458510905, |
|
"grad_norm": 20.349105834960938, |
|
"learning_rate": 7.539857905292859e-06, |
|
"loss": 2.8853, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"grad_norm": 17.05868911743164, |
|
"learning_rate": 6.690009178366251e-06, |
|
"loss": 2.8542, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8690565722403276, |
|
"eval_loss": 2.9402894973754883, |
|
"eval_runtime": 249.3105, |
|
"eval_samples_per_second": 17.897, |
|
"eval_steps_per_second": 2.238, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8857691986295646, |
|
"grad_norm": 28.65497589111328, |
|
"learning_rate": 5.846959241255057e-06, |
|
"loss": 3.7512, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"grad_norm": 41.53907775878906, |
|
"learning_rate": 4.99711051432845e-06, |
|
"loss": 3.4169, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9024818250188017, |
|
"eval_loss": 3.276771068572998, |
|
"eval_runtime": 249.323, |
|
"eval_samples_per_second": 17.896, |
|
"eval_steps_per_second": 2.238, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.9191944514080388, |
|
"grad_norm": 26.49399757385254, |
|
"learning_rate": 4.147261787401842e-06, |
|
"loss": 3.3055, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"grad_norm": 30.56415557861328, |
|
"learning_rate": 3.297413060475236e-06, |
|
"loss": 3.2131, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9359070777972759, |
|
"eval_loss": 3.140591859817505, |
|
"eval_runtime": 249.261, |
|
"eval_samples_per_second": 17.901, |
|
"eval_steps_per_second": 2.239, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9526197041865129, |
|
"grad_norm": 33.53205871582031, |
|
"learning_rate": 2.4509637284563352e-06, |
|
"loss": 3.1509, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"grad_norm": 26.73455810546875, |
|
"learning_rate": 1.6011150015297277e-06, |
|
"loss": 3.0998, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.96933233057575, |
|
"eval_loss": 3.0688536167144775, |
|
"eval_runtime": 249.2871, |
|
"eval_samples_per_second": 17.899, |
|
"eval_steps_per_second": 2.238, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9860449569649871, |
|
"grad_norm": 26.864349365234375, |
|
"learning_rate": 7.512662746031207e-07, |
|
"loss": 3.0876, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"grad_norm": 65.23794555664062, |
|
"learning_rate": 4.794667744453116e-05, |
|
"loss": 4.6117, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.25068939583855604, |
|
"eval_loss": 4.784034252166748, |
|
"eval_runtime": 253.6816, |
|
"eval_samples_per_second": 17.589, |
|
"eval_steps_per_second": 2.2, |
|
"step": 30000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 718020, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 6, |
|
"save_steps": 2000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.063185677140099e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|