{ "best_metric": 2.9402894973754883, "best_model_checkpoint": "./models/lora-finetuning/LLaMmlein_1B/checkpoint-26000", "epoch": 0.25068939583855604, "eval_steps": 1000, "global_step": 30000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016712626389237067, "grad_norm": 25.078365325927734, "learning_rate": 4.93e-05, "loss": 4.0094, "step": 500 }, { "epoch": 0.033425252778474135, "grad_norm": 13.957054138183594, "learning_rate": 4.916204915525037e-05, "loss": 3.8688, "step": 1000 }, { "epoch": 0.033425252778474135, "eval_loss": 3.781656265258789, "eval_runtime": 249.2772, "eval_samples_per_second": 17.9, "eval_steps_per_second": 2.238, "step": 1000 }, { "epoch": 0.05013787916771121, "grad_norm": 13.748096466064453, "learning_rate": 4.8313900125777615e-05, "loss": 3.7362, "step": 1500 }, { "epoch": 0.06685050555694827, "grad_norm": 11.765251159667969, "learning_rate": 4.746405139885101e-05, "loss": 3.7171, "step": 2000 }, { "epoch": 0.06685050555694827, "eval_loss": 3.6495721340179443, "eval_runtime": 249.1863, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 2000 }, { "epoch": 0.08356313194618534, "grad_norm": 14.349141120910645, "learning_rate": 4.66142026719244e-05, "loss": 3.6213, "step": 2500 }, { "epoch": 0.10027575833542242, "grad_norm": 12.409707069396973, "learning_rate": 4.576435394499779e-05, "loss": 3.5643, "step": 3000 }, { "epoch": 0.10027575833542242, "eval_loss": 3.565227508544922, "eval_runtime": 249.1942, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 3000 }, { "epoch": 0.11698838472465949, "grad_norm": 8.590898513793945, "learning_rate": 4.4914505218071186e-05, "loss": 3.5126, "step": 3500 }, { "epoch": 0.13370101111389654, "grad_norm": 14.560601234436035, "learning_rate": 4.406465649114458e-05, "loss": 3.4777, "step": 4000 }, { "epoch": 0.13370101111389654, "eval_loss": 3.48999285697937, "eval_runtime": 249.1782, "eval_samples_per_second": 17.907, "eval_steps_per_second": 2.239, "step": 4000 }, { "epoch": 0.1504136375031336, "grad_norm": 10.846818923950195, "learning_rate": 4.321480776421797e-05, "loss": 3.4575, "step": 4500 }, { "epoch": 0.16712626389237067, "grad_norm": 10.20279598236084, "learning_rate": 4.2364959037291364e-05, "loss": 3.3958, "step": 5000 }, { "epoch": 0.16712626389237067, "eval_loss": 3.433652639389038, "eval_runtime": 249.1687, "eval_samples_per_second": 17.908, "eval_steps_per_second": 2.239, "step": 5000 }, { "epoch": 0.18383889028160774, "grad_norm": 11.423223495483398, "learning_rate": 4.1515110310364756e-05, "loss": 3.3574, "step": 5500 }, { "epoch": 0.20055151667084484, "grad_norm": 10.610285758972168, "learning_rate": 4.066526158343815e-05, "loss": 3.3517, "step": 6000 }, { "epoch": 0.20055151667084484, "eval_loss": 3.387014150619507, "eval_runtime": 249.1821, "eval_samples_per_second": 17.907, "eval_steps_per_second": 2.239, "step": 6000 }, { "epoch": 0.2172641430600819, "grad_norm": 8.165976524353027, "learning_rate": 3.981541285651154e-05, "loss": 3.3294, "step": 6500 }, { "epoch": 0.23397676944931897, "grad_norm": 10.196443557739258, "learning_rate": 3.8965564129584935e-05, "loss": 3.2805, "step": 7000 }, { "epoch": 0.23397676944931897, "eval_loss": 3.3461484909057617, "eval_runtime": 249.1887, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 7000 }, { "epoch": 0.25068939583855604, "grad_norm": 9.405224800109863, "learning_rate": 3.8115715402658334e-05, "loss": 3.2766, "step": 7500 }, { "epoch": 0.2674020222277931, "grad_norm": 8.490914344787598, "learning_rate": 3.726586667573172e-05, "loss": 3.2408, "step": 8000 }, { "epoch": 0.2674020222277931, "eval_loss": 3.315063714981079, "eval_runtime": 249.1918, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 8000 }, { "epoch": 0.2841146486170302, "grad_norm": 11.21275806427002, "learning_rate": 3.641601794880511e-05, "loss": 3.2381, "step": 8500 }, { "epoch": 0.3008272750062672, "grad_norm": 10.82959270477295, "learning_rate": 3.556616922187851e-05, "loss": 3.1716, "step": 9000 }, { "epoch": 0.3008272750062672, "eval_loss": 3.2550790309906006, "eval_runtime": 249.1754, "eval_samples_per_second": 17.907, "eval_steps_per_second": 2.239, "step": 9000 }, { "epoch": 0.3175399013955043, "grad_norm": 11.301346778869629, "learning_rate": 3.47163204949519e-05, "loss": 3.176, "step": 9500 }, { "epoch": 0.33425252778474135, "grad_norm": 10.199508666992188, "learning_rate": 3.386647176802529e-05, "loss": 3.1645, "step": 10000 }, { "epoch": 0.33425252778474135, "eval_loss": 3.2413389682769775, "eval_runtime": 249.1913, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 10000 }, { "epoch": 0.35096515417397844, "grad_norm": 7.239902019500732, "learning_rate": 3.301662304109869e-05, "loss": 3.1457, "step": 10500 }, { "epoch": 0.3676777805632155, "grad_norm": 10.78636360168457, "learning_rate": 3.2166774314172076e-05, "loss": 3.1221, "step": 11000 }, { "epoch": 0.3676777805632155, "eval_loss": 3.1978135108947754, "eval_runtime": 249.1932, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 11000 }, { "epoch": 0.3843904069524526, "grad_norm": 8.892194747924805, "learning_rate": 3.131692558724547e-05, "loss": 3.1435, "step": 11500 }, { "epoch": 0.4011030333416897, "grad_norm": 9.634190559387207, "learning_rate": 3.0467076860318865e-05, "loss": 3.0882, "step": 12000 }, { "epoch": 0.4011030333416897, "eval_loss": 3.1803853511810303, "eval_runtime": 249.2264, "eval_samples_per_second": 17.903, "eval_steps_per_second": 2.239, "step": 12000 }, { "epoch": 0.4178156597309267, "grad_norm": 8.388688087463379, "learning_rate": 2.9617228133392254e-05, "loss": 3.0468, "step": 12500 }, { "epoch": 0.4345282861201638, "grad_norm": 7.932670593261719, "learning_rate": 2.876737940646565e-05, "loss": 3.0627, "step": 13000 }, { "epoch": 0.4345282861201638, "eval_loss": 3.1499178409576416, "eval_runtime": 249.1889, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 13000 }, { "epoch": 0.45124091250940085, "grad_norm": 10.718411445617676, "learning_rate": 2.7917530679539043e-05, "loss": 3.0438, "step": 13500 }, { "epoch": 0.46795353889863794, "grad_norm": 10.233518600463867, "learning_rate": 2.7072781044973994e-05, "loss": 3.0231, "step": 14000 }, { "epoch": 0.46795353889863794, "eval_loss": 3.1285717487335205, "eval_runtime": 249.2148, "eval_samples_per_second": 17.904, "eval_steps_per_second": 2.239, "step": 14000 }, { "epoch": 0.484666165287875, "grad_norm": 10.557904243469238, "learning_rate": 2.6222932318047387e-05, "loss": 3.0278, "step": 14500 }, { "epoch": 0.5013787916771121, "grad_norm": 6.9997334480285645, "learning_rate": 2.5373083591120783e-05, "loss": 3.019, "step": 15000 }, { "epoch": 0.5013787916771121, "eval_loss": 3.1056880950927734, "eval_runtime": 249.1758, "eval_samples_per_second": 17.907, "eval_steps_per_second": 2.239, "step": 15000 }, { "epoch": 0.5180914180663492, "grad_norm": 8.922798156738281, "learning_rate": 2.4523234864194176e-05, "loss": 2.9907, "step": 15500 }, { "epoch": 0.5348040444555862, "grad_norm": 9.043310165405273, "learning_rate": 2.367338613726757e-05, "loss": 2.9945, "step": 16000 }, { "epoch": 0.5348040444555862, "eval_loss": 3.0864944458007812, "eval_runtime": 249.2, "eval_samples_per_second": 17.905, "eval_steps_per_second": 2.239, "step": 16000 }, { "epoch": 0.5515166708448233, "grad_norm": 24.114103317260742, "learning_rate": 2.2830336200156375e-05, "loss": 3.8624, "step": 16500 }, { "epoch": 0.5682292972340603, "grad_norm": 15.645467758178711, "learning_rate": 2.1983886868137474e-05, "loss": 3.5355, "step": 17000 }, { "epoch": 0.5682292972340603, "eval_loss": 3.449280261993408, "eval_runtime": 249.5095, "eval_samples_per_second": 17.883, "eval_steps_per_second": 2.236, "step": 17000 }, { "epoch": 0.5849419236232974, "grad_norm": 12.026097297668457, "learning_rate": 2.113743753611857e-05, "loss": 3.4294, "step": 17500 }, { "epoch": 0.6016545500125344, "grad_norm": 15.589037895202637, "learning_rate": 2.0287588809191966e-05, "loss": 3.3682, "step": 18000 }, { "epoch": 0.6016545500125344, "eval_loss": 3.334568500518799, "eval_runtime": 249.295, "eval_samples_per_second": 17.898, "eval_steps_per_second": 2.238, "step": 18000 }, { "epoch": 0.6183671764017715, "grad_norm": 18.867961883544922, "learning_rate": 1.9437740082265356e-05, "loss": 3.3126, "step": 18500 }, { "epoch": 0.6350798027910086, "grad_norm": 14.147026062011719, "learning_rate": 1.8587891355338752e-05, "loss": 3.2718, "step": 19000 }, { "epoch": 0.6350798027910086, "eval_loss": 3.245798349380493, "eval_runtime": 249.2433, "eval_samples_per_second": 17.902, "eval_steps_per_second": 2.239, "step": 19000 }, { "epoch": 0.6517924291802457, "grad_norm": 15.220115661621094, "learning_rate": 1.7738042628412145e-05, "loss": 3.1792, "step": 19500 }, { "epoch": 0.6685050555694827, "grad_norm": 17.45062255859375, "learning_rate": 1.6888193901485537e-05, "loss": 3.1603, "step": 20000 }, { "epoch": 0.6685050555694827, "eval_loss": 3.180778741836548, "eval_runtime": 249.2332, "eval_samples_per_second": 17.903, "eval_steps_per_second": 2.239, "step": 20000 }, { "epoch": 0.6852176819587198, "grad_norm": 16.632200241088867, "learning_rate": 1.603834517455893e-05, "loss": 3.1225, "step": 20500 }, { "epoch": 0.7019303083479569, "grad_norm": 13.793306350708008, "learning_rate": 1.5188496447632323e-05, "loss": 3.0798, "step": 21000 }, { "epoch": 0.7019303083479569, "eval_loss": 3.1489596366882324, "eval_runtime": 249.2624, "eval_samples_per_second": 17.901, "eval_steps_per_second": 2.239, "step": 21000 }, { "epoch": 0.718642934737194, "grad_norm": 18.920320510864258, "learning_rate": 1.4338647720705714e-05, "loss": 3.0602, "step": 21500 }, { "epoch": 0.735355561126431, "grad_norm": 16.346004486083984, "learning_rate": 1.3488798993779108e-05, "loss": 3.0281, "step": 22000 }, { "epoch": 0.735355561126431, "eval_loss": 3.083583354949951, "eval_runtime": 249.1953, "eval_samples_per_second": 17.906, "eval_steps_per_second": 2.239, "step": 22000 }, { "epoch": 0.7520681875156681, "grad_norm": 12.671531677246094, "learning_rate": 1.26389502668525e-05, "loss": 3.0254, "step": 22500 }, { "epoch": 0.7687808139049052, "grad_norm": 14.962254524230957, "learning_rate": 1.1789101539925894e-05, "loss": 3.023, "step": 23000 }, { "epoch": 0.7687808139049052, "eval_loss": 3.0390138626098633, "eval_runtime": 249.2133, "eval_samples_per_second": 17.904, "eval_steps_per_second": 2.239, "step": 23000 }, { "epoch": 0.7854934402941423, "grad_norm": 16.1228084564209, "learning_rate": 1.0939252812999288e-05, "loss": 2.9347, "step": 23500 }, { "epoch": 0.8022060666833793, "grad_norm": 13.979843139648438, "learning_rate": 1.0089404086072679e-05, "loss": 2.9604, "step": 24000 }, { "epoch": 0.8022060666833793, "eval_loss": 3.0071020126342773, "eval_runtime": 249.2612, "eval_samples_per_second": 17.901, "eval_steps_per_second": 2.239, "step": 24000 }, { "epoch": 0.8189186930726163, "grad_norm": 17.260498046875, "learning_rate": 9.239555359146072e-06, "loss": 2.899, "step": 24500 }, { "epoch": 0.8356313194618534, "grad_norm": 13.376078605651855, "learning_rate": 8.389706632219466e-06, "loss": 2.9291, "step": 25000 }, { "epoch": 0.8356313194618534, "eval_loss": 2.9687299728393555, "eval_runtime": 249.2378, "eval_samples_per_second": 17.903, "eval_steps_per_second": 2.239, "step": 25000 }, { "epoch": 0.8523439458510905, "grad_norm": 20.349105834960938, "learning_rate": 7.539857905292859e-06, "loss": 2.8853, "step": 25500 }, { "epoch": 0.8690565722403276, "grad_norm": 17.05868911743164, "learning_rate": 6.690009178366251e-06, "loss": 2.8542, "step": 26000 }, { "epoch": 0.8690565722403276, "eval_loss": 2.9402894973754883, "eval_runtime": 249.3105, "eval_samples_per_second": 17.897, "eval_steps_per_second": 2.238, "step": 26000 }, { "epoch": 0.8857691986295646, "grad_norm": 28.65497589111328, "learning_rate": 5.846959241255057e-06, "loss": 3.7512, "step": 26500 }, { "epoch": 0.9024818250188017, "grad_norm": 41.53907775878906, "learning_rate": 4.99711051432845e-06, "loss": 3.4169, "step": 27000 }, { "epoch": 0.9024818250188017, "eval_loss": 3.276771068572998, "eval_runtime": 249.323, "eval_samples_per_second": 17.896, "eval_steps_per_second": 2.238, "step": 27000 }, { "epoch": 0.9191944514080388, "grad_norm": 26.49399757385254, "learning_rate": 4.147261787401842e-06, "loss": 3.3055, "step": 27500 }, { "epoch": 0.9359070777972759, "grad_norm": 30.56415557861328, "learning_rate": 3.297413060475236e-06, "loss": 3.2131, "step": 28000 }, { "epoch": 0.9359070777972759, "eval_loss": 3.140591859817505, "eval_runtime": 249.261, "eval_samples_per_second": 17.901, "eval_steps_per_second": 2.239, "step": 28000 }, { "epoch": 0.9526197041865129, "grad_norm": 33.53205871582031, "learning_rate": 2.4509637284563352e-06, "loss": 3.1509, "step": 28500 }, { "epoch": 0.96933233057575, "grad_norm": 26.73455810546875, "learning_rate": 1.6011150015297277e-06, "loss": 3.0998, "step": 29000 }, { "epoch": 0.96933233057575, "eval_loss": 3.0688536167144775, "eval_runtime": 249.2871, "eval_samples_per_second": 17.899, "eval_steps_per_second": 2.238, "step": 29000 }, { "epoch": 0.9860449569649871, "grad_norm": 26.864349365234375, "learning_rate": 7.512662746031207e-07, "loss": 3.0876, "step": 29500 }, { "epoch": 0.25068939583855604, "grad_norm": 65.23794555664062, "learning_rate": 4.794667744453116e-05, "loss": 4.6117, "step": 30000 }, { "epoch": 0.25068939583855604, "eval_loss": 4.784034252166748, "eval_runtime": 253.6816, "eval_samples_per_second": 17.589, "eval_steps_per_second": 2.2, "step": 30000 } ], "logging_steps": 500, "max_steps": 718020, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.063185677140099e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }