|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 10, |
|
"global_step": 66, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.046511627906976744, |
|
"grad_norm": 3.328749895095825, |
|
"learning_rate": 0.0, |
|
"loss": 4.5983, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.09302325581395349, |
|
"grad_norm": 3.354735851287842, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 4.5836, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.13953488372093023, |
|
"grad_norm": 2.9163668155670166, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 4.397, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.18604651162790697, |
|
"grad_norm": 2.2333481311798096, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 4.1457, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.23255813953488372, |
|
"grad_norm": 1.991350769996643, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 3.9002, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.27906976744186046, |
|
"grad_norm": 1.85317063331604, |
|
"learning_rate": 0.00014285714285714287, |
|
"loss": 3.5969, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.32558139534883723, |
|
"grad_norm": 1.9501370191574097, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 3.2651, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.37209302325581395, |
|
"grad_norm": 2.216587543487549, |
|
"learning_rate": 0.0002, |
|
"loss": 2.8473, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.4186046511627907, |
|
"grad_norm": 2.0312318801879883, |
|
"learning_rate": 0.0001998582695676762, |
|
"loss": 2.3923, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 1.8019684553146362, |
|
"learning_rate": 0.00019943348002101371, |
|
"loss": 2.0528, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"eval_loss": 1.894411325454712, |
|
"eval_runtime": 145.4733, |
|
"eval_samples_per_second": 33.601, |
|
"eval_steps_per_second": 0.137, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.5116279069767442, |
|
"grad_norm": 1.951332449913025, |
|
"learning_rate": 0.00019872683547213446, |
|
"loss": 1.7962, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.5581395348837209, |
|
"grad_norm": 1.5264594554901123, |
|
"learning_rate": 0.00019774033898178667, |
|
"loss": 1.5795, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.6046511627906976, |
|
"grad_norm": 1.0167055130004883, |
|
"learning_rate": 0.0001964767868814516, |
|
"loss": 1.4307, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.6511627906976745, |
|
"grad_norm": 1.1928244829177856, |
|
"learning_rate": 0.00019493976084683813, |
|
"loss": 1.3048, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.6976744186046512, |
|
"grad_norm": 1.513029932975769, |
|
"learning_rate": 0.00019313361774523385, |
|
"loss": 1.1708, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.7441860465116279, |
|
"grad_norm": 1.6432111263275146, |
|
"learning_rate": 0.00019106347728549135, |
|
"loss": 1.0173, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.7906976744186046, |
|
"grad_norm": 1.4672132730484009, |
|
"learning_rate": 0.00018873520750565718, |
|
"loss": 0.8399, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.8372093023255814, |
|
"grad_norm": 1.3380573987960815, |
|
"learning_rate": 0.0001861554081393806, |
|
"loss": 0.6508, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.8837209302325582, |
|
"grad_norm": 1.26139235496521, |
|
"learning_rate": 0.0001833313919082515, |
|
"loss": 0.4701, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.0803511142730713, |
|
"learning_rate": 0.00018027116379309638, |
|
"loss": 0.316, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"eval_loss": 0.19302108883857727, |
|
"eval_runtime": 145.4827, |
|
"eval_samples_per_second": 33.598, |
|
"eval_steps_per_second": 0.137, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9767441860465116, |
|
"grad_norm": 0.8388907313346863, |
|
"learning_rate": 0.00017698339834299061, |
|
"loss": 0.1865, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6994265913963318, |
|
"learning_rate": 0.00017347741508630672, |
|
"loss": 0.1083, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 1.0465116279069768, |
|
"grad_norm": 0.266382098197937, |
|
"learning_rate": 0.0001697631521134985, |
|
"loss": 0.0619, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 1.0930232558139534, |
|
"grad_norm": 0.17738769948482513, |
|
"learning_rate": 0.00016585113790650388, |
|
"loss": 0.0463, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 1.1395348837209303, |
|
"grad_norm": 0.12172795832157135, |
|
"learning_rate": 0.0001617524614946192, |
|
"loss": 0.039, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 1.1860465116279069, |
|
"grad_norm": 0.1264028549194336, |
|
"learning_rate": 0.0001574787410214407, |
|
"loss": 0.0372, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 1.2325581395348837, |
|
"grad_norm": 0.108629010617733, |
|
"learning_rate": 0.00015304209081197425, |
|
"loss": 0.0332, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 1.2790697674418605, |
|
"grad_norm": 0.0759243443608284, |
|
"learning_rate": 0.00014845508703326504, |
|
"loss": 0.0323, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 1.3255813953488373, |
|
"grad_norm": 0.06167895719408989, |
|
"learning_rate": 0.00014373073204588556, |
|
"loss": 0.0292, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 1.372093023255814, |
|
"grad_norm": 0.05063502490520477, |
|
"learning_rate": 0.00013888241754733208, |
|
"loss": 0.0322, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.372093023255814, |
|
"eval_loss": 0.03178785368800163, |
|
"eval_runtime": 144.6511, |
|
"eval_samples_per_second": 33.792, |
|
"eval_steps_per_second": 0.138, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.4186046511627908, |
|
"grad_norm": 0.05390379950404167, |
|
"learning_rate": 0.00013392388661180303, |
|
"loss": 0.0317, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 1.4651162790697674, |
|
"grad_norm": 0.05356408655643463, |
|
"learning_rate": 0.0001288691947339621, |
|
"loss": 0.0261, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 1.5116279069767442, |
|
"grad_norm": 0.039639439433813095, |
|
"learning_rate": 0.0001237326699871115, |
|
"loss": 0.0286, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 1.558139534883721, |
|
"grad_norm": 0.03714418411254883, |
|
"learning_rate": 0.00011852887240871145, |
|
"loss": 0.0279, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 1.6046511627906976, |
|
"grad_norm": 0.04030028358101845, |
|
"learning_rate": 0.00011327255272837221, |
|
"loss": 0.0265, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 1.6511627906976745, |
|
"grad_norm": 0.035650961101055145, |
|
"learning_rate": 0.00010797861055530831, |
|
"loss": 0.0253, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 1.697674418604651, |
|
"grad_norm": 0.041386678814888, |
|
"learning_rate": 0.00010266205214377748, |
|
"loss": 0.0261, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 1.744186046511628, |
|
"grad_norm": 0.03318718075752258, |
|
"learning_rate": 9.733794785622253e-05, |
|
"loss": 0.0253, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 1.7906976744186047, |
|
"grad_norm": 0.03519133850932121, |
|
"learning_rate": 9.202138944469168e-05, |
|
"loss": 0.0259, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 1.8372093023255816, |
|
"grad_norm": 0.03141423687338829, |
|
"learning_rate": 8.672744727162781e-05, |
|
"loss": 0.0229, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8372093023255816, |
|
"eval_loss": 0.028911028057336807, |
|
"eval_runtime": 145.5272, |
|
"eval_samples_per_second": 33.588, |
|
"eval_steps_per_second": 0.137, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.8837209302325582, |
|
"grad_norm": 0.04335688054561615, |
|
"learning_rate": 8.147112759128859e-05, |
|
"loss": 0.0246, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 1.9302325581395348, |
|
"grad_norm": 0.031896017491817474, |
|
"learning_rate": 7.626733001288851e-05, |
|
"loss": 0.0235, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 1.9767441860465116, |
|
"grad_norm": 0.03081543557345867, |
|
"learning_rate": 7.113080526603792e-05, |
|
"loss": 0.0247, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.04796084016561508, |
|
"learning_rate": 6.607611338819697e-05, |
|
"loss": 0.0266, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 2.046511627906977, |
|
"grad_norm": 0.032496869564056396, |
|
"learning_rate": 6.111758245266794e-05, |
|
"loss": 0.0217, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 2.0930232558139537, |
|
"grad_norm": 0.03214440867304802, |
|
"learning_rate": 5.626926795411447e-05, |
|
"loss": 0.022, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 2.13953488372093, |
|
"grad_norm": 0.026299171149730682, |
|
"learning_rate": 5.1544912966734994e-05, |
|
"loss": 0.0206, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 2.186046511627907, |
|
"grad_norm": 0.02710675820708275, |
|
"learning_rate": 4.695790918802576e-05, |
|
"loss": 0.0241, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 2.2325581395348837, |
|
"grad_norm": 0.027937039732933044, |
|
"learning_rate": 4.252125897855932e-05, |
|
"loss": 0.0213, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 2.2790697674418605, |
|
"grad_norm": 0.03069477155804634, |
|
"learning_rate": 3.824753850538082e-05, |
|
"loss": 0.0206, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.2790697674418605, |
|
"eval_loss": 0.02796892449259758, |
|
"eval_runtime": 144.1386, |
|
"eval_samples_per_second": 33.912, |
|
"eval_steps_per_second": 0.139, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 0.027339283376932144, |
|
"learning_rate": 3.414886209349615e-05, |
|
"loss": 0.0202, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 2.3720930232558137, |
|
"grad_norm": 0.02695722132921219, |
|
"learning_rate": 3.0236847886501542e-05, |
|
"loss": 0.0214, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 2.4186046511627906, |
|
"grad_norm": 0.022607291117310524, |
|
"learning_rate": 2.6522584913693294e-05, |
|
"loss": 0.0202, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 2.4651162790697674, |
|
"grad_norm": 0.02922969125211239, |
|
"learning_rate": 2.301660165700936e-05, |
|
"loss": 0.0225, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 2.511627906976744, |
|
"grad_norm": 0.02776559814810753, |
|
"learning_rate": 1.9728836206903656e-05, |
|
"loss": 0.0188, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 2.558139534883721, |
|
"grad_norm": 0.027599385008215904, |
|
"learning_rate": 1.6668608091748495e-05, |
|
"loss": 0.0215, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 2.604651162790698, |
|
"grad_norm": 0.02333148941397667, |
|
"learning_rate": 1.3844591860619383e-05, |
|
"loss": 0.0197, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 2.6511627906976747, |
|
"grad_norm": 0.02748979814350605, |
|
"learning_rate": 1.1264792494342857e-05, |
|
"loss": 0.023, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 2.697674418604651, |
|
"grad_norm": 0.023095615208148956, |
|
"learning_rate": 8.936522714508678e-06, |
|
"loss": 0.021, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 2.744186046511628, |
|
"grad_norm": 0.030044227838516235, |
|
"learning_rate": 6.866382254766157e-06, |
|
"loss": 0.0238, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.744186046511628, |
|
"eval_loss": 0.027968447655439377, |
|
"eval_runtime": 144.4837, |
|
"eval_samples_per_second": 33.831, |
|
"eval_steps_per_second": 0.138, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.7906976744186047, |
|
"grad_norm": 0.026822634041309357, |
|
"learning_rate": 5.060239153161872e-06, |
|
"loss": 0.022, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 2.8372093023255816, |
|
"grad_norm": 0.02589753456413746, |
|
"learning_rate": 3.5232131185484076e-06, |
|
"loss": 0.0193, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 2.883720930232558, |
|
"grad_norm": 0.025665169581770897, |
|
"learning_rate": 2.259661018213333e-06, |
|
"loss": 0.0211, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 2.9302325581395348, |
|
"grad_norm": 0.025219907984137535, |
|
"learning_rate": 1.2731645278655445e-06, |
|
"loss": 0.0205, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 2.9767441860465116, |
|
"grad_norm": 0.02233767695724964, |
|
"learning_rate": 5.665199789862907e-07, |
|
"loss": 0.0211, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.03987536579370499, |
|
"learning_rate": 1.4173043232380557e-07, |
|
"loss": 0.0221, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 66, |
|
"total_flos": 4.4321989464330076e+18, |
|
"train_loss": 0.7241760994674582, |
|
"train_runtime": 13313.2011, |
|
"train_samples_per_second": 9.912, |
|
"train_steps_per_second": 0.005 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 66, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.4321989464330076e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|