PumeTu's picture
Add files using upload-large-folder tool
2f08073 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 10,
"global_step": 66,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.046511627906976744,
"grad_norm": 3.328749895095825,
"learning_rate": 0.0,
"loss": 4.5983,
"step": 1
},
{
"epoch": 0.09302325581395349,
"grad_norm": 3.354735851287842,
"learning_rate": 2.857142857142857e-05,
"loss": 4.5836,
"step": 2
},
{
"epoch": 0.13953488372093023,
"grad_norm": 2.9163668155670166,
"learning_rate": 5.714285714285714e-05,
"loss": 4.397,
"step": 3
},
{
"epoch": 0.18604651162790697,
"grad_norm": 2.2333481311798096,
"learning_rate": 8.571428571428571e-05,
"loss": 4.1457,
"step": 4
},
{
"epoch": 0.23255813953488372,
"grad_norm": 1.991350769996643,
"learning_rate": 0.00011428571428571428,
"loss": 3.9002,
"step": 5
},
{
"epoch": 0.27906976744186046,
"grad_norm": 1.85317063331604,
"learning_rate": 0.00014285714285714287,
"loss": 3.5969,
"step": 6
},
{
"epoch": 0.32558139534883723,
"grad_norm": 1.9501370191574097,
"learning_rate": 0.00017142857142857143,
"loss": 3.2651,
"step": 7
},
{
"epoch": 0.37209302325581395,
"grad_norm": 2.216587543487549,
"learning_rate": 0.0002,
"loss": 2.8473,
"step": 8
},
{
"epoch": 0.4186046511627907,
"grad_norm": 2.0312318801879883,
"learning_rate": 0.0001998582695676762,
"loss": 2.3923,
"step": 9
},
{
"epoch": 0.46511627906976744,
"grad_norm": 1.8019684553146362,
"learning_rate": 0.00019943348002101371,
"loss": 2.0528,
"step": 10
},
{
"epoch": 0.46511627906976744,
"eval_loss": 1.894411325454712,
"eval_runtime": 145.4733,
"eval_samples_per_second": 33.601,
"eval_steps_per_second": 0.137,
"step": 10
},
{
"epoch": 0.5116279069767442,
"grad_norm": 1.951332449913025,
"learning_rate": 0.00019872683547213446,
"loss": 1.7962,
"step": 11
},
{
"epoch": 0.5581395348837209,
"grad_norm": 1.5264594554901123,
"learning_rate": 0.00019774033898178667,
"loss": 1.5795,
"step": 12
},
{
"epoch": 0.6046511627906976,
"grad_norm": 1.0167055130004883,
"learning_rate": 0.0001964767868814516,
"loss": 1.4307,
"step": 13
},
{
"epoch": 0.6511627906976745,
"grad_norm": 1.1928244829177856,
"learning_rate": 0.00019493976084683813,
"loss": 1.3048,
"step": 14
},
{
"epoch": 0.6976744186046512,
"grad_norm": 1.513029932975769,
"learning_rate": 0.00019313361774523385,
"loss": 1.1708,
"step": 15
},
{
"epoch": 0.7441860465116279,
"grad_norm": 1.6432111263275146,
"learning_rate": 0.00019106347728549135,
"loss": 1.0173,
"step": 16
},
{
"epoch": 0.7906976744186046,
"grad_norm": 1.4672132730484009,
"learning_rate": 0.00018873520750565718,
"loss": 0.8399,
"step": 17
},
{
"epoch": 0.8372093023255814,
"grad_norm": 1.3380573987960815,
"learning_rate": 0.0001861554081393806,
"loss": 0.6508,
"step": 18
},
{
"epoch": 0.8837209302325582,
"grad_norm": 1.26139235496521,
"learning_rate": 0.0001833313919082515,
"loss": 0.4701,
"step": 19
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.0803511142730713,
"learning_rate": 0.00018027116379309638,
"loss": 0.316,
"step": 20
},
{
"epoch": 0.9302325581395349,
"eval_loss": 0.19302108883857727,
"eval_runtime": 145.4827,
"eval_samples_per_second": 33.598,
"eval_steps_per_second": 0.137,
"step": 20
},
{
"epoch": 0.9767441860465116,
"grad_norm": 0.8388907313346863,
"learning_rate": 0.00017698339834299061,
"loss": 0.1865,
"step": 21
},
{
"epoch": 1.0,
"grad_norm": 0.6994265913963318,
"learning_rate": 0.00017347741508630672,
"loss": 0.1083,
"step": 22
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.266382098197937,
"learning_rate": 0.0001697631521134985,
"loss": 0.0619,
"step": 23
},
{
"epoch": 1.0930232558139534,
"grad_norm": 0.17738769948482513,
"learning_rate": 0.00016585113790650388,
"loss": 0.0463,
"step": 24
},
{
"epoch": 1.1395348837209303,
"grad_norm": 0.12172795832157135,
"learning_rate": 0.0001617524614946192,
"loss": 0.039,
"step": 25
},
{
"epoch": 1.1860465116279069,
"grad_norm": 0.1264028549194336,
"learning_rate": 0.0001574787410214407,
"loss": 0.0372,
"step": 26
},
{
"epoch": 1.2325581395348837,
"grad_norm": 0.108629010617733,
"learning_rate": 0.00015304209081197425,
"loss": 0.0332,
"step": 27
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.0759243443608284,
"learning_rate": 0.00014845508703326504,
"loss": 0.0323,
"step": 28
},
{
"epoch": 1.3255813953488373,
"grad_norm": 0.06167895719408989,
"learning_rate": 0.00014373073204588556,
"loss": 0.0292,
"step": 29
},
{
"epoch": 1.372093023255814,
"grad_norm": 0.05063502490520477,
"learning_rate": 0.00013888241754733208,
"loss": 0.0322,
"step": 30
},
{
"epoch": 1.372093023255814,
"eval_loss": 0.03178785368800163,
"eval_runtime": 144.6511,
"eval_samples_per_second": 33.792,
"eval_steps_per_second": 0.138,
"step": 30
},
{
"epoch": 1.4186046511627908,
"grad_norm": 0.05390379950404167,
"learning_rate": 0.00013392388661180303,
"loss": 0.0317,
"step": 31
},
{
"epoch": 1.4651162790697674,
"grad_norm": 0.05356408655643463,
"learning_rate": 0.0001288691947339621,
"loss": 0.0261,
"step": 32
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.039639439433813095,
"learning_rate": 0.0001237326699871115,
"loss": 0.0286,
"step": 33
},
{
"epoch": 1.558139534883721,
"grad_norm": 0.03714418411254883,
"learning_rate": 0.00011852887240871145,
"loss": 0.0279,
"step": 34
},
{
"epoch": 1.6046511627906976,
"grad_norm": 0.04030028358101845,
"learning_rate": 0.00011327255272837221,
"loss": 0.0265,
"step": 35
},
{
"epoch": 1.6511627906976745,
"grad_norm": 0.035650961101055145,
"learning_rate": 0.00010797861055530831,
"loss": 0.0253,
"step": 36
},
{
"epoch": 1.697674418604651,
"grad_norm": 0.041386678814888,
"learning_rate": 0.00010266205214377748,
"loss": 0.0261,
"step": 37
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.03318718075752258,
"learning_rate": 9.733794785622253e-05,
"loss": 0.0253,
"step": 38
},
{
"epoch": 1.7906976744186047,
"grad_norm": 0.03519133850932121,
"learning_rate": 9.202138944469168e-05,
"loss": 0.0259,
"step": 39
},
{
"epoch": 1.8372093023255816,
"grad_norm": 0.03141423687338829,
"learning_rate": 8.672744727162781e-05,
"loss": 0.0229,
"step": 40
},
{
"epoch": 1.8372093023255816,
"eval_loss": 0.028911028057336807,
"eval_runtime": 145.5272,
"eval_samples_per_second": 33.588,
"eval_steps_per_second": 0.137,
"step": 40
},
{
"epoch": 1.8837209302325582,
"grad_norm": 0.04335688054561615,
"learning_rate": 8.147112759128859e-05,
"loss": 0.0246,
"step": 41
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.031896017491817474,
"learning_rate": 7.626733001288851e-05,
"loss": 0.0235,
"step": 42
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.03081543557345867,
"learning_rate": 7.113080526603792e-05,
"loss": 0.0247,
"step": 43
},
{
"epoch": 2.0,
"grad_norm": 0.04796084016561508,
"learning_rate": 6.607611338819697e-05,
"loss": 0.0266,
"step": 44
},
{
"epoch": 2.046511627906977,
"grad_norm": 0.032496869564056396,
"learning_rate": 6.111758245266794e-05,
"loss": 0.0217,
"step": 45
},
{
"epoch": 2.0930232558139537,
"grad_norm": 0.03214440867304802,
"learning_rate": 5.626926795411447e-05,
"loss": 0.022,
"step": 46
},
{
"epoch": 2.13953488372093,
"grad_norm": 0.026299171149730682,
"learning_rate": 5.1544912966734994e-05,
"loss": 0.0206,
"step": 47
},
{
"epoch": 2.186046511627907,
"grad_norm": 0.02710675820708275,
"learning_rate": 4.695790918802576e-05,
"loss": 0.0241,
"step": 48
},
{
"epoch": 2.2325581395348837,
"grad_norm": 0.027937039732933044,
"learning_rate": 4.252125897855932e-05,
"loss": 0.0213,
"step": 49
},
{
"epoch": 2.2790697674418605,
"grad_norm": 0.03069477155804634,
"learning_rate": 3.824753850538082e-05,
"loss": 0.0206,
"step": 50
},
{
"epoch": 2.2790697674418605,
"eval_loss": 0.02796892449259758,
"eval_runtime": 144.1386,
"eval_samples_per_second": 33.912,
"eval_steps_per_second": 0.139,
"step": 50
},
{
"epoch": 2.3255813953488373,
"grad_norm": 0.027339283376932144,
"learning_rate": 3.414886209349615e-05,
"loss": 0.0202,
"step": 51
},
{
"epoch": 2.3720930232558137,
"grad_norm": 0.02695722132921219,
"learning_rate": 3.0236847886501542e-05,
"loss": 0.0214,
"step": 52
},
{
"epoch": 2.4186046511627906,
"grad_norm": 0.022607291117310524,
"learning_rate": 2.6522584913693294e-05,
"loss": 0.0202,
"step": 53
},
{
"epoch": 2.4651162790697674,
"grad_norm": 0.02922969125211239,
"learning_rate": 2.301660165700936e-05,
"loss": 0.0225,
"step": 54
},
{
"epoch": 2.511627906976744,
"grad_norm": 0.02776559814810753,
"learning_rate": 1.9728836206903656e-05,
"loss": 0.0188,
"step": 55
},
{
"epoch": 2.558139534883721,
"grad_norm": 0.027599385008215904,
"learning_rate": 1.6668608091748495e-05,
"loss": 0.0215,
"step": 56
},
{
"epoch": 2.604651162790698,
"grad_norm": 0.02333148941397667,
"learning_rate": 1.3844591860619383e-05,
"loss": 0.0197,
"step": 57
},
{
"epoch": 2.6511627906976747,
"grad_norm": 0.02748979814350605,
"learning_rate": 1.1264792494342857e-05,
"loss": 0.023,
"step": 58
},
{
"epoch": 2.697674418604651,
"grad_norm": 0.023095615208148956,
"learning_rate": 8.936522714508678e-06,
"loss": 0.021,
"step": 59
},
{
"epoch": 2.744186046511628,
"grad_norm": 0.030044227838516235,
"learning_rate": 6.866382254766157e-06,
"loss": 0.0238,
"step": 60
},
{
"epoch": 2.744186046511628,
"eval_loss": 0.027968447655439377,
"eval_runtime": 144.4837,
"eval_samples_per_second": 33.831,
"eval_steps_per_second": 0.138,
"step": 60
},
{
"epoch": 2.7906976744186047,
"grad_norm": 0.026822634041309357,
"learning_rate": 5.060239153161872e-06,
"loss": 0.022,
"step": 61
},
{
"epoch": 2.8372093023255816,
"grad_norm": 0.02589753456413746,
"learning_rate": 3.5232131185484076e-06,
"loss": 0.0193,
"step": 62
},
{
"epoch": 2.883720930232558,
"grad_norm": 0.025665169581770897,
"learning_rate": 2.259661018213333e-06,
"loss": 0.0211,
"step": 63
},
{
"epoch": 2.9302325581395348,
"grad_norm": 0.025219907984137535,
"learning_rate": 1.2731645278655445e-06,
"loss": 0.0205,
"step": 64
},
{
"epoch": 2.9767441860465116,
"grad_norm": 0.02233767695724964,
"learning_rate": 5.665199789862907e-07,
"loss": 0.0211,
"step": 65
},
{
"epoch": 3.0,
"grad_norm": 0.03987536579370499,
"learning_rate": 1.4173043232380557e-07,
"loss": 0.0221,
"step": 66
},
{
"epoch": 3.0,
"step": 66,
"total_flos": 4.4321989464330076e+18,
"train_loss": 0.7241760994674582,
"train_runtime": 13313.2011,
"train_samples_per_second": 9.912,
"train_steps_per_second": 0.005
}
],
"logging_steps": 1.0,
"max_steps": 66,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.4321989464330076e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}