thinger / trainer_state.json
rosethelocalfem's picture
Upload 12 files
c0873a7 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.0006535922081351315,
"eval_steps": 500,
"global_step": 500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 1.307184416270263e-06,
"grad_norm": 6.268255710601807,
"learning_rate": 0.0,
"loss": 7.0486,
"step": 1
},
{
"epoch": 2.614368832540526e-06,
"grad_norm": 5.648464202880859,
"learning_rate": 5e-06,
"loss": 6.6548,
"step": 2
},
{
"epoch": 3.921553248810789e-06,
"grad_norm": 3.917975902557373,
"learning_rate": 1e-05,
"loss": 5.9005,
"step": 3
},
{
"epoch": 5.228737665081052e-06,
"grad_norm": 5.040616989135742,
"learning_rate": 1.5e-05,
"loss": 6.4287,
"step": 4
},
{
"epoch": 6.535922081351315e-06,
"grad_norm": 4.18554162979126,
"learning_rate": 2e-05,
"loss": 5.8283,
"step": 5
},
{
"epoch": 7.843106497621578e-06,
"grad_norm": 3.837583303451538,
"learning_rate": 2.5e-05,
"loss": 5.8711,
"step": 6
},
{
"epoch": 9.150290913891842e-06,
"grad_norm": 3.9937901496887207,
"learning_rate": 3e-05,
"loss": 6.2397,
"step": 7
},
{
"epoch": 1.0457475330162104e-05,
"grad_norm": 6.323297023773193,
"learning_rate": 3.5e-05,
"loss": 6.0184,
"step": 8
},
{
"epoch": 1.1764659746432368e-05,
"grad_norm": 4.541579246520996,
"learning_rate": 4e-05,
"loss": 4.9289,
"step": 9
},
{
"epoch": 1.307184416270263e-05,
"grad_norm": 4.3712358474731445,
"learning_rate": 4.5e-05,
"loss": 5.5431,
"step": 10
},
{
"epoch": 1.4379028578972894e-05,
"grad_norm": 5.535464763641357,
"learning_rate": 5e-05,
"loss": 5.4152,
"step": 11
},
{
"epoch": 1.5686212995243156e-05,
"grad_norm": 4.411280632019043,
"learning_rate": 4.9994994994994995e-05,
"loss": 5.0227,
"step": 12
},
{
"epoch": 1.6993397411513418e-05,
"grad_norm": 8.777021408081055,
"learning_rate": 4.9989989989989995e-05,
"loss": 4.628,
"step": 13
},
{
"epoch": 1.8300581827783683e-05,
"grad_norm": 11.268546104431152,
"learning_rate": 4.998498498498499e-05,
"loss": 4.4529,
"step": 14
},
{
"epoch": 1.9607766244053945e-05,
"grad_norm": 11.081679344177246,
"learning_rate": 4.997997997997999e-05,
"loss": 3.7927,
"step": 15
},
{
"epoch": 2.0914950660324208e-05,
"grad_norm": 17.39815330505371,
"learning_rate": 4.9974974974974974e-05,
"loss": 3.8354,
"step": 16
},
{
"epoch": 2.222213507659447e-05,
"grad_norm": 19.696117401123047,
"learning_rate": 4.9969969969969974e-05,
"loss": 2.6296,
"step": 17
},
{
"epoch": 2.3529319492864735e-05,
"grad_norm": 9.491320610046387,
"learning_rate": 4.996496496496497e-05,
"loss": 2.34,
"step": 18
},
{
"epoch": 2.4836503909134997e-05,
"grad_norm": 8.10347843170166,
"learning_rate": 4.995995995995996e-05,
"loss": 2.8097,
"step": 19
},
{
"epoch": 2.614368832540526e-05,
"grad_norm": 10.142881393432617,
"learning_rate": 4.995495495495496e-05,
"loss": 2.418,
"step": 20
},
{
"epoch": 2.745087274167552e-05,
"grad_norm": 5.666007041931152,
"learning_rate": 4.994994994994995e-05,
"loss": 2.4814,
"step": 21
},
{
"epoch": 2.8758057157945787e-05,
"grad_norm": 2.6253504753112793,
"learning_rate": 4.994494494494495e-05,
"loss": 3.0029,
"step": 22
},
{
"epoch": 3.006524157421605e-05,
"grad_norm": 1.9413844347000122,
"learning_rate": 4.9939939939939945e-05,
"loss": 2.7044,
"step": 23
},
{
"epoch": 3.137242599048631e-05,
"grad_norm": 2.3702781200408936,
"learning_rate": 4.993493493493494e-05,
"loss": 3.2683,
"step": 24
},
{
"epoch": 3.267961040675658e-05,
"grad_norm": 1.8796427249908447,
"learning_rate": 4.992992992992993e-05,
"loss": 2.6591,
"step": 25
},
{
"epoch": 3.3986794823026836e-05,
"grad_norm": 1.8859280347824097,
"learning_rate": 4.9924924924924924e-05,
"loss": 2.6567,
"step": 26
},
{
"epoch": 3.52939792392971e-05,
"grad_norm": 1.8021355867385864,
"learning_rate": 4.9919919919919924e-05,
"loss": 2.0643,
"step": 27
},
{
"epoch": 3.660116365556737e-05,
"grad_norm": 3.3398263454437256,
"learning_rate": 4.991491491491492e-05,
"loss": 1.9806,
"step": 28
},
{
"epoch": 3.7908348071837625e-05,
"grad_norm": 1.7556695938110352,
"learning_rate": 4.9909909909909917e-05,
"loss": 2.5847,
"step": 29
},
{
"epoch": 3.921553248810789e-05,
"grad_norm": 2.361534833908081,
"learning_rate": 4.990490490490491e-05,
"loss": 3.1982,
"step": 30
},
{
"epoch": 4.052271690437815e-05,
"grad_norm": 2.1116466522216797,
"learning_rate": 4.98998998998999e-05,
"loss": 2.1447,
"step": 31
},
{
"epoch": 4.1829901320648415e-05,
"grad_norm": 2.76759672164917,
"learning_rate": 4.9894894894894896e-05,
"loss": 2.2138,
"step": 32
},
{
"epoch": 4.313708573691868e-05,
"grad_norm": 1.744004726409912,
"learning_rate": 4.988988988988989e-05,
"loss": 2.6433,
"step": 33
},
{
"epoch": 4.444427015318894e-05,
"grad_norm": 1.946826457977295,
"learning_rate": 4.988488488488489e-05,
"loss": 2.814,
"step": 34
},
{
"epoch": 4.5751454569459205e-05,
"grad_norm": 1.7588024139404297,
"learning_rate": 4.987987987987988e-05,
"loss": 1.9231,
"step": 35
},
{
"epoch": 4.705863898572947e-05,
"grad_norm": 1.920486330986023,
"learning_rate": 4.9874874874874874e-05,
"loss": 2.3058,
"step": 36
},
{
"epoch": 4.836582340199973e-05,
"grad_norm": 2.172233819961548,
"learning_rate": 4.9869869869869874e-05,
"loss": 1.9368,
"step": 37
},
{
"epoch": 4.9673007818269995e-05,
"grad_norm": 1.9940109252929688,
"learning_rate": 4.986486486486487e-05,
"loss": 2.4671,
"step": 38
},
{
"epoch": 5.0980192234540254e-05,
"grad_norm": 2.1106722354888916,
"learning_rate": 4.985985985985987e-05,
"loss": 1.2266,
"step": 39
},
{
"epoch": 5.228737665081052e-05,
"grad_norm": 1.9987304210662842,
"learning_rate": 4.985485485485486e-05,
"loss": 2.5097,
"step": 40
},
{
"epoch": 5.3594561067080785e-05,
"grad_norm": 4.258189678192139,
"learning_rate": 4.984984984984985e-05,
"loss": 2.7884,
"step": 41
},
{
"epoch": 5.490174548335104e-05,
"grad_norm": 3.9918763637542725,
"learning_rate": 4.9844844844844846e-05,
"loss": 1.8792,
"step": 42
},
{
"epoch": 5.620892989962131e-05,
"grad_norm": 2.13405442237854,
"learning_rate": 4.983983983983984e-05,
"loss": 2.4141,
"step": 43
},
{
"epoch": 5.7516114315891574e-05,
"grad_norm": 2.369387626647949,
"learning_rate": 4.983483483483484e-05,
"loss": 2.4503,
"step": 44
},
{
"epoch": 5.882329873216183e-05,
"grad_norm": 2.4659979343414307,
"learning_rate": 4.982982982982983e-05,
"loss": 1.6704,
"step": 45
},
{
"epoch": 6.01304831484321e-05,
"grad_norm": 2.8049533367156982,
"learning_rate": 4.982482482482483e-05,
"loss": 2.2134,
"step": 46
},
{
"epoch": 6.143766756470236e-05,
"grad_norm": 3.652784824371338,
"learning_rate": 4.9819819819819824e-05,
"loss": 2.4586,
"step": 47
},
{
"epoch": 6.274485198097262e-05,
"grad_norm": 2.76751971244812,
"learning_rate": 4.981481481481482e-05,
"loss": 2.6559,
"step": 48
},
{
"epoch": 6.405203639724289e-05,
"grad_norm": 2.337362051010132,
"learning_rate": 4.980980980980981e-05,
"loss": 2.6549,
"step": 49
},
{
"epoch": 6.535922081351315e-05,
"grad_norm": 2.5444376468658447,
"learning_rate": 4.98048048048048e-05,
"loss": 2.041,
"step": 50
},
{
"epoch": 6.666640522978342e-05,
"grad_norm": 2.3144052028656006,
"learning_rate": 4.97997997997998e-05,
"loss": 2.4423,
"step": 51
},
{
"epoch": 6.797358964605367e-05,
"grad_norm": 2.647606372833252,
"learning_rate": 4.9794794794794796e-05,
"loss": 2.0042,
"step": 52
},
{
"epoch": 6.928077406232394e-05,
"grad_norm": 2.2047841548919678,
"learning_rate": 4.9789789789789795e-05,
"loss": 2.7494,
"step": 53
},
{
"epoch": 7.05879584785942e-05,
"grad_norm": 2.5686540603637695,
"learning_rate": 4.978478478478479e-05,
"loss": 2.6443,
"step": 54
},
{
"epoch": 7.189514289486447e-05,
"grad_norm": 2.634040594100952,
"learning_rate": 4.977977977977978e-05,
"loss": 2.3489,
"step": 55
},
{
"epoch": 7.320232731113473e-05,
"grad_norm": 3.5904624462127686,
"learning_rate": 4.977477477477478e-05,
"loss": 2.325,
"step": 56
},
{
"epoch": 7.450951172740499e-05,
"grad_norm": 2.7417197227478027,
"learning_rate": 4.976976976976977e-05,
"loss": 2.109,
"step": 57
},
{
"epoch": 7.581669614367525e-05,
"grad_norm": 3.90405011177063,
"learning_rate": 4.976476476476477e-05,
"loss": 1.9777,
"step": 58
},
{
"epoch": 7.712388055994552e-05,
"grad_norm": 3.0904104709625244,
"learning_rate": 4.975975975975976e-05,
"loss": 2.964,
"step": 59
},
{
"epoch": 7.843106497621578e-05,
"grad_norm": 6.283293724060059,
"learning_rate": 4.975475475475476e-05,
"loss": 2.7203,
"step": 60
},
{
"epoch": 7.973824939248605e-05,
"grad_norm": 15.482819557189941,
"learning_rate": 4.974974974974975e-05,
"loss": 2.0104,
"step": 61
},
{
"epoch": 8.10454338087563e-05,
"grad_norm": 5.350860118865967,
"learning_rate": 4.9744744744744746e-05,
"loss": 2.4738,
"step": 62
},
{
"epoch": 8.235261822502656e-05,
"grad_norm": 2.5188300609588623,
"learning_rate": 4.9739739739739745e-05,
"loss": 2.3628,
"step": 63
},
{
"epoch": 8.365980264129683e-05,
"grad_norm": 3.6089985370635986,
"learning_rate": 4.973473473473474e-05,
"loss": 2.0068,
"step": 64
},
{
"epoch": 8.49669870575671e-05,
"grad_norm": 4.064090251922607,
"learning_rate": 4.972972972972974e-05,
"loss": 1.9134,
"step": 65
},
{
"epoch": 8.627417147383736e-05,
"grad_norm": 4.058968544006348,
"learning_rate": 4.9724724724724724e-05,
"loss": 2.654,
"step": 66
},
{
"epoch": 8.758135589010763e-05,
"grad_norm": 4.4842023849487305,
"learning_rate": 4.971971971971972e-05,
"loss": 2.0954,
"step": 67
},
{
"epoch": 8.888854030637788e-05,
"grad_norm": 4.771529674530029,
"learning_rate": 4.971471471471472e-05,
"loss": 2.3041,
"step": 68
},
{
"epoch": 9.019572472264814e-05,
"grad_norm": 5.2841267585754395,
"learning_rate": 4.970970970970971e-05,
"loss": 1.6284,
"step": 69
},
{
"epoch": 9.150290913891841e-05,
"grad_norm": 4.454013824462891,
"learning_rate": 4.970470470470471e-05,
"loss": 2.4258,
"step": 70
},
{
"epoch": 9.281009355518868e-05,
"grad_norm": 4.686120986938477,
"learning_rate": 4.96996996996997e-05,
"loss": 1.7159,
"step": 71
},
{
"epoch": 9.411727797145894e-05,
"grad_norm": 4.039860248565674,
"learning_rate": 4.9694694694694696e-05,
"loss": 2.4571,
"step": 72
},
{
"epoch": 9.542446238772919e-05,
"grad_norm": 6.6892805099487305,
"learning_rate": 4.9689689689689696e-05,
"loss": 1.5466,
"step": 73
},
{
"epoch": 9.673164680399946e-05,
"grad_norm": 4.6148362159729,
"learning_rate": 4.968468468468468e-05,
"loss": 2.1115,
"step": 74
},
{
"epoch": 9.803883122026972e-05,
"grad_norm": 4.417403697967529,
"learning_rate": 4.967967967967968e-05,
"loss": 2.0975,
"step": 75
},
{
"epoch": 9.934601563653999e-05,
"grad_norm": 6.658927917480469,
"learning_rate": 4.9674674674674674e-05,
"loss": 1.4255,
"step": 76
},
{
"epoch": 0.00010065320005281026,
"grad_norm": 5.162082195281982,
"learning_rate": 4.9669669669669674e-05,
"loss": 2.1406,
"step": 77
},
{
"epoch": 0.00010196038446908051,
"grad_norm": 3.9566612243652344,
"learning_rate": 4.966466466466467e-05,
"loss": 2.6316,
"step": 78
},
{
"epoch": 0.00010326756888535077,
"grad_norm": 5.431234836578369,
"learning_rate": 4.965965965965966e-05,
"loss": 1.9831,
"step": 79
},
{
"epoch": 0.00010457475330162104,
"grad_norm": 7.695541858673096,
"learning_rate": 4.965465465465466e-05,
"loss": 1.3602,
"step": 80
},
{
"epoch": 0.0001058819377178913,
"grad_norm": 5.668361663818359,
"learning_rate": 4.964964964964965e-05,
"loss": 1.2717,
"step": 81
},
{
"epoch": 0.00010718912213416157,
"grad_norm": 6.2884202003479,
"learning_rate": 4.9644644644644646e-05,
"loss": 2.3994,
"step": 82
},
{
"epoch": 0.00010849630655043183,
"grad_norm": 5.497302055358887,
"learning_rate": 4.963963963963964e-05,
"loss": 2.0307,
"step": 83
},
{
"epoch": 0.00010980349096670209,
"grad_norm": 6.879200458526611,
"learning_rate": 4.963463463463464e-05,
"loss": 1.4382,
"step": 84
},
{
"epoch": 0.00011111067538297235,
"grad_norm": 5.275783538818359,
"learning_rate": 4.962962962962963e-05,
"loss": 1.8855,
"step": 85
},
{
"epoch": 0.00011241785979924262,
"grad_norm": 5.1828460693359375,
"learning_rate": 4.9624624624624625e-05,
"loss": 2.1653,
"step": 86
},
{
"epoch": 0.00011372504421551288,
"grad_norm": 4.819062232971191,
"learning_rate": 4.9619619619619624e-05,
"loss": 2.6426,
"step": 87
},
{
"epoch": 0.00011503222863178315,
"grad_norm": 4.319160461425781,
"learning_rate": 4.961461461461462e-05,
"loss": 2.6315,
"step": 88
},
{
"epoch": 0.0001163394130480534,
"grad_norm": 5.244660377502441,
"learning_rate": 4.960960960960962e-05,
"loss": 2.4654,
"step": 89
},
{
"epoch": 0.00011764659746432367,
"grad_norm": 4.433946132659912,
"learning_rate": 4.96046046046046e-05,
"loss": 2.1795,
"step": 90
},
{
"epoch": 0.00011895378188059393,
"grad_norm": 5.009888648986816,
"learning_rate": 4.95995995995996e-05,
"loss": 1.3696,
"step": 91
},
{
"epoch": 0.0001202609662968642,
"grad_norm": 4.033069133758545,
"learning_rate": 4.9594594594594596e-05,
"loss": 2.3803,
"step": 92
},
{
"epoch": 0.00012156815071313446,
"grad_norm": 4.739905834197998,
"learning_rate": 4.958958958958959e-05,
"loss": 1.7027,
"step": 93
},
{
"epoch": 0.00012287533512940471,
"grad_norm": 5.726681232452393,
"learning_rate": 4.958458458458459e-05,
"loss": 1.0941,
"step": 94
},
{
"epoch": 0.00012418251954567498,
"grad_norm": 4.398964881896973,
"learning_rate": 4.957957957957958e-05,
"loss": 1.8081,
"step": 95
},
{
"epoch": 0.00012548970396194525,
"grad_norm": 3.442424774169922,
"learning_rate": 4.957457457457458e-05,
"loss": 2.0943,
"step": 96
},
{
"epoch": 0.0001267968883782155,
"grad_norm": 3.42536997795105,
"learning_rate": 4.9569569569569574e-05,
"loss": 2.3138,
"step": 97
},
{
"epoch": 0.00012810407279448578,
"grad_norm": 3.0850987434387207,
"learning_rate": 4.956456456456457e-05,
"loss": 2.1422,
"step": 98
},
{
"epoch": 0.00012941125721075604,
"grad_norm": 3.327355146408081,
"learning_rate": 4.955955955955956e-05,
"loss": 1.607,
"step": 99
},
{
"epoch": 0.0001307184416270263,
"grad_norm": 3.0526223182678223,
"learning_rate": 4.955455455455455e-05,
"loss": 1.5995,
"step": 100
},
{
"epoch": 0.00013202562604329657,
"grad_norm": 3.1441359519958496,
"learning_rate": 4.954954954954955e-05,
"loss": 2.2129,
"step": 101
},
{
"epoch": 0.00013333281045956684,
"grad_norm": 2.209366798400879,
"learning_rate": 4.9544544544544546e-05,
"loss": 2.2904,
"step": 102
},
{
"epoch": 0.00013463999487583708,
"grad_norm": 2.4867329597473145,
"learning_rate": 4.953953953953954e-05,
"loss": 0.7869,
"step": 103
},
{
"epoch": 0.00013594717929210734,
"grad_norm": 2.2363412380218506,
"learning_rate": 4.953453453453454e-05,
"loss": 1.4709,
"step": 104
},
{
"epoch": 0.0001372543637083776,
"grad_norm": 2.6723251342773438,
"learning_rate": 4.952952952952953e-05,
"loss": 2.2871,
"step": 105
},
{
"epoch": 0.00013856154812464787,
"grad_norm": 1.9363102912902832,
"learning_rate": 4.952452452452453e-05,
"loss": 1.985,
"step": 106
},
{
"epoch": 0.00013986873254091814,
"grad_norm": 1.7824273109436035,
"learning_rate": 4.951951951951952e-05,
"loss": 1.189,
"step": 107
},
{
"epoch": 0.0001411759169571884,
"grad_norm": 1.8107385635375977,
"learning_rate": 4.951451451451452e-05,
"loss": 1.8943,
"step": 108
},
{
"epoch": 0.00014248310137345867,
"grad_norm": 2.283268451690674,
"learning_rate": 4.950950950950951e-05,
"loss": 1.7242,
"step": 109
},
{
"epoch": 0.00014379028578972894,
"grad_norm": 1.7572216987609863,
"learning_rate": 4.95045045045045e-05,
"loss": 1.8113,
"step": 110
},
{
"epoch": 0.0001450974702059992,
"grad_norm": 2.1798532009124756,
"learning_rate": 4.94994994994995e-05,
"loss": 1.9223,
"step": 111
},
{
"epoch": 0.00014640465462226947,
"grad_norm": 1.4888501167297363,
"learning_rate": 4.9494494494494496e-05,
"loss": 1.9747,
"step": 112
},
{
"epoch": 0.0001477118390385397,
"grad_norm": 1.8382623195648193,
"learning_rate": 4.9489489489489496e-05,
"loss": 2.199,
"step": 113
},
{
"epoch": 0.00014901902345480997,
"grad_norm": 2.766442060470581,
"learning_rate": 4.948448448448449e-05,
"loss": 1.433,
"step": 114
},
{
"epoch": 0.00015032620787108024,
"grad_norm": 1.3955047130584717,
"learning_rate": 4.947947947947948e-05,
"loss": 0.8611,
"step": 115
},
{
"epoch": 0.0001516333922873505,
"grad_norm": 1.3797332048416138,
"learning_rate": 4.9474474474474475e-05,
"loss": 1.8795,
"step": 116
},
{
"epoch": 0.00015294057670362077,
"grad_norm": 2.196755886077881,
"learning_rate": 4.946946946946947e-05,
"loss": 1.4086,
"step": 117
},
{
"epoch": 0.00015424776111989103,
"grad_norm": 1.6287676095962524,
"learning_rate": 4.946446446446447e-05,
"loss": 1.0982,
"step": 118
},
{
"epoch": 0.0001555549455361613,
"grad_norm": 2.1588571071624756,
"learning_rate": 4.945945945945946e-05,
"loss": 2.2011,
"step": 119
},
{
"epoch": 0.00015686212995243156,
"grad_norm": 1.66507887840271,
"learning_rate": 4.945445445445446e-05,
"loss": 1.6342,
"step": 120
},
{
"epoch": 0.00015816931436870183,
"grad_norm": 2.086681365966797,
"learning_rate": 4.944944944944945e-05,
"loss": 2.4012,
"step": 121
},
{
"epoch": 0.0001594764987849721,
"grad_norm": 2.2638509273529053,
"learning_rate": 4.9444444444444446e-05,
"loss": 0.2397,
"step": 122
},
{
"epoch": 0.00016078368320124236,
"grad_norm": 1.6955903768539429,
"learning_rate": 4.9439439439439446e-05,
"loss": 2.0733,
"step": 123
},
{
"epoch": 0.0001620908676175126,
"grad_norm": 1.7107908725738525,
"learning_rate": 4.943443443443443e-05,
"loss": 1.6862,
"step": 124
},
{
"epoch": 0.00016339805203378286,
"grad_norm": 2.117300033569336,
"learning_rate": 4.942942942942943e-05,
"loss": 1.6494,
"step": 125
},
{
"epoch": 0.00016470523645005313,
"grad_norm": 1.9111508131027222,
"learning_rate": 4.9424424424424425e-05,
"loss": 0.932,
"step": 126
},
{
"epoch": 0.0001660124208663234,
"grad_norm": 1.934354305267334,
"learning_rate": 4.9419419419419425e-05,
"loss": 1.6937,
"step": 127
},
{
"epoch": 0.00016731960528259366,
"grad_norm": 2.0459141731262207,
"learning_rate": 4.941441441441442e-05,
"loss": 2.1459,
"step": 128
},
{
"epoch": 0.00016862678969886393,
"grad_norm": 1.9724762439727783,
"learning_rate": 4.940940940940941e-05,
"loss": 2.2388,
"step": 129
},
{
"epoch": 0.0001699339741151342,
"grad_norm": 1.900592565536499,
"learning_rate": 4.940440440440441e-05,
"loss": 2.3512,
"step": 130
},
{
"epoch": 0.00017124115853140446,
"grad_norm": 1.978319764137268,
"learning_rate": 4.93993993993994e-05,
"loss": 1.7366,
"step": 131
},
{
"epoch": 0.00017254834294767472,
"grad_norm": 1.5513819456100464,
"learning_rate": 4.9394394394394396e-05,
"loss": 1.0159,
"step": 132
},
{
"epoch": 0.000173855527363945,
"grad_norm": 2.037893772125244,
"learning_rate": 4.938938938938939e-05,
"loss": 1.4329,
"step": 133
},
{
"epoch": 0.00017516271178021525,
"grad_norm": 1.7013237476348877,
"learning_rate": 4.938438438438439e-05,
"loss": 0.7663,
"step": 134
},
{
"epoch": 0.0001764698961964855,
"grad_norm": 1.8744912147521973,
"learning_rate": 4.937937937937938e-05,
"loss": 1.9305,
"step": 135
},
{
"epoch": 0.00017777708061275576,
"grad_norm": 1.606590747833252,
"learning_rate": 4.9374374374374375e-05,
"loss": 2.1672,
"step": 136
},
{
"epoch": 0.00017908426502902602,
"grad_norm": 2.2351033687591553,
"learning_rate": 4.9369369369369375e-05,
"loss": 1.0281,
"step": 137
},
{
"epoch": 0.0001803914494452963,
"grad_norm": 2.2086477279663086,
"learning_rate": 4.936436436436437e-05,
"loss": 1.6579,
"step": 138
},
{
"epoch": 0.00018169863386156655,
"grad_norm": 1.6347638368606567,
"learning_rate": 4.935935935935936e-05,
"loss": 1.7343,
"step": 139
},
{
"epoch": 0.00018300581827783682,
"grad_norm": 1.6986087560653687,
"learning_rate": 4.9354354354354354e-05,
"loss": 2.2753,
"step": 140
},
{
"epoch": 0.00018431300269410709,
"grad_norm": 1.8067278861999512,
"learning_rate": 4.9349349349349347e-05,
"loss": 0.8229,
"step": 141
},
{
"epoch": 0.00018562018711037735,
"grad_norm": 1.9648691415786743,
"learning_rate": 4.9344344344344346e-05,
"loss": 1.8146,
"step": 142
},
{
"epoch": 0.00018692737152664762,
"grad_norm": 1.3609802722930908,
"learning_rate": 4.933933933933934e-05,
"loss": 1.4422,
"step": 143
},
{
"epoch": 0.00018823455594291788,
"grad_norm": 2.002868890762329,
"learning_rate": 4.933433433433434e-05,
"loss": 0.0777,
"step": 144
},
{
"epoch": 0.00018954174035918812,
"grad_norm": 1.5004322528839111,
"learning_rate": 4.932932932932933e-05,
"loss": 1.5963,
"step": 145
},
{
"epoch": 0.00019084892477545839,
"grad_norm": 1.5936583280563354,
"learning_rate": 4.9324324324324325e-05,
"loss": 0.765,
"step": 146
},
{
"epoch": 0.00019215610919172865,
"grad_norm": 1.2652479410171509,
"learning_rate": 4.9319319319319325e-05,
"loss": 0.9597,
"step": 147
},
{
"epoch": 0.00019346329360799892,
"grad_norm": 1.7863197326660156,
"learning_rate": 4.931431431431432e-05,
"loss": 0.9391,
"step": 148
},
{
"epoch": 0.00019477047802426918,
"grad_norm": 1.9188827276229858,
"learning_rate": 4.930930930930931e-05,
"loss": 2.2355,
"step": 149
},
{
"epoch": 0.00019607766244053945,
"grad_norm": 1.5928704738616943,
"learning_rate": 4.9304304304304304e-05,
"loss": 1.1829,
"step": 150
},
{
"epoch": 0.0001973848468568097,
"grad_norm": 2.145599603652954,
"learning_rate": 4.92992992992993e-05,
"loss": 1.4609,
"step": 151
},
{
"epoch": 0.00019869203127307998,
"grad_norm": 1.4171509742736816,
"learning_rate": 4.9294294294294296e-05,
"loss": 1.9975,
"step": 152
},
{
"epoch": 0.00019999921568935024,
"grad_norm": 1.3210326433181763,
"learning_rate": 4.928928928928929e-05,
"loss": 1.0295,
"step": 153
},
{
"epoch": 0.0002013064001056205,
"grad_norm": 2.013183832168579,
"learning_rate": 4.928428428428429e-05,
"loss": 1.5366,
"step": 154
},
{
"epoch": 0.00020261358452189078,
"grad_norm": 1.594954490661621,
"learning_rate": 4.927927927927928e-05,
"loss": 1.0073,
"step": 155
},
{
"epoch": 0.00020392076893816101,
"grad_norm": 2.206082344055176,
"learning_rate": 4.927427427427428e-05,
"loss": 0.9262,
"step": 156
},
{
"epoch": 0.00020522795335443128,
"grad_norm": 2.40515398979187,
"learning_rate": 4.926926926926927e-05,
"loss": 1.4697,
"step": 157
},
{
"epoch": 0.00020653513777070155,
"grad_norm": 1.8254011869430542,
"learning_rate": 4.926426426426427e-05,
"loss": 1.8146,
"step": 158
},
{
"epoch": 0.0002078423221869718,
"grad_norm": 2.0213472843170166,
"learning_rate": 4.925925925925926e-05,
"loss": 3.1616,
"step": 159
},
{
"epoch": 0.00020914950660324208,
"grad_norm": 1.8707919120788574,
"learning_rate": 4.9254254254254254e-05,
"loss": 1.6477,
"step": 160
},
{
"epoch": 0.00021045669101951234,
"grad_norm": 1.9849202632904053,
"learning_rate": 4.9249249249249253e-05,
"loss": 1.5499,
"step": 161
},
{
"epoch": 0.0002117638754357826,
"grad_norm": 1.8460415601730347,
"learning_rate": 4.9244244244244246e-05,
"loss": 1.4391,
"step": 162
},
{
"epoch": 0.00021307105985205287,
"grad_norm": 1.8488810062408447,
"learning_rate": 4.9239239239239246e-05,
"loss": 1.8629,
"step": 163
},
{
"epoch": 0.00021437824426832314,
"grad_norm": 1.8770229816436768,
"learning_rate": 4.923423423423424e-05,
"loss": 2.3753,
"step": 164
},
{
"epoch": 0.0002156854286845934,
"grad_norm": 1.472601056098938,
"learning_rate": 4.922922922922923e-05,
"loss": 0.7564,
"step": 165
},
{
"epoch": 0.00021699261310086367,
"grad_norm": 2.860130548477173,
"learning_rate": 4.9224224224224225e-05,
"loss": 0.7373,
"step": 166
},
{
"epoch": 0.0002182997975171339,
"grad_norm": 1.886745572090149,
"learning_rate": 4.921921921921922e-05,
"loss": 1.0192,
"step": 167
},
{
"epoch": 0.00021960698193340417,
"grad_norm": 1.2749582529067993,
"learning_rate": 4.921421421421422e-05,
"loss": 0.4704,
"step": 168
},
{
"epoch": 0.00022091416634967444,
"grad_norm": 1.645389199256897,
"learning_rate": 4.920920920920921e-05,
"loss": 1.8075,
"step": 169
},
{
"epoch": 0.0002222213507659447,
"grad_norm": 1.7299386262893677,
"learning_rate": 4.920420420420421e-05,
"loss": 2.4038,
"step": 170
},
{
"epoch": 0.00022352853518221497,
"grad_norm": 1.6160026788711548,
"learning_rate": 4.9199199199199204e-05,
"loss": 1.1931,
"step": 171
},
{
"epoch": 0.00022483571959848524,
"grad_norm": 1.9258168935775757,
"learning_rate": 4.9194194194194196e-05,
"loss": 2.2323,
"step": 172
},
{
"epoch": 0.0002261429040147555,
"grad_norm": 2.079798936843872,
"learning_rate": 4.9189189189189196e-05,
"loss": 1.9468,
"step": 173
},
{
"epoch": 0.00022745008843102577,
"grad_norm": 2.0454630851745605,
"learning_rate": 4.918418418418418e-05,
"loss": 1.7755,
"step": 174
},
{
"epoch": 0.00022875727284729603,
"grad_norm": 1.9173661470413208,
"learning_rate": 4.917917917917918e-05,
"loss": 1.0856,
"step": 175
},
{
"epoch": 0.0002300644572635663,
"grad_norm": 1.6535844802856445,
"learning_rate": 4.9174174174174175e-05,
"loss": 1.6214,
"step": 176
},
{
"epoch": 0.00023137164167983656,
"grad_norm": 2.402019739151001,
"learning_rate": 4.916916916916917e-05,
"loss": 1.7331,
"step": 177
},
{
"epoch": 0.0002326788260961068,
"grad_norm": 1.7244210243225098,
"learning_rate": 4.916416416416417e-05,
"loss": 1.4885,
"step": 178
},
{
"epoch": 0.00023398601051237707,
"grad_norm": 2.702523946762085,
"learning_rate": 4.915915915915916e-05,
"loss": 2.0403,
"step": 179
},
{
"epoch": 0.00023529319492864733,
"grad_norm": 2.1688618659973145,
"learning_rate": 4.915415415415416e-05,
"loss": 0.986,
"step": 180
},
{
"epoch": 0.0002366003793449176,
"grad_norm": 2.3919677734375,
"learning_rate": 4.9149149149149154e-05,
"loss": 1.3291,
"step": 181
},
{
"epoch": 0.00023790756376118786,
"grad_norm": 1.8715122938156128,
"learning_rate": 4.9144144144144147e-05,
"loss": 1.7018,
"step": 182
},
{
"epoch": 0.00023921474817745813,
"grad_norm": 2.152589797973633,
"learning_rate": 4.913913913913914e-05,
"loss": 1.8486,
"step": 183
},
{
"epoch": 0.0002405219325937284,
"grad_norm": 1.5322504043579102,
"learning_rate": 4.913413413413413e-05,
"loss": 1.6896,
"step": 184
},
{
"epoch": 0.00024182911700999866,
"grad_norm": 1.6208884716033936,
"learning_rate": 4.912912912912913e-05,
"loss": 1.9508,
"step": 185
},
{
"epoch": 0.00024313630142626893,
"grad_norm": 1.5740910768508911,
"learning_rate": 4.9124124124124125e-05,
"loss": 1.1913,
"step": 186
},
{
"epoch": 0.00024444348584253916,
"grad_norm": 2.652970790863037,
"learning_rate": 4.9119119119119125e-05,
"loss": 1.5786,
"step": 187
},
{
"epoch": 0.00024575067025880943,
"grad_norm": 1.5115643739700317,
"learning_rate": 4.911411411411412e-05,
"loss": 2.3382,
"step": 188
},
{
"epoch": 0.0002470578546750797,
"grad_norm": 2.009474754333496,
"learning_rate": 4.910910910910911e-05,
"loss": 2.2713,
"step": 189
},
{
"epoch": 0.00024836503909134996,
"grad_norm": 3.25408673286438,
"learning_rate": 4.9104104104104104e-05,
"loss": 1.767,
"step": 190
},
{
"epoch": 0.0002496722235076202,
"grad_norm": 1.6666245460510254,
"learning_rate": 4.90990990990991e-05,
"loss": 1.1057,
"step": 191
},
{
"epoch": 0.0002509794079238905,
"grad_norm": 2.5899922847747803,
"learning_rate": 4.90940940940941e-05,
"loss": 1.7851,
"step": 192
},
{
"epoch": 0.00025228659234016076,
"grad_norm": 1.9656989574432373,
"learning_rate": 4.908908908908909e-05,
"loss": 2.5767,
"step": 193
},
{
"epoch": 0.000253593776756431,
"grad_norm": 1.7756541967391968,
"learning_rate": 4.908408408408409e-05,
"loss": 1.7306,
"step": 194
},
{
"epoch": 0.0002549009611727013,
"grad_norm": 1.9504581689834595,
"learning_rate": 4.907907907907908e-05,
"loss": 1.5729,
"step": 195
},
{
"epoch": 0.00025620814558897155,
"grad_norm": 0.957115888595581,
"learning_rate": 4.9074074074074075e-05,
"loss": 0.277,
"step": 196
},
{
"epoch": 0.0002575153300052418,
"grad_norm": 1.4767944812774658,
"learning_rate": 4.9069069069069075e-05,
"loss": 1.5084,
"step": 197
},
{
"epoch": 0.0002588225144215121,
"grad_norm": 1.2738802433013916,
"learning_rate": 4.906406406406407e-05,
"loss": 0.9465,
"step": 198
},
{
"epoch": 0.00026012969883778235,
"grad_norm": 1.6053591966629028,
"learning_rate": 4.905905905905906e-05,
"loss": 1.3578,
"step": 199
},
{
"epoch": 0.0002614368832540526,
"grad_norm": 1.664510726928711,
"learning_rate": 4.9054054054054054e-05,
"loss": 2.4543,
"step": 200
},
{
"epoch": 0.0002627440676703229,
"grad_norm": 1.6944023370742798,
"learning_rate": 4.9049049049049054e-05,
"loss": 1.3043,
"step": 201
},
{
"epoch": 0.00026405125208659315,
"grad_norm": 1.387467861175537,
"learning_rate": 4.904404404404405e-05,
"loss": 1.4816,
"step": 202
},
{
"epoch": 0.0002653584365028634,
"grad_norm": 1.925346851348877,
"learning_rate": 4.903903903903904e-05,
"loss": 1.6951,
"step": 203
},
{
"epoch": 0.0002666656209191337,
"grad_norm": 1.4878056049346924,
"learning_rate": 4.903403403403404e-05,
"loss": 1.7805,
"step": 204
},
{
"epoch": 0.0002679728053354039,
"grad_norm": 2.018648862838745,
"learning_rate": 4.902902902902903e-05,
"loss": 1.4652,
"step": 205
},
{
"epoch": 0.00026927998975167415,
"grad_norm": 1.8136177062988281,
"learning_rate": 4.902402402402403e-05,
"loss": 1.5613,
"step": 206
},
{
"epoch": 0.0002705871741679444,
"grad_norm": 2.323335886001587,
"learning_rate": 4.901901901901902e-05,
"loss": 1.6988,
"step": 207
},
{
"epoch": 0.0002718943585842147,
"grad_norm": 1.7906171083450317,
"learning_rate": 4.901401401401402e-05,
"loss": 1.5934,
"step": 208
},
{
"epoch": 0.00027320154300048495,
"grad_norm": 1.5515409708023071,
"learning_rate": 4.900900900900901e-05,
"loss": 1.5874,
"step": 209
},
{
"epoch": 0.0002745087274167552,
"grad_norm": 1.9337890148162842,
"learning_rate": 4.9004004004004004e-05,
"loss": 1.7147,
"step": 210
},
{
"epoch": 0.0002758159118330255,
"grad_norm": 2.102532148361206,
"learning_rate": 4.8998998998999004e-05,
"loss": 1.7885,
"step": 211
},
{
"epoch": 0.00027712309624929575,
"grad_norm": 1.6877130270004272,
"learning_rate": 4.8993993993994e-05,
"loss": 1.5048,
"step": 212
},
{
"epoch": 0.000278430280665566,
"grad_norm": 1.8921681642532349,
"learning_rate": 4.898898898898899e-05,
"loss": 1.8371,
"step": 213
},
{
"epoch": 0.0002797374650818363,
"grad_norm": 1.810315728187561,
"learning_rate": 4.898398398398399e-05,
"loss": 1.7009,
"step": 214
},
{
"epoch": 0.00028104464949810654,
"grad_norm": 2.759514331817627,
"learning_rate": 4.8978978978978976e-05,
"loss": 3.1437,
"step": 215
},
{
"epoch": 0.0002823518339143768,
"grad_norm": 2.4991655349731445,
"learning_rate": 4.8973973973973975e-05,
"loss": 2.3588,
"step": 216
},
{
"epoch": 0.0002836590183306471,
"grad_norm": 2.547462224960327,
"learning_rate": 4.896896896896897e-05,
"loss": 1.5821,
"step": 217
},
{
"epoch": 0.00028496620274691734,
"grad_norm": 1.3106446266174316,
"learning_rate": 4.896396396396397e-05,
"loss": 1.4013,
"step": 218
},
{
"epoch": 0.0002862733871631876,
"grad_norm": 1.6845613718032837,
"learning_rate": 4.895895895895896e-05,
"loss": 1.5136,
"step": 219
},
{
"epoch": 0.00028758057157945787,
"grad_norm": 1.659991979598999,
"learning_rate": 4.8953953953953954e-05,
"loss": 2.2554,
"step": 220
},
{
"epoch": 0.00028888775599572814,
"grad_norm": 1.3590683937072754,
"learning_rate": 4.8948948948948954e-05,
"loss": 0.7868,
"step": 221
},
{
"epoch": 0.0002901949404119984,
"grad_norm": 1.8435338735580444,
"learning_rate": 4.894394394394395e-05,
"loss": 1.8352,
"step": 222
},
{
"epoch": 0.00029150212482826867,
"grad_norm": 1.6166387796401978,
"learning_rate": 4.893893893893894e-05,
"loss": 1.3868,
"step": 223
},
{
"epoch": 0.00029280930924453893,
"grad_norm": 2.057384967803955,
"learning_rate": 4.893393393393393e-05,
"loss": 1.898,
"step": 224
},
{
"epoch": 0.0002941164936608092,
"grad_norm": 1.695014238357544,
"learning_rate": 4.892892892892893e-05,
"loss": 2.2495,
"step": 225
},
{
"epoch": 0.0002954236780770794,
"grad_norm": 2.4186930656433105,
"learning_rate": 4.8923923923923926e-05,
"loss": 2.1074,
"step": 226
},
{
"epoch": 0.0002967308624933497,
"grad_norm": 1.5932652950286865,
"learning_rate": 4.891891891891892e-05,
"loss": 1.1245,
"step": 227
},
{
"epoch": 0.00029803804690961994,
"grad_norm": 1.7992708683013916,
"learning_rate": 4.891391391391392e-05,
"loss": 1.6981,
"step": 228
},
{
"epoch": 0.0002993452313258902,
"grad_norm": 1.6720223426818848,
"learning_rate": 4.890890890890891e-05,
"loss": 1.499,
"step": 229
},
{
"epoch": 0.00030065241574216047,
"grad_norm": 1.7174367904663086,
"learning_rate": 4.890390390390391e-05,
"loss": 1.4546,
"step": 230
},
{
"epoch": 0.00030195960015843074,
"grad_norm": 2.5477240085601807,
"learning_rate": 4.8898898898898904e-05,
"loss": 1.0959,
"step": 231
},
{
"epoch": 0.000303266784574701,
"grad_norm": 1.551347017288208,
"learning_rate": 4.88938938938939e-05,
"loss": 1.2192,
"step": 232
},
{
"epoch": 0.00030457396899097127,
"grad_norm": 4.913365364074707,
"learning_rate": 4.888888888888889e-05,
"loss": 1.4216,
"step": 233
},
{
"epoch": 0.00030588115340724153,
"grad_norm": 2.8688876628875732,
"learning_rate": 4.888388388388388e-05,
"loss": 1.2954,
"step": 234
},
{
"epoch": 0.0003071883378235118,
"grad_norm": 1.8472936153411865,
"learning_rate": 4.887887887887888e-05,
"loss": 1.6743,
"step": 235
},
{
"epoch": 0.00030849552223978207,
"grad_norm": 2.6009321212768555,
"learning_rate": 4.8873873873873876e-05,
"loss": 0.8994,
"step": 236
},
{
"epoch": 0.00030980270665605233,
"grad_norm": 3.2686116695404053,
"learning_rate": 4.8868868868868875e-05,
"loss": 1.89,
"step": 237
},
{
"epoch": 0.0003111098910723226,
"grad_norm": 2.842278003692627,
"learning_rate": 4.886386386386387e-05,
"loss": 1.3352,
"step": 238
},
{
"epoch": 0.00031241707548859286,
"grad_norm": 1.6975959539413452,
"learning_rate": 4.885885885885886e-05,
"loss": 2.5637,
"step": 239
},
{
"epoch": 0.00031372425990486313,
"grad_norm": 1.6205472946166992,
"learning_rate": 4.8853853853853854e-05,
"loss": 1.7702,
"step": 240
},
{
"epoch": 0.0003150314443211334,
"grad_norm": 1.9137065410614014,
"learning_rate": 4.884884884884885e-05,
"loss": 2.128,
"step": 241
},
{
"epoch": 0.00031633862873740366,
"grad_norm": 1.6640243530273438,
"learning_rate": 4.884384384384385e-05,
"loss": 2.798,
"step": 242
},
{
"epoch": 0.0003176458131536739,
"grad_norm": 1.5545439720153809,
"learning_rate": 4.883883883883884e-05,
"loss": 1.9383,
"step": 243
},
{
"epoch": 0.0003189529975699442,
"grad_norm": 2.110470771789551,
"learning_rate": 4.883383383383384e-05,
"loss": 2.7099,
"step": 244
},
{
"epoch": 0.00032026018198621446,
"grad_norm": 1.3277205228805542,
"learning_rate": 4.882882882882883e-05,
"loss": 1.6492,
"step": 245
},
{
"epoch": 0.0003215673664024847,
"grad_norm": 1.317232608795166,
"learning_rate": 4.8823823823823826e-05,
"loss": 1.3825,
"step": 246
},
{
"epoch": 0.00032287455081875493,
"grad_norm": 1.5808682441711426,
"learning_rate": 4.8818818818818825e-05,
"loss": 1.7761,
"step": 247
},
{
"epoch": 0.0003241817352350252,
"grad_norm": 1.576471209526062,
"learning_rate": 4.881381381381381e-05,
"loss": 1.6476,
"step": 248
},
{
"epoch": 0.00032548891965129546,
"grad_norm": 1.5742518901824951,
"learning_rate": 4.880880880880881e-05,
"loss": 1.3871,
"step": 249
},
{
"epoch": 0.00032679610406756573,
"grad_norm": 1.22970449924469,
"learning_rate": 4.8803803803803804e-05,
"loss": 0.9665,
"step": 250
},
{
"epoch": 0.000328103288483836,
"grad_norm": 1.9720549583435059,
"learning_rate": 4.87987987987988e-05,
"loss": 1.5671,
"step": 251
},
{
"epoch": 0.00032941047290010626,
"grad_norm": 1.6531200408935547,
"learning_rate": 4.87937937937938e-05,
"loss": 2.0848,
"step": 252
},
{
"epoch": 0.0003307176573163765,
"grad_norm": 0.06442175805568695,
"learning_rate": 4.878878878878879e-05,
"loss": 0.0011,
"step": 253
},
{
"epoch": 0.0003320248417326468,
"grad_norm": 1.4050544500350952,
"learning_rate": 4.878378378378379e-05,
"loss": 0.779,
"step": 254
},
{
"epoch": 0.00033333202614891706,
"grad_norm": 1.4547470808029175,
"learning_rate": 4.877877877877878e-05,
"loss": 1.2172,
"step": 255
},
{
"epoch": 0.0003346392105651873,
"grad_norm": 1.7619500160217285,
"learning_rate": 4.8773773773773776e-05,
"loss": 2.2073,
"step": 256
},
{
"epoch": 0.0003359463949814576,
"grad_norm": 1.3657172918319702,
"learning_rate": 4.876876876876877e-05,
"loss": 1.0345,
"step": 257
},
{
"epoch": 0.00033725357939772785,
"grad_norm": 1.9810845851898193,
"learning_rate": 4.876376376376376e-05,
"loss": 2.0455,
"step": 258
},
{
"epoch": 0.0003385607638139981,
"grad_norm": 1.3832188844680786,
"learning_rate": 4.875875875875876e-05,
"loss": 1.9951,
"step": 259
},
{
"epoch": 0.0003398679482302684,
"grad_norm": 1.5447360277175903,
"learning_rate": 4.8753753753753754e-05,
"loss": 2.2018,
"step": 260
},
{
"epoch": 0.00034117513264653865,
"grad_norm": 1.6113301515579224,
"learning_rate": 4.8748748748748754e-05,
"loss": 1.9734,
"step": 261
},
{
"epoch": 0.0003424823170628089,
"grad_norm": 1.8055421113967896,
"learning_rate": 4.874374374374375e-05,
"loss": 2.1845,
"step": 262
},
{
"epoch": 0.0003437895014790792,
"grad_norm": 1.568477749824524,
"learning_rate": 4.873873873873874e-05,
"loss": 1.8695,
"step": 263
},
{
"epoch": 0.00034509668589534945,
"grad_norm": 1.6943508386611938,
"learning_rate": 4.873373373373374e-05,
"loss": 1.8148,
"step": 264
},
{
"epoch": 0.0003464038703116197,
"grad_norm": 1.0157723426818848,
"learning_rate": 4.8728728728728726e-05,
"loss": 0.6497,
"step": 265
},
{
"epoch": 0.00034771105472789,
"grad_norm": 1.506629228591919,
"learning_rate": 4.8723723723723726e-05,
"loss": 2.0373,
"step": 266
},
{
"epoch": 0.00034901823914416024,
"grad_norm": 1.9490463733673096,
"learning_rate": 4.871871871871872e-05,
"loss": 1.37,
"step": 267
},
{
"epoch": 0.0003503254235604305,
"grad_norm": 1.2232673168182373,
"learning_rate": 4.871371371371372e-05,
"loss": 0.8621,
"step": 268
},
{
"epoch": 0.0003516326079767007,
"grad_norm": 1.1460094451904297,
"learning_rate": 4.870870870870871e-05,
"loss": 0.6316,
"step": 269
},
{
"epoch": 0.000352939792392971,
"grad_norm": 1.7706618309020996,
"learning_rate": 4.8703703703703704e-05,
"loss": 1.8774,
"step": 270
},
{
"epoch": 0.00035424697680924125,
"grad_norm": 1.5327508449554443,
"learning_rate": 4.8698698698698704e-05,
"loss": 1.669,
"step": 271
},
{
"epoch": 0.0003555541612255115,
"grad_norm": 2.183750629425049,
"learning_rate": 4.86936936936937e-05,
"loss": 1.7982,
"step": 272
},
{
"epoch": 0.0003568613456417818,
"grad_norm": 1.3268592357635498,
"learning_rate": 4.868868868868869e-05,
"loss": 1.6348,
"step": 273
},
{
"epoch": 0.00035816853005805205,
"grad_norm": 1.3289145231246948,
"learning_rate": 4.868368368368368e-05,
"loss": 1.0636,
"step": 274
},
{
"epoch": 0.0003594757144743223,
"grad_norm": 1.543196201324463,
"learning_rate": 4.867867867867868e-05,
"loss": 1.6589,
"step": 275
},
{
"epoch": 0.0003607828988905926,
"grad_norm": 1.8786145448684692,
"learning_rate": 4.8673673673673676e-05,
"loss": 2.2636,
"step": 276
},
{
"epoch": 0.00036209008330686284,
"grad_norm": 1.9315189123153687,
"learning_rate": 4.866866866866867e-05,
"loss": 1.4786,
"step": 277
},
{
"epoch": 0.0003633972677231331,
"grad_norm": 3.0136470794677734,
"learning_rate": 4.866366366366367e-05,
"loss": 1.1689,
"step": 278
},
{
"epoch": 0.0003647044521394034,
"grad_norm": 1.5813599824905396,
"learning_rate": 4.865865865865866e-05,
"loss": 1.7059,
"step": 279
},
{
"epoch": 0.00036601163655567364,
"grad_norm": 1.8223422765731812,
"learning_rate": 4.865365365365366e-05,
"loss": 1.2049,
"step": 280
},
{
"epoch": 0.0003673188209719439,
"grad_norm": 1.1571829319000244,
"learning_rate": 4.8648648648648654e-05,
"loss": 0.9757,
"step": 281
},
{
"epoch": 0.00036862600538821417,
"grad_norm": 2.1226696968078613,
"learning_rate": 4.864364364364364e-05,
"loss": 2.0413,
"step": 282
},
{
"epoch": 0.00036993318980448444,
"grad_norm": 1.8020603656768799,
"learning_rate": 4.863863863863864e-05,
"loss": 1.5693,
"step": 283
},
{
"epoch": 0.0003712403742207547,
"grad_norm": 1.799559235572815,
"learning_rate": 4.863363363363363e-05,
"loss": 2.1869,
"step": 284
},
{
"epoch": 0.00037254755863702497,
"grad_norm": 2.161092519760132,
"learning_rate": 4.862862862862863e-05,
"loss": 1.903,
"step": 285
},
{
"epoch": 0.00037385474305329523,
"grad_norm": 1.8967934846878052,
"learning_rate": 4.8623623623623626e-05,
"loss": 1.9437,
"step": 286
},
{
"epoch": 0.0003751619274695655,
"grad_norm": 1.6356936693191528,
"learning_rate": 4.861861861861862e-05,
"loss": 1.8212,
"step": 287
},
{
"epoch": 0.00037646911188583576,
"grad_norm": 2.0186562538146973,
"learning_rate": 4.861361361361362e-05,
"loss": 1.9657,
"step": 288
},
{
"epoch": 0.00037777629630210603,
"grad_norm": 2.0139613151550293,
"learning_rate": 4.860860860860861e-05,
"loss": 1.3713,
"step": 289
},
{
"epoch": 0.00037908348071837624,
"grad_norm": 1.9778642654418945,
"learning_rate": 4.8603603603603605e-05,
"loss": 2.1371,
"step": 290
},
{
"epoch": 0.0003803906651346465,
"grad_norm": 2.013936996459961,
"learning_rate": 4.85985985985986e-05,
"loss": 2.1338,
"step": 291
},
{
"epoch": 0.00038169784955091677,
"grad_norm": 1.480167031288147,
"learning_rate": 4.85935935935936e-05,
"loss": 1.4518,
"step": 292
},
{
"epoch": 0.00038300503396718704,
"grad_norm": 1.0584605932235718,
"learning_rate": 4.858858858858859e-05,
"loss": 0.9891,
"step": 293
},
{
"epoch": 0.0003843122183834573,
"grad_norm": 1.5172793865203857,
"learning_rate": 4.858358358358358e-05,
"loss": 1.1713,
"step": 294
},
{
"epoch": 0.00038561940279972757,
"grad_norm": 1.6944522857666016,
"learning_rate": 4.857857857857858e-05,
"loss": 2.72,
"step": 295
},
{
"epoch": 0.00038692658721599783,
"grad_norm": 1.6618627309799194,
"learning_rate": 4.8573573573573576e-05,
"loss": 1.3664,
"step": 296
},
{
"epoch": 0.0003882337716322681,
"grad_norm": 1.5239827632904053,
"learning_rate": 4.8568568568568576e-05,
"loss": 1.8734,
"step": 297
},
{
"epoch": 0.00038954095604853837,
"grad_norm": 1.4096382856369019,
"learning_rate": 4.856356356356356e-05,
"loss": 1.4521,
"step": 298
},
{
"epoch": 0.00039084814046480863,
"grad_norm": 1.4097378253936768,
"learning_rate": 4.855855855855856e-05,
"loss": 1.4607,
"step": 299
},
{
"epoch": 0.0003921553248810789,
"grad_norm": 1.5461453199386597,
"learning_rate": 4.8553553553553555e-05,
"loss": 1.744,
"step": 300
},
{
"epoch": 0.00039346250929734916,
"grad_norm": 1.9085536003112793,
"learning_rate": 4.854854854854855e-05,
"loss": 2.6346,
"step": 301
},
{
"epoch": 0.0003947696937136194,
"grad_norm": 1.5974091291427612,
"learning_rate": 4.854354354354355e-05,
"loss": 2.779,
"step": 302
},
{
"epoch": 0.0003960768781298897,
"grad_norm": 1.4183374643325806,
"learning_rate": 4.853853853853854e-05,
"loss": 1.1477,
"step": 303
},
{
"epoch": 0.00039738406254615996,
"grad_norm": 1.7596776485443115,
"learning_rate": 4.853353353353354e-05,
"loss": 1.3722,
"step": 304
},
{
"epoch": 0.0003986912469624302,
"grad_norm": 2.0469233989715576,
"learning_rate": 4.852852852852853e-05,
"loss": 2.532,
"step": 305
},
{
"epoch": 0.0003999984313787005,
"grad_norm": 1.7058496475219727,
"learning_rate": 4.8523523523523526e-05,
"loss": 1.8808,
"step": 306
},
{
"epoch": 0.00040130561579497075,
"grad_norm": 2.928122043609619,
"learning_rate": 4.851851851851852e-05,
"loss": 1.4133,
"step": 307
},
{
"epoch": 0.000402612800211241,
"grad_norm": 2.3317997455596924,
"learning_rate": 4.851351351351351e-05,
"loss": 1.9976,
"step": 308
},
{
"epoch": 0.0004039199846275113,
"grad_norm": 1.520419716835022,
"learning_rate": 4.850850850850851e-05,
"loss": 1.7102,
"step": 309
},
{
"epoch": 0.00040522716904378155,
"grad_norm": 1.5943193435668945,
"learning_rate": 4.8503503503503505e-05,
"loss": 1.9234,
"step": 310
},
{
"epoch": 0.0004065343534600518,
"grad_norm": 0.8694736361503601,
"learning_rate": 4.8498498498498504e-05,
"loss": 0.4603,
"step": 311
},
{
"epoch": 0.00040784153787632203,
"grad_norm": 1.2508904933929443,
"learning_rate": 4.84934934934935e-05,
"loss": 0.7948,
"step": 312
},
{
"epoch": 0.0004091487222925923,
"grad_norm": 2.01047420501709,
"learning_rate": 4.848848848848849e-05,
"loss": 1.8304,
"step": 313
},
{
"epoch": 0.00041045590670886256,
"grad_norm": 2.1941375732421875,
"learning_rate": 4.848348348348349e-05,
"loss": 1.7869,
"step": 314
},
{
"epoch": 0.0004117630911251328,
"grad_norm": 1.6790601015090942,
"learning_rate": 4.8478478478478476e-05,
"loss": 1.2441,
"step": 315
},
{
"epoch": 0.0004130702755414031,
"grad_norm": 1.6825275421142578,
"learning_rate": 4.8473473473473476e-05,
"loss": 1.7431,
"step": 316
},
{
"epoch": 0.00041437745995767336,
"grad_norm": 1.328930377960205,
"learning_rate": 4.846846846846847e-05,
"loss": 1.3589,
"step": 317
},
{
"epoch": 0.0004156846443739436,
"grad_norm": 1.519547939300537,
"learning_rate": 4.846346346346346e-05,
"loss": 0.5922,
"step": 318
},
{
"epoch": 0.0004169918287902139,
"grad_norm": 1.8517762422561646,
"learning_rate": 4.845845845845846e-05,
"loss": 2.0995,
"step": 319
},
{
"epoch": 0.00041829901320648415,
"grad_norm": 1.561224341392517,
"learning_rate": 4.8453453453453455e-05,
"loss": 2.1158,
"step": 320
},
{
"epoch": 0.0004196061976227544,
"grad_norm": 1.9378067255020142,
"learning_rate": 4.8448448448448455e-05,
"loss": 2.2775,
"step": 321
},
{
"epoch": 0.0004209133820390247,
"grad_norm": 1.6161144971847534,
"learning_rate": 4.844344344344345e-05,
"loss": 2.2261,
"step": 322
},
{
"epoch": 0.00042222056645529495,
"grad_norm": 0.9984459280967712,
"learning_rate": 4.843843843843844e-05,
"loss": 0.8248,
"step": 323
},
{
"epoch": 0.0004235277508715652,
"grad_norm": 1.2390950918197632,
"learning_rate": 4.8433433433433433e-05,
"loss": 0.8056,
"step": 324
},
{
"epoch": 0.0004248349352878355,
"grad_norm": 1.8470146656036377,
"learning_rate": 4.8428428428428426e-05,
"loss": 1.8804,
"step": 325
},
{
"epoch": 0.00042614211970410575,
"grad_norm": 1.5264325141906738,
"learning_rate": 4.8423423423423426e-05,
"loss": 1.4915,
"step": 326
},
{
"epoch": 0.000427449304120376,
"grad_norm": 1.4650579690933228,
"learning_rate": 4.841841841841842e-05,
"loss": 1.9906,
"step": 327
},
{
"epoch": 0.0004287564885366463,
"grad_norm": 1.6912672519683838,
"learning_rate": 4.841341341341342e-05,
"loss": 2.5512,
"step": 328
},
{
"epoch": 0.00043006367295291654,
"grad_norm": 1.419115424156189,
"learning_rate": 4.840840840840841e-05,
"loss": 1.254,
"step": 329
},
{
"epoch": 0.0004313708573691868,
"grad_norm": 1.3889597654342651,
"learning_rate": 4.8403403403403405e-05,
"loss": 1.5753,
"step": 330
},
{
"epoch": 0.00043267804178545707,
"grad_norm": 1.9419002532958984,
"learning_rate": 4.83983983983984e-05,
"loss": 1.363,
"step": 331
},
{
"epoch": 0.00043398522620172734,
"grad_norm": 2.337916374206543,
"learning_rate": 4.839339339339339e-05,
"loss": 1.8217,
"step": 332
},
{
"epoch": 0.00043529241061799755,
"grad_norm": 1.259263277053833,
"learning_rate": 4.838838838838839e-05,
"loss": 1.4361,
"step": 333
},
{
"epoch": 0.0004365995950342678,
"grad_norm": 1.728857159614563,
"learning_rate": 4.8383383383383384e-05,
"loss": 1.8216,
"step": 334
},
{
"epoch": 0.0004379067794505381,
"grad_norm": 1.126977562904358,
"learning_rate": 4.837837837837838e-05,
"loss": 0.6098,
"step": 335
},
{
"epoch": 0.00043921396386680835,
"grad_norm": 1.5760388374328613,
"learning_rate": 4.8373373373373376e-05,
"loss": 1.9721,
"step": 336
},
{
"epoch": 0.0004405211482830786,
"grad_norm": 1.6654877662658691,
"learning_rate": 4.836836836836837e-05,
"loss": 2.0176,
"step": 337
},
{
"epoch": 0.0004418283326993489,
"grad_norm": 1.5803656578063965,
"learning_rate": 4.836336336336337e-05,
"loss": 1.6791,
"step": 338
},
{
"epoch": 0.00044313551711561914,
"grad_norm": 1.6455450057983398,
"learning_rate": 4.835835835835836e-05,
"loss": 2.1326,
"step": 339
},
{
"epoch": 0.0004444427015318894,
"grad_norm": 1.302128553390503,
"learning_rate": 4.8353353353353355e-05,
"loss": 1.3892,
"step": 340
},
{
"epoch": 0.0004457498859481597,
"grad_norm": 1.2957465648651123,
"learning_rate": 4.834834834834835e-05,
"loss": 1.2624,
"step": 341
},
{
"epoch": 0.00044705707036442994,
"grad_norm": 1.56611168384552,
"learning_rate": 4.834334334334335e-05,
"loss": 1.2925,
"step": 342
},
{
"epoch": 0.0004483642547807002,
"grad_norm": 1.5972720384597778,
"learning_rate": 4.833833833833834e-05,
"loss": 2.2345,
"step": 343
},
{
"epoch": 0.00044967143919697047,
"grad_norm": 1.6864628791809082,
"learning_rate": 4.8333333333333334e-05,
"loss": 2.1596,
"step": 344
},
{
"epoch": 0.00045097862361324074,
"grad_norm": 1.3631922006607056,
"learning_rate": 4.832832832832833e-05,
"loss": 0.8551,
"step": 345
},
{
"epoch": 0.000452285808029511,
"grad_norm": 1.2245639562606812,
"learning_rate": 4.8323323323323326e-05,
"loss": 1.613,
"step": 346
},
{
"epoch": 0.00045359299244578127,
"grad_norm": 1.5226577520370483,
"learning_rate": 4.8318318318318326e-05,
"loss": 1.3359,
"step": 347
},
{
"epoch": 0.00045490017686205153,
"grad_norm": 1.371559739112854,
"learning_rate": 4.831331331331331e-05,
"loss": 2.1276,
"step": 348
},
{
"epoch": 0.0004562073612783218,
"grad_norm": 1.8305604457855225,
"learning_rate": 4.830830830830831e-05,
"loss": 2.6008,
"step": 349
},
{
"epoch": 0.00045751454569459206,
"grad_norm": 1.6536656618118286,
"learning_rate": 4.8303303303303305e-05,
"loss": 1.9033,
"step": 350
},
{
"epoch": 0.00045882173011086233,
"grad_norm": 1.682886004447937,
"learning_rate": 4.82982982982983e-05,
"loss": 1.7702,
"step": 351
},
{
"epoch": 0.0004601289145271326,
"grad_norm": 1.8235169649124146,
"learning_rate": 4.82932932932933e-05,
"loss": 2.0686,
"step": 352
},
{
"epoch": 0.00046143609894340286,
"grad_norm": 1.3382108211517334,
"learning_rate": 4.828828828828829e-05,
"loss": 1.3496,
"step": 353
},
{
"epoch": 0.0004627432833596731,
"grad_norm": 1.3107976913452148,
"learning_rate": 4.828328328328329e-05,
"loss": 0.7386,
"step": 354
},
{
"epoch": 0.00046405046777594334,
"grad_norm": 1.5658841133117676,
"learning_rate": 4.8278278278278283e-05,
"loss": 1.1042,
"step": 355
},
{
"epoch": 0.0004653576521922136,
"grad_norm": 1.51932692527771,
"learning_rate": 4.827327327327327e-05,
"loss": 0.7015,
"step": 356
},
{
"epoch": 0.00046666483660848387,
"grad_norm": 1.8380029201507568,
"learning_rate": 4.826826826826827e-05,
"loss": 1.8174,
"step": 357
},
{
"epoch": 0.00046797202102475413,
"grad_norm": 1.3767679929733276,
"learning_rate": 4.826326326326326e-05,
"loss": 1.3915,
"step": 358
},
{
"epoch": 0.0004692792054410244,
"grad_norm": 1.7605303525924683,
"learning_rate": 4.825825825825826e-05,
"loss": 0.9956,
"step": 359
},
{
"epoch": 0.00047058638985729466,
"grad_norm": 1.5122482776641846,
"learning_rate": 4.8253253253253255e-05,
"loss": 1.2155,
"step": 360
},
{
"epoch": 0.00047189357427356493,
"grad_norm": 1.5101649761199951,
"learning_rate": 4.824824824824825e-05,
"loss": 1.1057,
"step": 361
},
{
"epoch": 0.0004732007586898352,
"grad_norm": 1.0014289617538452,
"learning_rate": 4.824324324324325e-05,
"loss": 0.6814,
"step": 362
},
{
"epoch": 0.00047450794310610546,
"grad_norm": 1.7926838397979736,
"learning_rate": 4.823823823823824e-05,
"loss": 2.0843,
"step": 363
},
{
"epoch": 0.0004758151275223757,
"grad_norm": 1.632832407951355,
"learning_rate": 4.823323323323324e-05,
"loss": 2.0709,
"step": 364
},
{
"epoch": 0.000477122311938646,
"grad_norm": 1.414014458656311,
"learning_rate": 4.822822822822823e-05,
"loss": 1.8019,
"step": 365
},
{
"epoch": 0.00047842949635491626,
"grad_norm": 1.4348331689834595,
"learning_rate": 4.8223223223223226e-05,
"loss": 1.7161,
"step": 366
},
{
"epoch": 0.0004797366807711865,
"grad_norm": 1.537306547164917,
"learning_rate": 4.821821821821822e-05,
"loss": 0.8713,
"step": 367
},
{
"epoch": 0.0004810438651874568,
"grad_norm": 1.8898135423660278,
"learning_rate": 4.821321321321321e-05,
"loss": 2.1803,
"step": 368
},
{
"epoch": 0.00048235104960372705,
"grad_norm": 1.8930268287658691,
"learning_rate": 4.820820820820821e-05,
"loss": 1.5469,
"step": 369
},
{
"epoch": 0.0004836582340199973,
"grad_norm": 1.4677132368087769,
"learning_rate": 4.8203203203203205e-05,
"loss": 0.9854,
"step": 370
},
{
"epoch": 0.0004849654184362676,
"grad_norm": 1.2428200244903564,
"learning_rate": 4.8198198198198205e-05,
"loss": 1.503,
"step": 371
},
{
"epoch": 0.00048627260285253785,
"grad_norm": 1.542529582977295,
"learning_rate": 4.81931931931932e-05,
"loss": 1.7597,
"step": 372
},
{
"epoch": 0.0004875797872688081,
"grad_norm": 1.6563135385513306,
"learning_rate": 4.818818818818819e-05,
"loss": 1.2799,
"step": 373
},
{
"epoch": 0.0004888869716850783,
"grad_norm": 1.8432083129882812,
"learning_rate": 4.8183183183183184e-05,
"loss": 2.2825,
"step": 374
},
{
"epoch": 0.0004901941561013486,
"grad_norm": 1.8847230672836304,
"learning_rate": 4.817817817817818e-05,
"loss": 1.7107,
"step": 375
},
{
"epoch": 0.0004915013405176189,
"grad_norm": 1.4317970275878906,
"learning_rate": 4.8173173173173177e-05,
"loss": 1.2853,
"step": 376
},
{
"epoch": 0.0004928085249338892,
"grad_norm": 1.9522016048431396,
"learning_rate": 4.816816816816817e-05,
"loss": 2.0247,
"step": 377
},
{
"epoch": 0.0004941157093501594,
"grad_norm": 1.979780912399292,
"learning_rate": 4.816316316316317e-05,
"loss": 2.8973,
"step": 378
},
{
"epoch": 0.0004954228937664297,
"grad_norm": 1.2153147459030151,
"learning_rate": 4.815815815815816e-05,
"loss": 0.918,
"step": 379
},
{
"epoch": 0.0004967300781826999,
"grad_norm": 1.7080423831939697,
"learning_rate": 4.8153153153153155e-05,
"loss": 2.3176,
"step": 380
},
{
"epoch": 0.0004980372625989702,
"grad_norm": 1.4782633781433105,
"learning_rate": 4.814814814814815e-05,
"loss": 1.4934,
"step": 381
},
{
"epoch": 0.0004993444470152405,
"grad_norm": 1.3005950450897217,
"learning_rate": 4.814314314314314e-05,
"loss": 0.9541,
"step": 382
},
{
"epoch": 0.0005006516314315108,
"grad_norm": 1.6197017431259155,
"learning_rate": 4.813813813813814e-05,
"loss": 1.5796,
"step": 383
},
{
"epoch": 0.000501958815847781,
"grad_norm": 1.5193967819213867,
"learning_rate": 4.8133133133133134e-05,
"loss": 1.5869,
"step": 384
},
{
"epoch": 0.0005032660002640513,
"grad_norm": 2.8697612285614014,
"learning_rate": 4.8128128128128134e-05,
"loss": 1.4886,
"step": 385
},
{
"epoch": 0.0005045731846803215,
"grad_norm": 1.835699200630188,
"learning_rate": 4.812312312312313e-05,
"loss": 1.6965,
"step": 386
},
{
"epoch": 0.0005058803690965917,
"grad_norm": 1.51179039478302,
"learning_rate": 4.811811811811812e-05,
"loss": 1.5248,
"step": 387
},
{
"epoch": 0.000507187553512862,
"grad_norm": 1.3816337585449219,
"learning_rate": 4.811311311311312e-05,
"loss": 1.8567,
"step": 388
},
{
"epoch": 0.0005084947379291323,
"grad_norm": 1.3670501708984375,
"learning_rate": 4.810810810810811e-05,
"loss": 0.9172,
"step": 389
},
{
"epoch": 0.0005098019223454026,
"grad_norm": 1.5191140174865723,
"learning_rate": 4.8103103103103105e-05,
"loss": 2.2351,
"step": 390
},
{
"epoch": 0.0005111091067616728,
"grad_norm": 1.9390945434570312,
"learning_rate": 4.80980980980981e-05,
"loss": 1.5947,
"step": 391
},
{
"epoch": 0.0005124162911779431,
"grad_norm": 1.0936216115951538,
"learning_rate": 4.809309309309309e-05,
"loss": 0.8947,
"step": 392
},
{
"epoch": 0.0005137234755942133,
"grad_norm": 3.829148530960083,
"learning_rate": 4.808808808808809e-05,
"loss": 2.7547,
"step": 393
},
{
"epoch": 0.0005150306600104836,
"grad_norm": 2.89119815826416,
"learning_rate": 4.8083083083083084e-05,
"loss": 1.7366,
"step": 394
},
{
"epoch": 0.0005163378444267538,
"grad_norm": 1.6022891998291016,
"learning_rate": 4.8078078078078084e-05,
"loss": 1.786,
"step": 395
},
{
"epoch": 0.0005176450288430242,
"grad_norm": 1.2842696905136108,
"learning_rate": 4.807307307307308e-05,
"loss": 0.2522,
"step": 396
},
{
"epoch": 0.0005189522132592944,
"grad_norm": 1.976442813873291,
"learning_rate": 4.806806806806807e-05,
"loss": 1.9734,
"step": 397
},
{
"epoch": 0.0005202593976755647,
"grad_norm": 1.5915790796279907,
"learning_rate": 4.806306306306306e-05,
"loss": 1.9909,
"step": 398
},
{
"epoch": 0.0005215665820918349,
"grad_norm": 1.658768892288208,
"learning_rate": 4.8058058058058056e-05,
"loss": 1.6076,
"step": 399
},
{
"epoch": 0.0005228737665081052,
"grad_norm": 1.4726157188415527,
"learning_rate": 4.8053053053053055e-05,
"loss": 2.0277,
"step": 400
},
{
"epoch": 0.0005241809509243754,
"grad_norm": 2.232145309448242,
"learning_rate": 4.804804804804805e-05,
"loss": 1.7958,
"step": 401
},
{
"epoch": 0.0005254881353406458,
"grad_norm": 1.6406078338623047,
"learning_rate": 4.804304304304305e-05,
"loss": 1.7463,
"step": 402
},
{
"epoch": 0.000526795319756916,
"grad_norm": 1.1940925121307373,
"learning_rate": 4.803803803803804e-05,
"loss": 0.7934,
"step": 403
},
{
"epoch": 0.0005281025041731863,
"grad_norm": 1.7259567975997925,
"learning_rate": 4.8033033033033034e-05,
"loss": 0.9874,
"step": 404
},
{
"epoch": 0.0005294096885894565,
"grad_norm": 1.845804214477539,
"learning_rate": 4.8028028028028034e-05,
"loss": 0.4897,
"step": 405
},
{
"epoch": 0.0005307168730057268,
"grad_norm": 1.6611266136169434,
"learning_rate": 4.802302302302302e-05,
"loss": 1.7425,
"step": 406
},
{
"epoch": 0.000532024057421997,
"grad_norm": 1.487190842628479,
"learning_rate": 4.801801801801802e-05,
"loss": 2.0357,
"step": 407
},
{
"epoch": 0.0005333312418382674,
"grad_norm": 0.8890923261642456,
"learning_rate": 4.801301301301301e-05,
"loss": 0.5583,
"step": 408
},
{
"epoch": 0.0005346384262545376,
"grad_norm": 1.4603744745254517,
"learning_rate": 4.800800800800801e-05,
"loss": 1.6225,
"step": 409
},
{
"epoch": 0.0005359456106708078,
"grad_norm": 1.9771097898483276,
"learning_rate": 4.8003003003003005e-05,
"loss": 1.7477,
"step": 410
},
{
"epoch": 0.0005372527950870781,
"grad_norm": 1.5973646640777588,
"learning_rate": 4.7997997997998e-05,
"loss": 1.5643,
"step": 411
},
{
"epoch": 0.0005385599795033483,
"grad_norm": 1.9672987461090088,
"learning_rate": 4.7992992992993e-05,
"loss": 2.1034,
"step": 412
},
{
"epoch": 0.0005398671639196186,
"grad_norm": 1.4598102569580078,
"learning_rate": 4.798798798798799e-05,
"loss": 1.9834,
"step": 413
},
{
"epoch": 0.0005411743483358888,
"grad_norm": 1.8543391227722168,
"learning_rate": 4.798298298298299e-05,
"loss": 1.8818,
"step": 414
},
{
"epoch": 0.0005424815327521592,
"grad_norm": 1.3450524806976318,
"learning_rate": 4.797797797797798e-05,
"loss": 1.0332,
"step": 415
},
{
"epoch": 0.0005437887171684294,
"grad_norm": 1.323103904724121,
"learning_rate": 4.797297297297298e-05,
"loss": 1.4493,
"step": 416
},
{
"epoch": 0.0005450959015846997,
"grad_norm": 3.834731101989746,
"learning_rate": 4.796796796796797e-05,
"loss": 1.7237,
"step": 417
},
{
"epoch": 0.0005464030860009699,
"grad_norm": 1.6012156009674072,
"learning_rate": 4.796296296296296e-05,
"loss": 1.7705,
"step": 418
},
{
"epoch": 0.0005477102704172402,
"grad_norm": 1.6005125045776367,
"learning_rate": 4.795795795795796e-05,
"loss": 1.2692,
"step": 419
},
{
"epoch": 0.0005490174548335104,
"grad_norm": 0.04926226660609245,
"learning_rate": 4.7952952952952956e-05,
"loss": 0.001,
"step": 420
},
{
"epoch": 0.0005503246392497808,
"grad_norm": 1.9261682033538818,
"learning_rate": 4.7947947947947955e-05,
"loss": 1.3089,
"step": 421
},
{
"epoch": 0.000551631823666051,
"grad_norm": 1.4734810590744019,
"learning_rate": 4.794294294294295e-05,
"loss": 1.9636,
"step": 422
},
{
"epoch": 0.0005529390080823213,
"grad_norm": 1.5746914148330688,
"learning_rate": 4.793793793793794e-05,
"loss": 1.481,
"step": 423
},
{
"epoch": 0.0005542461924985915,
"grad_norm": 1.6360576152801514,
"learning_rate": 4.7932932932932934e-05,
"loss": 2.0989,
"step": 424
},
{
"epoch": 0.0005555533769148618,
"grad_norm": 1.9103704690933228,
"learning_rate": 4.792792792792793e-05,
"loss": 2.5168,
"step": 425
},
{
"epoch": 0.000556860561331132,
"grad_norm": 1.6170457601547241,
"learning_rate": 4.792292292292293e-05,
"loss": 2.1027,
"step": 426
},
{
"epoch": 0.0005581677457474023,
"grad_norm": 1.6519997119903564,
"learning_rate": 4.791791791791792e-05,
"loss": 1.8065,
"step": 427
},
{
"epoch": 0.0005594749301636726,
"grad_norm": 2.1365749835968018,
"learning_rate": 4.791291291291291e-05,
"loss": 2.1404,
"step": 428
},
{
"epoch": 0.0005607821145799429,
"grad_norm": 1.484369158744812,
"learning_rate": 4.790790790790791e-05,
"loss": 1.6508,
"step": 429
},
{
"epoch": 0.0005620892989962131,
"grad_norm": 2.2153050899505615,
"learning_rate": 4.7902902902902906e-05,
"loss": 1.9617,
"step": 430
},
{
"epoch": 0.0005633964834124833,
"grad_norm": 1.4849168062210083,
"learning_rate": 4.78978978978979e-05,
"loss": 1.6652,
"step": 431
},
{
"epoch": 0.0005647036678287536,
"grad_norm": 1.4296141862869263,
"learning_rate": 4.789289289289289e-05,
"loss": 1.2995,
"step": 432
},
{
"epoch": 0.0005660108522450238,
"grad_norm": 1.6376841068267822,
"learning_rate": 4.788788788788789e-05,
"loss": 2.2176,
"step": 433
},
{
"epoch": 0.0005673180366612942,
"grad_norm": 1.422378420829773,
"learning_rate": 4.7882882882882884e-05,
"loss": 1.4486,
"step": 434
},
{
"epoch": 0.0005686252210775644,
"grad_norm": 1.5652416944503784,
"learning_rate": 4.787787787787788e-05,
"loss": 1.8622,
"step": 435
},
{
"epoch": 0.0005699324054938347,
"grad_norm": 1.2181988954544067,
"learning_rate": 4.787287287287288e-05,
"loss": 0.7264,
"step": 436
},
{
"epoch": 0.0005712395899101049,
"grad_norm": 1.4485244750976562,
"learning_rate": 4.786786786786787e-05,
"loss": 1.0068,
"step": 437
},
{
"epoch": 0.0005725467743263752,
"grad_norm": 1.6365692615509033,
"learning_rate": 4.786286286286287e-05,
"loss": 1.8623,
"step": 438
},
{
"epoch": 0.0005738539587426454,
"grad_norm": 1.362836241722107,
"learning_rate": 4.785785785785786e-05,
"loss": 1.2694,
"step": 439
},
{
"epoch": 0.0005751611431589157,
"grad_norm": 1.5164575576782227,
"learning_rate": 4.7852852852852856e-05,
"loss": 1.9222,
"step": 440
},
{
"epoch": 0.000576468327575186,
"grad_norm": 1.7084901332855225,
"learning_rate": 4.784784784784785e-05,
"loss": 1.8659,
"step": 441
},
{
"epoch": 0.0005777755119914563,
"grad_norm": 2.0331461429595947,
"learning_rate": 4.784284284284284e-05,
"loss": 1.4633,
"step": 442
},
{
"epoch": 0.0005790826964077265,
"grad_norm": 1.529123306274414,
"learning_rate": 4.783783783783784e-05,
"loss": 1.4277,
"step": 443
},
{
"epoch": 0.0005803898808239968,
"grad_norm": 1.4111024141311646,
"learning_rate": 4.7832832832832834e-05,
"loss": 1.7261,
"step": 444
},
{
"epoch": 0.000581697065240267,
"grad_norm": 0.98598313331604,
"learning_rate": 4.7827827827827834e-05,
"loss": 0.4229,
"step": 445
},
{
"epoch": 0.0005830042496565373,
"grad_norm": 2.0867719650268555,
"learning_rate": 4.782282282282283e-05,
"loss": 1.9944,
"step": 446
},
{
"epoch": 0.0005843114340728075,
"grad_norm": 1.6054155826568604,
"learning_rate": 4.781781781781782e-05,
"loss": 2.2063,
"step": 447
},
{
"epoch": 0.0005856186184890779,
"grad_norm": 1.374302864074707,
"learning_rate": 4.781281281281281e-05,
"loss": 1.4218,
"step": 448
},
{
"epoch": 0.0005869258029053481,
"grad_norm": 1.8737014532089233,
"learning_rate": 4.7807807807807806e-05,
"loss": 2.0067,
"step": 449
},
{
"epoch": 0.0005882329873216184,
"grad_norm": 1.221209168434143,
"learning_rate": 4.7802802802802806e-05,
"loss": 1.1187,
"step": 450
},
{
"epoch": 0.0005895401717378886,
"grad_norm": 2.31976580619812,
"learning_rate": 4.77977977977978e-05,
"loss": 2.244,
"step": 451
},
{
"epoch": 0.0005908473561541588,
"grad_norm": 2.0758728981018066,
"learning_rate": 4.77927927927928e-05,
"loss": 2.37,
"step": 452
},
{
"epoch": 0.0005921545405704291,
"grad_norm": 1.5117443799972534,
"learning_rate": 4.778778778778779e-05,
"loss": 1.4368,
"step": 453
},
{
"epoch": 0.0005934617249866994,
"grad_norm": 2.0739381313323975,
"learning_rate": 4.7782782782782784e-05,
"loss": 2.3445,
"step": 454
},
{
"epoch": 0.0005947689094029697,
"grad_norm": 1.6032236814498901,
"learning_rate": 4.7777777777777784e-05,
"loss": 1.3034,
"step": 455
},
{
"epoch": 0.0005960760938192399,
"grad_norm": 1.8953197002410889,
"learning_rate": 4.777277277277277e-05,
"loss": 2.4014,
"step": 456
},
{
"epoch": 0.0005973832782355102,
"grad_norm": 1.1892727613449097,
"learning_rate": 4.776776776776777e-05,
"loss": 1.0365,
"step": 457
},
{
"epoch": 0.0005986904626517804,
"grad_norm": 1.3845131397247314,
"learning_rate": 4.776276276276276e-05,
"loss": 1.4982,
"step": 458
},
{
"epoch": 0.0005999976470680507,
"grad_norm": 2.0147531032562256,
"learning_rate": 4.775775775775776e-05,
"loss": 1.0432,
"step": 459
},
{
"epoch": 0.0006013048314843209,
"grad_norm": 1.3901069164276123,
"learning_rate": 4.7752752752752756e-05,
"loss": 1.1874,
"step": 460
},
{
"epoch": 0.0006026120159005913,
"grad_norm": 1.7510122060775757,
"learning_rate": 4.774774774774775e-05,
"loss": 1.7532,
"step": 461
},
{
"epoch": 0.0006039192003168615,
"grad_norm": 1.4255083799362183,
"learning_rate": 4.774274274274275e-05,
"loss": 1.2638,
"step": 462
},
{
"epoch": 0.0006052263847331318,
"grad_norm": 1.3789587020874023,
"learning_rate": 4.773773773773774e-05,
"loss": 1.2885,
"step": 463
},
{
"epoch": 0.000606533569149402,
"grad_norm": 1.8223754167556763,
"learning_rate": 4.7732732732732734e-05,
"loss": 1.846,
"step": 464
},
{
"epoch": 0.0006078407535656723,
"grad_norm": 1.9569519758224487,
"learning_rate": 4.772772772772773e-05,
"loss": 2.0224,
"step": 465
},
{
"epoch": 0.0006091479379819425,
"grad_norm": 1.589490532875061,
"learning_rate": 4.772272272272272e-05,
"loss": 1.4883,
"step": 466
},
{
"epoch": 0.0006104551223982129,
"grad_norm": 1.4150370359420776,
"learning_rate": 4.771771771771772e-05,
"loss": 1.1537,
"step": 467
},
{
"epoch": 0.0006117623068144831,
"grad_norm": 1.3750197887420654,
"learning_rate": 4.771271271271271e-05,
"loss": 1.6263,
"step": 468
},
{
"epoch": 0.0006130694912307534,
"grad_norm": 1.5440406799316406,
"learning_rate": 4.770770770770771e-05,
"loss": 1.6786,
"step": 469
},
{
"epoch": 0.0006143766756470236,
"grad_norm": 1.7556005716323853,
"learning_rate": 4.7702702702702706e-05,
"loss": 2.0755,
"step": 470
},
{
"epoch": 0.0006156838600632939,
"grad_norm": 1.2925258874893188,
"learning_rate": 4.76976976976977e-05,
"loss": 0.8731,
"step": 471
},
{
"epoch": 0.0006169910444795641,
"grad_norm": 1.463951826095581,
"learning_rate": 4.76926926926927e-05,
"loss": 1.7757,
"step": 472
},
{
"epoch": 0.0006182982288958343,
"grad_norm": 1.1145371198654175,
"learning_rate": 4.7687687687687685e-05,
"loss": 0.3717,
"step": 473
},
{
"epoch": 0.0006196054133121047,
"grad_norm": 1.656201720237732,
"learning_rate": 4.7682682682682685e-05,
"loss": 1.3816,
"step": 474
},
{
"epoch": 0.0006209125977283749,
"grad_norm": 1.6448125839233398,
"learning_rate": 4.767767767767768e-05,
"loss": 1.3373,
"step": 475
},
{
"epoch": 0.0006222197821446452,
"grad_norm": 1.5466721057891846,
"learning_rate": 4.767267267267268e-05,
"loss": 1.757,
"step": 476
},
{
"epoch": 0.0006235269665609154,
"grad_norm": 1.4172669649124146,
"learning_rate": 4.766766766766767e-05,
"loss": 1.3798,
"step": 477
},
{
"epoch": 0.0006248341509771857,
"grad_norm": 1.4851760864257812,
"learning_rate": 4.766266266266266e-05,
"loss": 1.5471,
"step": 478
},
{
"epoch": 0.0006261413353934559,
"grad_norm": 1.2650772333145142,
"learning_rate": 4.765765765765766e-05,
"loss": 0.5324,
"step": 479
},
{
"epoch": 0.0006274485198097263,
"grad_norm": 1.645494818687439,
"learning_rate": 4.7652652652652656e-05,
"loss": 1.422,
"step": 480
},
{
"epoch": 0.0006287557042259965,
"grad_norm": 1.4462095499038696,
"learning_rate": 4.764764764764765e-05,
"loss": 0.7855,
"step": 481
},
{
"epoch": 0.0006300628886422668,
"grad_norm": 1.6467872858047485,
"learning_rate": 4.764264264264264e-05,
"loss": 1.7378,
"step": 482
},
{
"epoch": 0.000631370073058537,
"grad_norm": 1.4757755994796753,
"learning_rate": 4.763763763763764e-05,
"loss": 1.2541,
"step": 483
},
{
"epoch": 0.0006326772574748073,
"grad_norm": 1.649613618850708,
"learning_rate": 4.7632632632632635e-05,
"loss": 1.9402,
"step": 484
},
{
"epoch": 0.0006339844418910775,
"grad_norm": 1.5895005464553833,
"learning_rate": 4.762762762762763e-05,
"loss": 1.6001,
"step": 485
},
{
"epoch": 0.0006352916263073478,
"grad_norm": 1.9634181261062622,
"learning_rate": 4.762262262262263e-05,
"loss": 1.9354,
"step": 486
},
{
"epoch": 0.0006365988107236181,
"grad_norm": 1.5781902074813843,
"learning_rate": 4.761761761761762e-05,
"loss": 1.7391,
"step": 487
},
{
"epoch": 0.0006379059951398884,
"grad_norm": 1.7558811902999878,
"learning_rate": 4.761261261261262e-05,
"loss": 1.3157,
"step": 488
},
{
"epoch": 0.0006392131795561586,
"grad_norm": 1.4790571928024292,
"learning_rate": 4.7607607607607606e-05,
"loss": 1.2889,
"step": 489
},
{
"epoch": 0.0006405203639724289,
"grad_norm": 1.9054185152053833,
"learning_rate": 4.7602602602602606e-05,
"loss": 1.9227,
"step": 490
},
{
"epoch": 0.0006418275483886991,
"grad_norm": 1.3396105766296387,
"learning_rate": 4.75975975975976e-05,
"loss": 0.966,
"step": 491
},
{
"epoch": 0.0006431347328049694,
"grad_norm": 1.6935501098632812,
"learning_rate": 4.759259259259259e-05,
"loss": 1.3237,
"step": 492
},
{
"epoch": 0.0006444419172212397,
"grad_norm": 1.58639395236969,
"learning_rate": 4.758758758758759e-05,
"loss": 1.8338,
"step": 493
},
{
"epoch": 0.0006457491016375099,
"grad_norm": 1.8331503868103027,
"learning_rate": 4.7582582582582585e-05,
"loss": 1.5735,
"step": 494
},
{
"epoch": 0.0006470562860537802,
"grad_norm": 2.0812811851501465,
"learning_rate": 4.7577577577577584e-05,
"loss": 2.0272,
"step": 495
},
{
"epoch": 0.0006483634704700504,
"grad_norm": 1.8232816457748413,
"learning_rate": 4.757257257257258e-05,
"loss": 1.52,
"step": 496
},
{
"epoch": 0.0006496706548863207,
"grad_norm": 1.5954536199569702,
"learning_rate": 4.756756756756757e-05,
"loss": 1.8882,
"step": 497
},
{
"epoch": 0.0006509778393025909,
"grad_norm": 2.2882657051086426,
"learning_rate": 4.756256256256256e-05,
"loss": 2.3967,
"step": 498
},
{
"epoch": 0.0006522850237188612,
"grad_norm": 1.4601846933364868,
"learning_rate": 4.7557557557557556e-05,
"loss": 1.7912,
"step": 499
},
{
"epoch": 0.0006535922081351315,
"grad_norm": 1.9598947763442993,
"learning_rate": 4.7552552552552556e-05,
"loss": 2.0452,
"step": 500
}
],
"logging_steps": 1,
"max_steps": 10000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 439137192394752.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}