|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 924, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008658008658008658, |
|
"grad_norm": 5.4204916523303837e-05, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 2.1959, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.017316017316017316, |
|
"grad_norm": 4.602531771524809e-05, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 1.982, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.025974025974025976, |
|
"grad_norm": 3.824224040727131e-05, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 2.2121, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.03463203463203463, |
|
"grad_norm": 2.3534847059636377e-05, |
|
"learning_rate": 2.9787234042553192e-05, |
|
"loss": 2.0893, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.04329004329004329, |
|
"grad_norm": 1.8666707546799444e-05, |
|
"learning_rate": 3.829787234042553e-05, |
|
"loss": 1.8373, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.05194805194805195, |
|
"grad_norm": 1.5847288523218594e-05, |
|
"learning_rate": 4.680851063829788e-05, |
|
"loss": 2.0273, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.06060606060606061, |
|
"grad_norm": 1.0995895536325406e-05, |
|
"learning_rate": 5.531914893617022e-05, |
|
"loss": 1.8151, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.06926406926406926, |
|
"grad_norm": 1.6404985217377543e-05, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 2.0606, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.07792207792207792, |
|
"grad_norm": 1.4904575436958112e-05, |
|
"learning_rate": 7.23404255319149e-05, |
|
"loss": 1.9488, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.08658008658008658, |
|
"grad_norm": 1.3000194485357497e-05, |
|
"learning_rate": 8.085106382978723e-05, |
|
"loss": 2.6669, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.09523809523809523, |
|
"grad_norm": 1.2479895303840749e-05, |
|
"learning_rate": 8.936170212765958e-05, |
|
"loss": 1.742, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1038961038961039, |
|
"grad_norm": 1.0467254469403997e-05, |
|
"learning_rate": 9.787234042553192e-05, |
|
"loss": 1.7994, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.11255411255411256, |
|
"grad_norm": 1.3622248843603302e-05, |
|
"learning_rate": 0.00010638297872340425, |
|
"loss": 1.9219, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.12121212121212122, |
|
"grad_norm": 1.3718491572944913e-05, |
|
"learning_rate": 0.00011489361702127661, |
|
"loss": 1.8505, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.12987012987012986, |
|
"grad_norm": 1.3184439012547955e-05, |
|
"learning_rate": 0.00012340425531914893, |
|
"loss": 1.7647, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.13852813852813853, |
|
"grad_norm": 1.2686439731623977e-05, |
|
"learning_rate": 0.00013191489361702127, |
|
"loss": 1.8865, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.1471861471861472, |
|
"grad_norm": 3.454366378718987e-05, |
|
"learning_rate": 0.00014042553191489363, |
|
"loss": 1.8743, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.15584415584415584, |
|
"grad_norm": 1.81365103344433e-05, |
|
"learning_rate": 0.00014893617021276596, |
|
"loss": 1.9598, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.1645021645021645, |
|
"grad_norm": 1.4871564417262562e-05, |
|
"learning_rate": 0.00015744680851063832, |
|
"loss": 1.7956, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.17316017316017315, |
|
"grad_norm": 1.3106016922392882e-05, |
|
"learning_rate": 0.00016595744680851065, |
|
"loss": 1.6608, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.18181818181818182, |
|
"grad_norm": 1.4674634257971775e-05, |
|
"learning_rate": 0.00017446808510638298, |
|
"loss": 1.7878, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.19047619047619047, |
|
"grad_norm": 2.0664912881329656e-05, |
|
"learning_rate": 0.00018297872340425532, |
|
"loss": 1.8639, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.19913419913419914, |
|
"grad_norm": 1.741530650178902e-05, |
|
"learning_rate": 0.00019148936170212768, |
|
"loss": 1.5938, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.2077922077922078, |
|
"grad_norm": 1.3442709132505115e-05, |
|
"learning_rate": 0.0002, |
|
"loss": 1.6691, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.21645021645021645, |
|
"grad_norm": 2.1207506506470963e-05, |
|
"learning_rate": 0.00019999743357429378, |
|
"loss": 1.804, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.22510822510822512, |
|
"grad_norm": 3.287912477389909e-05, |
|
"learning_rate": 0.00019998973442890598, |
|
"loss": 1.8404, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.23376623376623376, |
|
"grad_norm": 2.17068190977443e-05, |
|
"learning_rate": 0.00019997690295902226, |
|
"loss": 1.7561, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.24242424242424243, |
|
"grad_norm": 1.806787622626871e-05, |
|
"learning_rate": 0.00019995893982326286, |
|
"loss": 1.8094, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2510822510822511, |
|
"grad_norm": 1.9331200746819377e-05, |
|
"learning_rate": 0.00019993584594364894, |
|
"loss": 1.9647, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.2597402597402597, |
|
"grad_norm": 1.799209712771699e-05, |
|
"learning_rate": 0.00019990762250555495, |
|
"loss": 1.6961, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.2683982683982684, |
|
"grad_norm": 1.7485201169620268e-05, |
|
"learning_rate": 0.0001998742709576481, |
|
"loss": 1.8473, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.27705627705627706, |
|
"grad_norm": 1.571387838339433e-05, |
|
"learning_rate": 0.00019983579301181373, |
|
"loss": 1.5972, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 1.9293982404633425e-05, |
|
"learning_rate": 0.00019979219064306762, |
|
"loss": 1.6437, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.2943722943722944, |
|
"grad_norm": 1.444847475795541e-05, |
|
"learning_rate": 0.00019974346608945466, |
|
"loss": 1.6366, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.30303030303030304, |
|
"grad_norm": 1.3742756891588215e-05, |
|
"learning_rate": 0.00019968962185193365, |
|
"loss": 1.5994, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3116883116883117, |
|
"grad_norm": 8.885600254870951e-05, |
|
"learning_rate": 0.00019963066069424943, |
|
"loss": 1.7003, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.3203463203463203, |
|
"grad_norm": 2.5485469450359233e-05, |
|
"learning_rate": 0.0001995665856427905, |
|
"loss": 1.6381, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.329004329004329, |
|
"grad_norm": 3.399766137590632e-05, |
|
"learning_rate": 0.00019949739998643414, |
|
"loss": 1.6196, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.33766233766233766, |
|
"grad_norm": 3.7044861528556794e-05, |
|
"learning_rate": 0.00019942310727637724, |
|
"loss": 1.667, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.3463203463203463, |
|
"grad_norm": 3.5706671042134985e-05, |
|
"learning_rate": 0.00019934371132595424, |
|
"loss": 1.8312, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.354978354978355, |
|
"grad_norm": 2.0404797396622598e-05, |
|
"learning_rate": 0.00019925921621044129, |
|
"loss": 1.6482, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.36363636363636365, |
|
"grad_norm": 1.7347399989375845e-05, |
|
"learning_rate": 0.00019916962626684707, |
|
"loss": 1.6175, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3722943722943723, |
|
"grad_norm": 3.3090138458646834e-05, |
|
"learning_rate": 0.00019907494609369035, |
|
"loss": 1.8201, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.38095238095238093, |
|
"grad_norm": 2.85672413156135e-05, |
|
"learning_rate": 0.0001989751805507637, |
|
"loss": 1.7632, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.38961038961038963, |
|
"grad_norm": 3.10399555019103e-05, |
|
"learning_rate": 0.00019887033475888419, |
|
"loss": 1.648, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.39826839826839827, |
|
"grad_norm": 2.8758275220752694e-05, |
|
"learning_rate": 0.00019876041409963056, |
|
"loss": 1.6615, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.4069264069264069, |
|
"grad_norm": 2.411049536021892e-05, |
|
"learning_rate": 0.00019864542421506686, |
|
"loss": 1.6684, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.4155844155844156, |
|
"grad_norm": 2.1932239178568125e-05, |
|
"learning_rate": 0.00019852537100745307, |
|
"loss": 1.5879, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.42424242424242425, |
|
"grad_norm": 2.419168049527798e-05, |
|
"learning_rate": 0.00019840026063894193, |
|
"loss": 1.6055, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.4329004329004329, |
|
"grad_norm": 3.5360833862796426e-05, |
|
"learning_rate": 0.00019827009953126275, |
|
"loss": 1.6233, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.44155844155844154, |
|
"grad_norm": 5.8022818848257884e-05, |
|
"learning_rate": 0.0001981348943653918, |
|
"loss": 1.5232, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.45021645021645024, |
|
"grad_norm": 3.986386946053244e-05, |
|
"learning_rate": 0.0001979946520812093, |
|
"loss": 1.7553, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.4588744588744589, |
|
"grad_norm": 5.403523027780466e-05, |
|
"learning_rate": 0.00019784937987714333, |
|
"loss": 1.6931, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.4675324675324675, |
|
"grad_norm": 2.1774114429717883e-05, |
|
"learning_rate": 0.00019769908520980034, |
|
"loss": 1.5427, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.47619047619047616, |
|
"grad_norm": 3.063875192310661e-05, |
|
"learning_rate": 0.0001975437757935822, |
|
"loss": 1.7336, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.48484848484848486, |
|
"grad_norm": 2.7840827897307463e-05, |
|
"learning_rate": 0.0001973834596002905, |
|
"loss": 1.6536, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4935064935064935, |
|
"grad_norm": 2.68031708401395e-05, |
|
"learning_rate": 0.00019721814485871726, |
|
"loss": 1.7237, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.5021645021645021, |
|
"grad_norm": 4.067725967615843e-05, |
|
"learning_rate": 0.0001970478400542225, |
|
"loss": 1.6127, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.5108225108225108, |
|
"grad_norm": 3.361668132129125e-05, |
|
"learning_rate": 0.00019687255392829877, |
|
"loss": 1.5502, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.5194805194805194, |
|
"grad_norm": 3.693327744258568e-05, |
|
"learning_rate": 0.00019669229547812249, |
|
"loss": 1.6483, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5281385281385281, |
|
"grad_norm": 3.008819476235658e-05, |
|
"learning_rate": 0.00019650707395609204, |
|
"loss": 1.6436, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.5367965367965368, |
|
"grad_norm": 3.0448116376646794e-05, |
|
"learning_rate": 0.00019631689886935298, |
|
"loss": 1.4893, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.5454545454545454, |
|
"grad_norm": 4.133440961595625e-05, |
|
"learning_rate": 0.00019612177997930987, |
|
"loss": 1.6093, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.5541125541125541, |
|
"grad_norm": 8.941220585256815e-05, |
|
"learning_rate": 0.00019592172730112544, |
|
"loss": 1.737, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.5627705627705628, |
|
"grad_norm": 4.473665831028484e-05, |
|
"learning_rate": 0.00019571675110320643, |
|
"loss": 1.6455, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.00013181190297473222, |
|
"learning_rate": 0.00019550686190667648, |
|
"loss": 1.5996, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.5800865800865801, |
|
"grad_norm": 9.064975165529177e-05, |
|
"learning_rate": 0.0001952920704848362, |
|
"loss": 1.6947, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5887445887445888, |
|
"grad_norm": 3.242671664338559e-05, |
|
"learning_rate": 0.00019507238786261008, |
|
"loss": 1.6403, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5974025974025974, |
|
"grad_norm": 0.0001139045343734324, |
|
"learning_rate": 0.00019484782531598073, |
|
"loss": 1.6445, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.6060606060606061, |
|
"grad_norm": 0.0001039160051732324, |
|
"learning_rate": 0.00019461839437141004, |
|
"loss": 1.7483, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6147186147186147, |
|
"grad_norm": 8.791654545348138e-05, |
|
"learning_rate": 0.0001943841068052474, |
|
"loss": 1.5357, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.6233766233766234, |
|
"grad_norm": 4.35874389950186e-05, |
|
"learning_rate": 0.0001941449746431255, |
|
"loss": 1.7572, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.6320346320346321, |
|
"grad_norm": 4.0783703298075125e-05, |
|
"learning_rate": 0.0001939010101593429, |
|
"loss": 1.7342, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.6406926406926406, |
|
"grad_norm": 0.00014383271627593786, |
|
"learning_rate": 0.00019365222587623405, |
|
"loss": 1.6868, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.6493506493506493, |
|
"grad_norm": 9.058301657205448e-05, |
|
"learning_rate": 0.00019339863456352657, |
|
"loss": 1.7322, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.658008658008658, |
|
"grad_norm": 4.600944521371275e-05, |
|
"learning_rate": 0.0001931402492376857, |
|
"loss": 1.7196, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.00011446730059105903, |
|
"learning_rate": 0.0001928770831612463, |
|
"loss": 1.6174, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.6753246753246753, |
|
"grad_norm": 8.582652662880719e-05, |
|
"learning_rate": 0.00019260914984213203, |
|
"loss": 1.6256, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.683982683982684, |
|
"grad_norm": 4.044194065500051e-05, |
|
"learning_rate": 0.00019233646303296205, |
|
"loss": 1.6417, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.6926406926406926, |
|
"grad_norm": 4.7330380766652524e-05, |
|
"learning_rate": 0.0001920590367303451, |
|
"loss": 1.6737, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7012987012987013, |
|
"grad_norm": 5.3459320042748004e-05, |
|
"learning_rate": 0.00019177688517416105, |
|
"loss": 1.7177, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.70995670995671, |
|
"grad_norm": 9.026160842040554e-05, |
|
"learning_rate": 0.00019149002284683008, |
|
"loss": 1.7326, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.7186147186147186, |
|
"grad_norm": 6.937263970030472e-05, |
|
"learning_rate": 0.0001911984644725692, |
|
"loss": 1.7971, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.7272727272727273, |
|
"grad_norm": 8.01292626420036e-05, |
|
"learning_rate": 0.0001909022250166365, |
|
"loss": 1.6087, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.7359307359307359, |
|
"grad_norm": 7.882928184699267e-05, |
|
"learning_rate": 0.00019060131968456312, |
|
"loss": 1.6477, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7445887445887446, |
|
"grad_norm": 0.00010625456343404949, |
|
"learning_rate": 0.00019029576392137263, |
|
"loss": 1.7002, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.7532467532467533, |
|
"grad_norm": 0.00021590011601801962, |
|
"learning_rate": 0.00018998557341078835, |
|
"loss": 1.7764, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.7619047619047619, |
|
"grad_norm": 6.014715472701937e-05, |
|
"learning_rate": 0.00018967076407442829, |
|
"loss": 1.6852, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.7705627705627706, |
|
"grad_norm": 8.015201456146315e-05, |
|
"learning_rate": 0.00018935135207098785, |
|
"loss": 1.6184, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.7792207792207793, |
|
"grad_norm": 6.944081542314962e-05, |
|
"learning_rate": 0.00018902735379541064, |
|
"loss": 1.6294, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.7878787878787878, |
|
"grad_norm": 6.165856029838324e-05, |
|
"learning_rate": 0.0001886987858780467, |
|
"loss": 1.778, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.7965367965367965, |
|
"grad_norm": 8.64676694618538e-05, |
|
"learning_rate": 0.000188365665183799, |
|
"loss": 1.5531, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.8051948051948052, |
|
"grad_norm": 9.756162035046145e-05, |
|
"learning_rate": 0.00018802800881125784, |
|
"loss": 1.7175, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.8138528138528138, |
|
"grad_norm": 0.00021862791618332267, |
|
"learning_rate": 0.00018768583409182305, |
|
"loss": 1.8573, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.8225108225108225, |
|
"grad_norm": 0.0001369621604681015, |
|
"learning_rate": 0.00018733915858881462, |
|
"loss": 1.7662, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8311688311688312, |
|
"grad_norm": 0.00031745131127536297, |
|
"learning_rate": 0.00018698800009657094, |
|
"loss": 2.0091, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.8398268398268398, |
|
"grad_norm": 0.00010971837764373049, |
|
"learning_rate": 0.00018663237663953567, |
|
"loss": 1.7376, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.8484848484848485, |
|
"grad_norm": 0.00029516848735511303, |
|
"learning_rate": 0.0001862723064713324, |
|
"loss": 1.7701, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 0.00034297676756978035, |
|
"learning_rate": 0.0001859078080738279, |
|
"loss": 1.9447, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.8658008658008658, |
|
"grad_norm": 0.0002530592610128224, |
|
"learning_rate": 0.00018553890015618333, |
|
"loss": 1.8612, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.8744588744588745, |
|
"grad_norm": 0.00010490286513231695, |
|
"learning_rate": 0.00018516560165389388, |
|
"loss": 1.9922, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.8831168831168831, |
|
"grad_norm": 0.00020270211098250002, |
|
"learning_rate": 0.00018478793172781708, |
|
"loss": 1.9509, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.8917748917748918, |
|
"grad_norm": 9.309218876296654e-05, |
|
"learning_rate": 0.000184405909763189, |
|
"loss": 1.909, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.9004329004329005, |
|
"grad_norm": 7.871213892940432e-05, |
|
"learning_rate": 0.00018401955536862948, |
|
"loss": 2.0207, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 8.371711010113358e-05, |
|
"learning_rate": 0.00018362888837513547, |
|
"loss": 2.0631, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9177489177489178, |
|
"grad_norm": 0.00014524892321787775, |
|
"learning_rate": 0.00018323392883506335, |
|
"loss": 2.2097, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.9264069264069265, |
|
"grad_norm": 0.00010290888894814998, |
|
"learning_rate": 0.00018283469702109936, |
|
"loss": 2.1094, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.935064935064935, |
|
"grad_norm": 9.359593968838453e-05, |
|
"learning_rate": 0.00018243121342521935, |
|
"loss": 2.4721, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.9437229437229437, |
|
"grad_norm": 9.659567876951769e-05, |
|
"learning_rate": 0.0001820234987576368, |
|
"loss": 2.6673, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.9523809523809523, |
|
"grad_norm": 0.00016237075033131987, |
|
"learning_rate": 0.0001816115739457397, |
|
"loss": 2.9459, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.961038961038961, |
|
"grad_norm": 0.0001531827583676204, |
|
"learning_rate": 0.00018119546013301664, |
|
"loss": 3.2397, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.9696969696969697, |
|
"grad_norm": 0.0001731283700792119, |
|
"learning_rate": 0.0001807751786779713, |
|
"loss": 3.62, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.9783549783549783, |
|
"grad_norm": 8.82472813827917e-05, |
|
"learning_rate": 0.00018035075115302633, |
|
"loss": 3.9451, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.987012987012987, |
|
"grad_norm": 0.00010907312389463186, |
|
"learning_rate": 0.0001799221993434159, |
|
"loss": 4.6026, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.9956709956709957, |
|
"grad_norm": 0.00013694152585230768, |
|
"learning_rate": 0.00017948954524606763, |
|
"loss": 5.3581, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.0043290043290043, |
|
"grad_norm": 0.00015375320799648762, |
|
"learning_rate": 0.00017905281106847344, |
|
"loss": 5.3862, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.0129870129870129, |
|
"grad_norm": 0.000532141828443855, |
|
"learning_rate": 0.00017861201922754979, |
|
"loss": 6.0815, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.0216450216450217, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017839010874560733, |
|
"loss": 0.0, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.0303030303030303, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017839010874560733, |
|
"loss": 0.0, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.0389610389610389, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017839010874560733, |
|
"loss": 0.0, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.0476190476190477, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017839010874560733, |
|
"loss": 0.0, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.0562770562770563, |
|
"grad_norm": NaN, |
|
"learning_rate": 0.00017839010874560733, |
|
"loss": 0.0, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.0649350649350648, |
|
"grad_norm": 0.1457211822271347, |
|
"learning_rate": 0.0001781671923484869, |
|
"loss": 5.7907, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.0735930735930737, |
|
"grad_norm": 0.4493274390697479, |
|
"learning_rate": 0.00017771835326358743, |
|
"loss": 7.5356, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.0822510822510822, |
|
"grad_norm": 3.0114400386810303, |
|
"learning_rate": 0.00017726552501109478, |
|
"loss": 7.336, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.0909090909090908, |
|
"grad_norm": 0.23199492692947388, |
|
"learning_rate": 0.0001768087308340103, |
|
"loss": 8.1303, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.0995670995670996, |
|
"grad_norm": 0.23555096983909607, |
|
"learning_rate": 0.00017634799417890035, |
|
"loss": 9.2992, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.1082251082251082, |
|
"grad_norm": 0.17370502650737762, |
|
"learning_rate": 0.0001758833386946928, |
|
"loss": 9.3926, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.1168831168831168, |
|
"grad_norm": 0.25211918354034424, |
|
"learning_rate": 0.00017541478823146327, |
|
"loss": 9.3047, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.1255411255411256, |
|
"grad_norm": 0.18103384971618652, |
|
"learning_rate": 0.00017494236683921084, |
|
"loss": 9.1649, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.1341991341991342, |
|
"grad_norm": 0.05794016644358635, |
|
"learning_rate": 0.00017446609876662356, |
|
"loss": 8.9708, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.1428571428571428, |
|
"grad_norm": 0.08229361474514008, |
|
"learning_rate": 0.000173986008459834, |
|
"loss": 9.7143, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.1515151515151516, |
|
"grad_norm": 0.1594569981098175, |
|
"learning_rate": 0.00017350212056116418, |
|
"loss": 10.6715, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.1601731601731602, |
|
"grad_norm": 0.19106224179267883, |
|
"learning_rate": 0.00017301445990786102, |
|
"loss": 11.408, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.1688311688311688, |
|
"grad_norm": 0.2994789183139801, |
|
"learning_rate": 0.00017252305153082114, |
|
"loss": 12.2243, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.1774891774891776, |
|
"grad_norm": 0.21744585037231445, |
|
"learning_rate": 0.00017202792065330646, |
|
"loss": 12.9086, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.1861471861471862, |
|
"grad_norm": 0.16616225242614746, |
|
"learning_rate": 0.00017152909268964916, |
|
"loss": 13.5227, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.1948051948051948, |
|
"grad_norm": 0.15657858550548553, |
|
"learning_rate": 0.00017102659324394747, |
|
"loss": 13.9467, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.2034632034632033, |
|
"grad_norm": 0.16317234933376312, |
|
"learning_rate": 0.00017052044810875126, |
|
"loss": 14.6343, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.2121212121212122, |
|
"grad_norm": 0.21492412686347961, |
|
"learning_rate": 0.00017001068326373827, |
|
"loss": 15.0887, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.2207792207792207, |
|
"grad_norm": 0.1301400512456894, |
|
"learning_rate": 0.00016949732487438047, |
|
"loss": 15.6847, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.2294372294372296, |
|
"grad_norm": 0.20310911536216736, |
|
"learning_rate": 0.00016898039929060129, |
|
"loss": 16.6419, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.2380952380952381, |
|
"grad_norm": 0.05390779674053192, |
|
"learning_rate": 0.00016845993304542283, |
|
"loss": 15.3988, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.2467532467532467, |
|
"grad_norm": 0.16172261536121368, |
|
"learning_rate": 0.0001679359528536041, |
|
"loss": 15.9208, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.2554112554112553, |
|
"grad_norm": 0.13750512897968292, |
|
"learning_rate": 0.0001674084856102698, |
|
"loss": 16.2978, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.2640692640692641, |
|
"grad_norm": 0.07984600216150284, |
|
"learning_rate": 0.00016687755838952972, |
|
"loss": 15.957, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.2727272727272727, |
|
"grad_norm": 0.13223163783550262, |
|
"learning_rate": 0.00016634319844308925, |
|
"loss": 16.2106, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.2813852813852815, |
|
"grad_norm": 0.08291322737932205, |
|
"learning_rate": 0.00016580543319885048, |
|
"loss": 16.14, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.29004329004329, |
|
"grad_norm": 0.052721720188856125, |
|
"learning_rate": 0.00016526429025950424, |
|
"loss": 15.9756, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.2987012987012987, |
|
"grad_norm": 0.08016930520534515, |
|
"learning_rate": 0.00016471979740111366, |
|
"loss": 16.2873, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.3073593073593073, |
|
"grad_norm": 0.09582065045833588, |
|
"learning_rate": 0.00016417198257168803, |
|
"loss": 16.4316, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.316017316017316, |
|
"grad_norm": 0.07704820483922958, |
|
"learning_rate": 0.00016362087388974863, |
|
"loss": 16.7252, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.3246753246753247, |
|
"grad_norm": 0.06254423409700394, |
|
"learning_rate": 0.00016306649964288516, |
|
"loss": 16.1704, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.07229200750589371, |
|
"learning_rate": 0.000162508888286304, |
|
"loss": 16.0079, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.341991341991342, |
|
"grad_norm": 0.052998676896095276, |
|
"learning_rate": 0.00016194806844136754, |
|
"loss": 16.0897, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.3506493506493507, |
|
"grad_norm": 0.10584064573049545, |
|
"learning_rate": 0.00016138406889412512, |
|
"loss": 15.995, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.3593073593073592, |
|
"grad_norm": 0.11045046895742416, |
|
"learning_rate": 0.00016081691859383545, |
|
"loss": 16.0464, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.3679653679653678, |
|
"grad_norm": 0.06835480034351349, |
|
"learning_rate": 0.00016024664665148077, |
|
"loss": 15.842, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.3766233766233766, |
|
"grad_norm": 0.05876036360859871, |
|
"learning_rate": 0.00015967328233827249, |
|
"loss": 15.5758, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.3852813852813852, |
|
"grad_norm": 0.05429469048976898, |
|
"learning_rate": 0.00015909685508414884, |
|
"loss": 15.6389, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.393939393939394, |
|
"grad_norm": 0.08564180880784988, |
|
"learning_rate": 0.00015851739447626434, |
|
"loss": 15.5351, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.4025974025974026, |
|
"grad_norm": 0.08055119216442108, |
|
"learning_rate": 0.00015793493025747092, |
|
"loss": 15.4569, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.4112554112554112, |
|
"grad_norm": 0.08411835134029388, |
|
"learning_rate": 0.00015734949232479152, |
|
"loss": 15.5636, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.4199134199134198, |
|
"grad_norm": 0.15642018616199493, |
|
"learning_rate": 0.00015676111072788527, |
|
"loss": 15.7577, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.4285714285714286, |
|
"grad_norm": 0.17614306509494781, |
|
"learning_rate": 0.00015616981566750538, |
|
"loss": 15.3891, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.4372294372294372, |
|
"grad_norm": 0.10892828553915024, |
|
"learning_rate": 0.00015557563749394858, |
|
"loss": 15.0548, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.445887445887446, |
|
"grad_norm": 0.09866315126419067, |
|
"learning_rate": 0.00015497860670549772, |
|
"loss": 14.8649, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.4545454545454546, |
|
"grad_norm": 0.1259247213602066, |
|
"learning_rate": 0.00015437875394685606, |
|
"loss": 14.9433, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.4632034632034632, |
|
"grad_norm": 0.1163773462176323, |
|
"learning_rate": 0.0001537761100075744, |
|
"loss": 15.0705, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.4718614718614718, |
|
"grad_norm": 0.10914402455091476, |
|
"learning_rate": 0.00015317070582047065, |
|
"loss": 15.2118, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.4805194805194806, |
|
"grad_norm": 0.15627121925354004, |
|
"learning_rate": 0.00015256257246004217, |
|
"loss": 15.6857, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.4891774891774892, |
|
"grad_norm": 0.12600019574165344, |
|
"learning_rate": 0.00015195174114087078, |
|
"loss": 15.2044, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.497835497835498, |
|
"grad_norm": 0.1444402039051056, |
|
"learning_rate": 0.00015133824321602045, |
|
"loss": 15.1016, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.5064935064935066, |
|
"grad_norm": 0.07496843487024307, |
|
"learning_rate": 0.00015072211017542813, |
|
"loss": 14.8805, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.5151515151515151, |
|
"grad_norm": 0.14399290084838867, |
|
"learning_rate": 0.0001501033736442872, |
|
"loss": 14.8005, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.5238095238095237, |
|
"grad_norm": 0.10512608289718628, |
|
"learning_rate": 0.00014948206538142457, |
|
"loss": 14.4348, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.5324675324675323, |
|
"grad_norm": 0.05682501569390297, |
|
"learning_rate": 0.00014885821727767006, |
|
"loss": 14.274, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.5411255411255411, |
|
"grad_norm": 0.09160462766885757, |
|
"learning_rate": 0.00014823186135421994, |
|
"loss": 14.1816, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.54978354978355, |
|
"grad_norm": 0.03531830757856369, |
|
"learning_rate": 0.00014760302976099304, |
|
"loss": 14.0882, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.5584415584415585, |
|
"grad_norm": 0.039006441831588745, |
|
"learning_rate": 0.00014697175477498074, |
|
"loss": 14.1806, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.567099567099567, |
|
"grad_norm": 0.07611044496297836, |
|
"learning_rate": 0.00014633806879859, |
|
"loss": 14.2338, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.5757575757575757, |
|
"grad_norm": 0.05836552381515503, |
|
"learning_rate": 0.00014570200435798044, |
|
"loss": 14.2683, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.5844155844155843, |
|
"grad_norm": 0.01268097199499607, |
|
"learning_rate": 0.0001450635941013947, |
|
"loss": 14.2289, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.593073593073593, |
|
"grad_norm": 0.07083582133054733, |
|
"learning_rate": 0.00014442287079748263, |
|
"loss": 14.3293, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.601731601731602, |
|
"grad_norm": 0.07286886125802994, |
|
"learning_rate": 0.0001437798673336194, |
|
"loss": 14.4253, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.6103896103896105, |
|
"grad_norm": 0.014986686408519745, |
|
"learning_rate": 0.00014313461671421735, |
|
"loss": 14.2353, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.619047619047619, |
|
"grad_norm": 0.06591885536909103, |
|
"learning_rate": 0.00014248715205903204, |
|
"loss": 14.1536, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.6277056277056277, |
|
"grad_norm": 0.07969338446855545, |
|
"learning_rate": 0.0001418375066014622, |
|
"loss": 14.6147, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.06318385899066925, |
|
"learning_rate": 0.00014118571368684383, |
|
"loss": 14.345, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.645021645021645, |
|
"grad_norm": 0.05836130306124687, |
|
"learning_rate": 0.00014053180677073876, |
|
"loss": 14.2053, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.6536796536796536, |
|
"grad_norm": 0.06260073184967041, |
|
"learning_rate": 0.0001398758194172174, |
|
"loss": 14.1105, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.6623376623376624, |
|
"grad_norm": 0.060454513877630234, |
|
"learning_rate": 0.00013921778529713582, |
|
"loss": 14.0394, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.670995670995671, |
|
"grad_norm": 0.07355903834104538, |
|
"learning_rate": 0.00013855773818640773, |
|
"loss": 14.025, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.6796536796536796, |
|
"grad_norm": 0.066501684486866, |
|
"learning_rate": 0.00013789571196427055, |
|
"loss": 14.6727, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.6883116883116882, |
|
"grad_norm": 0.07008553296327591, |
|
"learning_rate": 0.0001372317406115465, |
|
"loss": 14.0043, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.696969696969697, |
|
"grad_norm": 0.010356806218624115, |
|
"learning_rate": 0.00013656585820889867, |
|
"loss": 14.0629, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.7056277056277056, |
|
"grad_norm": 0.06225749850273132, |
|
"learning_rate": 0.00013589809893508128, |
|
"loss": 13.9668, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.7142857142857144, |
|
"grad_norm": 0.07188686728477478, |
|
"learning_rate": 0.00013522849706518566, |
|
"loss": 13.8812, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.722943722943723, |
|
"grad_norm": 0.06185540184378624, |
|
"learning_rate": 0.00013455708696888085, |
|
"loss": 14.1862, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.7316017316017316, |
|
"grad_norm": 0.0615684911608696, |
|
"learning_rate": 0.00013388390310864945, |
|
"loss": 14.4132, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.7402597402597402, |
|
"grad_norm": 0.07077145576477051, |
|
"learning_rate": 0.00013320898003801879, |
|
"loss": 14.4692, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.7489177489177488, |
|
"grad_norm": 0.05472411587834358, |
|
"learning_rate": 0.00013253235239978715, |
|
"loss": 13.9146, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.7575757575757576, |
|
"grad_norm": 0.06190333142876625, |
|
"learning_rate": 0.00013185405492424588, |
|
"loss": 14.0298, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.7662337662337664, |
|
"grad_norm": 0.06982716172933578, |
|
"learning_rate": 0.00013117412242739655, |
|
"loss": 14.0501, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.774891774891775, |
|
"grad_norm": 0.05709117650985718, |
|
"learning_rate": 0.00013049258980916387, |
|
"loss": 14.0339, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.7835497835497836, |
|
"grad_norm": 0.1394786387681961, |
|
"learning_rate": 0.00012980949205160448, |
|
"loss": 14.2314, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.7922077922077921, |
|
"grad_norm": 0.09034628421068192, |
|
"learning_rate": 0.00012912486421711128, |
|
"loss": 14.1102, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.8008658008658007, |
|
"grad_norm": 0.06254860013723373, |
|
"learning_rate": 0.00012843874144661372, |
|
"loss": 14.0066, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.8095238095238095, |
|
"grad_norm": 0.09414557367563248, |
|
"learning_rate": 0.00012775115895777417, |
|
"loss": 14.0197, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.8181818181818183, |
|
"grad_norm": 0.021693430840969086, |
|
"learning_rate": 0.00012706215204318007, |
|
"loss": 14.9639, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.826839826839827, |
|
"grad_norm": 0.058936670422554016, |
|
"learning_rate": 0.00012637175606853264, |
|
"loss": 13.7909, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.8354978354978355, |
|
"grad_norm": 0.04964934289455414, |
|
"learning_rate": 0.0001256800064708313, |
|
"loss": 13.8304, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.844155844155844, |
|
"grad_norm": 0.04889138787984848, |
|
"learning_rate": 0.00012498693875655516, |
|
"loss": 13.9045, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.8528138528138527, |
|
"grad_norm": 0.015483302064239979, |
|
"learning_rate": 0.00012429258849984014, |
|
"loss": 13.9397, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.8614718614718615, |
|
"grad_norm": 0.009186900220811367, |
|
"learning_rate": 0.00012359699134065314, |
|
"loss": 13.7447, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.87012987012987, |
|
"grad_norm": 0.05520382896065712, |
|
"learning_rate": 0.00012290018298296285, |
|
"loss": 13.8049, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.878787878787879, |
|
"grad_norm": 0.05643709748983383, |
|
"learning_rate": 0.00012220219919290687, |
|
"loss": 14.3662, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.8874458874458875, |
|
"grad_norm": 0.05961998179554939, |
|
"learning_rate": 0.00012150307579695601, |
|
"loss": 13.9983, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.896103896103896, |
|
"grad_norm": 0.006795608904212713, |
|
"learning_rate": 0.00012080284868007541, |
|
"loss": 13.9398, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.9047619047619047, |
|
"grad_norm": 0.007627868093550205, |
|
"learning_rate": 0.00012010155378388253, |
|
"loss": 13.9321, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.9134199134199135, |
|
"grad_norm": 0.027596835047006607, |
|
"learning_rate": 0.00011939922710480229, |
|
"loss": 14.0445, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.922077922077922, |
|
"grad_norm": 0.0442458800971508, |
|
"learning_rate": 0.00011869590469221965, |
|
"loss": 13.999, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.9307359307359309, |
|
"grad_norm": 0.0031651495955884457, |
|
"learning_rate": 0.0001179916226466289, |
|
"loss": 14.3455, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.9393939393939394, |
|
"grad_norm": 0.0655621886253357, |
|
"learning_rate": 0.00011728641711778103, |
|
"loss": 14.0114, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.948051948051948, |
|
"grad_norm": 0.05245671793818474, |
|
"learning_rate": 0.000116580324302828, |
|
"loss": 14.1412, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.9567099567099566, |
|
"grad_norm": 0.017913660034537315, |
|
"learning_rate": 0.00011587338044446476, |
|
"loss": 14.0516, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.9653679653679652, |
|
"grad_norm": 0.005986363161355257, |
|
"learning_rate": 0.00011516562182906922, |
|
"loss": 14.0402, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.974025974025974, |
|
"grad_norm": 0.06156434491276741, |
|
"learning_rate": 0.0001144570847848394, |
|
"loss": 13.9566, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.9826839826839828, |
|
"grad_norm": 0.009826838970184326, |
|
"learning_rate": 0.000113747805679929, |
|
"loss": 13.9418, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.9913419913419914, |
|
"grad_norm": 0.009458528831601143, |
|
"learning_rate": 0.00011303782092058061, |
|
"loss": 13.8423, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.22517594695091248, |
|
"learning_rate": 0.00011232716694925693, |
|
"loss": 13.9573, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.0086580086580086, |
|
"grad_norm": 0.003495636396110058, |
|
"learning_rate": 0.00011161588024277036, |
|
"loss": 14.1671, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 2.017316017316017, |
|
"grad_norm": 0.013556540943682194, |
|
"learning_rate": 0.00011090399731041072, |
|
"loss": 14.503, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 2.0259740259740258, |
|
"grad_norm": 0.05908797308802605, |
|
"learning_rate": 0.0001101915546920711, |
|
"loss": 14.613, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 2.034632034632035, |
|
"grad_norm": 0.002222011098638177, |
|
"learning_rate": 0.00010947858895637255, |
|
"loss": 14.9046, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.0432900432900434, |
|
"grad_norm": 0.005948877427726984, |
|
"learning_rate": 0.00010876513669878683, |
|
"loss": 15.0299, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 2.051948051948052, |
|
"grad_norm": 0.0032331624533981085, |
|
"learning_rate": 0.0001080512345397583, |
|
"loss": 15.1622, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 2.0606060606060606, |
|
"grad_norm": 0.04379121959209442, |
|
"learning_rate": 0.00010733691912282396, |
|
"loss": 15.2752, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 2.069264069264069, |
|
"grad_norm": 0.003626056481152773, |
|
"learning_rate": 0.00010662222711273279, |
|
"loss": 15.4877, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 2.0779220779220777, |
|
"grad_norm": 0.0034651593305170536, |
|
"learning_rate": 0.00010590719519356373, |
|
"loss": 15.7603, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.0865800865800868, |
|
"grad_norm": 0.007096354383975267, |
|
"learning_rate": 0.00010519186006684277, |
|
"loss": 15.4634, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 2.0952380952380953, |
|
"grad_norm": 0.005978535860776901, |
|
"learning_rate": 0.000104476258449659, |
|
"loss": 15.6005, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 2.103896103896104, |
|
"grad_norm": 0.0036010430194437504, |
|
"learning_rate": 0.0001037604270727802, |
|
"loss": 15.6865, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 2.1125541125541125, |
|
"grad_norm": 0.005017112474888563, |
|
"learning_rate": 0.00010304440267876727, |
|
"loss": 15.8028, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 2.121212121212121, |
|
"grad_norm": 0.002776139648631215, |
|
"learning_rate": 0.00010232822202008844, |
|
"loss": 15.662, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.1298701298701297, |
|
"grad_norm": 0.0025852976832538843, |
|
"learning_rate": 0.0001016119218572328, |
|
"loss": 16.5597, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 2.1385281385281387, |
|
"grad_norm": 0.0041742087341845036, |
|
"learning_rate": 0.0001008955389568233, |
|
"loss": 15.6964, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 2.1471861471861473, |
|
"grad_norm": 0.003886697581037879, |
|
"learning_rate": 0.00010017911008972982, |
|
"loss": 15.6254, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 2.155844155844156, |
|
"grad_norm": 0.06273438781499863, |
|
"learning_rate": 9.946267202918157e-05, |
|
"loss": 15.7853, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 2.1645021645021645, |
|
"grad_norm": 0.0052159507758915424, |
|
"learning_rate": 9.87462615488797e-05, |
|
"loss": 15.7882, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.173160173160173, |
|
"grad_norm": 0.003929893020540476, |
|
"learning_rate": 9.802991542110958e-05, |
|
"loss": 16.0684, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 2.1818181818181817, |
|
"grad_norm": 0.004246190190315247, |
|
"learning_rate": 9.731367041485359e-05, |
|
"loss": 15.7668, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 2.1904761904761907, |
|
"grad_norm": 0.0031269013416022062, |
|
"learning_rate": 9.659756329390367e-05, |
|
"loss": 15.7715, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 2.1991341991341993, |
|
"grad_norm": 0.001676430692896247, |
|
"learning_rate": 9.588163081497427e-05, |
|
"loss": 15.6645, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 2.207792207792208, |
|
"grad_norm": 0.003888419596478343, |
|
"learning_rate": 9.516590972581578e-05, |
|
"loss": 15.6698, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.2164502164502164, |
|
"grad_norm": 0.0032913736067712307, |
|
"learning_rate": 9.445043676332819e-05, |
|
"loss": 15.7315, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 2.225108225108225, |
|
"grad_norm": 0.0013064603554084897, |
|
"learning_rate": 9.373524865167555e-05, |
|
"loss": 15.6309, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 2.2337662337662336, |
|
"grad_norm": 0.0033070454373955727, |
|
"learning_rate": 9.302038210040099e-05, |
|
"loss": 15.8009, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 2.242424242424242, |
|
"grad_norm": 0.002767252502962947, |
|
"learning_rate": 9.230587380254237e-05, |
|
"loss": 15.7214, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 2.2510822510822512, |
|
"grad_norm": 0.0022966829128563404, |
|
"learning_rate": 9.159176043274895e-05, |
|
"loss": 15.6413, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.25974025974026, |
|
"grad_norm": 0.002309858100488782, |
|
"learning_rate": 9.087807864539897e-05, |
|
"loss": 15.7846, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 2.2683982683982684, |
|
"grad_norm": 0.0039366851560771465, |
|
"learning_rate": 9.016486507271803e-05, |
|
"loss": 15.8136, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 2.277056277056277, |
|
"grad_norm": 0.004376592580229044, |
|
"learning_rate": 8.945215632289912e-05, |
|
"loss": 15.7964, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 2.2857142857142856, |
|
"grad_norm": 0.002474917098879814, |
|
"learning_rate": 8.873998897822336e-05, |
|
"loss": 15.9371, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 2.2943722943722946, |
|
"grad_norm": 0.0036089515779167414, |
|
"learning_rate": 8.802839959318239e-05, |
|
"loss": 15.7513, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.303030303030303, |
|
"grad_norm": 0.0045991260558366776, |
|
"learning_rate": 8.731742469260201e-05, |
|
"loss": 15.7613, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 2.311688311688312, |
|
"grad_norm": 0.0072856624610722065, |
|
"learning_rate": 8.66071007697674e-05, |
|
"loss": 16.4219, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 2.3203463203463204, |
|
"grad_norm": 0.0017560477135702968, |
|
"learning_rate": 8.58974642845501e-05, |
|
"loss": 16.0309, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 2.329004329004329, |
|
"grad_norm": 0.004301704466342926, |
|
"learning_rate": 8.518855166153644e-05, |
|
"loss": 27.1619, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 2.3376623376623376, |
|
"grad_norm": 0.059041913598775864, |
|
"learning_rate": 8.448039928815804e-05, |
|
"loss": 16.0355, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.346320346320346, |
|
"grad_norm": 0.002861848333850503, |
|
"learning_rate": 8.377304351282399e-05, |
|
"loss": 15.8526, |
|
"step": 542 |
|
}, |
|
{ |
|
"epoch": 2.354978354978355, |
|
"grad_norm": 0.003043128876015544, |
|
"learning_rate": 8.306652064305517e-05, |
|
"loss": 15.7983, |
|
"step": 544 |
|
}, |
|
{ |
|
"epoch": 2.3636363636363638, |
|
"grad_norm": 0.0002984661259688437, |
|
"learning_rate": 8.23608669436207e-05, |
|
"loss": 15.8203, |
|
"step": 546 |
|
}, |
|
{ |
|
"epoch": 2.3722943722943723, |
|
"grad_norm": 0.0006706724525429308, |
|
"learning_rate": 8.165611863467644e-05, |
|
"loss": 15.7032, |
|
"step": 548 |
|
}, |
|
{ |
|
"epoch": 2.380952380952381, |
|
"grad_norm": 0.0023594857193529606, |
|
"learning_rate": 8.095231188990597e-05, |
|
"loss": 15.8033, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.3896103896103895, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.024948283466367e-05, |
|
"loss": 15.722, |
|
"step": 552 |
|
}, |
|
{ |
|
"epoch": 2.398268398268398, |
|
"grad_norm": 0.0003026532067451626, |
|
"learning_rate": 7.954766754412066e-05, |
|
"loss": 15.7503, |
|
"step": 554 |
|
}, |
|
{ |
|
"epoch": 2.4069264069264067, |
|
"grad_norm": 0.0020434176549315453, |
|
"learning_rate": 7.884690204141298e-05, |
|
"loss": 15.8143, |
|
"step": 556 |
|
}, |
|
{ |
|
"epoch": 2.4155844155844157, |
|
"grad_norm": 0.03126376122236252, |
|
"learning_rate": 7.814722229579264e-05, |
|
"loss": 16.6621, |
|
"step": 558 |
|
}, |
|
{ |
|
"epoch": 2.4242424242424243, |
|
"grad_norm": 0.0025495837908238173, |
|
"learning_rate": 7.744866422078133e-05, |
|
"loss": 15.757, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.432900432900433, |
|
"grad_norm": 0.0023321521002799273, |
|
"learning_rate": 7.67512636723271e-05, |
|
"loss": 15.7909, |
|
"step": 562 |
|
}, |
|
{ |
|
"epoch": 2.4415584415584415, |
|
"grad_norm": 0.0014553000219166279, |
|
"learning_rate": 7.605505644696387e-05, |
|
"loss": 15.7724, |
|
"step": 564 |
|
}, |
|
{ |
|
"epoch": 2.45021645021645, |
|
"grad_norm": 0.0026701318565756083, |
|
"learning_rate": 7.536007827997397e-05, |
|
"loss": 15.7491, |
|
"step": 566 |
|
}, |
|
{ |
|
"epoch": 2.458874458874459, |
|
"grad_norm": 0.0020363188814371824, |
|
"learning_rate": 7.46663648435541e-05, |
|
"loss": 15.7404, |
|
"step": 568 |
|
}, |
|
{ |
|
"epoch": 2.4675324675324677, |
|
"grad_norm": 0.005381477996706963, |
|
"learning_rate": 7.397395174498417e-05, |
|
"loss": 15.7682, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.4761904761904763, |
|
"grad_norm": 0.0010892607970163226, |
|
"learning_rate": 7.328287452479968e-05, |
|
"loss": 15.7591, |
|
"step": 572 |
|
}, |
|
{ |
|
"epoch": 2.484848484848485, |
|
"grad_norm": 0.0046081384643912315, |
|
"learning_rate": 7.259316865496757e-05, |
|
"loss": 16.2354, |
|
"step": 574 |
|
}, |
|
{ |
|
"epoch": 2.4935064935064934, |
|
"grad_norm": 0.0023402958177030087, |
|
"learning_rate": 7.19048695370652e-05, |
|
"loss": 15.8337, |
|
"step": 576 |
|
}, |
|
{ |
|
"epoch": 2.502164502164502, |
|
"grad_norm": 0.0027834863867610693, |
|
"learning_rate": 7.121801250046363e-05, |
|
"loss": 15.9034, |
|
"step": 578 |
|
}, |
|
{ |
|
"epoch": 2.5108225108225106, |
|
"grad_norm": 0.004065465647727251, |
|
"learning_rate": 7.053263280051394e-05, |
|
"loss": 15.717, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.5194805194805197, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.984876561673776e-05, |
|
"loss": 15.7805, |
|
"step": 582 |
|
}, |
|
{ |
|
"epoch": 2.5281385281385282, |
|
"grad_norm": 0.002813364379107952, |
|
"learning_rate": 6.91664460510215e-05, |
|
"loss": 15.7205, |
|
"step": 584 |
|
}, |
|
{ |
|
"epoch": 2.536796536796537, |
|
"grad_norm": 0.0021334670018404722, |
|
"learning_rate": 6.848570912581463e-05, |
|
"loss": 15.8153, |
|
"step": 586 |
|
}, |
|
{ |
|
"epoch": 2.5454545454545454, |
|
"grad_norm": 0.003558355150744319, |
|
"learning_rate": 6.780658978233199e-05, |
|
"loss": 15.7313, |
|
"step": 588 |
|
}, |
|
{ |
|
"epoch": 2.554112554112554, |
|
"grad_norm": 0.001838353113271296, |
|
"learning_rate": 6.71291228787604e-05, |
|
"loss": 15.6268, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.562770562770563, |
|
"grad_norm": 0.003100910922512412, |
|
"learning_rate": 6.64533431884694e-05, |
|
"loss": 15.7013, |
|
"step": 592 |
|
}, |
|
{ |
|
"epoch": 2.571428571428571, |
|
"grad_norm": 0.0012829442275688052, |
|
"learning_rate": 6.57792853982264e-05, |
|
"loss": 15.832, |
|
"step": 594 |
|
}, |
|
{ |
|
"epoch": 2.58008658008658, |
|
"grad_norm": 0.001535665593110025, |
|
"learning_rate": 6.51069841064162e-05, |
|
"loss": 15.7034, |
|
"step": 596 |
|
}, |
|
{ |
|
"epoch": 2.588744588744589, |
|
"grad_norm": 0.010908816941082478, |
|
"learning_rate": 6.443647382126509e-05, |
|
"loss": 15.8574, |
|
"step": 598 |
|
}, |
|
{ |
|
"epoch": 2.5974025974025974, |
|
"grad_norm": 0.0028520359192043543, |
|
"learning_rate": 6.376778895906976e-05, |
|
"loss": 15.8502, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.606060606060606, |
|
"grad_norm": 0.011398903094232082, |
|
"learning_rate": 6.310096384243061e-05, |
|
"loss": 15.9701, |
|
"step": 602 |
|
}, |
|
{ |
|
"epoch": 2.6147186147186146, |
|
"grad_norm": 0.0021338535007089376, |
|
"learning_rate": 6.243603269849003e-05, |
|
"loss": 15.7824, |
|
"step": 604 |
|
}, |
|
{ |
|
"epoch": 2.6233766233766236, |
|
"grad_norm": 0.004288196098059416, |
|
"learning_rate": 6.177302965717566e-05, |
|
"loss": 15.7025, |
|
"step": 606 |
|
}, |
|
{ |
|
"epoch": 2.632034632034632, |
|
"grad_norm": 0.0022099835332483053, |
|
"learning_rate": 6.111198874944845e-05, |
|
"loss": 15.8339, |
|
"step": 608 |
|
}, |
|
{ |
|
"epoch": 2.6406926406926408, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.045294390555598e-05, |
|
"loss": 15.6495, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.6493506493506493, |
|
"grad_norm": 0.06401393562555313, |
|
"learning_rate": 5.979592895329085e-05, |
|
"loss": 15.9235, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 2.658008658008658, |
|
"grad_norm": 0.0, |
|
"learning_rate": 5.914097761625428e-05, |
|
"loss": 15.7719, |
|
"step": 614 |
|
}, |
|
{ |
|
"epoch": 2.6666666666666665, |
|
"grad_norm": 0.0014950314071029425, |
|
"learning_rate": 5.848812351212522e-05, |
|
"loss": 15.8033, |
|
"step": 616 |
|
}, |
|
{ |
|
"epoch": 2.675324675324675, |
|
"grad_norm": 0.003303313162177801, |
|
"learning_rate": 5.783740015093484e-05, |
|
"loss": 15.8201, |
|
"step": 618 |
|
}, |
|
{ |
|
"epoch": 2.683982683982684, |
|
"grad_norm": 0.0021313403267413378, |
|
"learning_rate": 5.718884093334627e-05, |
|
"loss": 15.6961, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.6926406926406927, |
|
"grad_norm": 0.004075351171195507, |
|
"learning_rate": 5.654247914894058e-05, |
|
"loss": 15.8514, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 2.7012987012987013, |
|
"grad_norm": 0.0010667102178558707, |
|
"learning_rate": 5.589834797450764e-05, |
|
"loss": 15.9098, |
|
"step": 624 |
|
}, |
|
{ |
|
"epoch": 2.70995670995671, |
|
"grad_norm": 0.0020862577948719263, |
|
"learning_rate": 5.525648047234364e-05, |
|
"loss": 15.8231, |
|
"step": 626 |
|
}, |
|
{ |
|
"epoch": 2.7186147186147185, |
|
"grad_norm": 0.0027077968697994947, |
|
"learning_rate": 5.4616909588553674e-05, |
|
"loss": 15.7527, |
|
"step": 628 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 0.0037543477956205606, |
|
"learning_rate": 5.3979668151360905e-05, |
|
"loss": 15.8286, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 2.7359307359307357, |
|
"grad_norm": 0.0034395060501992702, |
|
"learning_rate": 5.33447888694214e-05, |
|
"loss": 15.8517, |
|
"step": 632 |
|
}, |
|
{ |
|
"epoch": 2.7445887445887447, |
|
"grad_norm": 0.0016386689385399222, |
|
"learning_rate": 5.271230433014542e-05, |
|
"loss": 15.6296, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.7532467532467533, |
|
"grad_norm": 0.002882522065192461, |
|
"learning_rate": 5.2082246998024485e-05, |
|
"loss": 15.9626, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.761904761904762, |
|
"grad_norm": 0.0016715782694518566, |
|
"learning_rate": 5.145464921296537e-05, |
|
"loss": 16.1011, |
|
"step": 638 |
|
}, |
|
{ |
|
"epoch": 2.7705627705627704, |
|
"grad_norm": 0.003780187340453267, |
|
"learning_rate": 5.082954318862978e-05, |
|
"loss": 15.7561, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 2.779220779220779, |
|
"grad_norm": 0.0015209164703264832, |
|
"learning_rate": 5.0206961010781085e-05, |
|
"loss": 15.7211, |
|
"step": 642 |
|
}, |
|
{ |
|
"epoch": 2.787878787878788, |
|
"grad_norm": 0.003561074612662196, |
|
"learning_rate": 4.958693463563748e-05, |
|
"loss": 15.9192, |
|
"step": 644 |
|
}, |
|
{ |
|
"epoch": 2.7965367965367967, |
|
"grad_norm": 0.0025618516374379396, |
|
"learning_rate": 4.8969495888231484e-05, |
|
"loss": 15.959, |
|
"step": 646 |
|
}, |
|
{ |
|
"epoch": 2.8051948051948052, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.835467646077656e-05, |
|
"loss": 15.6299, |
|
"step": 648 |
|
}, |
|
{ |
|
"epoch": 2.813852813852814, |
|
"grad_norm": 0.0032659226562827826, |
|
"learning_rate": 4.7742507911040325e-05, |
|
"loss": 15.8226, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 2.8225108225108224, |
|
"grad_norm": 0.0029562402050942183, |
|
"learning_rate": 4.713302166072492e-05, |
|
"loss": 16.5945, |
|
"step": 652 |
|
}, |
|
{ |
|
"epoch": 2.8311688311688314, |
|
"grad_norm": 0.003705563023686409, |
|
"learning_rate": 4.652624899385387e-05, |
|
"loss": 15.8928, |
|
"step": 654 |
|
}, |
|
{ |
|
"epoch": 2.8398268398268396, |
|
"grad_norm": 0.002291543409228325, |
|
"learning_rate": 4.5922221055166656e-05, |
|
"loss": 15.6579, |
|
"step": 656 |
|
}, |
|
{ |
|
"epoch": 2.8484848484848486, |
|
"grad_norm": 0.00401789927855134, |
|
"learning_rate": 4.532096884851978e-05, |
|
"loss": 15.9641, |
|
"step": 658 |
|
}, |
|
{ |
|
"epoch": 2.857142857142857, |
|
"grad_norm": 0.003257931210100651, |
|
"learning_rate": 4.4722523235295745e-05, |
|
"loss": 15.6643, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 2.865800865800866, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.41269149328185e-05, |
|
"loss": 15.7199, |
|
"step": 662 |
|
}, |
|
{ |
|
"epoch": 2.8744588744588744, |
|
"grad_norm": 0.0021371468901634216, |
|
"learning_rate": 4.3534174512777324e-05, |
|
"loss": 15.8272, |
|
"step": 664 |
|
}, |
|
{ |
|
"epoch": 2.883116883116883, |
|
"grad_norm": 0.0009620334021747112, |
|
"learning_rate": 4.2944332399657184e-05, |
|
"loss": 16.1842, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 2.891774891774892, |
|
"grad_norm": 0.0020075475331395864, |
|
"learning_rate": 4.2357418869177354e-05, |
|
"loss": 15.6193, |
|
"step": 668 |
|
}, |
|
{ |
|
"epoch": 2.9004329004329006, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.1773464046737276e-05, |
|
"loss": 15.6615, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 2.909090909090909, |
|
"grad_norm": 0.000680964847560972, |
|
"learning_rate": 4.1192497905870276e-05, |
|
"loss": 15.7517, |
|
"step": 672 |
|
}, |
|
{ |
|
"epoch": 2.9177489177489178, |
|
"grad_norm": 0.004122753627598286, |
|
"learning_rate": 4.061455026670509e-05, |
|
"loss": 16.6327, |
|
"step": 674 |
|
}, |
|
{ |
|
"epoch": 2.9264069264069263, |
|
"grad_norm": 0.002312893746420741, |
|
"learning_rate": 4.0039650794435344e-05, |
|
"loss": 15.8981, |
|
"step": 676 |
|
}, |
|
{ |
|
"epoch": 2.935064935064935, |
|
"grad_norm": 0.024043424054980278, |
|
"learning_rate": 3.946782899779667e-05, |
|
"loss": 15.8999, |
|
"step": 678 |
|
}, |
|
{ |
|
"epoch": 2.9437229437229435, |
|
"grad_norm": 0.0012768743326887488, |
|
"learning_rate": 3.889911422755231e-05, |
|
"loss": 15.6874, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 2.9523809523809526, |
|
"grad_norm": 0.0023466376587748528, |
|
"learning_rate": 3.8333535674986275e-05, |
|
"loss": 15.7274, |
|
"step": 682 |
|
}, |
|
{ |
|
"epoch": 2.961038961038961, |
|
"grad_norm": 0.0034092010464519262, |
|
"learning_rate": 3.777112237040537e-05, |
|
"loss": 15.7464, |
|
"step": 684 |
|
}, |
|
{ |
|
"epoch": 2.9696969696969697, |
|
"grad_norm": 0.0017824557144194841, |
|
"learning_rate": 3.721190318164877e-05, |
|
"loss": 15.7556, |
|
"step": 686 |
|
}, |
|
{ |
|
"epoch": 2.9783549783549783, |
|
"grad_norm": 0.0014683044282719493, |
|
"learning_rate": 3.665590681260658e-05, |
|
"loss": 16.0187, |
|
"step": 688 |
|
}, |
|
{ |
|
"epoch": 2.987012987012987, |
|
"grad_norm": 0.00010432133422000334, |
|
"learning_rate": 3.610316180174622e-05, |
|
"loss": 16.4009, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 2.995670995670996, |
|
"grad_norm": 0.003945178352296352, |
|
"learning_rate": 3.555369652064787e-05, |
|
"loss": 15.7738, |
|
"step": 692 |
|
}, |
|
{ |
|
"epoch": 3.0043290043290045, |
|
"grad_norm": 0.0048090131022036076, |
|
"learning_rate": 3.500753917254787e-05, |
|
"loss": 15.9384, |
|
"step": 694 |
|
}, |
|
{ |
|
"epoch": 3.012987012987013, |
|
"grad_norm": 0.002933148993179202, |
|
"learning_rate": 3.446471779089144e-05, |
|
"loss": 15.8317, |
|
"step": 696 |
|
}, |
|
{ |
|
"epoch": 3.0216450216450217, |
|
"grad_norm": 0.003420398337766528, |
|
"learning_rate": 3.392526023789349e-05, |
|
"loss": 15.536, |
|
"step": 698 |
|
}, |
|
{ |
|
"epoch": 3.0303030303030303, |
|
"grad_norm": 0.0001099475848604925, |
|
"learning_rate": 3.338919420310871e-05, |
|
"loss": 15.5597, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.038961038961039, |
|
"grad_norm": 0.00021568694501183927, |
|
"learning_rate": 3.28565472020101e-05, |
|
"loss": 15.603, |
|
"step": 702 |
|
}, |
|
{ |
|
"epoch": 3.0476190476190474, |
|
"grad_norm": 0.003983495756983757, |
|
"learning_rate": 3.2327346574576753e-05, |
|
"loss": 15.5045, |
|
"step": 704 |
|
}, |
|
{ |
|
"epoch": 3.0562770562770565, |
|
"grad_norm": 0.00216678692959249, |
|
"learning_rate": 3.180161948389062e-05, |
|
"loss": 15.5247, |
|
"step": 706 |
|
}, |
|
{ |
|
"epoch": 3.064935064935065, |
|
"grad_norm": 0.031781021505594254, |
|
"learning_rate": 3.1279392914742046e-05, |
|
"loss": 15.4569, |
|
"step": 708 |
|
}, |
|
{ |
|
"epoch": 3.0735930735930737, |
|
"grad_norm": 0.00032434993772767484, |
|
"learning_rate": 3.076069367224486e-05, |
|
"loss": 16.0896, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.0822510822510822, |
|
"grad_norm": 0.0037430988159030676, |
|
"learning_rate": 3.0245548380460486e-05, |
|
"loss": 15.6038, |
|
"step": 712 |
|
}, |
|
{ |
|
"epoch": 3.090909090909091, |
|
"grad_norm": 0.0002255355502711609, |
|
"learning_rate": 2.9733983481031302e-05, |
|
"loss": 15.3307, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 3.0995670995670994, |
|
"grad_norm": 0.0028247262816876173, |
|
"learning_rate": 2.922602523182344e-05, |
|
"loss": 15.4397, |
|
"step": 716 |
|
}, |
|
{ |
|
"epoch": 3.108225108225108, |
|
"grad_norm": 0.018052997067570686, |
|
"learning_rate": 2.872169970557913e-05, |
|
"loss": 15.599, |
|
"step": 718 |
|
}, |
|
{ |
|
"epoch": 3.116883116883117, |
|
"grad_norm": 0.0025733679067343473, |
|
"learning_rate": 2.8221032788578205e-05, |
|
"loss": 15.4745, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.1255411255411256, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7724050179309646e-05, |
|
"loss": 15.9485, |
|
"step": 722 |
|
}, |
|
{ |
|
"epoch": 3.134199134199134, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.7230777387152296e-05, |
|
"loss": 15.4227, |
|
"step": 724 |
|
}, |
|
{ |
|
"epoch": 3.142857142857143, |
|
"grad_norm": 0.000987619161605835, |
|
"learning_rate": 2.6741239731065647e-05, |
|
"loss": 15.5741, |
|
"step": 726 |
|
}, |
|
{ |
|
"epoch": 3.1515151515151514, |
|
"grad_norm": 0.0019787976052612066, |
|
"learning_rate": 2.625546233829016e-05, |
|
"loss": 16.6376, |
|
"step": 728 |
|
}, |
|
{ |
|
"epoch": 3.16017316017316, |
|
"grad_norm": 0.0037663017865270376, |
|
"learning_rate": 2.5773470143057655e-05, |
|
"loss": 15.5158, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.168831168831169, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.529528788531128e-05, |
|
"loss": 15.4871, |
|
"step": 732 |
|
}, |
|
{ |
|
"epoch": 3.1774891774891776, |
|
"grad_norm": 0.0023285530041903257, |
|
"learning_rate": 2.4820940109435885e-05, |
|
"loss": 15.4632, |
|
"step": 734 |
|
}, |
|
{ |
|
"epoch": 3.186147186147186, |
|
"grad_norm": 0.0020200940780341625, |
|
"learning_rate": 2.4350451162997877e-05, |
|
"loss": 15.5132, |
|
"step": 736 |
|
}, |
|
{ |
|
"epoch": 3.1948051948051948, |
|
"grad_norm": 0.003936352673918009, |
|
"learning_rate": 2.3883845195495878e-05, |
|
"loss": 15.5417, |
|
"step": 738 |
|
}, |
|
{ |
|
"epoch": 3.2034632034632033, |
|
"grad_norm": 0.0037546472158282995, |
|
"learning_rate": 2.342114615712081e-05, |
|
"loss": 15.9803, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.212121212121212, |
|
"grad_norm": 0.0005508167669177055, |
|
"learning_rate": 2.296237779752687e-05, |
|
"loss": 15.2634, |
|
"step": 742 |
|
}, |
|
{ |
|
"epoch": 3.220779220779221, |
|
"grad_norm": 0.0036611163523048162, |
|
"learning_rate": 2.2507563664612252e-05, |
|
"loss": 15.5582, |
|
"step": 744 |
|
}, |
|
{ |
|
"epoch": 3.2294372294372296, |
|
"grad_norm": 0.003752505173906684, |
|
"learning_rate": 2.205672710331059e-05, |
|
"loss": 15.5934, |
|
"step": 746 |
|
}, |
|
{ |
|
"epoch": 3.238095238095238, |
|
"grad_norm": 0.0008896394865587354, |
|
"learning_rate": 2.1609891254392678e-05, |
|
"loss": 15.4942, |
|
"step": 748 |
|
}, |
|
{ |
|
"epoch": 3.2467532467532467, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.1167079053278737e-05, |
|
"loss": 15.3308, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.2554112554112553, |
|
"grad_norm": 0.00010732792725320905, |
|
"learning_rate": 2.072831322886105e-05, |
|
"loss": 15.3753, |
|
"step": 752 |
|
}, |
|
{ |
|
"epoch": 3.264069264069264, |
|
"grad_norm": 0.003391178324818611, |
|
"learning_rate": 2.029361630233747e-05, |
|
"loss": 15.4389, |
|
"step": 754 |
|
}, |
|
{ |
|
"epoch": 3.2727272727272725, |
|
"grad_norm": 0.0003303180856164545, |
|
"learning_rate": 1.986301058605531e-05, |
|
"loss": 15.385, |
|
"step": 756 |
|
}, |
|
{ |
|
"epoch": 3.2813852813852815, |
|
"grad_norm": 0.0005520053091458976, |
|
"learning_rate": 1.9436518182366158e-05, |
|
"loss": 15.4051, |
|
"step": 758 |
|
}, |
|
{ |
|
"epoch": 3.29004329004329, |
|
"grad_norm": 0.003017255337908864, |
|
"learning_rate": 1.901416098249136e-05, |
|
"loss": 15.647, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.2987012987012987, |
|
"grad_norm": 0.021177958697080612, |
|
"learning_rate": 1.8595960665398458e-05, |
|
"loss": 15.6138, |
|
"step": 762 |
|
}, |
|
{ |
|
"epoch": 3.3073593073593073, |
|
"grad_norm": 0.0034116676542907953, |
|
"learning_rate": 1.8181938696688296e-05, |
|
"loss": 15.4569, |
|
"step": 764 |
|
}, |
|
{ |
|
"epoch": 3.316017316017316, |
|
"grad_norm": 0.00011423804244259372, |
|
"learning_rate": 1.7772116327493372e-05, |
|
"loss": 15.4194, |
|
"step": 766 |
|
}, |
|
{ |
|
"epoch": 3.324675324675325, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.736651459338695e-05, |
|
"loss": 15.4586, |
|
"step": 768 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.001321481540799141, |
|
"learning_rate": 1.6965154313303368e-05, |
|
"loss": 15.613, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.341991341991342, |
|
"grad_norm": 0.0015613286523148417, |
|
"learning_rate": 1.6568056088469387e-05, |
|
"loss": 15.4395, |
|
"step": 772 |
|
}, |
|
{ |
|
"epoch": 3.3506493506493507, |
|
"grad_norm": 0.00010790528176585212, |
|
"learning_rate": 1.6175240301346906e-05, |
|
"loss": 15.4752, |
|
"step": 774 |
|
}, |
|
{ |
|
"epoch": 3.3593073593073592, |
|
"grad_norm": 0.003342802170664072, |
|
"learning_rate": 1.5786727114586586e-05, |
|
"loss": 15.4756, |
|
"step": 776 |
|
}, |
|
{ |
|
"epoch": 3.367965367965368, |
|
"grad_norm": 0.001664191484451294, |
|
"learning_rate": 1.540253646999299e-05, |
|
"loss": 15.4152, |
|
"step": 778 |
|
}, |
|
{ |
|
"epoch": 3.3766233766233764, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.5022688087501092e-05, |
|
"loss": 15.3462, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.3852813852813854, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4647201464163906e-05, |
|
"loss": 15.4894, |
|
"step": 782 |
|
}, |
|
{ |
|
"epoch": 3.393939393939394, |
|
"grad_norm": 0.002181015908718109, |
|
"learning_rate": 1.4276095873151952e-05, |
|
"loss": 15.4537, |
|
"step": 784 |
|
}, |
|
{ |
|
"epoch": 3.4025974025974026, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.3909390362763752e-05, |
|
"loss": 15.4039, |
|
"step": 786 |
|
}, |
|
{ |
|
"epoch": 3.411255411255411, |
|
"grad_norm": 0.001633924781344831, |
|
"learning_rate": 1.3547103755448287e-05, |
|
"loss": 15.3927, |
|
"step": 788 |
|
}, |
|
{ |
|
"epoch": 3.41991341991342, |
|
"grad_norm": 0.0019662026315927505, |
|
"learning_rate": 1.3189254646838767e-05, |
|
"loss": 15.573, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.4285714285714284, |
|
"grad_norm": 0.0017789127305150032, |
|
"learning_rate": 1.2835861404798265e-05, |
|
"loss": 15.4386, |
|
"step": 792 |
|
}, |
|
{ |
|
"epoch": 3.4372294372294374, |
|
"grad_norm": 0.0037333201617002487, |
|
"learning_rate": 1.2486942168476756e-05, |
|
"loss": 15.4277, |
|
"step": 794 |
|
}, |
|
{ |
|
"epoch": 3.445887445887446, |
|
"grad_norm": 0.00033077617990784347, |
|
"learning_rate": 1.2142514847380237e-05, |
|
"loss": 15.43, |
|
"step": 796 |
|
}, |
|
{ |
|
"epoch": 3.4545454545454546, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.1802597120451286e-05, |
|
"loss": 15.5442, |
|
"step": 798 |
|
}, |
|
{ |
|
"epoch": 3.463203463203463, |
|
"grad_norm": 0.0014221479650586843, |
|
"learning_rate": 1.146720643516177e-05, |
|
"loss": 15.4446, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.4718614718614718, |
|
"grad_norm": 0.0004546408890746534, |
|
"learning_rate": 1.1136360006617185e-05, |
|
"loss": 16.2801, |
|
"step": 802 |
|
}, |
|
{ |
|
"epoch": 3.4805194805194803, |
|
"grad_norm": 0.0038407333195209503, |
|
"learning_rate": 1.0810074816673154e-05, |
|
"loss": 26.5808, |
|
"step": 804 |
|
}, |
|
{ |
|
"epoch": 3.4891774891774894, |
|
"grad_norm": 0.000989666790701449, |
|
"learning_rate": 1.048836761306361e-05, |
|
"loss": 15.379, |
|
"step": 806 |
|
}, |
|
{ |
|
"epoch": 3.497835497835498, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0171254908541372e-05, |
|
"loss": 15.4584, |
|
"step": 808 |
|
}, |
|
{ |
|
"epoch": 3.5064935064935066, |
|
"grad_norm": 0.0, |
|
"learning_rate": 9.858752980030295e-06, |
|
"loss": 15.609, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.515151515151515, |
|
"grad_norm": 0.0013323862804099917, |
|
"learning_rate": 9.550877867790065e-06, |
|
"loss": 15.5316, |
|
"step": 812 |
|
}, |
|
{ |
|
"epoch": 3.5238095238095237, |
|
"grad_norm": 0.00010888870747294277, |
|
"learning_rate": 9.247645374592717e-06, |
|
"loss": 15.4492, |
|
"step": 814 |
|
}, |
|
{ |
|
"epoch": 3.5324675324675323, |
|
"grad_norm": 0.00010042625217465684, |
|
"learning_rate": 8.949071064911585e-06, |
|
"loss": 15.4255, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 3.541125541125541, |
|
"grad_norm": 0.0003212654555682093, |
|
"learning_rate": 8.655170264122303e-06, |
|
"loss": 15.9795, |
|
"step": 818 |
|
}, |
|
{ |
|
"epoch": 3.54978354978355, |
|
"grad_norm": 0.003979097120463848, |
|
"learning_rate": 8.365958057716338e-06, |
|
"loss": 15.5248, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.5584415584415585, |
|
"grad_norm": 0.0019642910920083523, |
|
"learning_rate": 8.081449290526432e-06, |
|
"loss": 15.4587, |
|
"step": 822 |
|
}, |
|
{ |
|
"epoch": 3.567099567099567, |
|
"grad_norm": 0.008268770761787891, |
|
"learning_rate": 7.80165856596492e-06, |
|
"loss": 15.7592, |
|
"step": 824 |
|
}, |
|
{ |
|
"epoch": 3.5757575757575757, |
|
"grad_norm": 0.0025506443344056606, |
|
"learning_rate": 7.526600245273918e-06, |
|
"loss": 15.8564, |
|
"step": 826 |
|
}, |
|
{ |
|
"epoch": 3.5844155844155843, |
|
"grad_norm": 0.0012161307968199253, |
|
"learning_rate": 7.256288446788362e-06, |
|
"loss": 15.3659, |
|
"step": 828 |
|
}, |
|
{ |
|
"epoch": 3.5930735930735933, |
|
"grad_norm": 0.004051461815834045, |
|
"learning_rate": 6.9907370452112046e-06, |
|
"loss": 15.9221, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 3.601731601731602, |
|
"grad_norm": 0.003108682343736291, |
|
"learning_rate": 6.729959670901309e-06, |
|
"loss": 15.5334, |
|
"step": 832 |
|
}, |
|
{ |
|
"epoch": 3.6103896103896105, |
|
"grad_norm": 0.0033409043680876493, |
|
"learning_rate": 6.4739697091738e-06, |
|
"loss": 15.426, |
|
"step": 834 |
|
}, |
|
{ |
|
"epoch": 3.619047619047619, |
|
"grad_norm": 0.0016489011468365788, |
|
"learning_rate": 6.222780299613074e-06, |
|
"loss": 15.3453, |
|
"step": 836 |
|
}, |
|
{ |
|
"epoch": 3.6277056277056277, |
|
"grad_norm": 0.00321365287527442, |
|
"learning_rate": 5.976404335398256e-06, |
|
"loss": 15.607, |
|
"step": 838 |
|
}, |
|
{ |
|
"epoch": 3.6363636363636362, |
|
"grad_norm": 0.00022275045921560377, |
|
"learning_rate": 5.734854462641548e-06, |
|
"loss": 15.4681, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 3.645021645021645, |
|
"grad_norm": 0.0018924333853647113, |
|
"learning_rate": 5.498143079738971e-06, |
|
"loss": 15.4828, |
|
"step": 842 |
|
}, |
|
{ |
|
"epoch": 3.653679653679654, |
|
"grad_norm": 0.0013136648340150714, |
|
"learning_rate": 5.2662823367340855e-06, |
|
"loss": 15.2643, |
|
"step": 844 |
|
}, |
|
{ |
|
"epoch": 3.6623376623376624, |
|
"grad_norm": 0.00011183915194123983, |
|
"learning_rate": 5.039284134694333e-06, |
|
"loss": 15.4385, |
|
"step": 846 |
|
}, |
|
{ |
|
"epoch": 3.670995670995671, |
|
"grad_norm": 0.0032093904446810484, |
|
"learning_rate": 4.817160125100106e-06, |
|
"loss": 15.4845, |
|
"step": 848 |
|
}, |
|
{ |
|
"epoch": 3.6796536796536796, |
|
"grad_norm": 0.001872088760137558, |
|
"learning_rate": 4.599921709246812e-06, |
|
"loss": 15.3745, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 3.688311688311688, |
|
"grad_norm": 0.002430541208013892, |
|
"learning_rate": 4.3875800376595e-06, |
|
"loss": 15.3891, |
|
"step": 852 |
|
}, |
|
{ |
|
"epoch": 3.6969696969696972, |
|
"grad_norm": 0.003143192734569311, |
|
"learning_rate": 4.180146009520702e-06, |
|
"loss": 15.6386, |
|
"step": 854 |
|
}, |
|
{ |
|
"epoch": 3.7056277056277054, |
|
"grad_norm": 0.003909118473529816, |
|
"learning_rate": 3.977630272110811e-06, |
|
"loss": 15.8103, |
|
"step": 856 |
|
}, |
|
{ |
|
"epoch": 3.7142857142857144, |
|
"grad_norm": 0.00011017678480129689, |
|
"learning_rate": 3.780043220261764e-06, |
|
"loss": 15.4262, |
|
"step": 858 |
|
}, |
|
{ |
|
"epoch": 3.722943722943723, |
|
"grad_norm": 0.002079217229038477, |
|
"learning_rate": 3.587394995823301e-06, |
|
"loss": 15.5448, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 3.7316017316017316, |
|
"grad_norm": 0.00011921657278435305, |
|
"learning_rate": 3.3996954871425845e-06, |
|
"loss": 15.3458, |
|
"step": 862 |
|
}, |
|
{ |
|
"epoch": 3.74025974025974, |
|
"grad_norm": 0.0003392777871340513, |
|
"learning_rate": 3.216954328556443e-06, |
|
"loss": 15.4115, |
|
"step": 864 |
|
}, |
|
{ |
|
"epoch": 3.7489177489177488, |
|
"grad_norm": 0.0011154324747622013, |
|
"learning_rate": 3.039180899897043e-06, |
|
"loss": 15.5333, |
|
"step": 866 |
|
}, |
|
{ |
|
"epoch": 3.757575757575758, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.8663843260103074e-06, |
|
"loss": 15.4002, |
|
"step": 868 |
|
}, |
|
{ |
|
"epoch": 3.7662337662337664, |
|
"grad_norm": 0.00021322118118405342, |
|
"learning_rate": 2.698573476287658e-06, |
|
"loss": 15.634, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 3.774891774891775, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.535756964210634e-06, |
|
"loss": 15.2512, |
|
"step": 872 |
|
}, |
|
{ |
|
"epoch": 3.7835497835497836, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.37794314690889e-06, |
|
"loss": 15.5646, |
|
"step": 874 |
|
}, |
|
{ |
|
"epoch": 3.792207792207792, |
|
"grad_norm": 0.0, |
|
"learning_rate": 2.225140124731151e-06, |
|
"loss": 15.436, |
|
"step": 876 |
|
}, |
|
{ |
|
"epoch": 3.8008658008658007, |
|
"grad_norm": 0.0018814082723110914, |
|
"learning_rate": 2.0773557408295343e-06, |
|
"loss": 15.5011, |
|
"step": 878 |
|
}, |
|
{ |
|
"epoch": 3.8095238095238093, |
|
"grad_norm": 0.0019670824985951185, |
|
"learning_rate": 1.9345975807568474e-06, |
|
"loss": 15.3244, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 3.8181818181818183, |
|
"grad_norm": 0.0035723568871617317, |
|
"learning_rate": 1.7968729720773459e-06, |
|
"loss": 15.6439, |
|
"step": 882 |
|
}, |
|
{ |
|
"epoch": 3.826839826839827, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.6641889839905445e-06, |
|
"loss": 15.3774, |
|
"step": 884 |
|
}, |
|
{ |
|
"epoch": 3.8354978354978355, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.536552426968396e-06, |
|
"loss": 15.3533, |
|
"step": 886 |
|
}, |
|
{ |
|
"epoch": 3.844155844155844, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4139698524057165e-06, |
|
"loss": 16.2874, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 3.8528138528138527, |
|
"grad_norm": 0.00011134289525216445, |
|
"learning_rate": 1.2964475522839304e-06, |
|
"loss": 15.4719, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 3.8614718614718617, |
|
"grad_norm": 0.00011475420615170151, |
|
"learning_rate": 1.1839915588480743e-06, |
|
"loss": 15.2561, |
|
"step": 892 |
|
}, |
|
{ |
|
"epoch": 3.87012987012987, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.0766076442971895e-06, |
|
"loss": 15.4674, |
|
"step": 894 |
|
}, |
|
{ |
|
"epoch": 3.878787878787879, |
|
"grad_norm": 0.0022935173474252224, |
|
"learning_rate": 9.74301320488058e-07, |
|
"loss": 15.6186, |
|
"step": 896 |
|
}, |
|
{ |
|
"epoch": 3.8874458874458875, |
|
"grad_norm": 0.0, |
|
"learning_rate": 8.770778386522627e-07, |
|
"loss": 15.3455, |
|
"step": 898 |
|
}, |
|
{ |
|
"epoch": 3.896103896103896, |
|
"grad_norm": 0.0, |
|
"learning_rate": 7.849421891266584e-07, |
|
"loss": 15.4871, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 3.9047619047619047, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.978991010972547e-07, |
|
"loss": 16.203, |
|
"step": 902 |
|
}, |
|
{ |
|
"epoch": 3.9134199134199132, |
|
"grad_norm": 0.0, |
|
"learning_rate": 6.159530423563986e-07, |
|
"loss": 15.4409, |
|
"step": 904 |
|
}, |
|
{ |
|
"epoch": 3.9220779220779223, |
|
"grad_norm": 0.004353491589426994, |
|
"learning_rate": 5.391082190735252e-07, |
|
"loss": 15.4641, |
|
"step": 906 |
|
}, |
|
{ |
|
"epoch": 3.930735930735931, |
|
"grad_norm": 0.0022043841890990734, |
|
"learning_rate": 4.6736857557925227e-07, |
|
"loss": 15.527, |
|
"step": 908 |
|
}, |
|
{ |
|
"epoch": 3.9393939393939394, |
|
"grad_norm": 0.0, |
|
"learning_rate": 4.007377941628754e-07, |
|
"loss": 15.3882, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 3.948051948051948, |
|
"grad_norm": 0.0020064269192516804, |
|
"learning_rate": 3.392192948833861e-07, |
|
"loss": 15.5124, |
|
"step": 912 |
|
}, |
|
{ |
|
"epoch": 3.9567099567099566, |
|
"grad_norm": 0.00176339247263968, |
|
"learning_rate": 2.828162353939678e-07, |
|
"loss": 15.3776, |
|
"step": 914 |
|
}, |
|
{ |
|
"epoch": 3.965367965367965, |
|
"grad_norm": 0.0016750121721997857, |
|
"learning_rate": 2.315315107798366e-07, |
|
"loss": 15.4065, |
|
"step": 916 |
|
}, |
|
{ |
|
"epoch": 3.974025974025974, |
|
"grad_norm": 0.0027372916229069233, |
|
"learning_rate": 1.8536775340970425e-07, |
|
"loss": 15.5642, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 3.982683982683983, |
|
"grad_norm": 0.0, |
|
"learning_rate": 1.4432733280065335e-07, |
|
"loss": 15.3594, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 3.9913419913419914, |
|
"grad_norm": 0.0021978975273668766, |
|
"learning_rate": 1.0841235549648999e-07, |
|
"loss": 15.4857, |
|
"step": 922 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.25208330154418945, |
|
"learning_rate": 7.762466495964127e-08, |
|
"loss": 15.4448, |
|
"step": 924 |
|
} |
|
], |
|
"logging_steps": 2, |
|
"max_steps": 924, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 200, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.3868198638796145e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|