|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 0, |
|
"global_step": 406, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024630541871921183, |
|
"grad_norm": 11.540680885314941, |
|
"learning_rate": 1e-05, |
|
"loss": 0.9851, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0049261083743842365, |
|
"grad_norm": 4.776151657104492, |
|
"learning_rate": 9.999850312505222e-06, |
|
"loss": 0.9615, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007389162561576354, |
|
"grad_norm": 3.0982847213745117, |
|
"learning_rate": 9.999401258983426e-06, |
|
"loss": 0.9231, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009852216748768473, |
|
"grad_norm": 2.3222382068634033, |
|
"learning_rate": 9.998652866321688e-06, |
|
"loss": 0.7308, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012315270935960592, |
|
"grad_norm": 1.997889757156372, |
|
"learning_rate": 9.997605179330018e-06, |
|
"loss": 0.7855, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014778325123152709, |
|
"grad_norm": 1.7656501531600952, |
|
"learning_rate": 9.996258260738676e-06, |
|
"loss": 0.8678, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017241379310344827, |
|
"grad_norm": 1.3362767696380615, |
|
"learning_rate": 9.994612191194407e-06, |
|
"loss": 0.7193, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019704433497536946, |
|
"grad_norm": 2.2548017501831055, |
|
"learning_rate": 9.99266706925562e-06, |
|
"loss": 0.6996, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.022167487684729065, |
|
"grad_norm": 1.2499150037765503, |
|
"learning_rate": 9.990423011386489e-06, |
|
"loss": 0.7632, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024630541871921183, |
|
"grad_norm": 1.1288288831710815, |
|
"learning_rate": 9.987880151949976e-06, |
|
"loss": 0.7409, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.027093596059113302, |
|
"grad_norm": 1.0057581663131714, |
|
"learning_rate": 9.98503864319978e-06, |
|
"loss": 0.7303, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.029556650246305417, |
|
"grad_norm": 0.968294084072113, |
|
"learning_rate": 9.981898655271237e-06, |
|
"loss": 0.6851, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03201970443349754, |
|
"grad_norm": 1.1969448328018188, |
|
"learning_rate": 9.978460376171113e-06, |
|
"loss": 0.7647, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.034482758620689655, |
|
"grad_norm": 0.983593225479126, |
|
"learning_rate": 9.974724011766364e-06, |
|
"loss": 0.7308, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03694581280788178, |
|
"grad_norm": 1.9946510791778564, |
|
"learning_rate": 9.970689785771798e-06, |
|
"loss": 0.6388, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03940886699507389, |
|
"grad_norm": 1.070317029953003, |
|
"learning_rate": 9.966357939736692e-06, |
|
"loss": 0.6595, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04187192118226601, |
|
"grad_norm": 1.241529941558838, |
|
"learning_rate": 9.961728733030318e-06, |
|
"loss": 0.8895, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04433497536945813, |
|
"grad_norm": 1.0109727382659912, |
|
"learning_rate": 9.956802442826417e-06, |
|
"loss": 0.7378, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.046798029556650245, |
|
"grad_norm": 1.079637050628662, |
|
"learning_rate": 9.951579364086603e-06, |
|
"loss": 0.7178, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04926108374384237, |
|
"grad_norm": 1.2305184602737427, |
|
"learning_rate": 9.946059809542706e-06, |
|
"loss": 0.5985, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05172413793103448, |
|
"grad_norm": 11.067344665527344, |
|
"learning_rate": 9.940244109678043e-06, |
|
"loss": 0.681, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.054187192118226604, |
|
"grad_norm": 1.4613230228424072, |
|
"learning_rate": 9.934132612707631e-06, |
|
"loss": 0.6258, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05665024630541872, |
|
"grad_norm": 1.041548728942871, |
|
"learning_rate": 9.927725684557339e-06, |
|
"loss": 0.5983, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.059113300492610835, |
|
"grad_norm": 1.0944949388504028, |
|
"learning_rate": 9.921023708841975e-06, |
|
"loss": 0.6303, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.06157635467980296, |
|
"grad_norm": 0.964139461517334, |
|
"learning_rate": 9.914027086842323e-06, |
|
"loss": 0.6394, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06403940886699508, |
|
"grad_norm": 1.1896731853485107, |
|
"learning_rate": 9.90673623748111e-06, |
|
"loss": 0.6009, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.0665024630541872, |
|
"grad_norm": 1.1914570331573486, |
|
"learning_rate": 9.899151597297923e-06, |
|
"loss": 0.6132, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06896551724137931, |
|
"grad_norm": 0.910982608795166, |
|
"learning_rate": 9.891273620423083e-06, |
|
"loss": 0.6051, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07142857142857142, |
|
"grad_norm": 0.9912497401237488, |
|
"learning_rate": 9.883102778550434e-06, |
|
"loss": 0.5512, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07389162561576355, |
|
"grad_norm": 1.612174153327942, |
|
"learning_rate": 9.874639560909118e-06, |
|
"loss": 0.5836, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07635467980295567, |
|
"grad_norm": 0.967313289642334, |
|
"learning_rate": 9.865884474234275e-06, |
|
"loss": 0.6504, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07881773399014778, |
|
"grad_norm": 4.314541339874268, |
|
"learning_rate": 9.856838042736698e-06, |
|
"loss": 0.654, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.0812807881773399, |
|
"grad_norm": 0.9903213977813721, |
|
"learning_rate": 9.847500808071458e-06, |
|
"loss": 0.5655, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08374384236453201, |
|
"grad_norm": 0.9885880947113037, |
|
"learning_rate": 9.837873329305458e-06, |
|
"loss": 0.6559, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08620689655172414, |
|
"grad_norm": 0.9196619987487793, |
|
"learning_rate": 9.82795618288397e-06, |
|
"loss": 0.5714, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08866995073891626, |
|
"grad_norm": 0.9677406549453735, |
|
"learning_rate": 9.817749962596115e-06, |
|
"loss": 0.5687, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09113300492610837, |
|
"grad_norm": 1.034245491027832, |
|
"learning_rate": 9.807255279539313e-06, |
|
"loss": 0.6626, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09359605911330049, |
|
"grad_norm": 1.936618685722351, |
|
"learning_rate": 9.796472762082687e-06, |
|
"loss": 0.6928, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.0960591133004926, |
|
"grad_norm": 0.9693952202796936, |
|
"learning_rate": 9.78540305582945e-06, |
|
"loss": 0.5772, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09852216748768473, |
|
"grad_norm": 1.231882929801941, |
|
"learning_rate": 9.77404682357824e-06, |
|
"loss": 0.6339, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10098522167487685, |
|
"grad_norm": 0.9423834681510925, |
|
"learning_rate": 9.762404745283439e-06, |
|
"loss": 0.5313, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10344827586206896, |
|
"grad_norm": 1.3421170711517334, |
|
"learning_rate": 9.75047751801446e-06, |
|
"loss": 0.6631, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10591133004926108, |
|
"grad_norm": 1.0659502744674683, |
|
"learning_rate": 9.738265855914014e-06, |
|
"loss": 0.6041, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10837438423645321, |
|
"grad_norm": 14.726941108703613, |
|
"learning_rate": 9.725770490155338e-06, |
|
"loss": 0.4993, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11083743842364532, |
|
"grad_norm": 1.113370418548584, |
|
"learning_rate": 9.712992168898436e-06, |
|
"loss": 0.6032, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11330049261083744, |
|
"grad_norm": 4.380392074584961, |
|
"learning_rate": 9.699931657245264e-06, |
|
"loss": 0.5649, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11576354679802955, |
|
"grad_norm": 1.0713157653808594, |
|
"learning_rate": 9.686589737193929e-06, |
|
"loss": 0.5833, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.11822660098522167, |
|
"grad_norm": 0.79201340675354, |
|
"learning_rate": 9.67296720759187e-06, |
|
"loss": 0.6057, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1206896551724138, |
|
"grad_norm": 0.8670496940612793, |
|
"learning_rate": 9.659064884088017e-06, |
|
"loss": 0.5203, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12315270935960591, |
|
"grad_norm": 0.9468395709991455, |
|
"learning_rate": 9.644883599083959e-06, |
|
"loss": 0.6177, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12561576354679804, |
|
"grad_norm": 1.1626007556915283, |
|
"learning_rate": 9.630424201684105e-06, |
|
"loss": 0.5259, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12807881773399016, |
|
"grad_norm": 0.8885717391967773, |
|
"learning_rate": 9.615687557644848e-06, |
|
"loss": 0.5835, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.13054187192118227, |
|
"grad_norm": 0.9217110872268677, |
|
"learning_rate": 9.600674549322716e-06, |
|
"loss": 0.5965, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1330049261083744, |
|
"grad_norm": 0.9233891367912292, |
|
"learning_rate": 9.585386075621553e-06, |
|
"loss": 0.5679, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1354679802955665, |
|
"grad_norm": 1.285866141319275, |
|
"learning_rate": 9.569823051938689e-06, |
|
"loss": 0.5387, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13793103448275862, |
|
"grad_norm": 0.8282471895217896, |
|
"learning_rate": 9.553986410110135e-06, |
|
"loss": 0.476, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14039408866995073, |
|
"grad_norm": 0.9375513792037964, |
|
"learning_rate": 9.537877098354787e-06, |
|
"loss": 0.5456, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14285714285714285, |
|
"grad_norm": 1.073931097984314, |
|
"learning_rate": 9.521496081217652e-06, |
|
"loss": 0.4895, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14532019704433496, |
|
"grad_norm": 0.9736307263374329, |
|
"learning_rate": 9.504844339512096e-06, |
|
"loss": 0.6053, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.1477832512315271, |
|
"grad_norm": 1.0761698484420776, |
|
"learning_rate": 9.487922870261123e-06, |
|
"loss": 0.4654, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15024630541871922, |
|
"grad_norm": 0.9952367544174194, |
|
"learning_rate": 9.470732686637665e-06, |
|
"loss": 0.5585, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15270935960591134, |
|
"grad_norm": 0.8771824836730957, |
|
"learning_rate": 9.453274817903932e-06, |
|
"loss": 0.5827, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15517241379310345, |
|
"grad_norm": 1.1052207946777344, |
|
"learning_rate": 9.435550309349776e-06, |
|
"loss": 0.5705, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15763546798029557, |
|
"grad_norm": 0.8488280177116394, |
|
"learning_rate": 9.417560222230115e-06, |
|
"loss": 0.5058, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16009852216748768, |
|
"grad_norm": 0.9430592656135559, |
|
"learning_rate": 9.399305633701372e-06, |
|
"loss": 0.506, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1625615763546798, |
|
"grad_norm": 1.6385177373886108, |
|
"learning_rate": 9.380787636757002e-06, |
|
"loss": 0.6934, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16502463054187191, |
|
"grad_norm": 0.9341111183166504, |
|
"learning_rate": 9.36200734016203e-06, |
|
"loss": 0.6388, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16748768472906403, |
|
"grad_norm": 0.7698712944984436, |
|
"learning_rate": 9.342965868386674e-06, |
|
"loss": 0.4639, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.16995073891625614, |
|
"grad_norm": 0.8888192176818848, |
|
"learning_rate": 9.32366436153902e-06, |
|
"loss": 0.5409, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1724137931034483, |
|
"grad_norm": 1.2890245914459229, |
|
"learning_rate": 9.30410397529675e-06, |
|
"loss": 0.6643, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1748768472906404, |
|
"grad_norm": 0.8433436155319214, |
|
"learning_rate": 9.284285880837947e-06, |
|
"loss": 0.5345, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17733990147783252, |
|
"grad_norm": 0.8337154388427734, |
|
"learning_rate": 9.264211264770977e-06, |
|
"loss": 0.4866, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17980295566502463, |
|
"grad_norm": 0.8894972801208496, |
|
"learning_rate": 9.243881329063436e-06, |
|
"loss": 0.5478, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18226600985221675, |
|
"grad_norm": 0.8818243741989136, |
|
"learning_rate": 9.22329729097018e-06, |
|
"loss": 0.5189, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.18472906403940886, |
|
"grad_norm": 1.0139381885528564, |
|
"learning_rate": 9.202460382960449e-06, |
|
"loss": 0.5413, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18719211822660098, |
|
"grad_norm": 1.144716739654541, |
|
"learning_rate": 9.181371852644063e-06, |
|
"loss": 0.5689, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.1896551724137931, |
|
"grad_norm": 1.1229621171951294, |
|
"learning_rate": 9.160032962696734e-06, |
|
"loss": 0.5018, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1921182266009852, |
|
"grad_norm": 1.20614755153656, |
|
"learning_rate": 9.138444990784455e-06, |
|
"loss": 0.5297, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.19458128078817735, |
|
"grad_norm": 1.7722558975219727, |
|
"learning_rate": 9.116609229486992e-06, |
|
"loss": 0.4937, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19704433497536947, |
|
"grad_norm": 1.1039201021194458, |
|
"learning_rate": 9.094526986220513e-06, |
|
"loss": 0.5181, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.19950738916256158, |
|
"grad_norm": 1.0505284070968628, |
|
"learning_rate": 9.072199583159285e-06, |
|
"loss": 0.6021, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.2019704433497537, |
|
"grad_norm": 0.9036574959754944, |
|
"learning_rate": 9.049628357156522e-06, |
|
"loss": 0.5434, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.2044334975369458, |
|
"grad_norm": 1.5033870935440063, |
|
"learning_rate": 9.026814659664331e-06, |
|
"loss": 0.462, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20689655172413793, |
|
"grad_norm": 0.9125608205795288, |
|
"learning_rate": 9.003759856652803e-06, |
|
"loss": 0.456, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.20935960591133004, |
|
"grad_norm": 3.8365941047668457, |
|
"learning_rate": 8.98046532852822e-06, |
|
"loss": 0.4463, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21182266009852216, |
|
"grad_norm": 1.0002238750457764, |
|
"learning_rate": 8.956932470050405e-06, |
|
"loss": 0.5616, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21428571428571427, |
|
"grad_norm": 0.807572603225708, |
|
"learning_rate": 8.93316269024921e-06, |
|
"loss": 0.4795, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21674876847290642, |
|
"grad_norm": 0.7959713935852051, |
|
"learning_rate": 8.90915741234015e-06, |
|
"loss": 0.4098, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.21921182266009853, |
|
"grad_norm": 2.6254165172576904, |
|
"learning_rate": 8.88491807363919e-06, |
|
"loss": 0.6164, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.22167487684729065, |
|
"grad_norm": 0.8573901653289795, |
|
"learning_rate": 8.860446125476688e-06, |
|
"loss": 0.5426, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22413793103448276, |
|
"grad_norm": 0.8792327046394348, |
|
"learning_rate": 8.835743033110482e-06, |
|
"loss": 0.5441, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.22660098522167488, |
|
"grad_norm": 1.1924870014190674, |
|
"learning_rate": 8.810810275638183e-06, |
|
"loss": 0.4542, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.229064039408867, |
|
"grad_norm": 0.7673595547676086, |
|
"learning_rate": 8.78564934590859e-06, |
|
"loss": 0.4864, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2315270935960591, |
|
"grad_norm": 1.037055492401123, |
|
"learning_rate": 8.760261750432312e-06, |
|
"loss": 0.5286, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23399014778325122, |
|
"grad_norm": 1.1282660961151123, |
|
"learning_rate": 8.734649009291586e-06, |
|
"loss": 0.5892, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.23645320197044334, |
|
"grad_norm": 0.9695258140563965, |
|
"learning_rate": 8.708812656049227e-06, |
|
"loss": 0.5851, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.23891625615763548, |
|
"grad_norm": 0.9574100971221924, |
|
"learning_rate": 8.68275423765683e-06, |
|
"loss": 0.5058, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.2413793103448276, |
|
"grad_norm": 0.8745037317276001, |
|
"learning_rate": 8.656475314362149e-06, |
|
"loss": 0.4157, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.2438423645320197, |
|
"grad_norm": 0.8950684070587158, |
|
"learning_rate": 8.629977459615655e-06, |
|
"loss": 0.4444, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24630541871921183, |
|
"grad_norm": 1.0956140756607056, |
|
"learning_rate": 8.603262259976348e-06, |
|
"loss": 0.5102, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24876847290640394, |
|
"grad_norm": 0.9333466291427612, |
|
"learning_rate": 8.576331315016753e-06, |
|
"loss": 0.5257, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2512315270935961, |
|
"grad_norm": 0.9434043169021606, |
|
"learning_rate": 8.549186237227138e-06, |
|
"loss": 0.532, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2536945812807882, |
|
"grad_norm": 1.0310450792312622, |
|
"learning_rate": 8.521828651918983e-06, |
|
"loss": 0.5588, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2561576354679803, |
|
"grad_norm": 1.0325108766555786, |
|
"learning_rate": 8.49426019712765e-06, |
|
"loss": 0.4569, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.25862068965517243, |
|
"grad_norm": 1.030055284500122, |
|
"learning_rate": 8.46648252351431e-06, |
|
"loss": 0.5172, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.26108374384236455, |
|
"grad_norm": 0.8964868783950806, |
|
"learning_rate": 8.438497294267117e-06, |
|
"loss": 0.5143, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.26354679802955666, |
|
"grad_norm": 1.2119203805923462, |
|
"learning_rate": 8.41030618500161e-06, |
|
"loss": 0.4589, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2660098522167488, |
|
"grad_norm": 0.880455732345581, |
|
"learning_rate": 8.3819108836604e-06, |
|
"loss": 0.4789, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2684729064039409, |
|
"grad_norm": 1.026513695716858, |
|
"learning_rate": 8.353313090412093e-06, |
|
"loss": 0.4132, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.270935960591133, |
|
"grad_norm": 0.8435222506523132, |
|
"learning_rate": 8.3245145175495e-06, |
|
"loss": 0.4623, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2733990147783251, |
|
"grad_norm": 0.8828105926513672, |
|
"learning_rate": 8.295516889387115e-06, |
|
"loss": 0.4267, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.27586206896551724, |
|
"grad_norm": 0.7920165061950684, |
|
"learning_rate": 8.26632194215786e-06, |
|
"loss": 0.4472, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.27832512315270935, |
|
"grad_norm": 1.1182401180267334, |
|
"learning_rate": 8.23693142390914e-06, |
|
"loss": 0.4812, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.28078817733990147, |
|
"grad_norm": 1.444700002670288, |
|
"learning_rate": 8.207347094398173e-06, |
|
"loss": 0.4815, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.2832512315270936, |
|
"grad_norm": 0.9819019436836243, |
|
"learning_rate": 8.177570724986627e-06, |
|
"loss": 0.47, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2857142857142857, |
|
"grad_norm": 0.84306401014328, |
|
"learning_rate": 8.14760409853456e-06, |
|
"loss": 0.547, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.2881773399014778, |
|
"grad_norm": 0.9470686912536621, |
|
"learning_rate": 8.117449009293668e-06, |
|
"loss": 0.4874, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.29064039408866993, |
|
"grad_norm": 0.7772971987724304, |
|
"learning_rate": 8.087107262799856e-06, |
|
"loss": 0.5282, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.29310344827586204, |
|
"grad_norm": 0.9155469536781311, |
|
"learning_rate": 8.05658067576513e-06, |
|
"loss": 0.4475, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2955665024630542, |
|
"grad_norm": 0.823267936706543, |
|
"learning_rate": 8.025871075968828e-06, |
|
"loss": 0.4273, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29802955665024633, |
|
"grad_norm": 0.921981155872345, |
|
"learning_rate": 7.99498030214817e-06, |
|
"loss": 0.5247, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.30049261083743845, |
|
"grad_norm": 0.8674582839012146, |
|
"learning_rate": 7.963910203888177e-06, |
|
"loss": 0.4733, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.30295566502463056, |
|
"grad_norm": 0.895481526851654, |
|
"learning_rate": 7.932662641510915e-06, |
|
"loss": 0.5376, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3054187192118227, |
|
"grad_norm": 0.7914291024208069, |
|
"learning_rate": 7.90123948596412e-06, |
|
"loss": 0.4572, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3078817733990148, |
|
"grad_norm": 1.2109417915344238, |
|
"learning_rate": 7.869642618709162e-06, |
|
"loss": 0.4873, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3103448275862069, |
|
"grad_norm": 0.9359944462776184, |
|
"learning_rate": 7.8378739316084e-06, |
|
"loss": 0.4277, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.312807881773399, |
|
"grad_norm": 0.8828278183937073, |
|
"learning_rate": 7.805935326811913e-06, |
|
"loss": 0.5501, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.31527093596059114, |
|
"grad_norm": 1.0584925413131714, |
|
"learning_rate": 7.773828716643592e-06, |
|
"loss": 0.4268, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.31773399014778325, |
|
"grad_norm": 1.5920400619506836, |
|
"learning_rate": 7.741556023486655e-06, |
|
"loss": 0.5066, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.32019704433497537, |
|
"grad_norm": 0.8200149536132812, |
|
"learning_rate": 7.709119179668538e-06, |
|
"loss": 0.419, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3226600985221675, |
|
"grad_norm": 1.1859734058380127, |
|
"learning_rate": 7.676520127345198e-06, |
|
"loss": 0.496, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3251231527093596, |
|
"grad_norm": 0.9637341499328613, |
|
"learning_rate": 7.64376081838482e-06, |
|
"loss": 0.5875, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.3275862068965517, |
|
"grad_norm": 0.9584794640541077, |
|
"learning_rate": 7.610843214250964e-06, |
|
"loss": 0.4876, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.33004926108374383, |
|
"grad_norm": 0.9975658059120178, |
|
"learning_rate": 7.57776928588511e-06, |
|
"loss": 0.4481, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.33251231527093594, |
|
"grad_norm": 1.1614975929260254, |
|
"learning_rate": 7.5445410135886455e-06, |
|
"loss": 0.5802, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33497536945812806, |
|
"grad_norm": 2.236684799194336, |
|
"learning_rate": 7.511160386904306e-06, |
|
"loss": 0.4955, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.3374384236453202, |
|
"grad_norm": 0.8656754493713379, |
|
"learning_rate": 7.477629404497048e-06, |
|
"loss": 0.5312, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3399014778325123, |
|
"grad_norm": 0.7941805124282837, |
|
"learning_rate": 7.4439500740343685e-06, |
|
"loss": 0.4797, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.34236453201970446, |
|
"grad_norm": 0.8105161190032959, |
|
"learning_rate": 7.4101244120661105e-06, |
|
"loss": 0.4181, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3448275862068966, |
|
"grad_norm": 2.2261204719543457, |
|
"learning_rate": 7.376154443903714e-06, |
|
"loss": 0.5089, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3472906403940887, |
|
"grad_norm": 0.9390556812286377, |
|
"learning_rate": 7.342042203498952e-06, |
|
"loss": 0.5437, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3497536945812808, |
|
"grad_norm": 0.8173288702964783, |
|
"learning_rate": 7.307789733322146e-06, |
|
"loss": 0.4328, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3522167487684729, |
|
"grad_norm": 1.0669867992401123, |
|
"learning_rate": 7.273399084239878e-06, |
|
"loss": 0.4557, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.35467980295566504, |
|
"grad_norm": 2.4072816371917725, |
|
"learning_rate": 7.238872315392189e-06, |
|
"loss": 0.4371, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.35714285714285715, |
|
"grad_norm": 0.8072436451911926, |
|
"learning_rate": 7.204211494069292e-06, |
|
"loss": 0.3932, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.35960591133004927, |
|
"grad_norm": 1.3799082040786743, |
|
"learning_rate": 7.169418695587791e-06, |
|
"loss": 0.4526, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3620689655172414, |
|
"grad_norm": 0.9200634956359863, |
|
"learning_rate": 7.134496003166423e-06, |
|
"loss": 0.4379, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3645320197044335, |
|
"grad_norm": 2.47597336769104, |
|
"learning_rate": 7.099445507801324e-06, |
|
"loss": 0.4638, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.3669950738916256, |
|
"grad_norm": 0.9331015348434448, |
|
"learning_rate": 7.06426930814083e-06, |
|
"loss": 0.4635, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3694581280788177, |
|
"grad_norm": 0.8710178732872009, |
|
"learning_rate": 7.028969510359821e-06, |
|
"loss": 0.3501, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37192118226600984, |
|
"grad_norm": 1.0825324058532715, |
|
"learning_rate": 6.993548228033618e-06, |
|
"loss": 0.4367, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.37438423645320196, |
|
"grad_norm": 0.8021703362464905, |
|
"learning_rate": 6.9580075820114255e-06, |
|
"loss": 0.5561, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3768472906403941, |
|
"grad_norm": 0.799213707447052, |
|
"learning_rate": 6.922349700289348e-06, |
|
"loss": 0.4704, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.3793103448275862, |
|
"grad_norm": 1.3380686044692993, |
|
"learning_rate": 6.886576717882982e-06, |
|
"loss": 0.4525, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.3817733990147783, |
|
"grad_norm": 0.807714581489563, |
|
"learning_rate": 6.850690776699574e-06, |
|
"loss": 0.3812, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3842364532019704, |
|
"grad_norm": 0.8078760504722595, |
|
"learning_rate": 6.814694025409773e-06, |
|
"loss": 0.4816, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3866995073891626, |
|
"grad_norm": 0.884308934211731, |
|
"learning_rate": 6.7785886193189936e-06, |
|
"loss": 0.5723, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3891625615763547, |
|
"grad_norm": 0.8629475831985474, |
|
"learning_rate": 6.742376720238346e-06, |
|
"loss": 0.4831, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3916256157635468, |
|
"grad_norm": 0.8618782162666321, |
|
"learning_rate": 6.7060604963552125e-06, |
|
"loss": 0.4798, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.39408866995073893, |
|
"grad_norm": 0.7644566893577576, |
|
"learning_rate": 6.669642122103423e-06, |
|
"loss": 0.4965, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.39655172413793105, |
|
"grad_norm": 1.092326283454895, |
|
"learning_rate": 6.633123778033061e-06, |
|
"loss": 0.4821, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.39901477832512317, |
|
"grad_norm": 0.9640597105026245, |
|
"learning_rate": 6.5965076506799e-06, |
|
"loss": 0.39, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4014778325123153, |
|
"grad_norm": 1.1585789918899536, |
|
"learning_rate": 6.559795932434489e-06, |
|
"loss": 0.4484, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4039408866995074, |
|
"grad_norm": 0.8530002236366272, |
|
"learning_rate": 6.522990821410881e-06, |
|
"loss": 0.3783, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.4064039408866995, |
|
"grad_norm": 0.8914724588394165, |
|
"learning_rate": 6.486094521315022e-06, |
|
"loss": 0.4685, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4088669950738916, |
|
"grad_norm": 0.7827121019363403, |
|
"learning_rate": 6.449109241312803e-06, |
|
"loss": 0.4192, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.41133004926108374, |
|
"grad_norm": 1.0978766679763794, |
|
"learning_rate": 6.412037195897786e-06, |
|
"loss": 0.408, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.41379310344827586, |
|
"grad_norm": 0.8382684588432312, |
|
"learning_rate": 6.3748806047586155e-06, |
|
"loss": 0.457, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.41625615763546797, |
|
"grad_norm": 1.073023796081543, |
|
"learning_rate": 6.337641692646106e-06, |
|
"loss": 0.4697, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4187192118226601, |
|
"grad_norm": 0.9375430941581726, |
|
"learning_rate": 6.300322689240042e-06, |
|
"loss": 0.4769, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4211822660098522, |
|
"grad_norm": 0.8222770690917969, |
|
"learning_rate": 6.262925829015675e-06, |
|
"loss": 0.4652, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.4236453201970443, |
|
"grad_norm": 0.9433236718177795, |
|
"learning_rate": 6.2254533511099345e-06, |
|
"loss": 0.4507, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.42610837438423643, |
|
"grad_norm": 0.8848790526390076, |
|
"learning_rate": 6.187907499187357e-06, |
|
"loss": 0.4219, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.42857142857142855, |
|
"grad_norm": 0.8615171313285828, |
|
"learning_rate": 6.150290521305746e-06, |
|
"loss": 0.4878, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 0.8783863186836243, |
|
"learning_rate": 6.112604669781572e-06, |
|
"loss": 0.4351, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.43349753694581283, |
|
"grad_norm": 0.8565014004707336, |
|
"learning_rate": 6.074852201055121e-06, |
|
"loss": 0.4027, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.43596059113300495, |
|
"grad_norm": 0.8122984766960144, |
|
"learning_rate": 6.037035375555376e-06, |
|
"loss": 0.4788, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.43842364532019706, |
|
"grad_norm": 0.8158956170082092, |
|
"learning_rate": 5.9991564575646855e-06, |
|
"loss": 0.4449, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4408866995073892, |
|
"grad_norm": 0.8699731826782227, |
|
"learning_rate": 5.961217715083185e-06, |
|
"loss": 0.4864, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4433497536945813, |
|
"grad_norm": 0.9397453665733337, |
|
"learning_rate": 5.923221419693002e-06, |
|
"loss": 0.518, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4458128078817734, |
|
"grad_norm": 0.8958430290222168, |
|
"learning_rate": 5.885169846422242e-06, |
|
"loss": 0.4862, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4482758620689655, |
|
"grad_norm": 0.8838856816291809, |
|
"learning_rate": 5.847065273608777e-06, |
|
"loss": 0.4701, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.45073891625615764, |
|
"grad_norm": 0.8200072050094604, |
|
"learning_rate": 5.808909982763825e-06, |
|
"loss": 0.3679, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.45320197044334976, |
|
"grad_norm": 0.931821346282959, |
|
"learning_rate": 5.770706258435342e-06, |
|
"loss": 0.4082, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.45566502463054187, |
|
"grad_norm": 1.0416653156280518, |
|
"learning_rate": 5.732456388071247e-06, |
|
"loss": 0.4608, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.458128078817734, |
|
"grad_norm": 0.8697349429130554, |
|
"learning_rate": 5.6941626618824445e-06, |
|
"loss": 0.5462, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.4605911330049261, |
|
"grad_norm": 0.760611355304718, |
|
"learning_rate": 5.655827372705712e-06, |
|
"loss": 0.4358, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4630541871921182, |
|
"grad_norm": 0.8112332820892334, |
|
"learning_rate": 5.61745281586641e-06, |
|
"loss": 0.5072, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.46551724137931033, |
|
"grad_norm": 0.857423722743988, |
|
"learning_rate": 5.579041289041045e-06, |
|
"loss": 0.4662, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.46798029556650245, |
|
"grad_norm": 0.9442394375801086, |
|
"learning_rate": 5.540595092119709e-06, |
|
"loss": 0.5124, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.47044334975369456, |
|
"grad_norm": 0.7853634357452393, |
|
"learning_rate": 5.502116527068363e-06, |
|
"loss": 0.4048, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4729064039408867, |
|
"grad_norm": 1.0823391675949097, |
|
"learning_rate": 5.463607897791006e-06, |
|
"loss": 0.4303, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4753694581280788, |
|
"grad_norm": 1.0174390077590942, |
|
"learning_rate": 5.425071509991737e-06, |
|
"loss": 0.467, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.47783251231527096, |
|
"grad_norm": 0.9144822955131531, |
|
"learning_rate": 5.386509671036695e-06, |
|
"loss": 0.4526, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4802955665024631, |
|
"grad_norm": 0.9582109451293945, |
|
"learning_rate": 5.347924689815906e-06, |
|
"loss": 0.4019, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4827586206896552, |
|
"grad_norm": 0.7563319802284241, |
|
"learning_rate": 5.309318876605043e-06, |
|
"loss": 0.4331, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4852216748768473, |
|
"grad_norm": 0.8050902485847473, |
|
"learning_rate": 5.270694542927089e-06, |
|
"loss": 0.4731, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4876847290640394, |
|
"grad_norm": 0.9717739820480347, |
|
"learning_rate": 5.2320540014139405e-06, |
|
"loss": 0.4701, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.49014778325123154, |
|
"grad_norm": 0.9432171583175659, |
|
"learning_rate": 5.193399565667945e-06, |
|
"loss": 0.3581, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.49261083743842365, |
|
"grad_norm": 1.2452778816223145, |
|
"learning_rate": 5.154733550123357e-06, |
|
"loss": 0.4548, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.49507389162561577, |
|
"grad_norm": 0.78749680519104, |
|
"learning_rate": 5.116058269907779e-06, |
|
"loss": 0.421, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4975369458128079, |
|
"grad_norm": 0.8748036026954651, |
|
"learning_rate": 5.077376040703533e-06, |
|
"loss": 0.4386, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8578478693962097, |
|
"learning_rate": 5.038689178609011e-06, |
|
"loss": 0.5316, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5024630541871922, |
|
"grad_norm": 0.7721994519233704, |
|
"learning_rate": 5e-06, |
|
"loss": 0.4235, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5049261083743842, |
|
"grad_norm": 0.7942141890525818, |
|
"learning_rate": 4.96131082139099e-06, |
|
"loss": 0.4798, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5073891625615764, |
|
"grad_norm": 1.1543359756469727, |
|
"learning_rate": 4.922623959296469e-06, |
|
"loss": 0.4402, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5098522167487685, |
|
"grad_norm": 0.9413177967071533, |
|
"learning_rate": 4.883941730092222e-06, |
|
"loss": 0.5872, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5123152709359606, |
|
"grad_norm": 0.7829416394233704, |
|
"learning_rate": 4.845266449876646e-06, |
|
"loss": 0.4577, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5147783251231527, |
|
"grad_norm": 0.7771807312965393, |
|
"learning_rate": 4.806600434332056e-06, |
|
"loss": 0.4063, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5172413793103449, |
|
"grad_norm": 0.8723726868629456, |
|
"learning_rate": 4.76794599858606e-06, |
|
"loss": 0.4745, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5197044334975369, |
|
"grad_norm": 0.8430385589599609, |
|
"learning_rate": 4.729305457072913e-06, |
|
"loss": 0.4174, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5221674876847291, |
|
"grad_norm": 0.6947190165519714, |
|
"learning_rate": 4.690681123394959e-06, |
|
"loss": 0.378, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5246305418719212, |
|
"grad_norm": 0.932201623916626, |
|
"learning_rate": 4.6520753101840945e-06, |
|
"loss": 0.4578, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5270935960591133, |
|
"grad_norm": 0.8713343739509583, |
|
"learning_rate": 4.613490328963307e-06, |
|
"loss": 0.39, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5295566502463054, |
|
"grad_norm": 0.7538230419158936, |
|
"learning_rate": 4.574928490008264e-06, |
|
"loss": 0.3857, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5320197044334976, |
|
"grad_norm": 0.8916656970977783, |
|
"learning_rate": 4.536392102208998e-06, |
|
"loss": 0.5202, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5344827586206896, |
|
"grad_norm": 0.8922424912452698, |
|
"learning_rate": 4.497883472931639e-06, |
|
"loss": 0.4081, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5369458128078818, |
|
"grad_norm": 1.8260999917984009, |
|
"learning_rate": 4.459404907880293e-06, |
|
"loss": 0.4833, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5394088669950738, |
|
"grad_norm": 1.0410164594650269, |
|
"learning_rate": 4.4209587109589565e-06, |
|
"loss": 0.3233, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.541871921182266, |
|
"grad_norm": 0.9134721755981445, |
|
"learning_rate": 4.382547184133593e-06, |
|
"loss": 0.373, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5443349753694581, |
|
"grad_norm": 0.8863281607627869, |
|
"learning_rate": 4.3441726272942895e-06, |
|
"loss": 0.3112, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5467980295566502, |
|
"grad_norm": 0.7182394862174988, |
|
"learning_rate": 4.305837338117557e-06, |
|
"loss": 0.4257, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5492610837438424, |
|
"grad_norm": 1.0430289506912231, |
|
"learning_rate": 4.267543611928755e-06, |
|
"loss": 0.4123, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5517241379310345, |
|
"grad_norm": 0.9563698768615723, |
|
"learning_rate": 4.229293741564658e-06, |
|
"loss": 0.5008, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5541871921182266, |
|
"grad_norm": 1.4074031114578247, |
|
"learning_rate": 4.191090017236177e-06, |
|
"loss": 0.4318, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5566502463054187, |
|
"grad_norm": 0.8157017230987549, |
|
"learning_rate": 4.152934726391223e-06, |
|
"loss": 0.5214, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5591133004926109, |
|
"grad_norm": 0.7647298574447632, |
|
"learning_rate": 4.114830153577759e-06, |
|
"loss": 0.3445, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5615763546798029, |
|
"grad_norm": 1.0865991115570068, |
|
"learning_rate": 4.076778580306999e-06, |
|
"loss": 0.3936, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5640394088669951, |
|
"grad_norm": 1.014061450958252, |
|
"learning_rate": 4.0387822849168165e-06, |
|
"loss": 0.3716, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5665024630541872, |
|
"grad_norm": 0.7353817224502563, |
|
"learning_rate": 4.000843542435315e-06, |
|
"loss": 0.3828, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5689655172413793, |
|
"grad_norm": 1.005282998085022, |
|
"learning_rate": 3.962964624444625e-06, |
|
"loss": 0.4572, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5714285714285714, |
|
"grad_norm": 0.9712240099906921, |
|
"learning_rate": 3.92514779894488e-06, |
|
"loss": 0.3327, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5738916256157636, |
|
"grad_norm": 0.7928905487060547, |
|
"learning_rate": 3.887395330218429e-06, |
|
"loss": 0.3351, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5763546798029556, |
|
"grad_norm": 0.8094484806060791, |
|
"learning_rate": 3.849709478694256e-06, |
|
"loss": 0.4305, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5788177339901478, |
|
"grad_norm": 3.6431384086608887, |
|
"learning_rate": 3.8120925008126457e-06, |
|
"loss": 0.4964, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5812807881773399, |
|
"grad_norm": 0.9250912666320801, |
|
"learning_rate": 3.7745466488900663e-06, |
|
"loss": 0.4114, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.583743842364532, |
|
"grad_norm": 1.0961583852767944, |
|
"learning_rate": 3.7370741709843263e-06, |
|
"loss": 0.4588, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5862068965517241, |
|
"grad_norm": 0.857764482498169, |
|
"learning_rate": 3.6996773107599605e-06, |
|
"loss": 0.3729, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5886699507389163, |
|
"grad_norm": 1.0068374872207642, |
|
"learning_rate": 3.662358307353897e-06, |
|
"loss": 0.4843, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.5911330049261084, |
|
"grad_norm": 0.8471587300300598, |
|
"learning_rate": 3.6251193952413866e-06, |
|
"loss": 0.4503, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5935960591133005, |
|
"grad_norm": 0.8168319463729858, |
|
"learning_rate": 3.587962804102214e-06, |
|
"loss": 0.3802, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5960591133004927, |
|
"grad_norm": 0.8261502981185913, |
|
"learning_rate": 3.550890758687199e-06, |
|
"loss": 0.4659, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5985221674876847, |
|
"grad_norm": 1.202130913734436, |
|
"learning_rate": 3.5139054786849787e-06, |
|
"loss": 0.4397, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6009852216748769, |
|
"grad_norm": 0.8035978078842163, |
|
"learning_rate": 3.4770091785891207e-06, |
|
"loss": 0.4111, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.603448275862069, |
|
"grad_norm": 2.1057868003845215, |
|
"learning_rate": 3.440204067565511e-06, |
|
"loss": 0.4438, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6059113300492611, |
|
"grad_norm": 0.8185580372810364, |
|
"learning_rate": 3.403492349320101e-06, |
|
"loss": 0.3776, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6083743842364532, |
|
"grad_norm": 0.8482059240341187, |
|
"learning_rate": 3.3668762219669393e-06, |
|
"loss": 0.4184, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6108374384236454, |
|
"grad_norm": 0.8905600309371948, |
|
"learning_rate": 3.330357877896577e-06, |
|
"loss": 0.4145, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6133004926108374, |
|
"grad_norm": 0.7447388172149658, |
|
"learning_rate": 3.293939503644788e-06, |
|
"loss": 0.4, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6157635467980296, |
|
"grad_norm": 1.7505443096160889, |
|
"learning_rate": 3.2576232797616556e-06, |
|
"loss": 0.3793, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6182266009852216, |
|
"grad_norm": 0.9543902277946472, |
|
"learning_rate": 3.2214113806810077e-06, |
|
"loss": 0.3999, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6206896551724138, |
|
"grad_norm": 0.889468789100647, |
|
"learning_rate": 3.1853059745902287e-06, |
|
"loss": 0.3603, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6231527093596059, |
|
"grad_norm": 0.9314963817596436, |
|
"learning_rate": 3.149309223300428e-06, |
|
"loss": 0.4, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.625615763546798, |
|
"grad_norm": 0.7808181047439575, |
|
"learning_rate": 3.1134232821170202e-06, |
|
"loss": 0.4358, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6280788177339901, |
|
"grad_norm": 0.8075929284095764, |
|
"learning_rate": 3.0776502997106526e-06, |
|
"loss": 0.4397, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6305418719211823, |
|
"grad_norm": 0.991936445236206, |
|
"learning_rate": 3.041992417988577e-06, |
|
"loss": 0.3703, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6330049261083743, |
|
"grad_norm": 0.9331387877464294, |
|
"learning_rate": 3.0064517719663833e-06, |
|
"loss": 0.4161, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6354679802955665, |
|
"grad_norm": 0.9041942358016968, |
|
"learning_rate": 2.9710304896401803e-06, |
|
"loss": 0.3642, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6379310344827587, |
|
"grad_norm": 1.105132818222046, |
|
"learning_rate": 2.935730691859172e-06, |
|
"loss": 0.3843, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6403940886699507, |
|
"grad_norm": 0.7953276634216309, |
|
"learning_rate": 2.9005544921986774e-06, |
|
"loss": 0.3583, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6428571428571429, |
|
"grad_norm": 0.9104849696159363, |
|
"learning_rate": 2.8655039968335774e-06, |
|
"loss": 0.4262, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.645320197044335, |
|
"grad_norm": 1.0020502805709839, |
|
"learning_rate": 2.83058130441221e-06, |
|
"loss": 0.3959, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6477832512315271, |
|
"grad_norm": 1.0070898532867432, |
|
"learning_rate": 2.7957885059307097e-06, |
|
"loss": 0.4128, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6502463054187192, |
|
"grad_norm": 0.8092946410179138, |
|
"learning_rate": 2.761127684607811e-06, |
|
"loss": 0.4152, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6527093596059114, |
|
"grad_norm": 0.9570103883743286, |
|
"learning_rate": 2.7266009157601226e-06, |
|
"loss": 0.4408, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6551724137931034, |
|
"grad_norm": 0.9353964924812317, |
|
"learning_rate": 2.692210266677855e-06, |
|
"loss": 0.4548, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6576354679802956, |
|
"grad_norm": 0.7735761404037476, |
|
"learning_rate": 2.65795779650105e-06, |
|
"loss": 0.4139, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6600985221674877, |
|
"grad_norm": 0.8776670098304749, |
|
"learning_rate": 2.6238455560962884e-06, |
|
"loss": 0.3719, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6625615763546798, |
|
"grad_norm": 0.9098663330078125, |
|
"learning_rate": 2.589875587933892e-06, |
|
"loss": 0.4113, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6650246305418719, |
|
"grad_norm": 3.0552971363067627, |
|
"learning_rate": 2.5560499259656323e-06, |
|
"loss": 0.4058, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6674876847290641, |
|
"grad_norm": 0.7983712553977966, |
|
"learning_rate": 2.522370595502954e-06, |
|
"loss": 0.4385, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6699507389162561, |
|
"grad_norm": 0.695418119430542, |
|
"learning_rate": 2.488839613095695e-06, |
|
"loss": 0.3056, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6724137931034483, |
|
"grad_norm": 1.0092236995697021, |
|
"learning_rate": 2.4554589864113566e-06, |
|
"loss": 0.3816, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6748768472906403, |
|
"grad_norm": 0.7949216961860657, |
|
"learning_rate": 2.422230714114891e-06, |
|
"loss": 0.3537, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6773399014778325, |
|
"grad_norm": 0.9313486218452454, |
|
"learning_rate": 2.3891567857490373e-06, |
|
"loss": 0.4997, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6798029556650246, |
|
"grad_norm": 0.8018554449081421, |
|
"learning_rate": 2.3562391816151807e-06, |
|
"loss": 0.417, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6822660098522167, |
|
"grad_norm": 0.9989979267120361, |
|
"learning_rate": 2.323479872654805e-06, |
|
"loss": 0.4071, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6847290640394089, |
|
"grad_norm": 0.7565258741378784, |
|
"learning_rate": 2.2908808203314637e-06, |
|
"loss": 0.3126, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.687192118226601, |
|
"grad_norm": 0.9910544157028198, |
|
"learning_rate": 2.2584439765133453e-06, |
|
"loss": 0.4361, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6896551724137931, |
|
"grad_norm": 0.7968178987503052, |
|
"learning_rate": 2.226171283356409e-06, |
|
"loss": 0.3032, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6921182266009852, |
|
"grad_norm": 0.8218366503715515, |
|
"learning_rate": 2.1940646731880887e-06, |
|
"loss": 0.3931, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6945812807881774, |
|
"grad_norm": 0.8241420984268188, |
|
"learning_rate": 2.162126068391601e-06, |
|
"loss": 0.4846, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6970443349753694, |
|
"grad_norm": 0.9964359998703003, |
|
"learning_rate": 2.1303573812908383e-06, |
|
"loss": 0.4059, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6995073891625616, |
|
"grad_norm": 0.8862767219543457, |
|
"learning_rate": 2.0987605140358823e-06, |
|
"loss": 0.387, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7019704433497537, |
|
"grad_norm": 0.8447444438934326, |
|
"learning_rate": 2.0673373584890847e-06, |
|
"loss": 0.367, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.7044334975369458, |
|
"grad_norm": 0.719472348690033, |
|
"learning_rate": 2.036089796111825e-06, |
|
"loss": 0.3834, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7068965517241379, |
|
"grad_norm": 0.8975826501846313, |
|
"learning_rate": 2.0050196978518323e-06, |
|
"loss": 0.3538, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7093596059113301, |
|
"grad_norm": 1.0086376667022705, |
|
"learning_rate": 1.9741289240311757e-06, |
|
"loss": 0.498, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7118226600985221, |
|
"grad_norm": 0.9242886304855347, |
|
"learning_rate": 1.943419324234871e-06, |
|
"loss": 0.442, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7142857142857143, |
|
"grad_norm": 0.8589749932289124, |
|
"learning_rate": 1.9128927372001456e-06, |
|
"loss": 0.3315, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7167487684729064, |
|
"grad_norm": 1.2725121974945068, |
|
"learning_rate": 1.8825509907063328e-06, |
|
"loss": 0.5441, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7192118226600985, |
|
"grad_norm": 0.975878119468689, |
|
"learning_rate": 1.852395901465441e-06, |
|
"loss": 0.4301, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7216748768472906, |
|
"grad_norm": 0.9230057001113892, |
|
"learning_rate": 1.8224292750133743e-06, |
|
"loss": 0.4157, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7241379310344828, |
|
"grad_norm": 0.9983015656471252, |
|
"learning_rate": 1.79265290560183e-06, |
|
"loss": 0.4511, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7266009852216748, |
|
"grad_norm": 0.8537778258323669, |
|
"learning_rate": 1.7630685760908623e-06, |
|
"loss": 0.4724, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.729064039408867, |
|
"grad_norm": 0.8379884958267212, |
|
"learning_rate": 1.733678057842142e-06, |
|
"loss": 0.3558, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7315270935960592, |
|
"grad_norm": 0.835649311542511, |
|
"learning_rate": 1.7044831106128867e-06, |
|
"loss": 0.3725, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7339901477832512, |
|
"grad_norm": 0.7726622819900513, |
|
"learning_rate": 1.675485482450499e-06, |
|
"loss": 0.3666, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7364532019704434, |
|
"grad_norm": 0.9081035256385803, |
|
"learning_rate": 1.6466869095879079e-06, |
|
"loss": 0.3409, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7389162561576355, |
|
"grad_norm": 0.7757613658905029, |
|
"learning_rate": 1.6180891163396013e-06, |
|
"loss": 0.3568, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7413793103448276, |
|
"grad_norm": 0.9632595777511597, |
|
"learning_rate": 1.589693814998391e-06, |
|
"loss": 0.4876, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7438423645320197, |
|
"grad_norm": 0.7990944385528564, |
|
"learning_rate": 1.561502705732883e-06, |
|
"loss": 0.4158, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7463054187192119, |
|
"grad_norm": 0.9227884411811829, |
|
"learning_rate": 1.533517476485691e-06, |
|
"loss": 0.3277, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7487684729064039, |
|
"grad_norm": 1.0761818885803223, |
|
"learning_rate": 1.5057398028723514e-06, |
|
"loss": 0.3425, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7512315270935961, |
|
"grad_norm": 0.7749660015106201, |
|
"learning_rate": 1.4781713480810184e-06, |
|
"loss": 0.3679, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7536945812807881, |
|
"grad_norm": 2.2163593769073486, |
|
"learning_rate": 1.450813762772863e-06, |
|
"loss": 0.407, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7561576354679803, |
|
"grad_norm": 1.069761037826538, |
|
"learning_rate": 1.4236686849832497e-06, |
|
"loss": 0.3972, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7586206896551724, |
|
"grad_norm": 0.9363300800323486, |
|
"learning_rate": 1.3967377400236515e-06, |
|
"loss": 0.4288, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7610837438423645, |
|
"grad_norm": 0.8307360410690308, |
|
"learning_rate": 1.370022540384347e-06, |
|
"loss": 0.5181, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7635467980295566, |
|
"grad_norm": 0.9606369137763977, |
|
"learning_rate": 1.3435246856378524e-06, |
|
"loss": 0.5306, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7660098522167488, |
|
"grad_norm": 0.9734224677085876, |
|
"learning_rate": 1.3172457623431706e-06, |
|
"loss": 0.53, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7684729064039408, |
|
"grad_norm": 0.9703786373138428, |
|
"learning_rate": 1.2911873439507766e-06, |
|
"loss": 0.4361, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.770935960591133, |
|
"grad_norm": 1.0127205848693848, |
|
"learning_rate": 1.2653509907084171e-06, |
|
"loss": 0.3778, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7733990147783252, |
|
"grad_norm": 0.6820623874664307, |
|
"learning_rate": 1.2397382495676873e-06, |
|
"loss": 0.3225, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7758620689655172, |
|
"grad_norm": 1.9675172567367554, |
|
"learning_rate": 1.214350654091413e-06, |
|
"loss": 0.4524, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7783251231527094, |
|
"grad_norm": 0.8468368649482727, |
|
"learning_rate": 1.1891897243618184e-06, |
|
"loss": 0.4673, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7807881773399015, |
|
"grad_norm": 1.0746312141418457, |
|
"learning_rate": 1.1642569668895171e-06, |
|
"loss": 0.4392, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7832512315270936, |
|
"grad_norm": 1.0559433698654175, |
|
"learning_rate": 1.139553874523313e-06, |
|
"loss": 0.4199, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7857142857142857, |
|
"grad_norm": 0.7627314925193787, |
|
"learning_rate": 1.1150819263608098e-06, |
|
"loss": 0.3582, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7881773399014779, |
|
"grad_norm": 0.8994935750961304, |
|
"learning_rate": 1.0908425876598512e-06, |
|
"loss": 0.3603, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7906403940886699, |
|
"grad_norm": 0.8457913398742676, |
|
"learning_rate": 1.0668373097507922e-06, |
|
"loss": 0.3291, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7931034482758621, |
|
"grad_norm": 0.8527902364730835, |
|
"learning_rate": 1.0430675299495973e-06, |
|
"loss": 0.3809, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7955665024630542, |
|
"grad_norm": 0.7819089889526367, |
|
"learning_rate": 1.0195346714717813e-06, |
|
"loss": 0.4148, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.7980295566502463, |
|
"grad_norm": 0.878807544708252, |
|
"learning_rate": 9.962401433471985e-07, |
|
"loss": 0.3385, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8004926108374384, |
|
"grad_norm": 1.3636759519577026, |
|
"learning_rate": 9.731853403356705e-07, |
|
"loss": 0.378, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8029556650246306, |
|
"grad_norm": 22.09219741821289, |
|
"learning_rate": 9.5037164284348e-07, |
|
"loss": 0.3528, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8054187192118226, |
|
"grad_norm": 0.763201117515564, |
|
"learning_rate": 9.278004168407151e-07, |
|
"loss": 0.4066, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.8078817733990148, |
|
"grad_norm": 0.7910692095756531, |
|
"learning_rate": 9.054730137794887e-07, |
|
"loss": 0.2703, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8103448275862069, |
|
"grad_norm": 0.9270244836807251, |
|
"learning_rate": 8.833907705130091e-07, |
|
"loss": 0.4674, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.812807881773399, |
|
"grad_norm": 0.7657389044761658, |
|
"learning_rate": 8.615550092155478e-07, |
|
"loss": 0.3766, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8152709359605911, |
|
"grad_norm": 0.9968824982643127, |
|
"learning_rate": 8.399670373032665e-07, |
|
"loss": 0.443, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8177339901477833, |
|
"grad_norm": 0.9094595313072205, |
|
"learning_rate": 8.186281473559382e-07, |
|
"loss": 0.4102, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8201970443349754, |
|
"grad_norm": 0.9319184422492981, |
|
"learning_rate": 7.975396170395522e-07, |
|
"loss": 0.3985, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8226600985221675, |
|
"grad_norm": 0.8672550916671753, |
|
"learning_rate": 7.767027090298207e-07, |
|
"loss": 0.4297, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8251231527093597, |
|
"grad_norm": 1.1561846733093262, |
|
"learning_rate": 7.561186709365653e-07, |
|
"loss": 0.5102, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8275862068965517, |
|
"grad_norm": 1.1552369594573975, |
|
"learning_rate": 7.357887352290227e-07, |
|
"loss": 0.4518, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8300492610837439, |
|
"grad_norm": 8.381624221801758, |
|
"learning_rate": 7.157141191620548e-07, |
|
"loss": 0.4467, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8325123152709359, |
|
"grad_norm": 0.7928099632263184, |
|
"learning_rate": 6.958960247032515e-07, |
|
"loss": 0.3273, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8349753694581281, |
|
"grad_norm": 1.1730235815048218, |
|
"learning_rate": 6.763356384609809e-07, |
|
"loss": 0.4065, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8374384236453202, |
|
"grad_norm": 1.0426114797592163, |
|
"learning_rate": 6.570341316133272e-07, |
|
"loss": 0.4301, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8399014778325123, |
|
"grad_norm": 0.7004266381263733, |
|
"learning_rate": 6.379926598379727e-07, |
|
"loss": 0.2412, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8423645320197044, |
|
"grad_norm": 0.8835524320602417, |
|
"learning_rate": 6.192123632429986e-07, |
|
"loss": 0.4428, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8448275862068966, |
|
"grad_norm": 0.7402558922767639, |
|
"learning_rate": 6.006943662986275e-07, |
|
"loss": 0.3081, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8472906403940886, |
|
"grad_norm": 0.9316273927688599, |
|
"learning_rate": 5.824397777698859e-07, |
|
"loss": 0.3839, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8497536945812808, |
|
"grad_norm": 1.0180613994598389, |
|
"learning_rate": 5.644496906502233e-07, |
|
"loss": 0.3383, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8522167487684729, |
|
"grad_norm": 0.9619033932685852, |
|
"learning_rate": 5.4672518209607e-07, |
|
"loss": 0.3643, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.854679802955665, |
|
"grad_norm": 1.3424688577651978, |
|
"learning_rate": 5.292673133623372e-07, |
|
"loss": 0.5491, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8571428571428571, |
|
"grad_norm": 1.1439658403396606, |
|
"learning_rate": 5.120771297388788e-07, |
|
"loss": 0.3623, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8596059113300493, |
|
"grad_norm": 0.9782744646072388, |
|
"learning_rate": 4.951556604879049e-07, |
|
"loss": 0.5006, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8620689655172413, |
|
"grad_norm": 0.9459072351455688, |
|
"learning_rate": 4.785039187823503e-07, |
|
"loss": 0.4076, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8645320197044335, |
|
"grad_norm": 0.9256353378295898, |
|
"learning_rate": 4.6212290164521554e-07, |
|
"loss": 0.4268, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8669950738916257, |
|
"grad_norm": 0.9616571068763733, |
|
"learning_rate": 4.46013589889866e-07, |
|
"loss": 0.2967, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8694581280788177, |
|
"grad_norm": 0.7549436688423157, |
|
"learning_rate": 4.3017694806131163e-07, |
|
"loss": 0.4012, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8719211822660099, |
|
"grad_norm": 0.829087495803833, |
|
"learning_rate": 4.146139243784475e-07, |
|
"loss": 0.3399, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.874384236453202, |
|
"grad_norm": 0.885208785533905, |
|
"learning_rate": 3.9932545067728366e-07, |
|
"loss": 0.3451, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8768472906403941, |
|
"grad_norm": 0.9004522562026978, |
|
"learning_rate": 3.8431244235515366e-07, |
|
"loss": 0.3754, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8793103448275862, |
|
"grad_norm": 0.9170466065406799, |
|
"learning_rate": 3.695757983158954e-07, |
|
"loss": 0.3638, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8817733990147784, |
|
"grad_norm": 1.3507622480392456, |
|
"learning_rate": 3.5511640091604293e-07, |
|
"loss": 0.3724, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8842364532019704, |
|
"grad_norm": 0.8250304460525513, |
|
"learning_rate": 3.409351159119845e-07, |
|
"loss": 0.3516, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8866995073891626, |
|
"grad_norm": 0.7117695212364197, |
|
"learning_rate": 3.270327924081301e-07, |
|
"loss": 0.402, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8891625615763546, |
|
"grad_norm": 1.0209925174713135, |
|
"learning_rate": 3.134102628060698e-07, |
|
"loss": 0.5029, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8916256157635468, |
|
"grad_norm": 1.013433814048767, |
|
"learning_rate": 3.000683427547374e-07, |
|
"loss": 0.4095, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8940886699507389, |
|
"grad_norm": 0.9493584036827087, |
|
"learning_rate": 2.8700783110156507e-07, |
|
"loss": 0.4316, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.896551724137931, |
|
"grad_norm": 0.8495911955833435, |
|
"learning_rate": 2.742295098446623e-07, |
|
"loss": 0.3361, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8990147783251231, |
|
"grad_norm": 0.8885901570320129, |
|
"learning_rate": 2.617341440859883e-07, |
|
"loss": 0.4157, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9014778325123153, |
|
"grad_norm": 1.3665013313293457, |
|
"learning_rate": 2.4952248198554075e-07, |
|
"loss": 0.4276, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.9039408866995073, |
|
"grad_norm": 0.9520452618598938, |
|
"learning_rate": 2.3759525471656163e-07, |
|
"loss": 0.3771, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9064039408866995, |
|
"grad_norm": 0.8242120742797852, |
|
"learning_rate": 2.259531764217604e-07, |
|
"loss": 0.361, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9088669950738916, |
|
"grad_norm": 0.9279873371124268, |
|
"learning_rate": 2.1459694417055033e-07, |
|
"loss": 0.4935, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9113300492610837, |
|
"grad_norm": 1.1166878938674927, |
|
"learning_rate": 2.0352723791731366e-07, |
|
"loss": 0.3956, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9137931034482759, |
|
"grad_norm": 0.8491089344024658, |
|
"learning_rate": 1.9274472046068805e-07, |
|
"loss": 0.2792, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.916256157635468, |
|
"grad_norm": 1.089476227760315, |
|
"learning_rate": 1.8225003740388546e-07, |
|
"loss": 0.3604, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9187192118226601, |
|
"grad_norm": 0.9467745423316956, |
|
"learning_rate": 1.7204381711603046e-07, |
|
"loss": 0.3956, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9211822660098522, |
|
"grad_norm": 0.9419111013412476, |
|
"learning_rate": 1.621266706945429e-07, |
|
"loss": 0.4508, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9236453201970444, |
|
"grad_norm": 2.099689245223999, |
|
"learning_rate": 1.524991919285429e-07, |
|
"loss": 0.5749, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9261083743842364, |
|
"grad_norm": 1.3350390195846558, |
|
"learning_rate": 1.431619572633014e-07, |
|
"loss": 0.4098, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9285714285714286, |
|
"grad_norm": 0.912725567817688, |
|
"learning_rate": 1.3411552576572562e-07, |
|
"loss": 0.4331, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9310344827586207, |
|
"grad_norm": 0.9236518144607544, |
|
"learning_rate": 1.253604390908819e-07, |
|
"loss": 0.3197, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9334975369458128, |
|
"grad_norm": 0.9659357070922852, |
|
"learning_rate": 1.1689722144956672e-07, |
|
"loss": 0.4172, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9359605911330049, |
|
"grad_norm": 0.7131395936012268, |
|
"learning_rate": 1.0872637957691834e-07, |
|
"loss": 0.3812, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9384236453201971, |
|
"grad_norm": 1.1041388511657715, |
|
"learning_rate": 1.008484027020773e-07, |
|
"loss": 0.4566, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9408866995073891, |
|
"grad_norm": 0.7971521615982056, |
|
"learning_rate": 9.326376251889202e-08, |
|
"loss": 0.343, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9433497536945813, |
|
"grad_norm": 0.7595178484916687, |
|
"learning_rate": 8.597291315767808e-08, |
|
"loss": 0.37, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9458128078817734, |
|
"grad_norm": 0.709429144859314, |
|
"learning_rate": 7.897629115802553e-08, |
|
"loss": 0.3152, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9482758620689655, |
|
"grad_norm": 9.246243476867676, |
|
"learning_rate": 7.227431544266194e-08, |
|
"loss": 0.6465, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9507389162561576, |
|
"grad_norm": 1.1263999938964844, |
|
"learning_rate": 6.58673872923693e-08, |
|
"loss": 0.33, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9532019704433498, |
|
"grad_norm": 0.830968976020813, |
|
"learning_rate": 5.97558903219575e-08, |
|
"loss": 0.369, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9556650246305419, |
|
"grad_norm": 0.7675350904464722, |
|
"learning_rate": 5.3940190457294486e-08, |
|
"loss": 0.3185, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.958128078817734, |
|
"grad_norm": 1.0136815309524536, |
|
"learning_rate": 4.842063591339763e-08, |
|
"loss": 0.4651, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9605911330049262, |
|
"grad_norm": 0.9319408535957336, |
|
"learning_rate": 4.3197557173584317e-08, |
|
"loss": 0.3876, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9630541871921182, |
|
"grad_norm": 0.8962835669517517, |
|
"learning_rate": 3.82712669696822e-08, |
|
"loss": 0.4232, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9655172413793104, |
|
"grad_norm": 0.8527126312255859, |
|
"learning_rate": 3.364206026330752e-08, |
|
"loss": 0.3564, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9679802955665024, |
|
"grad_norm": 0.8836209177970886, |
|
"learning_rate": 2.9310214228202016e-08, |
|
"loss": 0.3616, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9704433497536946, |
|
"grad_norm": 0.8913058638572693, |
|
"learning_rate": 2.527598823363786e-08, |
|
"loss": 0.4859, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9729064039408867, |
|
"grad_norm": 0.8344419598579407, |
|
"learning_rate": 2.153962382888841e-08, |
|
"loss": 0.3994, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9753694581280788, |
|
"grad_norm": 0.9367709755897522, |
|
"learning_rate": 1.8101344728764236e-08, |
|
"loss": 0.321, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9778325123152709, |
|
"grad_norm": 0.862615704536438, |
|
"learning_rate": 1.496135680021993e-08, |
|
"loss": 0.3368, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9802955665024631, |
|
"grad_norm": 0.8697728514671326, |
|
"learning_rate": 1.2119848050025084e-08, |
|
"loss": 0.3877, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9827586206896551, |
|
"grad_norm": 0.985674262046814, |
|
"learning_rate": 9.576988613511084e-09, |
|
"loss": 0.3868, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9852216748768473, |
|
"grad_norm": 0.7607033252716064, |
|
"learning_rate": 7.332930744380906e-09, |
|
"loss": 0.4217, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9876847290640394, |
|
"grad_norm": 0.9515877962112427, |
|
"learning_rate": 5.387808805594752e-09, |
|
"loss": 0.417, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.9901477832512315, |
|
"grad_norm": 1.0010173320770264, |
|
"learning_rate": 3.741739261324817e-09, |
|
"loss": 0.4093, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.9926108374384236, |
|
"grad_norm": 0.6813908219337463, |
|
"learning_rate": 2.3948206699819787e-09, |
|
"loss": 0.3174, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.9950738916256158, |
|
"grad_norm": 0.9744001626968384, |
|
"learning_rate": 1.347133678313295e-09, |
|
"loss": 0.3682, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.9975369458128078, |
|
"grad_norm": 0.7549907565116882, |
|
"learning_rate": 5.987410165758656e-10, |
|
"loss": 0.4171, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.7962020635604858, |
|
"learning_rate": 1.4968749477872746e-10, |
|
"loss": 0.359, |
|
"step": 406 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 406, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 8.842746096485663e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|