|
{ |
|
"best_metric": 0.279557466506958, |
|
"best_model_checkpoint": "output/output_minicpmv26_upsampled_new/checkpoint-100", |
|
"epoch": 0.9720534629404617, |
|
"eval_steps": 100, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002430133657351154, |
|
"grad_norm": 3.186406373977661, |
|
"learning_rate": 0.0, |
|
"loss": 0.2724, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004860267314702308, |
|
"grad_norm": 3.186406373977661, |
|
"learning_rate": 0.0, |
|
"loss": 0.475, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.007290400972053463, |
|
"grad_norm": 3.186406373977661, |
|
"learning_rate": 0.0, |
|
"loss": 0.6319, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009720534629404616, |
|
"grad_norm": 3.186406373977661, |
|
"learning_rate": 0.0, |
|
"loss": 0.5354, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012150668286755772, |
|
"grad_norm": 9.746641159057617, |
|
"learning_rate": 5.017166594399687e-06, |
|
"loss": 0.6389, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014580801944106925, |
|
"grad_norm": 10.395784378051758, |
|
"learning_rate": 7.952020911994375e-06, |
|
"loss": 0.6399, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01701093560145808, |
|
"grad_norm": 3.7885968685150146, |
|
"learning_rate": 1.0034333188799373e-05, |
|
"loss": 0.3931, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019441069258809233, |
|
"grad_norm": 3.202871799468994, |
|
"learning_rate": 1.164950007226698e-05, |
|
"loss": 0.4881, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02187120291616039, |
|
"grad_norm": 4.895542144775391, |
|
"learning_rate": 1.2969187506394062e-05, |
|
"loss": 0.3696, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024301336573511544, |
|
"grad_norm": 2.880216360092163, |
|
"learning_rate": 1.4084967333570947e-05, |
|
"loss": 0.4236, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.026731470230862697, |
|
"grad_norm": 1.7968358993530273, |
|
"learning_rate": 1.505149978319906e-05, |
|
"loss": 0.2569, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.02916160388821385, |
|
"grad_norm": 2.6668541431427, |
|
"learning_rate": 1.590404182398875e-05, |
|
"loss": 0.5213, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.031591737545565005, |
|
"grad_norm": 2.0502774715423584, |
|
"learning_rate": 1.666666666666667e-05, |
|
"loss": 0.3246, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03402187120291616, |
|
"grad_norm": 1.6508930921554565, |
|
"learning_rate": 1.7356544752637084e-05, |
|
"loss": 0.1883, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03645200486026731, |
|
"grad_norm": 1.4152283668518066, |
|
"learning_rate": 1.7986354100793748e-05, |
|
"loss": 0.2409, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.038882138517618466, |
|
"grad_norm": 2.176948308944702, |
|
"learning_rate": 1.8565722538447282e-05, |
|
"loss": 0.2042, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.041312272174969626, |
|
"grad_norm": 2.2497684955596924, |
|
"learning_rate": 1.9102133927970633e-05, |
|
"loss": 0.2568, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04374240583232078, |
|
"grad_norm": 2.5578770637512207, |
|
"learning_rate": 1.9601520984261358e-05, |
|
"loss": 0.1914, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.046172539489671933, |
|
"grad_norm": 1.6818372011184692, |
|
"learning_rate": 2.0068666377598747e-05, |
|
"loss": 0.2355, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04860267314702309, |
|
"grad_norm": 1.2788841724395752, |
|
"learning_rate": 2.0507482022971233e-05, |
|
"loss": 0.2149, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05103280680437424, |
|
"grad_norm": 1.4694865942001343, |
|
"learning_rate": 2.0921208418388435e-05, |
|
"loss": 0.1945, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.053462940461725394, |
|
"grad_norm": 1.3222826719284058, |
|
"learning_rate": 2.1312560015880482e-05, |
|
"loss": 0.271, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05589307411907655, |
|
"grad_norm": 1.3958441019058228, |
|
"learning_rate": 2.1683833261066357e-05, |
|
"loss": 0.2217, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0583232077764277, |
|
"grad_norm": 1.6378260850906372, |
|
"learning_rate": 2.2036988245565324e-05, |
|
"loss": 0.235, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.060753341433778855, |
|
"grad_norm": 1.7126753330230713, |
|
"learning_rate": 2.2373711347036773e-05, |
|
"loss": 0.1788, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06318347509113001, |
|
"grad_norm": 1.4920830726623535, |
|
"learning_rate": 2.269546393362655e-05, |
|
"loss": 0.2211, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06561360874848117, |
|
"grad_norm": 1.6582273244857788, |
|
"learning_rate": 2.3003520695193437e-05, |
|
"loss": 0.1942, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06804374240583232, |
|
"grad_norm": 1.3107630014419556, |
|
"learning_rate": 2.329900014453396e-05, |
|
"loss": 0.1218, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07047387606318348, |
|
"grad_norm": 1.492126226425171, |
|
"learning_rate": 2.3582889132846968e-05, |
|
"loss": 0.1473, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.07290400972053462, |
|
"grad_norm": 2.5211129188537598, |
|
"learning_rate": 2.3856062735983123e-05, |
|
"loss": 0.3478, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07533414337788578, |
|
"grad_norm": 1.6695655584335327, |
|
"learning_rate": 2.4119300522370322e-05, |
|
"loss": 0.1309, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07776427703523693, |
|
"grad_norm": 1.708465576171875, |
|
"learning_rate": 2.4373299964982603e-05, |
|
"loss": 0.2028, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08019441069258809, |
|
"grad_norm": 1.2873278856277466, |
|
"learning_rate": 2.4618687578661044e-05, |
|
"loss": 0.1629, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08262454434993925, |
|
"grad_norm": 1.6095136404037476, |
|
"learning_rate": 2.4856028230571212e-05, |
|
"loss": 0.2027, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.0850546780072904, |
|
"grad_norm": 2.230327844619751, |
|
"learning_rate": 2.5085832971998436e-05, |
|
"loss": 0.2505, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08748481166464156, |
|
"grad_norm": 0.9581132531166077, |
|
"learning_rate": 2.530856566463146e-05, |
|
"loss": 0.1652, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.0899149453219927, |
|
"grad_norm": 2.2543814182281494, |
|
"learning_rate": 2.552464861737092e-05, |
|
"loss": 0.2478, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09234507897934387, |
|
"grad_norm": 1.2664082050323486, |
|
"learning_rate": 2.5734467405837933e-05, |
|
"loss": 0.1083, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09477521263669501, |
|
"grad_norm": 1.69247567653656, |
|
"learning_rate": 2.5938375012788124e-05, |
|
"loss": 0.3731, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09720534629404617, |
|
"grad_norm": 1.6350871324539185, |
|
"learning_rate": 2.6136695401116585e-05, |
|
"loss": 0.3767, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09963547995139732, |
|
"grad_norm": 1.7677048444747925, |
|
"learning_rate": 2.6329726610280168e-05, |
|
"loss": 0.1207, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10206561360874848, |
|
"grad_norm": 1.1837761402130127, |
|
"learning_rate": 2.651774345044166e-05, |
|
"loss": 0.119, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10449574726609964, |
|
"grad_norm": 1.1551034450531006, |
|
"learning_rate": 2.6700999855466042e-05, |
|
"loss": 0.1566, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10692588092345079, |
|
"grad_norm": 1.4947562217712402, |
|
"learning_rate": 2.687973094532893e-05, |
|
"loss": 0.2224, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.10935601458080195, |
|
"grad_norm": 1.3775482177734375, |
|
"learning_rate": 2.7054154839965013e-05, |
|
"loss": 0.158, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1117861482381531, |
|
"grad_norm": 1.6749316453933716, |
|
"learning_rate": 2.722447425965978e-05, |
|
"loss": 0.1058, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11421628189550426, |
|
"grad_norm": 0.8986448049545288, |
|
"learning_rate": 2.739087794143646e-05, |
|
"loss": 0.1299, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1166464155528554, |
|
"grad_norm": 1.2980709075927734, |
|
"learning_rate": 2.755354189625573e-05, |
|
"loss": 0.1029, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.11907654921020656, |
|
"grad_norm": 1.271391749382019, |
|
"learning_rate": 2.771263052802624e-05, |
|
"loss": 0.1576, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12150668286755771, |
|
"grad_norm": 1.4220929145812988, |
|
"learning_rate": 2.7868297632261957e-05, |
|
"loss": 0.1338, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12393681652490887, |
|
"grad_norm": 0.962883472442627, |
|
"learning_rate": 2.8020687289593123e-05, |
|
"loss": 0.1546, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12636695018226002, |
|
"grad_norm": 0.8322131633758545, |
|
"learning_rate": 2.8169934667141895e-05, |
|
"loss": 0.1877, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.12879708383961117, |
|
"grad_norm": 1.5774791240692139, |
|
"learning_rate": 2.8316166738933646e-05, |
|
"loss": 0.1473, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.13122721749696234, |
|
"grad_norm": 1.2331901788711548, |
|
"learning_rate": 2.845950293496561e-05, |
|
"loss": 0.226, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1336573511543135, |
|
"grad_norm": 2.1382243633270264, |
|
"learning_rate": 2.8600055727246657e-05, |
|
"loss": 0.2057, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13608748481166463, |
|
"grad_norm": 0.8981903195381165, |
|
"learning_rate": 2.8737931160013153e-05, |
|
"loss": 0.1151, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.1385176184690158, |
|
"grad_norm": 2.6175801753997803, |
|
"learning_rate": 2.8873229330382812e-05, |
|
"loss": 0.187, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14094775212636695, |
|
"grad_norm": 2.0045104026794434, |
|
"learning_rate": 2.9006044824904066e-05, |
|
"loss": 0.1901, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.1433778857837181, |
|
"grad_norm": 1.3968229293823242, |
|
"learning_rate": 2.913646711677001e-05, |
|
"loss": 0.1513, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.14580801944106925, |
|
"grad_norm": 8.191553115844727, |
|
"learning_rate": 2.926458092787486e-05, |
|
"loss": 0.1286, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.14823815309842042, |
|
"grad_norm": 2.5880401134490967, |
|
"learning_rate": 2.939046655938229e-05, |
|
"loss": 0.1799, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15066828675577157, |
|
"grad_norm": 1.9319133758544922, |
|
"learning_rate": 2.951420019403574e-05, |
|
"loss": 0.2835, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15309842041312272, |
|
"grad_norm": 2.653153419494629, |
|
"learning_rate": 2.963585417306073e-05, |
|
"loss": 0.1936, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15552855407047386, |
|
"grad_norm": 1.4456685781478882, |
|
"learning_rate": 2.9755497250179453e-05, |
|
"loss": 0.2256, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.15795868772782504, |
|
"grad_norm": 1.168137788772583, |
|
"learning_rate": 2.98731948249709e-05, |
|
"loss": 0.1318, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16038882138517618, |
|
"grad_norm": 1.0261298418045044, |
|
"learning_rate": 2.9989009157559694e-05, |
|
"loss": 0.1738, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16281895504252733, |
|
"grad_norm": 2.1802849769592285, |
|
"learning_rate": 3.010299956639812e-05, |
|
"loss": 0.1304, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.1652490886998785, |
|
"grad_norm": 0.9934577941894531, |
|
"learning_rate": 3.021522261071426e-05, |
|
"loss": 0.1472, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.16767922235722965, |
|
"grad_norm": 1.7603391408920288, |
|
"learning_rate": 3.0325732259031143e-05, |
|
"loss": 0.0965, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.1701093560145808, |
|
"grad_norm": 1.1827404499053955, |
|
"learning_rate": 3.043458004501377e-05, |
|
"loss": 0.0957, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17253948967193194, |
|
"grad_norm": 1.950810432434082, |
|
"learning_rate": 3.054181521177061e-05, |
|
"loss": 0.1916, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.17496962332928312, |
|
"grad_norm": 1.1083086729049683, |
|
"learning_rate": 3.064748484562093e-05, |
|
"loss": 0.1189, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.17739975698663427, |
|
"grad_norm": 1.8297152519226074, |
|
"learning_rate": 3.0751634000237615e-05, |
|
"loss": 0.1802, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.1798298906439854, |
|
"grad_norm": 1.1761589050292969, |
|
"learning_rate": 3.085430581198459e-05, |
|
"loss": 0.1516, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.1822600243013366, |
|
"grad_norm": 1.0804104804992676, |
|
"learning_rate": 3.095554160718781e-05, |
|
"loss": 0.1177, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18469015795868773, |
|
"grad_norm": 1.3176584243774414, |
|
"learning_rate": 3.10553810020076e-05, |
|
"loss": 0.2232, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.18712029161603888, |
|
"grad_norm": 0.8258953094482422, |
|
"learning_rate": 3.115386199551628e-05, |
|
"loss": 0.0939, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.18955042527339003, |
|
"grad_norm": 1.2120673656463623, |
|
"learning_rate": 3.1251021056528336e-05, |
|
"loss": 0.0949, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1919805589307412, |
|
"grad_norm": 2.156952142715454, |
|
"learning_rate": 3.134689320467986e-05, |
|
"loss": 0.1487, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19441069258809235, |
|
"grad_norm": 1.1182252168655396, |
|
"learning_rate": 3.144151208620804e-05, |
|
"loss": 0.1251, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.1968408262454435, |
|
"grad_norm": 0.9124034643173218, |
|
"learning_rate": 3.1534910044841344e-05, |
|
"loss": 0.0881, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.19927095990279464, |
|
"grad_norm": 1.1577218770980835, |
|
"learning_rate": 3.1627118188174024e-05, |
|
"loss": 0.1401, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.20170109356014582, |
|
"grad_norm": 0.8857666254043579, |
|
"learning_rate": 3.171816644986573e-05, |
|
"loss": 0.1202, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20413122721749696, |
|
"grad_norm": 0.8261206746101379, |
|
"learning_rate": 3.18080836479775e-05, |
|
"loss": 0.1156, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.2065613608748481, |
|
"grad_norm": 2.775029182434082, |
|
"learning_rate": 3.1896897539728616e-05, |
|
"loss": 0.1777, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.20899149453219928, |
|
"grad_norm": 0.9188308715820312, |
|
"learning_rate": 3.198463487293457e-05, |
|
"loss": 0.0676, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21142162818955043, |
|
"grad_norm": 1.3610761165618896, |
|
"learning_rate": 3.207132143436469e-05, |
|
"loss": 0.1576, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21385176184690158, |
|
"grad_norm": 1.8237483501434326, |
|
"learning_rate": 3.215698209523821e-05, |
|
"loss": 0.0858, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.21628189550425272, |
|
"grad_norm": 1.4661478996276855, |
|
"learning_rate": 3.224164085405946e-05, |
|
"loss": 0.1596, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2187120291616039, |
|
"grad_norm": 0.8811361193656921, |
|
"learning_rate": 3.232532087697698e-05, |
|
"loss": 0.1661, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22114216281895505, |
|
"grad_norm": 1.274592638015747, |
|
"learning_rate": 3.240804453583615e-05, |
|
"loss": 0.0793, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.2235722964763062, |
|
"grad_norm": 1.125353455543518, |
|
"learning_rate": 3.248983344408188e-05, |
|
"loss": 0.2034, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.22600243013365734, |
|
"grad_norm": 1.1274268627166748, |
|
"learning_rate": 3.2570708490655414e-05, |
|
"loss": 0.117, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2284325637910085, |
|
"grad_norm": 0.8626168370246887, |
|
"learning_rate": 3.265068987201822e-05, |
|
"loss": 0.0469, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23086269744835966, |
|
"grad_norm": 0.9799898862838745, |
|
"learning_rate": 3.2729797122425925e-05, |
|
"loss": 0.1822, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2332928311057108, |
|
"grad_norm": 1.093001127243042, |
|
"learning_rate": 3.280804914256559e-05, |
|
"loss": 0.1182, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.23572296476306198, |
|
"grad_norm": 0.8328416347503662, |
|
"learning_rate": 3.288546422666164e-05, |
|
"loss": 0.1668, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.23815309842041313, |
|
"grad_norm": 1.4609471559524536, |
|
"learning_rate": 3.2962060088147464e-05, |
|
"loss": 0.1545, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24058323207776428, |
|
"grad_norm": 1.207360029220581, |
|
"learning_rate": 3.3037853883992805e-05, |
|
"loss": 0.2099, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24301336573511542, |
|
"grad_norm": 0.9304331541061401, |
|
"learning_rate": 3.3112862237770756e-05, |
|
"loss": 0.0543, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.24301336573511542, |
|
"eval_loss": 0.279557466506958, |
|
"eval_runtime": 507.19, |
|
"eval_samples_per_second": 5.418, |
|
"eval_steps_per_second": 0.678, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2454434993924666, |
|
"grad_norm": 0.8421845436096191, |
|
"learning_rate": 3.3187101261541584e-05, |
|
"loss": 0.117, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.24787363304981774, |
|
"grad_norm": 1.1542550325393677, |
|
"learning_rate": 3.326058657662584e-05, |
|
"loss": 0.172, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2503037667071689, |
|
"grad_norm": 1.1323540210723877, |
|
"learning_rate": 3.333333333333334e-05, |
|
"loss": 0.1541, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.25273390036452004, |
|
"grad_norm": 0.9894705414772034, |
|
"learning_rate": 3.340535622971072e-05, |
|
"loss": 0.0689, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.2551640340218712, |
|
"grad_norm": 1.0771222114562988, |
|
"learning_rate": 3.3476669529365295e-05, |
|
"loss": 0.1796, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.25759416767922233, |
|
"grad_norm": 0.9119389653205872, |
|
"learning_rate": 3.3547287078419544e-05, |
|
"loss": 0.1092, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2600243013365735, |
|
"grad_norm": 1.194972038269043, |
|
"learning_rate": 3.361722232164634e-05, |
|
"loss": 0.1077, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2624544349939247, |
|
"grad_norm": 1.0005968809127808, |
|
"learning_rate": 3.3686488317832306e-05, |
|
"loss": 0.2024, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2648845686512758, |
|
"grad_norm": 1.0404926538467407, |
|
"learning_rate": 3.375509775441284e-05, |
|
"loss": 0.0672, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.267314702308627, |
|
"grad_norm": 0.9089542031288147, |
|
"learning_rate": 3.382306296142016e-05, |
|
"loss": 0.0831, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.26974483596597815, |
|
"grad_norm": 1.2017402648925781, |
|
"learning_rate": 3.38903959247825e-05, |
|
"loss": 0.1336, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.27217496962332927, |
|
"grad_norm": 0.9206855893135071, |
|
"learning_rate": 3.395710829901039e-05, |
|
"loss": 0.0721, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.27460510328068044, |
|
"grad_norm": 1.4966216087341309, |
|
"learning_rate": 3.402321141930376e-05, |
|
"loss": 0.124, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2770352369380316, |
|
"grad_norm": 0.7973242998123169, |
|
"learning_rate": 3.4088716313110955e-05, |
|
"loss": 0.0597, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.27946537059538273, |
|
"grad_norm": 0.941777765750885, |
|
"learning_rate": 3.415363371116969e-05, |
|
"loss": 0.0737, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.2818955042527339, |
|
"grad_norm": 0.9339935183525085, |
|
"learning_rate": 3.4217974058057e-05, |
|
"loss": 0.1308, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.284325637910085, |
|
"grad_norm": 1.0190166234970093, |
|
"learning_rate": 3.428174752227455e-05, |
|
"loss": 0.117, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.2867557715674362, |
|
"grad_norm": 1.7908596992492676, |
|
"learning_rate": 3.434496400589353e-05, |
|
"loss": 0.1853, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.2891859052247874, |
|
"grad_norm": 1.1210516691207886, |
|
"learning_rate": 3.440763315378198e-05, |
|
"loss": 0.1775, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.2916160388821385, |
|
"grad_norm": 0.916373610496521, |
|
"learning_rate": 3.446976436243603e-05, |
|
"loss": 0.0902, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.29404617253948967, |
|
"grad_norm": 1.1219305992126465, |
|
"learning_rate": 3.4531366788435425e-05, |
|
"loss": 0.1742, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.29647630619684084, |
|
"grad_norm": 0.6167263388633728, |
|
"learning_rate": 3.459244935654219e-05, |
|
"loss": 0.0674, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.29890643985419196, |
|
"grad_norm": 1.4466544389724731, |
|
"learning_rate": 3.465302076746041e-05, |
|
"loss": 0.1252, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.30133657351154314, |
|
"grad_norm": 1.1142324209213257, |
|
"learning_rate": 3.471308950527417e-05, |
|
"loss": 0.2289, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3037667071688943, |
|
"grad_norm": 1.2791327238082886, |
|
"learning_rate": 3.477266384457914e-05, |
|
"loss": 0.0885, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.30619684082624543, |
|
"grad_norm": 0.87410968542099, |
|
"learning_rate": 3.48317518573233e-05, |
|
"loss": 0.1188, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.3086269744835966, |
|
"grad_norm": 1.3250837326049805, |
|
"learning_rate": 3.489036141937059e-05, |
|
"loss": 0.1572, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3110571081409477, |
|
"grad_norm": 1.2806202173233032, |
|
"learning_rate": 3.494850021680094e-05, |
|
"loss": 0.1576, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.3134872417982989, |
|
"grad_norm": 1.1694215536117554, |
|
"learning_rate": 3.500617575195938e-05, |
|
"loss": 0.0828, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.3159173754556501, |
|
"grad_norm": 1.1224102973937988, |
|
"learning_rate": 3.5063395349265945e-05, |
|
"loss": 0.197, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3183475091130012, |
|
"grad_norm": 5.477957248687744, |
|
"learning_rate": 3.5120166160797804e-05, |
|
"loss": 0.1175, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.32077764277035237, |
|
"grad_norm": 0.8001928925514221, |
|
"learning_rate": 3.517649517165415e-05, |
|
"loss": 0.0827, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.32320777642770354, |
|
"grad_norm": 0.8747657537460327, |
|
"learning_rate": 3.523238920511395e-05, |
|
"loss": 0.0611, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.32563791008505466, |
|
"grad_norm": 0.7952237725257874, |
|
"learning_rate": 3.528785492759607e-05, |
|
"loss": 0.0875, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.32806804374240583, |
|
"grad_norm": 1.1550030708312988, |
|
"learning_rate": 3.5342898853430836e-05, |
|
"loss": 0.1051, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.330498177399757, |
|
"grad_norm": 1.0629292726516724, |
|
"learning_rate": 3.539752734945143e-05, |
|
"loss": 0.0615, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.33292831105710813, |
|
"grad_norm": 1.4681527614593506, |
|
"learning_rate": 3.5451746639413466e-05, |
|
"loss": 0.1419, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3353584447144593, |
|
"grad_norm": 1.924991488456726, |
|
"learning_rate": 3.550556280825011e-05, |
|
"loss": 0.1506, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3377885783718105, |
|
"grad_norm": 1.485156774520874, |
|
"learning_rate": 3.55589818061703e-05, |
|
"loss": 0.1149, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3402187120291616, |
|
"grad_norm": 1.5923975706100464, |
|
"learning_rate": 3.561200945260678e-05, |
|
"loss": 0.1039, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.34264884568651277, |
|
"grad_norm": 1.22085440158844, |
|
"learning_rate": 3.5664651440020616e-05, |
|
"loss": 0.1192, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3450789793438639, |
|
"grad_norm": 1.2707493305206299, |
|
"learning_rate": 3.571691333756825e-05, |
|
"loss": 0.1605, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.34750911300121506, |
|
"grad_norm": 1.3047083616256714, |
|
"learning_rate": 3.5768800594637304e-05, |
|
"loss": 0.085, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.34993924665856624, |
|
"grad_norm": 1.43068265914917, |
|
"learning_rate": 3.582031854425634e-05, |
|
"loss": 0.1063, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.35236938031591736, |
|
"grad_norm": 1.2630374431610107, |
|
"learning_rate": 3.587147240638428e-05, |
|
"loss": 0.0877, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.35479951397326853, |
|
"grad_norm": 0.7847033143043518, |
|
"learning_rate": 3.5922267291084366e-05, |
|
"loss": 0.0309, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.3572296476306197, |
|
"grad_norm": 1.0574724674224854, |
|
"learning_rate": 3.5972708201587496e-05, |
|
"loss": 0.1964, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.3596597812879708, |
|
"grad_norm": 1.662292718887329, |
|
"learning_rate": 3.6022800037249585e-05, |
|
"loss": 0.0853, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.362089914945322, |
|
"grad_norm": 2.2652666568756104, |
|
"learning_rate": 3.607254759640729e-05, |
|
"loss": 0.1134, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.3645200486026732, |
|
"grad_norm": 2.5058281421661377, |
|
"learning_rate": 3.612195557913627e-05, |
|
"loss": 0.0678, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3669501822600243, |
|
"grad_norm": 1.5377355813980103, |
|
"learning_rate": 3.6171028589915954e-05, |
|
"loss": 0.2186, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.36938031591737547, |
|
"grad_norm": 1.1862763166427612, |
|
"learning_rate": 3.6219771140204575e-05, |
|
"loss": 0.1957, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.3718104495747266, |
|
"grad_norm": 0.6389171481132507, |
|
"learning_rate": 3.626818765092802e-05, |
|
"loss": 0.1293, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.37424058323207776, |
|
"grad_norm": 0.7620474696159363, |
|
"learning_rate": 3.6316282454886157e-05, |
|
"loss": 0.1237, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.37667071688942894, |
|
"grad_norm": 2.63171124458313, |
|
"learning_rate": 3.636405979907955e-05, |
|
"loss": 0.1276, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.37910085054678005, |
|
"grad_norm": 1.1460998058319092, |
|
"learning_rate": 3.6411523846959985e-05, |
|
"loss": 0.0713, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.38153098420413123, |
|
"grad_norm": 0.7793575525283813, |
|
"learning_rate": 3.645867868060772e-05, |
|
"loss": 0.0684, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3839611178614824, |
|
"grad_norm": 0.973029613494873, |
|
"learning_rate": 3.6505528302838193e-05, |
|
"loss": 0.0667, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3863912515188335, |
|
"grad_norm": 2.509524345397949, |
|
"learning_rate": 3.6552076639241027e-05, |
|
"loss": 0.2404, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3888213851761847, |
|
"grad_norm": 1.7816277742385864, |
|
"learning_rate": 3.65983275401539e-05, |
|
"loss": 0.0928, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.39125151883353587, |
|
"grad_norm": 0.7536255121231079, |
|
"learning_rate": 3.664428478257371e-05, |
|
"loss": 0.1489, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.393681652490887, |
|
"grad_norm": 0.8865494132041931, |
|
"learning_rate": 3.668995207200753e-05, |
|
"loss": 0.1395, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.39611178614823817, |
|
"grad_norm": 1.5424951314926147, |
|
"learning_rate": 3.673533304426541e-05, |
|
"loss": 0.2781, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.3985419198055893, |
|
"grad_norm": 1.0128049850463867, |
|
"learning_rate": 3.67804312671975e-05, |
|
"loss": 0.161, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.40097205346294046, |
|
"grad_norm": 1.1372522115707397, |
|
"learning_rate": 3.682525024237719e-05, |
|
"loss": 0.1299, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.40340218712029163, |
|
"grad_norm": 1.0910232067108154, |
|
"learning_rate": 3.6869793406732636e-05, |
|
"loss": 0.0693, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.40583232077764275, |
|
"grad_norm": 0.8842087388038635, |
|
"learning_rate": 3.69140641341283e-05, |
|
"loss": 0.136, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.4082624544349939, |
|
"grad_norm": 1.3073561191558838, |
|
"learning_rate": 3.695806573689844e-05, |
|
"loss": 0.0366, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.4106925880923451, |
|
"grad_norm": 1.3095803260803223, |
|
"learning_rate": 3.700180146733426e-05, |
|
"loss": 0.1218, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.4131227217496962, |
|
"grad_norm": 1.3316633701324463, |
|
"learning_rate": 3.704527451912639e-05, |
|
"loss": 0.14, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.4155528554070474, |
|
"grad_norm": 0.7762789130210876, |
|
"learning_rate": 3.708848802876438e-05, |
|
"loss": 0.1055, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.41798298906439857, |
|
"grad_norm": 1.0042897462844849, |
|
"learning_rate": 3.7131445076894564e-05, |
|
"loss": 0.0627, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.4204131227217497, |
|
"grad_norm": 2.6669344902038574, |
|
"learning_rate": 3.717414868963791e-05, |
|
"loss": 0.1243, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.42284325637910086, |
|
"grad_norm": 0.6236822605133057, |
|
"learning_rate": 3.721660183986924e-05, |
|
"loss": 0.0497, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.425273390036452, |
|
"grad_norm": 1.0069197416305542, |
|
"learning_rate": 3.725880744845915e-05, |
|
"loss": 0.1257, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.42770352369380316, |
|
"grad_norm": 0.8991190195083618, |
|
"learning_rate": 3.730076838547993e-05, |
|
"loss": 0.1139, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.43013365735115433, |
|
"grad_norm": 1.1606578826904297, |
|
"learning_rate": 3.734248747137666e-05, |
|
"loss": 0.0454, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.43256379100850545, |
|
"grad_norm": 1.6333225965499878, |
|
"learning_rate": 3.738396747810492e-05, |
|
"loss": 0.0885, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4349939246658566, |
|
"grad_norm": 1.1033371686935425, |
|
"learning_rate": 3.7425211130235834e-05, |
|
"loss": 0.139, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.4374240583232078, |
|
"grad_norm": 0.5453643202781677, |
|
"learning_rate": 3.7466221106030115e-05, |
|
"loss": 0.0543, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4398541919805589, |
|
"grad_norm": 0.8695247769355774, |
|
"learning_rate": 3.750700003848157e-05, |
|
"loss": 0.1419, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4422843256379101, |
|
"grad_norm": 1.7627193927764893, |
|
"learning_rate": 3.7547550516331555e-05, |
|
"loss": 0.1566, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.44471445929526127, |
|
"grad_norm": 0.7778682708740234, |
|
"learning_rate": 3.75878750850551e-05, |
|
"loss": 0.0559, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.4471445929526124, |
|
"grad_norm": 0.8081510663032532, |
|
"learning_rate": 3.7627976247819744e-05, |
|
"loss": 0.0729, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.44957472660996356, |
|
"grad_norm": 0.6429978013038635, |
|
"learning_rate": 3.766785646641792e-05, |
|
"loss": 0.0474, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4520048602673147, |
|
"grad_norm": 1.088581919670105, |
|
"learning_rate": 3.770751816217383e-05, |
|
"loss": 0.0644, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.45443499392466585, |
|
"grad_norm": 1.4708858728408813, |
|
"learning_rate": 3.7746963716825615e-05, |
|
"loss": 0.1651, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.456865127582017, |
|
"grad_norm": 0.8583030700683594, |
|
"learning_rate": 3.778619547338356e-05, |
|
"loss": 0.0822, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.45929526123936815, |
|
"grad_norm": 1.0450993776321411, |
|
"learning_rate": 3.782521573696528e-05, |
|
"loss": 0.0731, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4617253948967193, |
|
"grad_norm": 1.1490970849990845, |
|
"learning_rate": 3.786402677560832e-05, |
|
"loss": 0.1354, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4641555285540705, |
|
"grad_norm": 0.5680958032608032, |
|
"learning_rate": 3.790263082106134e-05, |
|
"loss": 0.0836, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4665856622114216, |
|
"grad_norm": 0.7936691641807556, |
|
"learning_rate": 3.794103006955407e-05, |
|
"loss": 0.0526, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4690157958687728, |
|
"grad_norm": 1.0569026470184326, |
|
"learning_rate": 3.797922668254715e-05, |
|
"loss": 0.1512, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.47144592952612396, |
|
"grad_norm": 1.2363556623458862, |
|
"learning_rate": 3.801722278746213e-05, |
|
"loss": 0.1316, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4738760631834751, |
|
"grad_norm": 1.1622111797332764, |
|
"learning_rate": 3.8055020478392495e-05, |
|
"loss": 0.1432, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.47630619684082626, |
|
"grad_norm": 2.1137237548828125, |
|
"learning_rate": 3.809262181679623e-05, |
|
"loss": 0.1273, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4787363304981774, |
|
"grad_norm": 1.0623483657836914, |
|
"learning_rate": 3.813002883217044e-05, |
|
"loss": 0.1066, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.48116646415552855, |
|
"grad_norm": 1.0300410985946655, |
|
"learning_rate": 3.816724352270863e-05, |
|
"loss": 0.1443, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4835965978128797, |
|
"grad_norm": 0.6886430382728577, |
|
"learning_rate": 3.8204267855941266e-05, |
|
"loss": 0.0969, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.48602673147023084, |
|
"grad_norm": 0.6511848568916321, |
|
"learning_rate": 3.824110376935989e-05, |
|
"loss": 0.0791, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.48602673147023084, |
|
"eval_loss": 0.311313271522522, |
|
"eval_runtime": 505.4888, |
|
"eval_samples_per_second": 5.436, |
|
"eval_steps_per_second": 0.681, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.488456865127582, |
|
"grad_norm": 0.9628679752349854, |
|
"learning_rate": 3.827775317102552e-05, |
|
"loss": 0.0952, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.4908869987849332, |
|
"grad_norm": 1.1625686883926392, |
|
"learning_rate": 3.831421794016178e-05, |
|
"loss": 0.1665, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.4933171324422843, |
|
"grad_norm": 1.7337137460708618, |
|
"learning_rate": 3.835049992773302e-05, |
|
"loss": 0.141, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.4957472660996355, |
|
"grad_norm": 1.1475183963775635, |
|
"learning_rate": 3.838660095700815e-05, |
|
"loss": 0.0996, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.49817739975698666, |
|
"grad_norm": 0.6650044918060303, |
|
"learning_rate": 3.84225228241104e-05, |
|
"loss": 0.0611, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5006075334143378, |
|
"grad_norm": 0.6770364046096802, |
|
"learning_rate": 3.8458267298553554e-05, |
|
"loss": 0.051, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.503037667071689, |
|
"grad_norm": 1.22215735912323, |
|
"learning_rate": 3.8493836123764984e-05, |
|
"loss": 0.0724, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.5054678007290401, |
|
"grad_norm": 1.1168265342712402, |
|
"learning_rate": 3.852923101759591e-05, |
|
"loss": 0.116, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5078979343863913, |
|
"grad_norm": 0.7812952995300293, |
|
"learning_rate": 3.856445367281923e-05, |
|
"loss": 0.0726, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5103280680437424, |
|
"grad_norm": 0.7324075698852539, |
|
"learning_rate": 3.859950575761529e-05, |
|
"loss": 0.0359, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5127582017010935, |
|
"grad_norm": 0.8063955903053284, |
|
"learning_rate": 3.8634388916046025e-05, |
|
"loss": 0.0912, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.5151883353584447, |
|
"grad_norm": 4.057889938354492, |
|
"learning_rate": 3.866910476851757e-05, |
|
"loss": 0.2407, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5176184690157959, |
|
"grad_norm": 0.7670718431472778, |
|
"learning_rate": 3.870365491223199e-05, |
|
"loss": 0.058, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.520048602673147, |
|
"grad_norm": 0.717004120349884, |
|
"learning_rate": 3.8738040921628215e-05, |
|
"loss": 0.0308, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5224787363304981, |
|
"grad_norm": 1.2053970098495483, |
|
"learning_rate": 3.877226434881253e-05, |
|
"loss": 0.091, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5249088699878494, |
|
"grad_norm": 1.3508349657058716, |
|
"learning_rate": 3.880632672397897e-05, |
|
"loss": 0.1201, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5273390036452005, |
|
"grad_norm": 0.6895163059234619, |
|
"learning_rate": 3.884022955581985e-05, |
|
"loss": 0.0403, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5297691373025516, |
|
"grad_norm": 0.7722972631454468, |
|
"learning_rate": 3.887397433192676e-05, |
|
"loss": 0.0984, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5321992709599028, |
|
"grad_norm": 0.897186279296875, |
|
"learning_rate": 3.890756251918219e-05, |
|
"loss": 0.1564, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.534629404617254, |
|
"grad_norm": 0.5847256183624268, |
|
"learning_rate": 3.894099556414216e-05, |
|
"loss": 0.0905, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5370595382746051, |
|
"grad_norm": 0.7907549142837524, |
|
"learning_rate": 3.897427489341009e-05, |
|
"loss": 0.075, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5394896719319563, |
|
"grad_norm": 0.8477361798286438, |
|
"learning_rate": 3.900740191400198e-05, |
|
"loss": 0.028, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5419198055893074, |
|
"grad_norm": 1.1732168197631836, |
|
"learning_rate": 3.904037801370344e-05, |
|
"loss": 0.0921, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5443499392466585, |
|
"grad_norm": 0.9721484184265137, |
|
"learning_rate": 3.9073204561418514e-05, |
|
"loss": 0.1273, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5467800729040098, |
|
"grad_norm": 0.6401745676994324, |
|
"learning_rate": 3.9105882907510644e-05, |
|
"loss": 0.0433, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5492102065613609, |
|
"grad_norm": 1.2300125360488892, |
|
"learning_rate": 3.913841438413601e-05, |
|
"loss": 0.0574, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.551640340218712, |
|
"grad_norm": 0.9645660519599915, |
|
"learning_rate": 3.917080030556938e-05, |
|
"loss": 0.0491, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5540704738760632, |
|
"grad_norm": 1.5600403547286987, |
|
"learning_rate": 3.9203041968522716e-05, |
|
"loss": 0.1232, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5565006075334143, |
|
"grad_norm": 1.07868230342865, |
|
"learning_rate": 3.923514065245669e-05, |
|
"loss": 0.0674, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5589307411907655, |
|
"grad_norm": 0.933269739151001, |
|
"learning_rate": 3.926709761988538e-05, |
|
"loss": 0.0753, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5613608748481167, |
|
"grad_norm": 2.12733793258667, |
|
"learning_rate": 3.929891411667424e-05, |
|
"loss": 0.1144, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5637910085054678, |
|
"grad_norm": 1.380554437637329, |
|
"learning_rate": 3.933059137233147e-05, |
|
"loss": 0.0843, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.5662211421628189, |
|
"grad_norm": 1.0420360565185547, |
|
"learning_rate": 3.9362130600293214e-05, |
|
"loss": 0.111, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.56865127582017, |
|
"grad_norm": 0.6409225463867188, |
|
"learning_rate": 3.9393532998202405e-05, |
|
"loss": 0.0425, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5710814094775213, |
|
"grad_norm": 0.8021889328956604, |
|
"learning_rate": 3.942479974818166e-05, |
|
"loss": 0.0386, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5735115431348724, |
|
"grad_norm": 1.5499992370605469, |
|
"learning_rate": 3.945593201710032e-05, |
|
"loss": 0.0404, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5759416767922235, |
|
"grad_norm": 0.7606313824653625, |
|
"learning_rate": 3.9486930956835724e-05, |
|
"loss": 0.0789, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5783718104495748, |
|
"grad_norm": 0.9025908708572388, |
|
"learning_rate": 3.951779770452894e-05, |
|
"loss": 0.0617, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5808019441069259, |
|
"grad_norm": 1.1557445526123047, |
|
"learning_rate": 3.954853338283512e-05, |
|
"loss": 0.0751, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.583232077764277, |
|
"grad_norm": 1.5104789733886719, |
|
"learning_rate": 3.9579139100168404e-05, |
|
"loss": 0.0732, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5856622114216282, |
|
"grad_norm": 0.9768268465995789, |
|
"learning_rate": 3.960961595094187e-05, |
|
"loss": 0.0797, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.5880923450789793, |
|
"grad_norm": 0.7394477725028992, |
|
"learning_rate": 3.96399650158023e-05, |
|
"loss": 0.0397, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.5905224787363305, |
|
"grad_norm": 0.742852509021759, |
|
"learning_rate": 3.96701873618601e-05, |
|
"loss": 0.0742, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.5929526123936817, |
|
"grad_norm": 0.7512255311012268, |
|
"learning_rate": 3.970028404291448e-05, |
|
"loss": 0.0281, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.5953827460510328, |
|
"grad_norm": 0.6248149871826172, |
|
"learning_rate": 3.9730256099673865e-05, |
|
"loss": 0.0375, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5978128797083839, |
|
"grad_norm": 0.8596628904342651, |
|
"learning_rate": 3.976010455997187e-05, |
|
"loss": 0.1213, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.6002430133657352, |
|
"grad_norm": 0.7119196057319641, |
|
"learning_rate": 3.978983043897883e-05, |
|
"loss": 0.0849, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6026731470230863, |
|
"grad_norm": 0.8290873765945435, |
|
"learning_rate": 3.981943473940888e-05, |
|
"loss": 0.0531, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6051032806804374, |
|
"grad_norm": 0.4561799168586731, |
|
"learning_rate": 3.984891845172299e-05, |
|
"loss": 0.0327, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6075334143377886, |
|
"grad_norm": 0.8212061524391174, |
|
"learning_rate": 3.987828255432777e-05, |
|
"loss": 0.0769, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6099635479951397, |
|
"grad_norm": 0.6895563006401062, |
|
"learning_rate": 3.9907528013770276e-05, |
|
"loss": 0.0405, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6123936816524909, |
|
"grad_norm": 1.4649648666381836, |
|
"learning_rate": 3.993665578492894e-05, |
|
"loss": 0.1539, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6148238153098421, |
|
"grad_norm": 1.6070215702056885, |
|
"learning_rate": 3.9965666811200624e-05, |
|
"loss": 0.2167, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.6172539489671932, |
|
"grad_norm": 0.9879380464553833, |
|
"learning_rate": 3.999456202468397e-05, |
|
"loss": 0.1, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6196840826245443, |
|
"grad_norm": 0.8622350096702576, |
|
"learning_rate": 4.002334234635907e-05, |
|
"loss": 0.1184, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6221142162818954, |
|
"grad_norm": 1.95242440700531, |
|
"learning_rate": 4.005200868626364e-05, |
|
"loss": 0.1317, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6245443499392467, |
|
"grad_norm": 1.3259199857711792, |
|
"learning_rate": 4.008056194366564e-05, |
|
"loss": 0.1522, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6269744835965978, |
|
"grad_norm": 0.9809350371360779, |
|
"learning_rate": 4.010900300723259e-05, |
|
"loss": 0.0581, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6294046172539489, |
|
"grad_norm": 1.1465263366699219, |
|
"learning_rate": 4.013733275519749e-05, |
|
"loss": 0.1303, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6318347509113001, |
|
"grad_norm": 1.4853742122650146, |
|
"learning_rate": 4.016555205552158e-05, |
|
"loss": 0.0983, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6342648845686513, |
|
"grad_norm": 0.4413992166519165, |
|
"learning_rate": 4.0193661766053834e-05, |
|
"loss": 0.0473, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6366950182260024, |
|
"grad_norm": 0.9577488899230957, |
|
"learning_rate": 4.022166273468753e-05, |
|
"loss": 0.15, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6391251518833536, |
|
"grad_norm": 0.7132108211517334, |
|
"learning_rate": 4.024955579951363e-05, |
|
"loss": 0.1153, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6415552855407047, |
|
"grad_norm": 0.7191299796104431, |
|
"learning_rate": 4.027734178897136e-05, |
|
"loss": 0.0538, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.6439854191980559, |
|
"grad_norm": 0.6709555983543396, |
|
"learning_rate": 4.030502152199576e-05, |
|
"loss": 0.1552, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6464155528554071, |
|
"grad_norm": 0.8649526834487915, |
|
"learning_rate": 4.033259580816264e-05, |
|
"loss": 0.1492, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6488456865127582, |
|
"grad_norm": 0.6234789490699768, |
|
"learning_rate": 4.036006544783052e-05, |
|
"loss": 0.0777, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6512758201701093, |
|
"grad_norm": 1.0768671035766602, |
|
"learning_rate": 4.0387431232280135e-05, |
|
"loss": 0.177, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6537059538274606, |
|
"grad_norm": 0.7391580939292908, |
|
"learning_rate": 4.041469394385112e-05, |
|
"loss": 0.1217, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6561360874848117, |
|
"grad_norm": 0.5944250226020813, |
|
"learning_rate": 4.0441854356076257e-05, |
|
"loss": 0.0678, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6585662211421628, |
|
"grad_norm": 0.5040566325187683, |
|
"learning_rate": 4.046891323381315e-05, |
|
"loss": 0.0865, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.660996354799514, |
|
"grad_norm": 1.0286433696746826, |
|
"learning_rate": 4.049587133337347e-05, |
|
"loss": 0.0643, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6634264884568651, |
|
"grad_norm": 1.6537009477615356, |
|
"learning_rate": 4.0522729402649793e-05, |
|
"loss": 0.1008, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6658566221142163, |
|
"grad_norm": 0.7121666669845581, |
|
"learning_rate": 4.0549488181240096e-05, |
|
"loss": 0.0865, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.6682867557715675, |
|
"grad_norm": 0.8037539720535278, |
|
"learning_rate": 4.057614840056998e-05, |
|
"loss": 0.0976, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6707168894289186, |
|
"grad_norm": 0.6083033680915833, |
|
"learning_rate": 4.06027107840126e-05, |
|
"loss": 0.1118, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6731470230862697, |
|
"grad_norm": 1.8657127618789673, |
|
"learning_rate": 4.0629176047006474e-05, |
|
"loss": 0.0523, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.675577156743621, |
|
"grad_norm": 0.6102950572967529, |
|
"learning_rate": 4.065554489717105e-05, |
|
"loss": 0.0926, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6780072904009721, |
|
"grad_norm": 0.7026309370994568, |
|
"learning_rate": 4.068181803442029e-05, |
|
"loss": 0.0968, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6804374240583232, |
|
"grad_norm": 0.6937738656997681, |
|
"learning_rate": 4.0707996151074147e-05, |
|
"loss": 0.0554, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6828675577156743, |
|
"grad_norm": 1.4066294431686401, |
|
"learning_rate": 4.073407993196794e-05, |
|
"loss": 0.0964, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.6852976913730255, |
|
"grad_norm": 0.6314956545829773, |
|
"learning_rate": 4.076007005455996e-05, |
|
"loss": 0.1161, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6877278250303767, |
|
"grad_norm": 0.9460674524307251, |
|
"learning_rate": 4.0785967189036986e-05, |
|
"loss": 0.044, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6901579586877278, |
|
"grad_norm": 0.7385574579238892, |
|
"learning_rate": 4.0811771998418e-05, |
|
"loss": 0.118, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.692588092345079, |
|
"grad_norm": 0.7021672129631042, |
|
"learning_rate": 4.083748513865602e-05, |
|
"loss": 0.085, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6950182260024301, |
|
"grad_norm": 0.6057882308959961, |
|
"learning_rate": 4.086310725873818e-05, |
|
"loss": 0.0389, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6974483596597812, |
|
"grad_norm": 0.7348142862319946, |
|
"learning_rate": 4.0888639000783966e-05, |
|
"loss": 0.0888, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.6998784933171325, |
|
"grad_norm": 0.8555133938789368, |
|
"learning_rate": 4.0914081000141844e-05, |
|
"loss": 0.0917, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7023086269744836, |
|
"grad_norm": 0.8100624084472656, |
|
"learning_rate": 4.0939433885484055e-05, |
|
"loss": 0.0811, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7047387606318347, |
|
"grad_norm": 0.5672865509986877, |
|
"learning_rate": 4.0964698278899874e-05, |
|
"loss": 0.0389, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.707168894289186, |
|
"grad_norm": 0.842689573764801, |
|
"learning_rate": 4.0989874795987185e-05, |
|
"loss": 0.0887, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7095990279465371, |
|
"grad_norm": 1.099148154258728, |
|
"learning_rate": 4.1014964045942465e-05, |
|
"loss": 0.0592, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.7120291616038882, |
|
"grad_norm": 1.0394737720489502, |
|
"learning_rate": 4.103996663164927e-05, |
|
"loss": 0.0568, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7144592952612394, |
|
"grad_norm": 0.5482613444328308, |
|
"learning_rate": 4.106488314976513e-05, |
|
"loss": 0.0724, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7168894289185905, |
|
"grad_norm": 0.6032484173774719, |
|
"learning_rate": 4.108971419080698e-05, |
|
"loss": 0.0448, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7193195625759417, |
|
"grad_norm": 0.7295458316802979, |
|
"learning_rate": 4.111446033923516e-05, |
|
"loss": 0.093, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7217496962332929, |
|
"grad_norm": 0.5472877621650696, |
|
"learning_rate": 4.113912217353596e-05, |
|
"loss": 0.0799, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.724179829890644, |
|
"grad_norm": 0.682966411113739, |
|
"learning_rate": 4.116370026630272e-05, |
|
"loss": 0.0575, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7266099635479951, |
|
"grad_norm": 0.4737589657306671, |
|
"learning_rate": 4.118819518431564e-05, |
|
"loss": 0.0949, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7290400972053463, |
|
"grad_norm": 0.6645620465278625, |
|
"learning_rate": 4.121260748862021e-05, |
|
"loss": 0.0233, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7290400972053463, |
|
"eval_loss": 0.3244495689868927, |
|
"eval_runtime": 503.9011, |
|
"eval_samples_per_second": 5.453, |
|
"eval_steps_per_second": 0.683, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7314702308626975, |
|
"grad_norm": 0.6423314809799194, |
|
"learning_rate": 4.123693773460426e-05, |
|
"loss": 0.0774, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7339003645200486, |
|
"grad_norm": 0.6238884329795837, |
|
"learning_rate": 4.126118647207383e-05, |
|
"loss": 0.0638, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7363304981773997, |
|
"grad_norm": 2.3978679180145264, |
|
"learning_rate": 4.1285354245327715e-05, |
|
"loss": 0.1022, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7387606318347509, |
|
"grad_norm": 0.49952298402786255, |
|
"learning_rate": 4.1309441593230726e-05, |
|
"loss": 0.0477, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.741190765492102, |
|
"grad_norm": 1.5169883966445923, |
|
"learning_rate": 4.133344904928585e-05, |
|
"loss": 0.1601, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7436208991494532, |
|
"grad_norm": 0.5875476002693176, |
|
"learning_rate": 4.1357377141705084e-05, |
|
"loss": 0.0866, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.7460510328068044, |
|
"grad_norm": 0.9874062538146973, |
|
"learning_rate": 4.1381226393479236e-05, |
|
"loss": 0.0901, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7484811664641555, |
|
"grad_norm": 1.2504879236221313, |
|
"learning_rate": 4.1404997322446435e-05, |
|
"loss": 0.1227, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7509113001215066, |
|
"grad_norm": 0.6713179349899292, |
|
"learning_rate": 4.142869044135967e-05, |
|
"loss": 0.0347, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7533414337788579, |
|
"grad_norm": 0.8156313896179199, |
|
"learning_rate": 4.145230625795311e-05, |
|
"loss": 0.0548, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.755771567436209, |
|
"grad_norm": 0.9566905498504639, |
|
"learning_rate": 4.14758452750074e-05, |
|
"loss": 0.1255, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7582017010935601, |
|
"grad_norm": 0.8393445611000061, |
|
"learning_rate": 4.149930799041392e-05, |
|
"loss": 0.0587, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7606318347509113, |
|
"grad_norm": 0.637996256351471, |
|
"learning_rate": 4.152269489723788e-05, |
|
"loss": 0.0881, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.7630619684082625, |
|
"grad_norm": 0.8390913605690002, |
|
"learning_rate": 4.1546006483780626e-05, |
|
"loss": 0.0881, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7654921020656136, |
|
"grad_norm": 0.7430179715156555, |
|
"learning_rate": 4.156924323364072e-05, |
|
"loss": 0.0409, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7679222357229648, |
|
"grad_norm": 0.7785168886184692, |
|
"learning_rate": 4.1592405625774144e-05, |
|
"loss": 0.0511, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7703523693803159, |
|
"grad_norm": 0.8135663866996765, |
|
"learning_rate": 4.161549413455358e-05, |
|
"loss": 0.0703, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.772782503037667, |
|
"grad_norm": 0.8496614694595337, |
|
"learning_rate": 4.163850922982668e-05, |
|
"loss": 0.1502, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7752126366950183, |
|
"grad_norm": 0.8001265525817871, |
|
"learning_rate": 4.16614513769734e-05, |
|
"loss": 0.0917, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7776427703523694, |
|
"grad_norm": 0.5384124517440796, |
|
"learning_rate": 4.1684321036962526e-05, |
|
"loss": 0.0574, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7800729040097205, |
|
"grad_norm": 0.6082786917686462, |
|
"learning_rate": 4.170711866640721e-05, |
|
"loss": 0.0285, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.7825030376670717, |
|
"grad_norm": 0.6169834136962891, |
|
"learning_rate": 4.1729844717619684e-05, |
|
"loss": 0.0529, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.7849331713244229, |
|
"grad_norm": 1.1811317205429077, |
|
"learning_rate": 4.17524996386651e-05, |
|
"loss": 0.0449, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.787363304981774, |
|
"grad_norm": 0.7238284945487976, |
|
"learning_rate": 4.177508387341454e-05, |
|
"loss": 0.046, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.7897934386391251, |
|
"grad_norm": 1.2236160039901733, |
|
"learning_rate": 4.179759786159719e-05, |
|
"loss": 0.0871, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7922235722964763, |
|
"grad_norm": 0.8143868446350098, |
|
"learning_rate": 4.182004203885172e-05, |
|
"loss": 0.0629, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.7946537059538274, |
|
"grad_norm": 0.33017951250076294, |
|
"learning_rate": 4.184241683677687e-05, |
|
"loss": 0.0117, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.7970838396111786, |
|
"grad_norm": 0.3505575358867645, |
|
"learning_rate": 4.1864722682981245e-05, |
|
"loss": 0.0158, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.7995139732685298, |
|
"grad_norm": 1.4309784173965454, |
|
"learning_rate": 4.188696000113232e-05, |
|
"loss": 0.0492, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8019441069258809, |
|
"grad_norm": 1.023452639579773, |
|
"learning_rate": 4.190912921100477e-05, |
|
"loss": 0.0623, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.804374240583232, |
|
"grad_norm": 0.7057633996009827, |
|
"learning_rate": 4.1931230728527994e-05, |
|
"loss": 0.0468, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8068043742405833, |
|
"grad_norm": 0.8735628724098206, |
|
"learning_rate": 4.195326496583291e-05, |
|
"loss": 0.0709, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8092345078979344, |
|
"grad_norm": 3.259680986404419, |
|
"learning_rate": 4.1975232331298125e-05, |
|
"loss": 0.0491, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8116646415552855, |
|
"grad_norm": 1.0592741966247559, |
|
"learning_rate": 4.1997133229595316e-05, |
|
"loss": 0.044, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.8140947752126367, |
|
"grad_norm": 0.5978744029998779, |
|
"learning_rate": 4.201896806173394e-05, |
|
"loss": 0.0563, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8165249088699879, |
|
"grad_norm": 0.8989129662513733, |
|
"learning_rate": 4.2040737225105335e-05, |
|
"loss": 0.0582, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.818955042527339, |
|
"grad_norm": 0.9518970251083374, |
|
"learning_rate": 4.206244111352608e-05, |
|
"loss": 0.0764, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8213851761846902, |
|
"grad_norm": 0.48601099848747253, |
|
"learning_rate": 4.2084080117280756e-05, |
|
"loss": 0.0495, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.8238153098420413, |
|
"grad_norm": 0.6095461249351501, |
|
"learning_rate": 4.210565462316407e-05, |
|
"loss": 0.0234, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8262454434993924, |
|
"grad_norm": 0.6459489464759827, |
|
"learning_rate": 4.2127165014522315e-05, |
|
"loss": 0.0236, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8286755771567437, |
|
"grad_norm": 0.8500393629074097, |
|
"learning_rate": 4.214861167129425e-05, |
|
"loss": 0.1073, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8311057108140948, |
|
"grad_norm": 0.7379801869392395, |
|
"learning_rate": 4.2169994970051365e-05, |
|
"loss": 0.0779, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8335358444714459, |
|
"grad_norm": 0.7911482453346252, |
|
"learning_rate": 4.219131528403759e-05, |
|
"loss": 0.083, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8359659781287971, |
|
"grad_norm": 1.379714012145996, |
|
"learning_rate": 4.22125729832083e-05, |
|
"loss": 0.0527, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8383961117861483, |
|
"grad_norm": 0.7738386988639832, |
|
"learning_rate": 4.2233768434268914e-05, |
|
"loss": 0.0439, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8408262454434994, |
|
"grad_norm": 0.9524173140525818, |
|
"learning_rate": 4.225490200071284e-05, |
|
"loss": 0.0635, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8432563791008505, |
|
"grad_norm": 0.843377947807312, |
|
"learning_rate": 4.227597404285883e-05, |
|
"loss": 0.0531, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8456865127582017, |
|
"grad_norm": 2.120123863220215, |
|
"learning_rate": 4.229698491788791e-05, |
|
"loss": 0.068, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8481166464155528, |
|
"grad_norm": 0.804137647151947, |
|
"learning_rate": 4.231793497987961e-05, |
|
"loss": 0.0577, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.850546780072904, |
|
"grad_norm": 0.8609597086906433, |
|
"learning_rate": 4.2338824579847904e-05, |
|
"loss": 0.0204, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8529769137302552, |
|
"grad_norm": 0.7206110954284668, |
|
"learning_rate": 4.235965406577636e-05, |
|
"loss": 0.0682, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8554070473876063, |
|
"grad_norm": 0.8393117785453796, |
|
"learning_rate": 4.2380423782653e-05, |
|
"loss": 0.0442, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8578371810449574, |
|
"grad_norm": 0.5761812329292297, |
|
"learning_rate": 4.240113407250459e-05, |
|
"loss": 0.0985, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8602673147023087, |
|
"grad_norm": 0.6769473552703857, |
|
"learning_rate": 4.24217852744304e-05, |
|
"loss": 0.0347, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8626974483596598, |
|
"grad_norm": 0.6801010370254517, |
|
"learning_rate": 4.244237772463552e-05, |
|
"loss": 0.0541, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8651275820170109, |
|
"grad_norm": 0.9833145141601562, |
|
"learning_rate": 4.246291175646371e-05, |
|
"loss": 0.0909, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8675577156743621, |
|
"grad_norm": 0.8934769034385681, |
|
"learning_rate": 4.24833877004298e-05, |
|
"loss": 0.0769, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8699878493317132, |
|
"grad_norm": 0.523007869720459, |
|
"learning_rate": 4.250380588425157e-05, |
|
"loss": 0.0413, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8724179829890644, |
|
"grad_norm": 2.014488935470581, |
|
"learning_rate": 4.2524166632881255e-05, |
|
"loss": 0.0651, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.8748481166464156, |
|
"grad_norm": 0.8834489583969116, |
|
"learning_rate": 4.254447026853656e-05, |
|
"loss": 0.0288, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8772782503037667, |
|
"grad_norm": 0.8947015404701233, |
|
"learning_rate": 4.2564717110731244e-05, |
|
"loss": 0.1693, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.8797083839611178, |
|
"grad_norm": 1.9122964143753052, |
|
"learning_rate": 4.258490747630532e-05, |
|
"loss": 0.0738, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.8821385176184691, |
|
"grad_norm": 0.8690800070762634, |
|
"learning_rate": 4.260504167945479e-05, |
|
"loss": 0.0472, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.8845686512758202, |
|
"grad_norm": 0.7206094861030579, |
|
"learning_rate": 4.2625120031760965e-05, |
|
"loss": 0.0881, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.8869987849331713, |
|
"grad_norm": 0.6016607284545898, |
|
"learning_rate": 4.264514284221944e-05, |
|
"loss": 0.0403, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8894289185905225, |
|
"grad_norm": 0.8117083311080933, |
|
"learning_rate": 4.266511041726854e-05, |
|
"loss": 0.0533, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.8918590522478737, |
|
"grad_norm": 0.7658905982971191, |
|
"learning_rate": 4.26850230608176e-05, |
|
"loss": 0.067, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.8942891859052248, |
|
"grad_norm": 0.8042786717414856, |
|
"learning_rate": 4.2704881074274584e-05, |
|
"loss": 0.08, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.8967193195625759, |
|
"grad_norm": 0.5545147657394409, |
|
"learning_rate": 4.272468475657351e-05, |
|
"loss": 0.0534, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.8991494532199271, |
|
"grad_norm": 0.7363461256027222, |
|
"learning_rate": 4.2744434404201497e-05, |
|
"loss": 0.0394, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9015795868772782, |
|
"grad_norm": 0.5137555003166199, |
|
"learning_rate": 4.27641303112253e-05, |
|
"loss": 0.0602, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9040097205346294, |
|
"grad_norm": 0.7932950258255005, |
|
"learning_rate": 4.278377276931767e-05, |
|
"loss": 0.0711, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9064398541919806, |
|
"grad_norm": 2.459850788116455, |
|
"learning_rate": 4.2803362067783256e-05, |
|
"loss": 0.1969, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9088699878493317, |
|
"grad_norm": 0.6758642792701721, |
|
"learning_rate": 4.2822898493584104e-05, |
|
"loss": 0.1131, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9113001215066828, |
|
"grad_norm": 0.6686858534812927, |
|
"learning_rate": 4.284238233136496e-05, |
|
"loss": 0.0951, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.913730255164034, |
|
"grad_norm": 0.8299110531806946, |
|
"learning_rate": 4.286181386347813e-05, |
|
"loss": 0.046, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9161603888213852, |
|
"grad_norm": 2.4468648433685303, |
|
"learning_rate": 4.288119337000801e-05, |
|
"loss": 0.222, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9185905224787363, |
|
"grad_norm": 1.187517523765564, |
|
"learning_rate": 4.2900521128795315e-05, |
|
"loss": 0.1061, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9210206561360875, |
|
"grad_norm": 0.6776664853096008, |
|
"learning_rate": 4.291979741546102e-05, |
|
"loss": 0.0287, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9234507897934386, |
|
"grad_norm": 0.6103045344352722, |
|
"learning_rate": 4.293902250342989e-05, |
|
"loss": 0.0439, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9258809234507898, |
|
"grad_norm": 0.44634753465652466, |
|
"learning_rate": 4.295819666395376e-05, |
|
"loss": 0.03, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.928311057108141, |
|
"grad_norm": 0.8639276027679443, |
|
"learning_rate": 4.297732016613454e-05, |
|
"loss": 0.1479, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9307411907654921, |
|
"grad_norm": 0.7611154913902283, |
|
"learning_rate": 4.299639327694684e-05, |
|
"loss": 0.0403, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9331713244228432, |
|
"grad_norm": 0.8710222244262695, |
|
"learning_rate": 4.3015416261260325e-05, |
|
"loss": 0.0522, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9356014580801945, |
|
"grad_norm": 0.7666921615600586, |
|
"learning_rate": 4.303438938186182e-05, |
|
"loss": 0.0303, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9380315917375456, |
|
"grad_norm": 0.5829209089279175, |
|
"learning_rate": 4.305331289947705e-05, |
|
"loss": 0.0315, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9404617253948967, |
|
"grad_norm": 0.9368737936019897, |
|
"learning_rate": 4.3072187072792184e-05, |
|
"loss": 0.0661, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.9428918590522479, |
|
"grad_norm": 0.43866387009620667, |
|
"learning_rate": 4.309101215847502e-05, |
|
"loss": 0.0335, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.945321992709599, |
|
"grad_norm": 0.6128959059715271, |
|
"learning_rate": 4.3109788411195924e-05, |
|
"loss": 0.108, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9477521263669502, |
|
"grad_norm": 0.5031237602233887, |
|
"learning_rate": 4.312851608364853e-05, |
|
"loss": 0.0332, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9501822600243013, |
|
"grad_norm": 0.6804950833320618, |
|
"learning_rate": 4.314719542657013e-05, |
|
"loss": 0.0259, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.9526123936816525, |
|
"grad_norm": 0.9872898459434509, |
|
"learning_rate": 4.3165826688761796e-05, |
|
"loss": 0.0281, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9550425273390036, |
|
"grad_norm": 1.9561671018600464, |
|
"learning_rate": 4.318441011710833e-05, |
|
"loss": 0.0342, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9574726609963548, |
|
"grad_norm": 0.6444841027259827, |
|
"learning_rate": 4.3202945956597786e-05, |
|
"loss": 0.035, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.959902794653706, |
|
"grad_norm": 0.49825266003608704, |
|
"learning_rate": 4.3221434450340956e-05, |
|
"loss": 0.0248, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9623329283110571, |
|
"grad_norm": 0.548538863658905, |
|
"learning_rate": 4.323987583959045e-05, |
|
"loss": 0.076, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9647630619684082, |
|
"grad_norm": 0.6437348127365112, |
|
"learning_rate": 4.325827036375957e-05, |
|
"loss": 0.0749, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9671931956257594, |
|
"grad_norm": 0.5854353308677673, |
|
"learning_rate": 4.327661826044101e-05, |
|
"loss": 0.0413, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9696233292831106, |
|
"grad_norm": 1.0665239095687866, |
|
"learning_rate": 4.329491976542521e-05, |
|
"loss": 0.0574, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9720534629404617, |
|
"grad_norm": 0.7473851442337036, |
|
"learning_rate": 4.331317511271859e-05, |
|
"loss": 0.1447, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9720534629404617, |
|
"eval_loss": 0.31331515312194824, |
|
"eval_runtime": 503.3031, |
|
"eval_samples_per_second": 5.46, |
|
"eval_steps_per_second": 0.683, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9720534629404617, |
|
"step": 400, |
|
"total_flos": 1.1378390115664527e+18, |
|
"train_loss": 0.11899106367724016, |
|
"train_runtime": 12503.2939, |
|
"train_samples_per_second": 25.593, |
|
"train_steps_per_second": 0.8 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 10000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 25, |
|
"save_steps": 100, |
|
"total_flos": 1.1378390115664527e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|