|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.9964020148716717, |
|
"eval_steps": 1400, |
|
"global_step": 2082, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.007195970256656272, |
|
"grad_norm": 3.382328658618869, |
|
"learning_rate": 1.9138755980861244e-07, |
|
"loss": 1.1034, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.014391940513312544, |
|
"grad_norm": 2.9332510464158155, |
|
"learning_rate": 3.827751196172249e-07, |
|
"loss": 1.1248, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.021587910769968816, |
|
"grad_norm": 2.8021125575132175, |
|
"learning_rate": 5.741626794258373e-07, |
|
"loss": 1.0979, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02878388102662509, |
|
"grad_norm": 2.5443691578126004, |
|
"learning_rate": 7.655502392344498e-07, |
|
"loss": 1.1128, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.035979851283281364, |
|
"grad_norm": 2.823918046213155, |
|
"learning_rate": 9.569377990430622e-07, |
|
"loss": 1.0726, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.04317582153993763, |
|
"grad_norm": 2.0326907547127124, |
|
"learning_rate": 1.1483253588516746e-06, |
|
"loss": 1.0043, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.05037179179659391, |
|
"grad_norm": 2.168998278242428, |
|
"learning_rate": 1.339712918660287e-06, |
|
"loss": 1.0079, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.05756776205325018, |
|
"grad_norm": 2.152879633190319, |
|
"learning_rate": 1.5311004784688995e-06, |
|
"loss": 1.0086, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.06476373230990645, |
|
"grad_norm": 1.7771809680542394, |
|
"learning_rate": 1.722488038277512e-06, |
|
"loss": 0.9862, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.07195970256656273, |
|
"grad_norm": 1.6796335234145752, |
|
"learning_rate": 1.9138755980861244e-06, |
|
"loss": 0.9757, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.079155672823219, |
|
"grad_norm": 1.7111004529632545, |
|
"learning_rate": 2.1052631578947366e-06, |
|
"loss": 0.9067, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.08635164307987526, |
|
"grad_norm": 2.1161929062800184, |
|
"learning_rate": 2.2966507177033493e-06, |
|
"loss": 0.9534, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.09354761333653154, |
|
"grad_norm": 1.739361359363285, |
|
"learning_rate": 2.4880382775119615e-06, |
|
"loss": 0.9368, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.10074358359318782, |
|
"grad_norm": 1.9645645051440501, |
|
"learning_rate": 2.679425837320574e-06, |
|
"loss": 0.8762, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.10793955384984409, |
|
"grad_norm": 2.291876148211727, |
|
"learning_rate": 2.8708133971291864e-06, |
|
"loss": 0.9079, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11513552410650035, |
|
"grad_norm": 1.9350312982517304, |
|
"learning_rate": 3.062200956937799e-06, |
|
"loss": 0.9063, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.12233149436315663, |
|
"grad_norm": 1.723686001407234, |
|
"learning_rate": 3.2535885167464113e-06, |
|
"loss": 0.9141, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.1295274646198129, |
|
"grad_norm": 1.7291796951049525, |
|
"learning_rate": 3.444976076555024e-06, |
|
"loss": 0.8852, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.13672343487646918, |
|
"grad_norm": 1.5591618462436518, |
|
"learning_rate": 3.636363636363636e-06, |
|
"loss": 0.8812, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.14391940513312546, |
|
"grad_norm": 2.5857950863717596, |
|
"learning_rate": 3.827751196172249e-06, |
|
"loss": 0.8719, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.15111537538978173, |
|
"grad_norm": 1.882950492021297, |
|
"learning_rate": 4.019138755980861e-06, |
|
"loss": 0.923, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.158311345646438, |
|
"grad_norm": 1.9095611162862844, |
|
"learning_rate": 4.210526315789473e-06, |
|
"loss": 0.8466, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.16550731590309425, |
|
"grad_norm": 1.7349719997638062, |
|
"learning_rate": 4.4019138755980855e-06, |
|
"loss": 0.8433, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.17270328615975053, |
|
"grad_norm": 1.6057643525733838, |
|
"learning_rate": 4.5933014354066986e-06, |
|
"loss": 0.8901, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.1798992564164068, |
|
"grad_norm": 1.636440080236875, |
|
"learning_rate": 4.784688995215311e-06, |
|
"loss": 0.9158, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18709522667306308, |
|
"grad_norm": 1.7680919340453238, |
|
"learning_rate": 4.976076555023923e-06, |
|
"loss": 0.9012, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.19429119692971936, |
|
"grad_norm": 1.73915276490131, |
|
"learning_rate": 5.167464114832536e-06, |
|
"loss": 0.8372, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.20148716718637563, |
|
"grad_norm": 1.4761634523053184, |
|
"learning_rate": 5.358851674641148e-06, |
|
"loss": 0.8631, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.2086831374430319, |
|
"grad_norm": 1.7615083334015453, |
|
"learning_rate": 5.5502392344497606e-06, |
|
"loss": 0.8275, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.21587910769968818, |
|
"grad_norm": 1.5283874292360122, |
|
"learning_rate": 5.741626794258373e-06, |
|
"loss": 0.8625, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22307507795634446, |
|
"grad_norm": 1.6114784875621726, |
|
"learning_rate": 5.933014354066985e-06, |
|
"loss": 0.891, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.2302710482130007, |
|
"grad_norm": 1.6981117694270553, |
|
"learning_rate": 6.124401913875598e-06, |
|
"loss": 0.8548, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.23746701846965698, |
|
"grad_norm": 1.7093051041039082, |
|
"learning_rate": 6.31578947368421e-06, |
|
"loss": 0.8953, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.24466298872631326, |
|
"grad_norm": 1.6829774560988056, |
|
"learning_rate": 6.5071770334928226e-06, |
|
"loss": 0.7894, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.25185895898296956, |
|
"grad_norm": 1.7224253982363609, |
|
"learning_rate": 6.698564593301436e-06, |
|
"loss": 0.9342, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2590549292396258, |
|
"grad_norm": 1.718825453842612, |
|
"learning_rate": 6.889952153110048e-06, |
|
"loss": 0.8765, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.26625089949628206, |
|
"grad_norm": 1.8299643097592468, |
|
"learning_rate": 7.081339712918659e-06, |
|
"loss": 0.8562, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.27344686975293836, |
|
"grad_norm": 1.9650794343564884, |
|
"learning_rate": 7.272727272727272e-06, |
|
"loss": 0.8468, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2806428400095946, |
|
"grad_norm": 1.6665409567649216, |
|
"learning_rate": 7.4641148325358846e-06, |
|
"loss": 0.8394, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.2878388102662509, |
|
"grad_norm": 1.6178516494615538, |
|
"learning_rate": 7.655502392344498e-06, |
|
"loss": 0.9063, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29503478052290716, |
|
"grad_norm": 1.737750416660729, |
|
"learning_rate": 7.84688995215311e-06, |
|
"loss": 0.8376, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.30223075077956346, |
|
"grad_norm": 1.6775958248025469, |
|
"learning_rate": 7.9999943732958e-06, |
|
"loss": 0.8839, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.3094267210362197, |
|
"grad_norm": 1.6351855660641077, |
|
"learning_rate": 7.999797440310976e-06, |
|
"loss": 0.8138, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.316622691292876, |
|
"grad_norm": 1.5160913628770296, |
|
"learning_rate": 7.999319187945908e-06, |
|
"loss": 0.8634, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.32381866154953226, |
|
"grad_norm": 1.5346049341737504, |
|
"learning_rate": 7.998559649837715e-06, |
|
"loss": 0.8777, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.3310146318061885, |
|
"grad_norm": 1.7202200592837338, |
|
"learning_rate": 7.997518879407302e-06, |
|
"loss": 0.9041, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3382106020628448, |
|
"grad_norm": 1.6194787415884186, |
|
"learning_rate": 7.996196949855597e-06, |
|
"loss": 0.8567, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.34540657231950106, |
|
"grad_norm": 1.5761200757620502, |
|
"learning_rate": 7.994593954158409e-06, |
|
"loss": 0.8683, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.35260254257615736, |
|
"grad_norm": 1.6216799441610352, |
|
"learning_rate": 7.992710005059886e-06, |
|
"loss": 0.8718, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.3597985128328136, |
|
"grad_norm": 1.4982517397299326, |
|
"learning_rate": 7.990545235064588e-06, |
|
"loss": 0.8491, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3669944830894699, |
|
"grad_norm": 1.7915522189289803, |
|
"learning_rate": 7.988099796428161e-06, |
|
"loss": 0.8546, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.37419045334612616, |
|
"grad_norm": 1.6209807612052567, |
|
"learning_rate": 7.985373861146636e-06, |
|
"loss": 0.8112, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.38138642360278247, |
|
"grad_norm": 1.5354364067126935, |
|
"learning_rate": 7.98236762094433e-06, |
|
"loss": 0.8484, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.3885823938594387, |
|
"grad_norm": 1.7666681382505078, |
|
"learning_rate": 7.979081287260356e-06, |
|
"loss": 0.8752, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.39577836411609496, |
|
"grad_norm": 1.6458650430939799, |
|
"learning_rate": 7.975515091233757e-06, |
|
"loss": 0.8294, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.40297433437275126, |
|
"grad_norm": 1.7589737639189482, |
|
"learning_rate": 7.971669283687252e-06, |
|
"loss": 0.8269, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.4101703046294075, |
|
"grad_norm": 1.597742594232911, |
|
"learning_rate": 7.967544135109583e-06, |
|
"loss": 0.8873, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.4173662748860638, |
|
"grad_norm": 1.516759424294559, |
|
"learning_rate": 7.963139935636505e-06, |
|
"loss": 0.8162, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.42456224514272006, |
|
"grad_norm": 1.6997534481298684, |
|
"learning_rate": 7.958456995030372e-06, |
|
"loss": 0.8202, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.43175821539937637, |
|
"grad_norm": 1.726622337993047, |
|
"learning_rate": 7.95349564265835e-06, |
|
"loss": 0.8456, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4389541856560326, |
|
"grad_norm": 1.647760739349251, |
|
"learning_rate": 7.94825622746925e-06, |
|
"loss": 0.8648, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.4461501559126889, |
|
"grad_norm": 1.7211951131144776, |
|
"learning_rate": 7.942739117968995e-06, |
|
"loss": 0.8272, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.45334612616934516, |
|
"grad_norm": 1.5335036085040443, |
|
"learning_rate": 7.936944702194691e-06, |
|
"loss": 0.878, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.4605420964260014, |
|
"grad_norm": 1.4848768316508538, |
|
"learning_rate": 7.93087338768734e-06, |
|
"loss": 0.8456, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.4677380666826577, |
|
"grad_norm": 1.5394168018437981, |
|
"learning_rate": 7.924525601463173e-06, |
|
"loss": 0.8427, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.47493403693931396, |
|
"grad_norm": 1.559312757501083, |
|
"learning_rate": 7.91790178998362e-06, |
|
"loss": 0.8202, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.48213000719597027, |
|
"grad_norm": 1.5792217882137543, |
|
"learning_rate": 7.91100241912391e-06, |
|
"loss": 0.8419, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.4893259774526265, |
|
"grad_norm": 1.738156211889958, |
|
"learning_rate": 7.9038279741403e-06, |
|
"loss": 0.8659, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.4965219477092828, |
|
"grad_norm": 1.7060742398951603, |
|
"learning_rate": 7.896378959635946e-06, |
|
"loss": 0.8564, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.5037179179659391, |
|
"grad_norm": 1.483857852582522, |
|
"learning_rate": 7.888655899525413e-06, |
|
"loss": 0.8122, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5109138882225953, |
|
"grad_norm": 1.468389601692516, |
|
"learning_rate": 7.880659336997833e-06, |
|
"loss": 0.887, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.5181098584792516, |
|
"grad_norm": 1.5870295716868312, |
|
"learning_rate": 7.872389834478688e-06, |
|
"loss": 0.8813, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5253058287359079, |
|
"grad_norm": 1.8895020621596357, |
|
"learning_rate": 7.863847973590265e-06, |
|
"loss": 0.8626, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.5325017989925641, |
|
"grad_norm": 1.550855518803781, |
|
"learning_rate": 7.855034355110736e-06, |
|
"loss": 0.8546, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5396977692492204, |
|
"grad_norm": 1.5611172491745864, |
|
"learning_rate": 7.845949598931918e-06, |
|
"loss": 0.848, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5468937395058767, |
|
"grad_norm": 1.5611908671622932, |
|
"learning_rate": 7.836594344015661e-06, |
|
"loss": 0.8738, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.554089709762533, |
|
"grad_norm": 1.5092812979555394, |
|
"learning_rate": 7.826969248348915e-06, |
|
"loss": 0.8693, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.5612856800191892, |
|
"grad_norm": 2.804211699082011, |
|
"learning_rate": 7.817074988897446e-06, |
|
"loss": 0.8373, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.5684816502758455, |
|
"grad_norm": 1.6322846593476272, |
|
"learning_rate": 7.806912261558232e-06, |
|
"loss": 0.8179, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.5756776205325018, |
|
"grad_norm": 1.537211750195004, |
|
"learning_rate": 7.796481781110504e-06, |
|
"loss": 0.8881, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5828735907891581, |
|
"grad_norm": 1.6424496003416038, |
|
"learning_rate": 7.785784281165491e-06, |
|
"loss": 0.8285, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.5900695610458143, |
|
"grad_norm": 1.712983639226015, |
|
"learning_rate": 7.774820514114804e-06, |
|
"loss": 0.8471, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.5972655313024706, |
|
"grad_norm": 1.6543355090148368, |
|
"learning_rate": 7.763591251077532e-06, |
|
"loss": 0.8181, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.6044615015591269, |
|
"grad_norm": 1.7695569793499315, |
|
"learning_rate": 7.752097281845998e-06, |
|
"loss": 0.8317, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6116574718157831, |
|
"grad_norm": 1.6408594582199851, |
|
"learning_rate": 7.740339414830216e-06, |
|
"loss": 0.8822, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6188534420724394, |
|
"grad_norm": 1.532763883790403, |
|
"learning_rate": 7.72831847700103e-06, |
|
"loss": 0.8858, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.6260494123290957, |
|
"grad_norm": 1.5699531832045912, |
|
"learning_rate": 7.71603531383195e-06, |
|
"loss": 0.803, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.633245382585752, |
|
"grad_norm": 1.4023688398705096, |
|
"learning_rate": 7.703490789239685e-06, |
|
"loss": 0.8015, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6404413528424082, |
|
"grad_norm": 1.743812894669404, |
|
"learning_rate": 7.690685785523388e-06, |
|
"loss": 0.8398, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.6476373230990645, |
|
"grad_norm": 1.5621879493455977, |
|
"learning_rate": 7.677621203302591e-06, |
|
"loss": 0.7979, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6548332933557208, |
|
"grad_norm": 1.5193447362876678, |
|
"learning_rate": 7.66429796145387e-06, |
|
"loss": 0.8125, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.662029263612377, |
|
"grad_norm": 1.6795960658239877, |
|
"learning_rate": 7.650716997046216e-06, |
|
"loss": 0.8477, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6692252338690333, |
|
"grad_norm": 1.585934440158675, |
|
"learning_rate": 7.636879265275119e-06, |
|
"loss": 0.845, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.6764212041256896, |
|
"grad_norm": 1.550239333699456, |
|
"learning_rate": 7.622785739395397e-06, |
|
"loss": 0.8723, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.6836171743823459, |
|
"grad_norm": 1.6342679772831266, |
|
"learning_rate": 7.608437410652739e-06, |
|
"loss": 0.8237, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.6908131446390021, |
|
"grad_norm": 1.9009580720490618, |
|
"learning_rate": 7.593835288213984e-06, |
|
"loss": 0.8525, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.6980091148956584, |
|
"grad_norm": 1.4797149380660173, |
|
"learning_rate": 7.578980399096153e-06, |
|
"loss": 0.8343, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.7052050851523147, |
|
"grad_norm": 1.6927806221547512, |
|
"learning_rate": 7.5638737880942e-06, |
|
"loss": 0.819, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.712401055408971, |
|
"grad_norm": 1.6132959543020267, |
|
"learning_rate": 7.548516517707544e-06, |
|
"loss": 0.8177, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.7195970256656272, |
|
"grad_norm": 1.5783090956116979, |
|
"learning_rate": 7.532909668065329e-06, |
|
"loss": 0.8217, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7267929959222835, |
|
"grad_norm": 1.439090732947757, |
|
"learning_rate": 7.517054336850457e-06, |
|
"loss": 0.8617, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.7339889661789398, |
|
"grad_norm": 1.906650403594057, |
|
"learning_rate": 7.500951639222389e-06, |
|
"loss": 0.8427, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.741184936435596, |
|
"grad_norm": 1.5718009905007941, |
|
"learning_rate": 7.484602707738707e-06, |
|
"loss": 0.9079, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.7483809066922523, |
|
"grad_norm": 1.716197310859666, |
|
"learning_rate": 7.468008692275457e-06, |
|
"loss": 0.8278, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7555768769489086, |
|
"grad_norm": 1.5485542395076963, |
|
"learning_rate": 7.45117075994628e-06, |
|
"loss": 0.8326, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7627728472055649, |
|
"grad_norm": 1.4519889463898326, |
|
"learning_rate": 7.434090095020318e-06, |
|
"loss": 0.7923, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.7699688174622211, |
|
"grad_norm": 1.4499255632268135, |
|
"learning_rate": 7.416767898838926e-06, |
|
"loss": 0.8449, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.7771647877188774, |
|
"grad_norm": 1.498217506498266, |
|
"learning_rate": 7.399205389731172e-06, |
|
"loss": 0.8462, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.7843607579755337, |
|
"grad_norm": 1.441416826072204, |
|
"learning_rate": 7.381403802928153e-06, |
|
"loss": 0.7864, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.7915567282321899, |
|
"grad_norm": 1.5728216952970062, |
|
"learning_rate": 7.363364390476114e-06, |
|
"loss": 0.779, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.7987526984888462, |
|
"grad_norm": 1.579757123001059, |
|
"learning_rate": 7.34508842114839e-06, |
|
"loss": 0.8342, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.8059486687455025, |
|
"grad_norm": 1.5204038228338386, |
|
"learning_rate": 7.326577180356162e-06, |
|
"loss": 0.8202, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8131446390021588, |
|
"grad_norm": 1.6070382704422732, |
|
"learning_rate": 7.30783197005806e-06, |
|
"loss": 0.7948, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.820340609258815, |
|
"grad_norm": 1.4952361279508235, |
|
"learning_rate": 7.288854108668586e-06, |
|
"loss": 0.8451, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.8275365795154713, |
|
"grad_norm": 1.4373323975649135, |
|
"learning_rate": 7.2696449309653795e-06, |
|
"loss": 0.8381, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8347325497721276, |
|
"grad_norm": 1.292833703910724, |
|
"learning_rate": 7.250205787995353e-06, |
|
"loss": 0.8286, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8419285200287839, |
|
"grad_norm": 1.2885838799591653, |
|
"learning_rate": 7.230538046979654e-06, |
|
"loss": 0.8506, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.8491244902854401, |
|
"grad_norm": 1.4134558176619048, |
|
"learning_rate": 7.210643091217513e-06, |
|
"loss": 0.8411, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8563204605420964, |
|
"grad_norm": 1.6657530487665673, |
|
"learning_rate": 7.1905223199889425e-06, |
|
"loss": 0.834, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.8635164307987527, |
|
"grad_norm": 1.7801521918282026, |
|
"learning_rate": 7.170177148456331e-06, |
|
"loss": 0.8461, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8707124010554089, |
|
"grad_norm": 1.5712149389587902, |
|
"learning_rate": 7.149609007564903e-06, |
|
"loss": 0.8683, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.8779083713120652, |
|
"grad_norm": 1.5017107860836532, |
|
"learning_rate": 7.128819343942077e-06, |
|
"loss": 0.8442, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.8851043415687215, |
|
"grad_norm": 1.4727249376876361, |
|
"learning_rate": 7.107809619795722e-06, |
|
"loss": 0.8668, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.8923003118253778, |
|
"grad_norm": 1.428036074094576, |
|
"learning_rate": 7.086581312811309e-06, |
|
"loss": 0.773, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.899496282082034, |
|
"grad_norm": 1.459125555270072, |
|
"learning_rate": 7.065135916047992e-06, |
|
"loss": 0.8551, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9066922523386903, |
|
"grad_norm": 1.4437358399730527, |
|
"learning_rate": 7.043474937833581e-06, |
|
"loss": 0.8055, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9138882225953466, |
|
"grad_norm": 1.6487138533805716, |
|
"learning_rate": 7.021599901658467e-06, |
|
"loss": 0.8162, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.9210841928520028, |
|
"grad_norm": 1.5886402538030315, |
|
"learning_rate": 6.999512346068467e-06, |
|
"loss": 0.8472, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9282801631086591, |
|
"grad_norm": 1.580701659838036, |
|
"learning_rate": 6.977213824556613e-06, |
|
"loss": 0.8185, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.9354761333653154, |
|
"grad_norm": 1.3791827207697653, |
|
"learning_rate": 6.95470590545389e-06, |
|
"loss": 0.8424, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9426721036219717, |
|
"grad_norm": 1.5248977943916064, |
|
"learning_rate": 6.931990171818923e-06, |
|
"loss": 0.8829, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.9498680738786279, |
|
"grad_norm": 1.4962085290443874, |
|
"learning_rate": 6.909068221326647e-06, |
|
"loss": 0.8236, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9570640441352842, |
|
"grad_norm": 1.6074178928012908, |
|
"learning_rate": 6.88594166615593e-06, |
|
"loss": 0.8165, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.9642600143919405, |
|
"grad_norm": 1.538150422370725, |
|
"learning_rate": 6.8626121328761824e-06, |
|
"loss": 0.8155, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.9714559846485968, |
|
"grad_norm": 1.4261928133080215, |
|
"learning_rate": 6.839081262332957e-06, |
|
"loss": 0.8271, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.978651954905253, |
|
"grad_norm": 1.4811633224353407, |
|
"learning_rate": 6.815350709532544e-06, |
|
"loss": 0.8417, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.9858479251619093, |
|
"grad_norm": 1.5008044573312285, |
|
"learning_rate": 6.791422143525564e-06, |
|
"loss": 0.859, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.9930438954185656, |
|
"grad_norm": 1.5977699325206687, |
|
"learning_rate": 6.767297247289585e-06, |
|
"loss": 0.8663, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0002398656752218, |
|
"grad_norm": 2.198059715592546, |
|
"learning_rate": 6.742977717610744e-06, |
|
"loss": 0.8427, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.0074358359318782, |
|
"grad_norm": 1.447486858474852, |
|
"learning_rate": 6.718465264964414e-06, |
|
"loss": 0.5445, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.0146318061885344, |
|
"grad_norm": 1.5387360085110118, |
|
"learning_rate": 6.693761613394899e-06, |
|
"loss": 0.5585, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.0218277764451906, |
|
"grad_norm": 1.3617154485025116, |
|
"learning_rate": 6.668868500394172e-06, |
|
"loss": 0.4605, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.029023746701847, |
|
"grad_norm": 1.433780770938119, |
|
"learning_rate": 6.643787676779671e-06, |
|
"loss": 0.5254, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.0362197169585032, |
|
"grad_norm": 1.4766138665890287, |
|
"learning_rate": 6.618520906571171e-06, |
|
"loss": 0.476, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0434156872151594, |
|
"grad_norm": 1.4473624764279305, |
|
"learning_rate": 6.593069966866694e-06, |
|
"loss": 0.5404, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.0506116574718158, |
|
"grad_norm": 1.4713255574009756, |
|
"learning_rate": 6.567436647717535e-06, |
|
"loss": 0.5293, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.057807627728472, |
|
"grad_norm": 1.3099551452546936, |
|
"learning_rate": 6.541622752002355e-06, |
|
"loss": 0.5168, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.0650035979851282, |
|
"grad_norm": 1.4756303619125208, |
|
"learning_rate": 6.515630095300383e-06, |
|
"loss": 0.5253, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.0721995682417846, |
|
"grad_norm": 1.2607903619444512, |
|
"learning_rate": 6.489460505763713e-06, |
|
"loss": 0.5203, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.0793955384984408, |
|
"grad_norm": 1.2668860008648557, |
|
"learning_rate": 6.463115823988732e-06, |
|
"loss": 0.5133, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.0865915087550972, |
|
"grad_norm": 1.5445317289615172, |
|
"learning_rate": 6.436597902886655e-06, |
|
"loss": 0.5399, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.0937874790117534, |
|
"grad_norm": 1.373346882134398, |
|
"learning_rate": 6.409908607553217e-06, |
|
"loss": 0.4742, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1009834492684096, |
|
"grad_norm": 1.2423912396823111, |
|
"learning_rate": 6.38304981513748e-06, |
|
"loss": 0.4928, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.108179419525066, |
|
"grad_norm": 1.564288911463839, |
|
"learning_rate": 6.3560234147098155e-06, |
|
"loss": 0.509, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1153753897817222, |
|
"grad_norm": 1.592002787628597, |
|
"learning_rate": 6.328831307129039e-06, |
|
"loss": 0.5373, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.1225713600383784, |
|
"grad_norm": 1.2953958374191832, |
|
"learning_rate": 6.30147540490871e-06, |
|
"loss": 0.5053, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.1297673302950348, |
|
"grad_norm": 1.3851707132475586, |
|
"learning_rate": 6.27395763208263e-06, |
|
"loss": 0.5138, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.136963300551691, |
|
"grad_norm": 1.5083938363704992, |
|
"learning_rate": 6.246279924069504e-06, |
|
"loss": 0.4639, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1441592708083472, |
|
"grad_norm": 1.4583984595795128, |
|
"learning_rate": 6.218444227536832e-06, |
|
"loss": 0.509, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.1513552410650036, |
|
"grad_norm": 1.1915713777559744, |
|
"learning_rate": 6.190452500263975e-06, |
|
"loss": 0.4771, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.1585512113216598, |
|
"grad_norm": 1.2538864728177044, |
|
"learning_rate": 6.162306711004474e-06, |
|
"loss": 0.4927, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.165747181578316, |
|
"grad_norm": 1.3142042016311857, |
|
"learning_rate": 6.134008839347575e-06, |
|
"loss": 0.4884, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.1729431518349724, |
|
"grad_norm": 1.232777503769632, |
|
"learning_rate": 6.105560875578994e-06, |
|
"loss": 0.5273, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.1801391220916286, |
|
"grad_norm": 1.502479848796588, |
|
"learning_rate": 6.076964820540937e-06, |
|
"loss": 0.5086, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.187335092348285, |
|
"grad_norm": 1.5121283948236117, |
|
"learning_rate": 6.048222685491374e-06, |
|
"loss": 0.5374, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.1945310626049412, |
|
"grad_norm": 1.8544252686881426, |
|
"learning_rate": 6.019336491962581e-06, |
|
"loss": 0.5381, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2017270328615974, |
|
"grad_norm": 1.3559009500960952, |
|
"learning_rate": 5.990308271618956e-06, |
|
"loss": 0.4939, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 1.2089230031182538, |
|
"grad_norm": 1.6991974082062777, |
|
"learning_rate": 5.961140066114128e-06, |
|
"loss": 0.5429, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.21611897337491, |
|
"grad_norm": 1.40613574888153, |
|
"learning_rate": 5.931833926947358e-06, |
|
"loss": 0.4778, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 1.2233149436315662, |
|
"grad_norm": 1.291827429944612, |
|
"learning_rate": 5.902391915319252e-06, |
|
"loss": 0.4604, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.2305109138882226, |
|
"grad_norm": 1.5403421830718962, |
|
"learning_rate": 5.872816101986789e-06, |
|
"loss": 0.4993, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 1.2377068841448788, |
|
"grad_norm": 1.295984623620993, |
|
"learning_rate": 5.843108567117678e-06, |
|
"loss": 0.4972, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.244902854401535, |
|
"grad_norm": 1.372473134492806, |
|
"learning_rate": 5.813271400144051e-06, |
|
"loss": 0.5199, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 1.2520988246581914, |
|
"grad_norm": 1.3747684408273264, |
|
"learning_rate": 5.783306699615512e-06, |
|
"loss": 0.5136, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.2592947949148476, |
|
"grad_norm": 1.3792887268238399, |
|
"learning_rate": 5.753216573051526e-06, |
|
"loss": 0.5045, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 1.266490765171504, |
|
"grad_norm": 1.6127625261536036, |
|
"learning_rate": 5.723003136793208e-06, |
|
"loss": 0.5003, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.2736867354281602, |
|
"grad_norm": 1.4440630275399904, |
|
"learning_rate": 5.692668515854457e-06, |
|
"loss": 0.4521, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 1.2808827056848164, |
|
"grad_norm": 1.5683931030948375, |
|
"learning_rate": 5.662214843772506e-06, |
|
"loss": 0.5435, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.2880786759414729, |
|
"grad_norm": 1.4069551760135997, |
|
"learning_rate": 5.631644262457861e-06, |
|
"loss": 0.5326, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 1.295274646198129, |
|
"grad_norm": 1.3933443999205188, |
|
"learning_rate": 5.600958922043651e-06, |
|
"loss": 0.4905, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3024706164547855, |
|
"grad_norm": 2.0908428594407344, |
|
"learning_rate": 5.570160980734405e-06, |
|
"loss": 0.4444, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 1.3096665867114416, |
|
"grad_norm": 1.6732069684734259, |
|
"learning_rate": 5.539252604654256e-06, |
|
"loss": 0.5535, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3168625569680978, |
|
"grad_norm": 1.4666496678005971, |
|
"learning_rate": 5.50823596769459e-06, |
|
"loss": 0.4977, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 1.324058527224754, |
|
"grad_norm": 1.3633392168469545, |
|
"learning_rate": 5.477113251361149e-06, |
|
"loss": 0.5118, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3312544974814104, |
|
"grad_norm": 1.2762316788464472, |
|
"learning_rate": 5.445886644620601e-06, |
|
"loss": 0.5136, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 1.3384504677380666, |
|
"grad_norm": 1.5424740001396617, |
|
"learning_rate": 5.414558343746579e-06, |
|
"loss": 0.4926, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.345646437994723, |
|
"grad_norm": 1.3847453947317292, |
|
"learning_rate": 5.38313055216521e-06, |
|
"loss": 0.5458, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 1.3528424082513792, |
|
"grad_norm": 1.3877715039925074, |
|
"learning_rate": 5.351605480300143e-06, |
|
"loss": 0.4637, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.3600383785080354, |
|
"grad_norm": 1.3969957173831602, |
|
"learning_rate": 5.319985345417079e-06, |
|
"loss": 0.4787, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 1.3672343487646919, |
|
"grad_norm": 1.669129051283974, |
|
"learning_rate": 5.288272371467827e-06, |
|
"loss": 0.484, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.374430319021348, |
|
"grad_norm": 1.3646255552767974, |
|
"learning_rate": 5.256468788933881e-06, |
|
"loss": 0.4782, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 1.3816262892780042, |
|
"grad_norm": 1.384029909257828, |
|
"learning_rate": 5.2245768346695494e-06, |
|
"loss": 0.5021, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.3888222595346607, |
|
"grad_norm": 1.3642725031029292, |
|
"learning_rate": 5.192598751744621e-06, |
|
"loss": 0.476, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 1.3960182297913168, |
|
"grad_norm": 1.4324224845783868, |
|
"learning_rate": 5.160536789286612e-06, |
|
"loss": 0.4966, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.403214200047973, |
|
"grad_norm": 1.3426650482840705, |
|
"learning_rate": 5.128393202322565e-06, |
|
"loss": 0.5116, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 1.4104101703046295, |
|
"grad_norm": 1.4860297463165402, |
|
"learning_rate": 5.096170251620458e-06, |
|
"loss": 0.512, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4176061405612856, |
|
"grad_norm": 1.3809734192523142, |
|
"learning_rate": 5.063870203530188e-06, |
|
"loss": 0.5128, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 1.424802110817942, |
|
"grad_norm": 1.61363323216821, |
|
"learning_rate": 5.031495329824175e-06, |
|
"loss": 0.5342, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.4319980810745983, |
|
"grad_norm": 2.088959286090713, |
|
"learning_rate": 4.999047907537582e-06, |
|
"loss": 0.489, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 1.4391940513312544, |
|
"grad_norm": 1.5116312432174273, |
|
"learning_rate": 4.966530218808157e-06, |
|
"loss": 0.4968, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.4463900215879106, |
|
"grad_norm": 1.4453482163933629, |
|
"learning_rate": 4.933944550715725e-06, |
|
"loss": 0.5297, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 1.453585991844567, |
|
"grad_norm": 1.5874368851505785, |
|
"learning_rate": 4.901293195121338e-06, |
|
"loss": 0.5005, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.4607819621012232, |
|
"grad_norm": 1.5140612281307557, |
|
"learning_rate": 4.868578448506067e-06, |
|
"loss": 0.5425, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 1.4679779323578797, |
|
"grad_norm": 1.5247613079347937, |
|
"learning_rate": 4.835802611809492e-06, |
|
"loss": 0.5246, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.4751739026145358, |
|
"grad_norm": 1.4922253999621244, |
|
"learning_rate": 4.802967990267867e-06, |
|
"loss": 0.5129, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 1.482369872871192, |
|
"grad_norm": 1.5098918292072203, |
|
"learning_rate": 4.770076893251986e-06, |
|
"loss": 0.5239, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.4895658431278485, |
|
"grad_norm": 1.6715329520651785, |
|
"learning_rate": 4.7371316341047484e-06, |
|
"loss": 0.5659, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 1.4967618133845046, |
|
"grad_norm": 1.4935236860112138, |
|
"learning_rate": 4.704134529978471e-06, |
|
"loss": 0.4914, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.503957783641161, |
|
"grad_norm": 1.4122572036404084, |
|
"learning_rate": 4.671087901671899e-06, |
|
"loss": 0.4798, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 1.5111537538978173, |
|
"grad_norm": 1.4681688735493286, |
|
"learning_rate": 4.637994073466981e-06, |
|
"loss": 0.5051, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5183497241544734, |
|
"grad_norm": 1.44942902813713, |
|
"learning_rate": 4.604855372965394e-06, |
|
"loss": 0.539, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 1.5255456944111296, |
|
"grad_norm": 1.236269924531647, |
|
"learning_rate": 4.5716741309248445e-06, |
|
"loss": 0.5305, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.532741664667786, |
|
"grad_norm": 1.6240033228942266, |
|
"learning_rate": 4.538452681095123e-06, |
|
"loss": 0.5531, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 1.5399376349244425, |
|
"grad_norm": 1.5622316143038788, |
|
"learning_rate": 4.5051933600539705e-06, |
|
"loss": 0.494, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.5471336051810987, |
|
"grad_norm": 1.5638109251408436, |
|
"learning_rate": 4.471898507042745e-06, |
|
"loss": 0.533, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.5543295754377549, |
|
"grad_norm": 1.5029077042303538, |
|
"learning_rate": 4.438570463801884e-06, |
|
"loss": 0.513, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.561525545694411, |
|
"grad_norm": 1.372369046315647, |
|
"learning_rate": 4.405211574406209e-06, |
|
"loss": 0.4698, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 1.5687215159510672, |
|
"grad_norm": 1.6322850083649267, |
|
"learning_rate": 4.371824185100054e-06, |
|
"loss": 0.4607, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.5759174862077237, |
|
"grad_norm": 1.6038884982185644, |
|
"learning_rate": 4.338410644132256e-06, |
|
"loss": 0.4918, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 1.58311345646438, |
|
"grad_norm": 1.510163759577701, |
|
"learning_rate": 4.304973301590977e-06, |
|
"loss": 0.5141, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.5903094267210363, |
|
"grad_norm": 1.3447962919620353, |
|
"learning_rate": 4.271514509238434e-06, |
|
"loss": 0.5719, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 1.5975053969776924, |
|
"grad_norm": 1.4095809553505452, |
|
"learning_rate": 4.238036620345477e-06, |
|
"loss": 0.5378, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6047013672343486, |
|
"grad_norm": 1.5471600021180223, |
|
"learning_rate": 4.204541989526083e-06, |
|
"loss": 0.5159, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 1.611897337491005, |
|
"grad_norm": 1.3407123732434036, |
|
"learning_rate": 4.171032972571744e-06, |
|
"loss": 0.514, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6190933077476615, |
|
"grad_norm": 1.5587421912806174, |
|
"learning_rate": 4.137511926285779e-06, |
|
"loss": 0.4943, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.6262892780043177, |
|
"grad_norm": 1.5228893274275985, |
|
"learning_rate": 4.103981208317571e-06, |
|
"loss": 0.5161, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6334852482609739, |
|
"grad_norm": 1.3816307683209126, |
|
"learning_rate": 4.070443176996745e-06, |
|
"loss": 0.5036, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 1.64068121851763, |
|
"grad_norm": 1.551360790563111, |
|
"learning_rate": 4.036900191167301e-06, |
|
"loss": 0.4973, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.6478771887742862, |
|
"grad_norm": 1.3412086458854404, |
|
"learning_rate": 4.003354610021701e-06, |
|
"loss": 0.5029, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 1.6550731590309427, |
|
"grad_norm": 1.4793955535305545, |
|
"learning_rate": 3.96980879293495e-06, |
|
"loss": 0.4925, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.662269129287599, |
|
"grad_norm": 1.2902630164808577, |
|
"learning_rate": 3.9362650992986465e-06, |
|
"loss": 0.4906, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 1.6694650995442553, |
|
"grad_norm": 1.5152262911206174, |
|
"learning_rate": 3.902725888355037e-06, |
|
"loss": 0.5019, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.6766610698009115, |
|
"grad_norm": 1.577445286461562, |
|
"learning_rate": 3.869193519031086e-06, |
|
"loss": 0.49, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 1.6838570400575676, |
|
"grad_norm": 1.4532186104713023, |
|
"learning_rate": 3.835670349772566e-06, |
|
"loss": 0.47, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.691053010314224, |
|
"grad_norm": 1.2527595036522912, |
|
"learning_rate": 3.802158738378176e-06, |
|
"loss": 0.4508, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.6982489805708805, |
|
"grad_norm": 1.4942224309571124, |
|
"learning_rate": 3.7686610418337083e-06, |
|
"loss": 0.5039, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7054449508275367, |
|
"grad_norm": 1.72911326258422, |
|
"learning_rate": 3.7351796161462796e-06, |
|
"loss": 0.4808, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 1.7126409210841929, |
|
"grad_norm": 1.3747290591542072, |
|
"learning_rate": 3.7017168161786215e-06, |
|
"loss": 0.4993, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.719836891340849, |
|
"grad_norm": 1.4764634140281585, |
|
"learning_rate": 3.6682749954834548e-06, |
|
"loss": 0.5115, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 1.7270328615975052, |
|
"grad_norm": 1.6701936544900278, |
|
"learning_rate": 3.634856506137956e-06, |
|
"loss": 0.5653, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7342288318541617, |
|
"grad_norm": 1.5206960936360632, |
|
"learning_rate": 3.6014636985783287e-06, |
|
"loss": 0.521, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 1.741424802110818, |
|
"grad_norm": 1.5138010545314067, |
|
"learning_rate": 3.568098921434488e-06, |
|
"loss": 0.4856, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.7486207723674743, |
|
"grad_norm": 1.5508467662920749, |
|
"learning_rate": 3.534764521364879e-06, |
|
"loss": 0.4846, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 1.7558167426241305, |
|
"grad_norm": 1.294214634658577, |
|
"learning_rate": 3.501462842891418e-06, |
|
"loss": 0.4876, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.7630127128807866, |
|
"grad_norm": 1.4066532330498682, |
|
"learning_rate": 3.4681962282346023e-06, |
|
"loss": 0.4644, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.770208683137443, |
|
"grad_norm": 1.4435875920097863, |
|
"learning_rate": 3.4349670171487714e-06, |
|
"loss": 0.5199, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.7774046533940993, |
|
"grad_norm": 1.5007291535231901, |
|
"learning_rate": 3.4017775467575446e-06, |
|
"loss": 0.5224, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 1.7846006236507557, |
|
"grad_norm": 1.2699334841273793, |
|
"learning_rate": 3.3686301513894416e-06, |
|
"loss": 0.4914, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.7917965939074119, |
|
"grad_norm": 1.2195698381350315, |
|
"learning_rate": 3.3355271624137037e-06, |
|
"loss": 0.4719, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 1.798992564164068, |
|
"grad_norm": 1.3895500926839512, |
|
"learning_rate": 3.3024709080763186e-06, |
|
"loss": 0.5144, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.8061885344207242, |
|
"grad_norm": 1.48913348325029, |
|
"learning_rate": 3.269463713336268e-06, |
|
"loss": 0.5103, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 1.8133845046773807, |
|
"grad_norm": 1.3658603273721261, |
|
"learning_rate": 3.236507899702005e-06, |
|
"loss": 0.473, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.820580474934037, |
|
"grad_norm": 1.3775281668274946, |
|
"learning_rate": 3.2036057850681745e-06, |
|
"loss": 0.514, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 1.8277764451906933, |
|
"grad_norm": 1.631774330401289, |
|
"learning_rate": 3.170759683552586e-06, |
|
"loss": 0.5163, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.8349724154473495, |
|
"grad_norm": 1.384315211463836, |
|
"learning_rate": 3.137971905333458e-06, |
|
"loss": 0.4752, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.8421683857040057, |
|
"grad_norm": 1.4207130788508293, |
|
"learning_rate": 3.1052447564869343e-06, |
|
"loss": 0.5018, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.849364355960662, |
|
"grad_norm": 1.5148685580490273, |
|
"learning_rate": 3.0725805388248834e-06, |
|
"loss": 0.5127, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 1.8565603262173183, |
|
"grad_norm": 1.4919953654248346, |
|
"learning_rate": 3.039981549733014e-06, |
|
"loss": 0.4971, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.8637562964739747, |
|
"grad_norm": 2.065773895317616, |
|
"learning_rate": 3.007450082009283e-06, |
|
"loss": 0.4843, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 1.8709522667306309, |
|
"grad_norm": 1.4849907079900204, |
|
"learning_rate": 2.9749884237026426e-06, |
|
"loss": 0.5102, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.878148236987287, |
|
"grad_norm": 1.7567340972637704, |
|
"learning_rate": 2.9425988579521103e-06, |
|
"loss": 0.4901, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 1.8853442072439432, |
|
"grad_norm": 1.4271802301503538, |
|
"learning_rate": 2.910283662826188e-06, |
|
"loss": 0.4805, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.8925401775005997, |
|
"grad_norm": 1.479518679920681, |
|
"learning_rate": 2.8780451111626384e-06, |
|
"loss": 0.4908, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.899736147757256, |
|
"grad_norm": 1.5062976034854971, |
|
"learning_rate": 2.8458854704086275e-06, |
|
"loss": 0.491, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.9069321180139123, |
|
"grad_norm": 1.552858614788501, |
|
"learning_rate": 2.8138070024612504e-06, |
|
"loss": 0.4787, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.9141280882705685, |
|
"grad_norm": 1.668691870366938, |
|
"learning_rate": 2.7818119635084392e-06, |
|
"loss": 0.536, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9213240585272247, |
|
"grad_norm": 1.4857517196291667, |
|
"learning_rate": 2.749902603870283e-06, |
|
"loss": 0.5047, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.928520028783881, |
|
"grad_norm": 2.9875786042021932, |
|
"learning_rate": 2.7180811678407525e-06, |
|
"loss": 0.504, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.9357159990405373, |
|
"grad_norm": 1.3595766267302944, |
|
"learning_rate": 2.686349893529849e-06, |
|
"loss": 0.4863, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.9429119692971937, |
|
"grad_norm": 1.4889142920291538, |
|
"learning_rate": 2.6547110127061975e-06, |
|
"loss": 0.4926, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.9501079395538499, |
|
"grad_norm": 1.451668900558599, |
|
"learning_rate": 2.6231667506400706e-06, |
|
"loss": 0.4984, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.957303909810506, |
|
"grad_norm": 1.47137694607456, |
|
"learning_rate": 2.591719325946883e-06, |
|
"loss": 0.5209, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.9644998800671623, |
|
"grad_norm": 1.5284770765800149, |
|
"learning_rate": 2.560370950431146e-06, |
|
"loss": 0.4603, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.9716958503238187, |
|
"grad_norm": 1.402896466872614, |
|
"learning_rate": 2.5291238289309054e-06, |
|
"loss": 0.5077, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.978891820580475, |
|
"grad_norm": 1.4817926638302614, |
|
"learning_rate": 2.497980159162667e-06, |
|
"loss": 0.4839, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.9860877908371313, |
|
"grad_norm": 1.5453436757112435, |
|
"learning_rate": 2.466942131566824e-06, |
|
"loss": 0.4888, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.9932837610937875, |
|
"grad_norm": 1.4335485637084342, |
|
"learning_rate": 2.4360119291535955e-06, |
|
"loss": 0.4917, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 2.0004797313504437, |
|
"grad_norm": 1.6143369616539034, |
|
"learning_rate": 2.405191727349489e-06, |
|
"loss": 0.4993, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.0076757016071, |
|
"grad_norm": 1.2224443867251211, |
|
"learning_rate": 2.3744836938442936e-06, |
|
"loss": 0.2088, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 2.0148716718637565, |
|
"grad_norm": 1.2602299678447657, |
|
"learning_rate": 2.3438899884386185e-06, |
|
"loss": 0.1941, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0148716718637565, |
|
"eval_loss": 0.9261869192123413, |
|
"eval_runtime": 740.3886, |
|
"eval_samples_per_second": 10.008, |
|
"eval_steps_per_second": 0.627, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0220676421204127, |
|
"grad_norm": 1.323505095452205, |
|
"learning_rate": 2.3134127628919927e-06, |
|
"loss": 0.1915, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 2.029263612377069, |
|
"grad_norm": 1.2958926807268987, |
|
"learning_rate": 2.2830541607715136e-06, |
|
"loss": 0.1736, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.036459582633725, |
|
"grad_norm": 1.271545256175082, |
|
"learning_rate": 2.2528163173010927e-06, |
|
"loss": 0.1845, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 2.0436555528903813, |
|
"grad_norm": 1.5029993532268295, |
|
"learning_rate": 2.2227013592112757e-06, |
|
"loss": 0.1893, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.0508515231470374, |
|
"grad_norm": 1.2921857666544403, |
|
"learning_rate": 2.192711404589658e-06, |
|
"loss": 0.1958, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 2.058047493403694, |
|
"grad_norm": 1.2460289218576504, |
|
"learning_rate": 2.162848562731916e-06, |
|
"loss": 0.1994, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.0652434636603503, |
|
"grad_norm": 1.2881623243419067, |
|
"learning_rate": 2.133114933993452e-06, |
|
"loss": 0.1935, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 2.0724394339170065, |
|
"grad_norm": 1.1792218418956621, |
|
"learning_rate": 2.1035126096416704e-06, |
|
"loss": 0.1951, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.0796354041736627, |
|
"grad_norm": 1.284870948942911, |
|
"learning_rate": 2.07404367170889e-06, |
|
"loss": 0.1948, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 2.086831374430319, |
|
"grad_norm": 1.2222749381574636, |
|
"learning_rate": 2.0447101928459083e-06, |
|
"loss": 0.1927, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.0940273446869755, |
|
"grad_norm": 1.4089391140981338, |
|
"learning_rate": 2.0155142361762256e-06, |
|
"loss": 0.1553, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 2.1012233149436317, |
|
"grad_norm": 1.1670069976157664, |
|
"learning_rate": 1.986457855150937e-06, |
|
"loss": 0.1882, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.108419285200288, |
|
"grad_norm": 1.20035808468667, |
|
"learning_rate": 1.957543093404309e-06, |
|
"loss": 0.1723, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 2.115615255456944, |
|
"grad_norm": 1.2485627377889825, |
|
"learning_rate": 1.9287719846100366e-06, |
|
"loss": 0.1841, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.1228112257136003, |
|
"grad_norm": 1.4758861123400375, |
|
"learning_rate": 1.900146552338222e-06, |
|
"loss": 0.1989, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 2.1300071959702565, |
|
"grad_norm": 1.415660377393989, |
|
"learning_rate": 1.8716688099130336e-06, |
|
"loss": 0.1792, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.137203166226913, |
|
"grad_norm": 1.1398390415745234, |
|
"learning_rate": 1.8433407602711122e-06, |
|
"loss": 0.1828, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 2.1443991364835693, |
|
"grad_norm": 1.436825768706905, |
|
"learning_rate": 1.8151643958206963e-06, |
|
"loss": 0.1873, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.1515951067402255, |
|
"grad_norm": 1.2111903866598819, |
|
"learning_rate": 1.7871416983014864e-06, |
|
"loss": 0.1747, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 2.1587910769968817, |
|
"grad_norm": 1.592322648486121, |
|
"learning_rate": 1.7592746386452641e-06, |
|
"loss": 0.1981, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.165987047253538, |
|
"grad_norm": 1.3381476033081696, |
|
"learning_rate": 1.7315651768372734e-06, |
|
"loss": 0.1752, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 2.1731830175101945, |
|
"grad_norm": 1.5243125399513529, |
|
"learning_rate": 1.7040152617783607e-06, |
|
"loss": 0.1797, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.1803789877668507, |
|
"grad_norm": 1.5148343638714192, |
|
"learning_rate": 1.6766268311479078e-06, |
|
"loss": 0.193, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 2.187574958023507, |
|
"grad_norm": 1.315102374687142, |
|
"learning_rate": 1.649401811267546e-06, |
|
"loss": 0.1889, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.194770928280163, |
|
"grad_norm": 1.5038597370043303, |
|
"learning_rate": 1.622342116965672e-06, |
|
"loss": 0.2193, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 2.2019668985368193, |
|
"grad_norm": 1.3456620640508148, |
|
"learning_rate": 1.595449651442771e-06, |
|
"loss": 0.1842, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.2091628687934755, |
|
"grad_norm": 1.3647300470767014, |
|
"learning_rate": 1.5687263061375595e-06, |
|
"loss": 0.1752, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 2.216358839050132, |
|
"grad_norm": 1.417987485184227, |
|
"learning_rate": 1.5421739605939518e-06, |
|
"loss": 0.1728, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.2235548093067883, |
|
"grad_norm": 1.5887020304276804, |
|
"learning_rate": 1.5157944823288672e-06, |
|
"loss": 0.1637, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 2.2307507795634445, |
|
"grad_norm": 1.3375708672110973, |
|
"learning_rate": 1.4895897267008782e-06, |
|
"loss": 0.1792, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.2379467498201007, |
|
"grad_norm": 1.3565700525423485, |
|
"learning_rate": 1.463561536779724e-06, |
|
"loss": 0.1921, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 2.245142720076757, |
|
"grad_norm": 1.5551856772129453, |
|
"learning_rate": 1.4377117432166718e-06, |
|
"loss": 0.1618, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.2523386903334135, |
|
"grad_norm": 1.2100448164204372, |
|
"learning_rate": 1.4120421641157662e-06, |
|
"loss": 0.1928, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 2.2595346605900697, |
|
"grad_norm": 1.438877153368831, |
|
"learning_rate": 1.386554604905955e-06, |
|
"loss": 0.1774, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.266730630846726, |
|
"grad_norm": 1.2780217507242704, |
|
"learning_rate": 1.3612508582141065e-06, |
|
"loss": 0.1871, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 2.273926601103382, |
|
"grad_norm": 1.3558845492725387, |
|
"learning_rate": 1.3361327037389295e-06, |
|
"loss": 0.2018, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.2811225713600383, |
|
"grad_norm": 1.3490250928179355, |
|
"learning_rate": 1.3112019081257986e-06, |
|
"loss": 0.1731, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 2.2883185416166945, |
|
"grad_norm": 1.2405141654870557, |
|
"learning_rate": 1.2864602248425018e-06, |
|
"loss": 0.1886, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.295514511873351, |
|
"grad_norm": 1.2873724354006912, |
|
"learning_rate": 1.2619093940559138e-06, |
|
"loss": 0.1868, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 2.3027104821300073, |
|
"grad_norm": 1.3107124153475105, |
|
"learning_rate": 1.2375511425096013e-06, |
|
"loss": 0.187, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3099064523866635, |
|
"grad_norm": 1.3468010137925535, |
|
"learning_rate": 1.213387183402378e-06, |
|
"loss": 0.1771, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 2.3171024226433197, |
|
"grad_norm": 1.4179240822671797, |
|
"learning_rate": 1.1894192162678086e-06, |
|
"loss": 0.1654, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.324298392899976, |
|
"grad_norm": 1.3848546480668056, |
|
"learning_rate": 1.165648926854672e-06, |
|
"loss": 0.1838, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 2.331494363156632, |
|
"grad_norm": 1.5195023852589002, |
|
"learning_rate": 1.1420779870084052e-06, |
|
"loss": 0.1955, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.3386903334132887, |
|
"grad_norm": 1.2410727385995408, |
|
"learning_rate": 1.1187080545535064e-06, |
|
"loss": 0.1685, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 2.345886303669945, |
|
"grad_norm": 1.1237477417805415, |
|
"learning_rate": 1.09554077317694e-06, |
|
"loss": 0.1824, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.353082273926601, |
|
"grad_norm": 1.2937342096954545, |
|
"learning_rate": 1.0725777723125301e-06, |
|
"loss": 0.1943, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 2.3602782441832573, |
|
"grad_norm": 1.2828779926698606, |
|
"learning_rate": 1.0498206670263567e-06, |
|
"loss": 0.1832, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.3674742144399135, |
|
"grad_norm": 1.253425033010922, |
|
"learning_rate": 1.0272710579031616e-06, |
|
"loss": 0.2044, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 2.37467018469657, |
|
"grad_norm": 1.3678742472737333, |
|
"learning_rate": 1.0049305309337758e-06, |
|
"loss": 0.1672, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.3818661549532263, |
|
"grad_norm": 1.5542727998398753, |
|
"learning_rate": 9.82800657403569e-07, |
|
"loss": 0.1955, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 2.3890621252098825, |
|
"grad_norm": 1.4017624513152087, |
|
"learning_rate": 9.60882993781937e-07, |
|
"loss": 0.1733, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.3962580954665387, |
|
"grad_norm": 1.199342554533447, |
|
"learning_rate": 9.391790816128304e-07, |
|
"loss": 0.1649, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 2.403454065723195, |
|
"grad_norm": 1.2335679341459465, |
|
"learning_rate": 9.176904474063319e-07, |
|
"loss": 0.198, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.4106500359798515, |
|
"grad_norm": 1.4585828923578052, |
|
"learning_rate": 8.964186025312908e-07, |
|
"loss": 0.1988, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 2.4178460062365077, |
|
"grad_norm": 1.4518660782918198, |
|
"learning_rate": 8.753650431090252e-07, |
|
"loss": 0.1701, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.425041976493164, |
|
"grad_norm": 1.3322728405275928, |
|
"learning_rate": 8.545312499080922e-07, |
|
"loss": 0.1729, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 2.43223794674982, |
|
"grad_norm": 1.3067316057050342, |
|
"learning_rate": 8.339186882401445e-07, |
|
"loss": 0.1874, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.4394339170064763, |
|
"grad_norm": 1.4177678336114292, |
|
"learning_rate": 8.135288078568656e-07, |
|
"loss": 0.2021, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 2.4466298872631325, |
|
"grad_norm": 1.3121080863750958, |
|
"learning_rate": 7.933630428480049e-07, |
|
"loss": 0.1699, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.453825857519789, |
|
"grad_norm": 1.3185780885959946, |
|
"learning_rate": 7.734228115405161e-07, |
|
"loss": 0.1624, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 2.4610218277764453, |
|
"grad_norm": 1.33019533604804, |
|
"learning_rate": 7.537095163987972e-07, |
|
"loss": 0.1784, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.4682177980331015, |
|
"grad_norm": 1.3853774517952444, |
|
"learning_rate": 7.342245439260537e-07, |
|
"loss": 0.1824, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 2.4754137682897577, |
|
"grad_norm": 1.1804687459435843, |
|
"learning_rate": 7.149692645667804e-07, |
|
"loss": 0.1693, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.482609738546414, |
|
"grad_norm": 1.250231314429457, |
|
"learning_rate": 6.959450326103722e-07, |
|
"loss": 0.2067, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 2.48980570880307, |
|
"grad_norm": 1.3184620916504868, |
|
"learning_rate": 6.771531860958726e-07, |
|
"loss": 0.1557, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.4970016790597267, |
|
"grad_norm": 1.3996911523285738, |
|
"learning_rate": 6.585950467178656e-07, |
|
"loss": 0.1984, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 2.504197649316383, |
|
"grad_norm": 1.330732277956789, |
|
"learning_rate": 6.402719197335181e-07, |
|
"loss": 0.1656, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.511393619573039, |
|
"grad_norm": 1.3782406997114114, |
|
"learning_rate": 6.22185093870772e-07, |
|
"loss": 0.1669, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 2.5185895898296953, |
|
"grad_norm": 1.4431968846802443, |
|
"learning_rate": 6.043358412377069e-07, |
|
"loss": 0.1799, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.5257855600863515, |
|
"grad_norm": 1.1865288276002492, |
|
"learning_rate": 5.867254172330689e-07, |
|
"loss": 0.1614, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 2.532981530343008, |
|
"grad_norm": 1.3447844251083265, |
|
"learning_rate": 5.693550604579722e-07, |
|
"loss": 0.1761, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.5401775005996643, |
|
"grad_norm": 1.312290863998097, |
|
"learning_rate": 5.52225992628784e-07, |
|
"loss": 0.175, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 2.5473734708563205, |
|
"grad_norm": 1.325480546799902, |
|
"learning_rate": 5.353394184912012e-07, |
|
"loss": 0.1893, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.5545694411129767, |
|
"grad_norm": 1.211006197074522, |
|
"learning_rate": 5.186965257355092e-07, |
|
"loss": 0.1738, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 2.561765411369633, |
|
"grad_norm": 1.2613128106853304, |
|
"learning_rate": 5.022984849130542e-07, |
|
"loss": 0.1735, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.5689613816262895, |
|
"grad_norm": 1.4240080375407917, |
|
"learning_rate": 4.861464493539116e-07, |
|
"loss": 0.209, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 2.5761573518829457, |
|
"grad_norm": 1.212642870699417, |
|
"learning_rate": 4.702415550857668e-07, |
|
"loss": 0.1661, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.583353322139602, |
|
"grad_norm": 1.19899124906289, |
|
"learning_rate": 4.5458492075401845e-07, |
|
"loss": 0.1871, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 2.590549292396258, |
|
"grad_norm": 1.2451776201467897, |
|
"learning_rate": 4.391776475430964e-07, |
|
"loss": 0.1736, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.5977452626529143, |
|
"grad_norm": 1.4217111682942414, |
|
"learning_rate": 4.240208190990149e-07, |
|
"loss": 0.1656, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 2.604941232909571, |
|
"grad_norm": 1.154023125578338, |
|
"learning_rate": 4.0911550145315356e-07, |
|
"loss": 0.176, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.6121372031662267, |
|
"grad_norm": 1.2517982852871838, |
|
"learning_rate": 3.944627429472809e-07, |
|
"loss": 0.168, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 2.6193331734228833, |
|
"grad_norm": 1.3001175217867729, |
|
"learning_rate": 3.8006357415981947e-07, |
|
"loss": 0.1582, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.6265291436795395, |
|
"grad_norm": 1.4179539106113206, |
|
"learning_rate": 3.659190078333667e-07, |
|
"loss": 0.1901, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 2.6337251139361957, |
|
"grad_norm": 1.2865481274768071, |
|
"learning_rate": 3.5203003880345786e-07, |
|
"loss": 0.1825, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.640921084192852, |
|
"grad_norm": 1.2107327771575902, |
|
"learning_rate": 3.383976439286007e-07, |
|
"loss": 0.178, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 2.648117054449508, |
|
"grad_norm": 1.4930579520298934, |
|
"learning_rate": 3.250227820215694e-07, |
|
"loss": 0.1795, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.6553130247061647, |
|
"grad_norm": 1.7580144453795274, |
|
"learning_rate": 3.119063937819666e-07, |
|
"loss": 0.1988, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 2.662508994962821, |
|
"grad_norm": 1.389677232858989, |
|
"learning_rate": 2.990494017300604e-07, |
|
"loss": 0.189, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.669704965219477, |
|
"grad_norm": 1.4778063736068945, |
|
"learning_rate": 2.864527101419032e-07, |
|
"loss": 0.2053, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 2.6769009354761333, |
|
"grad_norm": 1.2577420076989798, |
|
"learning_rate": 2.7411720498572744e-07, |
|
"loss": 0.1917, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.6840969057327895, |
|
"grad_norm": 1.5130645195940433, |
|
"learning_rate": 2.6204375385963494e-07, |
|
"loss": 0.161, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 2.691292875989446, |
|
"grad_norm": 1.080302530956707, |
|
"learning_rate": 2.502332059305745e-07, |
|
"loss": 0.1752, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.6984888462461023, |
|
"grad_norm": 1.306131392662643, |
|
"learning_rate": 2.386863918746167e-07, |
|
"loss": 0.1968, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 2.7056848165027585, |
|
"grad_norm": 1.3461515984684975, |
|
"learning_rate": 2.2740412381853223e-07, |
|
"loss": 0.183, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.7128807867594147, |
|
"grad_norm": 1.4486810753503954, |
|
"learning_rate": 2.1638719528266835e-07, |
|
"loss": 0.1938, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 2.720076757016071, |
|
"grad_norm": 1.035281562927121, |
|
"learning_rate": 2.0563638112514047e-07, |
|
"loss": 0.1823, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.7272727272727275, |
|
"grad_norm": 1.4407581503306328, |
|
"learning_rate": 1.9515243748733455e-07, |
|
"loss": 0.1648, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 2.7344686975293837, |
|
"grad_norm": 1.2289916037617492, |
|
"learning_rate": 1.8493610174072248e-07, |
|
"loss": 0.1716, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.74166466778604, |
|
"grad_norm": 1.1641895371111006, |
|
"learning_rate": 1.7498809243500133e-07, |
|
"loss": 0.1659, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 2.748860638042696, |
|
"grad_norm": 1.2177853046541605, |
|
"learning_rate": 1.6530910924755603e-07, |
|
"loss": 0.1905, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.7560566082993523, |
|
"grad_norm": 1.373432351937655, |
|
"learning_rate": 1.5589983293424802e-07, |
|
"loss": 0.1948, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 2.7632525785560085, |
|
"grad_norm": 1.2223263222194338, |
|
"learning_rate": 1.4676092528153495e-07, |
|
"loss": 0.1635, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.7704485488126647, |
|
"grad_norm": 1.323423227538223, |
|
"learning_rate": 1.378930290599265e-07, |
|
"loss": 0.1941, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 2.7776445190693213, |
|
"grad_norm": 1.1747342118230812, |
|
"learning_rate": 1.29296767978774e-07, |
|
"loss": 0.1556, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.7848404893259775, |
|
"grad_norm": 1.4551159106122102, |
|
"learning_rate": 1.2097274664240486e-07, |
|
"loss": 0.1778, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 2.7920364595826337, |
|
"grad_norm": 1.2601624341009796, |
|
"learning_rate": 1.1292155050759689e-07, |
|
"loss": 0.183, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.79923242983929, |
|
"grad_norm": 1.2755552581393579, |
|
"learning_rate": 1.0514374584240338e-07, |
|
"loss": 0.1623, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 2.806428400095946, |
|
"grad_norm": 1.2967219351574655, |
|
"learning_rate": 9.763987968632293e-08, |
|
"loss": 0.1895, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.8136243703526027, |
|
"grad_norm": 1.5216756650995487, |
|
"learning_rate": 9.04104798118257e-08, |
|
"loss": 0.18, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 2.820820340609259, |
|
"grad_norm": 1.2463724400999108, |
|
"learning_rate": 8.345605468723427e-08, |
|
"loss": 0.1855, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.828016310865915, |
|
"grad_norm": 1.3612982361619894, |
|
"learning_rate": 7.677709344095883e-08, |
|
"loss": 0.1971, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 2.8352122811225713, |
|
"grad_norm": 1.1485240309542117, |
|
"learning_rate": 7.037406582709815e-08, |
|
"loss": 0.1673, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.8424082513792275, |
|
"grad_norm": 1.129849173373603, |
|
"learning_rate": 6.424742219239698e-08, |
|
"loss": 0.1688, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 2.849604221635884, |
|
"grad_norm": 1.271858319859489, |
|
"learning_rate": 5.839759344457462e-08, |
|
"loss": 0.1864, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.8568001918925403, |
|
"grad_norm": 1.3695271997638596, |
|
"learning_rate": 5.282499102201532e-08, |
|
"loss": 0.182, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 2.8639961621491965, |
|
"grad_norm": 1.2817216551660306, |
|
"learning_rate": 4.753000686483189e-08, |
|
"loss": 0.191, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.8711921324058527, |
|
"grad_norm": 1.2963119208915668, |
|
"learning_rate": 4.2513013387298846e-08, |
|
"loss": 0.1877, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 2.878388102662509, |
|
"grad_norm": 1.290101189771667, |
|
"learning_rate": 3.7774363451658744e-08, |
|
"loss": 0.1796, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.8855840729191655, |
|
"grad_norm": 1.4819813983719878, |
|
"learning_rate": 3.331439034330552e-08, |
|
"loss": 0.1763, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 2.8927800431758213, |
|
"grad_norm": 1.4019557468116053, |
|
"learning_rate": 2.913340774734152e-08, |
|
"loss": 0.1708, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 2.899976013432478, |
|
"grad_norm": 1.2339439577357936, |
|
"learning_rate": 2.5231709726516005e-08, |
|
"loss": 0.1789, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 2.907171983689134, |
|
"grad_norm": 1.2294457543053794, |
|
"learning_rate": 2.1609570700543478e-08, |
|
"loss": 0.1575, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 2.9143679539457903, |
|
"grad_norm": 1.6910934029285385, |
|
"learning_rate": 1.826724542680047e-08, |
|
"loss": 0.1853, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 2.9215639242024465, |
|
"grad_norm": 1.5951062337979125, |
|
"learning_rate": 1.5204968982410527e-08, |
|
"loss": 0.1994, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 2.9287598944591027, |
|
"grad_norm": 1.244634963059678, |
|
"learning_rate": 1.2422956747708546e-08, |
|
"loss": 0.1792, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 2.9359558647157593, |
|
"grad_norm": 1.3731755256138354, |
|
"learning_rate": 9.92140439109157e-09, |
|
"loss": 0.1855, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 2.9431518349724155, |
|
"grad_norm": 1.487158498377204, |
|
"learning_rate": 7.700487855260007e-09, |
|
"loss": 0.1713, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 2.9503478052290717, |
|
"grad_norm": 1.195702473189198, |
|
"learning_rate": 5.760363344839536e-09, |
|
"loss": 0.1756, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 2.957543775485728, |
|
"grad_norm": 1.2219249635030713, |
|
"learning_rate": 4.101167315396559e-09, |
|
"loss": 0.1705, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 2.964739745742384, |
|
"grad_norm": 1.1937579061988086, |
|
"learning_rate": 2.7230164638401e-09, |
|
"loss": 0.1669, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 2.9719357159990407, |
|
"grad_norm": 1.0911166566357549, |
|
"learning_rate": 1.626007720214595e-09, |
|
"loss": 0.1556, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 2.979131686255697, |
|
"grad_norm": 1.155309788494306, |
|
"learning_rate": 8.102182408822322e-10, |
|
"loss": 0.1572, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 2.986327656512353, |
|
"grad_norm": 1.1809384871247852, |
|
"learning_rate": 2.7570540309618253e-10, |
|
"loss": 0.1651, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 2.9935236267690093, |
|
"grad_norm": 2.41813981269854, |
|
"learning_rate": 2.2506800965604867e-11, |
|
"loss": 0.1452, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 2.9964020148716717, |
|
"step": 2082, |
|
"total_flos": 5.073775214995702e+17, |
|
"train_loss": 0.5172660127031643, |
|
"train_runtime": 62603.1223, |
|
"train_samples_per_second": 3.196, |
|
"train_steps_per_second": 0.033 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2082, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 700, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.073775214995702e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|