{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 20, "global_step": 12178, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0016424069473813874, "grad_norm": 0.4667005240917206, "learning_rate": 0.0002, "loss": 1.9661, "step": 20 }, { "epoch": 0.003284813894762775, "grad_norm": 0.5031771063804626, "learning_rate": 0.0002, "loss": 1.602, "step": 40 }, { "epoch": 0.004927220842144162, "grad_norm": 0.4090685546398163, "learning_rate": 0.0002, "loss": 1.4703, "step": 60 }, { "epoch": 0.00656962778952555, "grad_norm": 0.4099690020084381, "learning_rate": 0.0002, "loss": 1.3652, "step": 80 }, { "epoch": 0.008212034736906937, "grad_norm": 0.4610142111778259, "learning_rate": 0.0002, "loss": 1.4386, "step": 100 }, { "epoch": 0.009854441684288324, "grad_norm": 0.3908289968967438, "learning_rate": 0.0002, "loss": 1.3151, "step": 120 }, { "epoch": 0.011496848631669712, "grad_norm": 0.4541659951210022, "learning_rate": 0.0002, "loss": 1.1233, "step": 140 }, { "epoch": 0.0131392555790511, "grad_norm": 0.43324407935142517, "learning_rate": 0.0002, "loss": 1.1266, "step": 160 }, { "epoch": 0.014781662526432487, "grad_norm": 0.3396519720554352, "learning_rate": 0.0002, "loss": 1.1004, "step": 180 }, { "epoch": 0.016424069473813873, "grad_norm": 0.5125846266746521, "learning_rate": 0.0002, "loss": 1.1258, "step": 200 }, { "epoch": 0.01806647642119526, "grad_norm": 0.4572688937187195, "learning_rate": 0.0002, "loss": 1.1796, "step": 220 }, { "epoch": 0.01970888336857665, "grad_norm": 0.434186190366745, "learning_rate": 0.0002, "loss": 1.1016, "step": 240 }, { "epoch": 0.021351290315958036, "grad_norm": 0.5205552577972412, "learning_rate": 0.0002, "loss": 1.0419, "step": 260 }, { "epoch": 0.022993697263339424, "grad_norm": 0.3958785831928253, "learning_rate": 0.0002, "loss": 0.9515, "step": 280 }, { "epoch": 0.02463610421072081, "grad_norm": 0.46327391266822815, "learning_rate": 0.0002, "loss": 1.0079, "step": 300 }, { "epoch": 0.0262785111581022, "grad_norm": 0.39861008524894714, "learning_rate": 0.0002, "loss": 0.9755, "step": 320 }, { "epoch": 0.027920918105483587, "grad_norm": 0.42074650526046753, "learning_rate": 0.0002, "loss": 0.9435, "step": 340 }, { "epoch": 0.029563325052864974, "grad_norm": 0.41754183173179626, "learning_rate": 0.0002, "loss": 0.9376, "step": 360 }, { "epoch": 0.031205732000246362, "grad_norm": 0.3933572769165039, "learning_rate": 0.0002, "loss": 0.9489, "step": 380 }, { "epoch": 0.032848138947627746, "grad_norm": 0.4244033992290497, "learning_rate": 0.0002, "loss": 0.9759, "step": 400 }, { "epoch": 0.034490545895009134, "grad_norm": 0.3638761639595032, "learning_rate": 0.0002, "loss": 0.9371, "step": 420 }, { "epoch": 0.03613295284239052, "grad_norm": 0.4706399738788605, "learning_rate": 0.0002, "loss": 0.8464, "step": 440 }, { "epoch": 0.03777535978977191, "grad_norm": 0.4349803328514099, "learning_rate": 0.0002, "loss": 0.8918, "step": 460 }, { "epoch": 0.0394177667371533, "grad_norm": 0.3831111490726471, "learning_rate": 0.0002, "loss": 0.8366, "step": 480 }, { "epoch": 0.041060173684534684, "grad_norm": 0.4122432470321655, "learning_rate": 0.0002, "loss": 0.8444, "step": 500 }, { "epoch": 0.04270258063191607, "grad_norm": 0.3296256959438324, "learning_rate": 0.0002, "loss": 0.8301, "step": 520 }, { "epoch": 0.04434498757929746, "grad_norm": 0.3447166979312897, "learning_rate": 0.0002, "loss": 0.857, "step": 540 }, { "epoch": 0.04598739452667885, "grad_norm": 0.4408610761165619, "learning_rate": 0.0002, "loss": 0.8356, "step": 560 }, { "epoch": 0.047629801474060235, "grad_norm": 0.4657248854637146, "learning_rate": 0.0002, "loss": 0.7525, "step": 580 }, { "epoch": 0.04927220842144162, "grad_norm": 0.35138434171676636, "learning_rate": 0.0002, "loss": 0.7486, "step": 600 }, { "epoch": 0.05091461536882301, "grad_norm": 0.4687822461128235, "learning_rate": 0.0002, "loss": 0.8169, "step": 620 }, { "epoch": 0.0525570223162044, "grad_norm": 0.465108186006546, "learning_rate": 0.0002, "loss": 0.738, "step": 640 }, { "epoch": 0.054199429263585785, "grad_norm": 0.3954925835132599, "learning_rate": 0.0002, "loss": 0.7627, "step": 660 }, { "epoch": 0.05584183621096717, "grad_norm": 0.5010778307914734, "learning_rate": 0.0002, "loss": 0.7273, "step": 680 }, { "epoch": 0.05748424315834856, "grad_norm": 0.6221648454666138, "learning_rate": 0.0002, "loss": 0.7506, "step": 700 }, { "epoch": 0.05912665010572995, "grad_norm": 0.4075715243816376, "learning_rate": 0.0002, "loss": 0.7587, "step": 720 }, { "epoch": 0.060769057053111336, "grad_norm": 0.4346787631511688, "learning_rate": 0.0002, "loss": 0.7627, "step": 740 }, { "epoch": 0.062411464000492724, "grad_norm": 0.4146323800086975, "learning_rate": 0.0002, "loss": 0.6642, "step": 760 }, { "epoch": 0.06405387094787411, "grad_norm": 0.4093219041824341, "learning_rate": 0.0002, "loss": 0.7148, "step": 780 }, { "epoch": 0.06569627789525549, "grad_norm": 0.4016498327255249, "learning_rate": 0.0002, "loss": 0.6522, "step": 800 }, { "epoch": 0.06733868484263689, "grad_norm": 0.436252236366272, "learning_rate": 0.0002, "loss": 0.6884, "step": 820 }, { "epoch": 0.06898109179001827, "grad_norm": 0.4362093508243561, "learning_rate": 0.0002, "loss": 0.7185, "step": 840 }, { "epoch": 0.07062349873739966, "grad_norm": 0.42092448472976685, "learning_rate": 0.0002, "loss": 0.6702, "step": 860 }, { "epoch": 0.07226590568478104, "grad_norm": 0.4649953842163086, "learning_rate": 0.0002, "loss": 0.6753, "step": 880 }, { "epoch": 0.07390831263216244, "grad_norm": 0.4321405589580536, "learning_rate": 0.0002, "loss": 0.6578, "step": 900 }, { "epoch": 0.07555071957954382, "grad_norm": 0.5045340657234192, "learning_rate": 0.0002, "loss": 0.6993, "step": 920 }, { "epoch": 0.07719312652692521, "grad_norm": 0.5063377022743225, "learning_rate": 0.0002, "loss": 0.6654, "step": 940 }, { "epoch": 0.0788355334743066, "grad_norm": 0.41710513830184937, "learning_rate": 0.0002, "loss": 0.6264, "step": 960 }, { "epoch": 0.08047794042168799, "grad_norm": 0.4204249083995819, "learning_rate": 0.0002, "loss": 0.6683, "step": 980 }, { "epoch": 0.08212034736906937, "grad_norm": 0.44983726739883423, "learning_rate": 0.0002, "loss": 0.6592, "step": 1000 }, { "epoch": 0.08376275431645076, "grad_norm": 0.5991094708442688, "learning_rate": 0.0002, "loss": 0.6197, "step": 1020 }, { "epoch": 0.08540516126383214, "grad_norm": 0.3672972619533539, "learning_rate": 0.0002, "loss": 0.5656, "step": 1040 }, { "epoch": 0.08704756821121354, "grad_norm": 0.503656804561615, "learning_rate": 0.0002, "loss": 0.6017, "step": 1060 }, { "epoch": 0.08868997515859492, "grad_norm": 0.49204686284065247, "learning_rate": 0.0002, "loss": 0.6421, "step": 1080 }, { "epoch": 0.09033238210597631, "grad_norm": 0.45617127418518066, "learning_rate": 0.0002, "loss": 0.6176, "step": 1100 }, { "epoch": 0.0919747890533577, "grad_norm": 0.49607595801353455, "learning_rate": 0.0002, "loss": 0.5595, "step": 1120 }, { "epoch": 0.09361719600073909, "grad_norm": 0.39171984791755676, "learning_rate": 0.0002, "loss": 0.5479, "step": 1140 }, { "epoch": 0.09525960294812047, "grad_norm": 0.4964667558670044, "learning_rate": 0.0002, "loss": 0.5937, "step": 1160 }, { "epoch": 0.09690200989550186, "grad_norm": 0.40392565727233887, "learning_rate": 0.0002, "loss": 0.5888, "step": 1180 }, { "epoch": 0.09854441684288325, "grad_norm": 0.4721887409687042, "learning_rate": 0.0002, "loss": 0.5345, "step": 1200 }, { "epoch": 0.10018682379026464, "grad_norm": 0.4130144417285919, "learning_rate": 0.0002, "loss": 0.599, "step": 1220 }, { "epoch": 0.10182923073764602, "grad_norm": 0.4222985506057739, "learning_rate": 0.0002, "loss": 0.5762, "step": 1240 }, { "epoch": 0.1034716376850274, "grad_norm": 0.47171750664711, "learning_rate": 0.0002, "loss": 0.5619, "step": 1260 }, { "epoch": 0.1051140446324088, "grad_norm": 0.40906137228012085, "learning_rate": 0.0002, "loss": 0.5137, "step": 1280 }, { "epoch": 0.10675645157979018, "grad_norm": 0.43774527311325073, "learning_rate": 0.0002, "loss": 0.5888, "step": 1300 }, { "epoch": 0.10839885852717157, "grad_norm": 0.5423911213874817, "learning_rate": 0.0002, "loss": 0.5409, "step": 1320 }, { "epoch": 0.11004126547455295, "grad_norm": 0.4405030906200409, "learning_rate": 0.0002, "loss": 0.5248, "step": 1340 }, { "epoch": 0.11168367242193435, "grad_norm": 0.4299491345882416, "learning_rate": 0.0002, "loss": 0.5196, "step": 1360 }, { "epoch": 0.11332607936931573, "grad_norm": 0.5445800423622131, "learning_rate": 0.0002, "loss": 0.5524, "step": 1380 }, { "epoch": 0.11496848631669712, "grad_norm": 0.42257580161094666, "learning_rate": 0.0002, "loss": 0.5266, "step": 1400 }, { "epoch": 0.1166108932640785, "grad_norm": 0.4614318907260895, "learning_rate": 0.0002, "loss": 0.5593, "step": 1420 }, { "epoch": 0.1182533002114599, "grad_norm": 0.5021907687187195, "learning_rate": 0.0002, "loss": 0.5183, "step": 1440 }, { "epoch": 0.11989570715884128, "grad_norm": 0.39399659633636475, "learning_rate": 0.0002, "loss": 0.516, "step": 1460 }, { "epoch": 0.12153811410622267, "grad_norm": 0.5128427743911743, "learning_rate": 0.0002, "loss": 0.5067, "step": 1480 }, { "epoch": 0.12318052105360405, "grad_norm": 0.41359153389930725, "learning_rate": 0.0002, "loss": 0.508, "step": 1500 }, { "epoch": 0.12482292800098545, "grad_norm": 0.5723029375076294, "learning_rate": 0.0002, "loss": 0.4955, "step": 1520 }, { "epoch": 0.12646533494836684, "grad_norm": 0.4619792699813843, "learning_rate": 0.0002, "loss": 0.5398, "step": 1540 }, { "epoch": 0.12810774189574822, "grad_norm": 0.5200566649436951, "learning_rate": 0.0002, "loss": 0.5213, "step": 1560 }, { "epoch": 0.1297501488431296, "grad_norm": 0.4156297445297241, "learning_rate": 0.0002, "loss": 0.4895, "step": 1580 }, { "epoch": 0.13139255579051098, "grad_norm": 0.43649184703826904, "learning_rate": 0.0002, "loss": 0.4809, "step": 1600 }, { "epoch": 0.1330349627378924, "grad_norm": 0.38926875591278076, "learning_rate": 0.0002, "loss": 0.4819, "step": 1620 }, { "epoch": 0.13467736968527377, "grad_norm": 0.45897549390792847, "learning_rate": 0.0002, "loss": 0.4619, "step": 1640 }, { "epoch": 0.13631977663265515, "grad_norm": 0.4487549364566803, "learning_rate": 0.0002, "loss": 0.4737, "step": 1660 }, { "epoch": 0.13796218358003653, "grad_norm": 0.36948007345199585, "learning_rate": 0.0002, "loss": 0.4576, "step": 1680 }, { "epoch": 0.13960459052741794, "grad_norm": 0.38834378123283386, "learning_rate": 0.0002, "loss": 0.4464, "step": 1700 }, { "epoch": 0.14124699747479932, "grad_norm": 0.5436655879020691, "learning_rate": 0.0002, "loss": 0.4616, "step": 1720 }, { "epoch": 0.1428894044221807, "grad_norm": 0.3576355278491974, "learning_rate": 0.0002, "loss": 0.4669, "step": 1740 }, { "epoch": 0.14453181136956209, "grad_norm": 0.4736698269844055, "learning_rate": 0.0002, "loss": 0.4788, "step": 1760 }, { "epoch": 0.1461742183169435, "grad_norm": 0.4074772596359253, "learning_rate": 0.0002, "loss": 0.4214, "step": 1780 }, { "epoch": 0.14781662526432487, "grad_norm": 0.4454910457134247, "learning_rate": 0.0002, "loss": 0.4407, "step": 1800 }, { "epoch": 0.14945903221170626, "grad_norm": 0.4039610028266907, "learning_rate": 0.0002, "loss": 0.4585, "step": 1820 }, { "epoch": 0.15110143915908764, "grad_norm": 0.4431604743003845, "learning_rate": 0.0002, "loss": 0.4483, "step": 1840 }, { "epoch": 0.15274384610646902, "grad_norm": 0.4190782606601715, "learning_rate": 0.0002, "loss": 0.4516, "step": 1860 }, { "epoch": 0.15438625305385043, "grad_norm": 0.2951456606388092, "learning_rate": 0.0002, "loss": 0.4584, "step": 1880 }, { "epoch": 0.1560286600012318, "grad_norm": 0.4400006830692291, "learning_rate": 0.0002, "loss": 0.4533, "step": 1900 }, { "epoch": 0.1576710669486132, "grad_norm": 0.3839446008205414, "learning_rate": 0.0002, "loss": 0.4489, "step": 1920 }, { "epoch": 0.15931347389599457, "grad_norm": 0.41484808921813965, "learning_rate": 0.0002, "loss": 0.422, "step": 1940 }, { "epoch": 0.16095588084337598, "grad_norm": 0.5211725831031799, "learning_rate": 0.0002, "loss": 0.4379, "step": 1960 }, { "epoch": 0.16259828779075736, "grad_norm": 0.3866327106952667, "learning_rate": 0.0002, "loss": 0.4279, "step": 1980 }, { "epoch": 0.16424069473813874, "grad_norm": 0.3327186107635498, "learning_rate": 0.0002, "loss": 0.417, "step": 2000 }, { "epoch": 0.16588310168552012, "grad_norm": 0.46427205204963684, "learning_rate": 0.0002, "loss": 0.4411, "step": 2020 }, { "epoch": 0.16752550863290153, "grad_norm": 0.4826524257659912, "learning_rate": 0.0002, "loss": 0.4359, "step": 2040 }, { "epoch": 0.1691679155802829, "grad_norm": 0.4641328454017639, "learning_rate": 0.0002, "loss": 0.4691, "step": 2060 }, { "epoch": 0.1708103225276643, "grad_norm": 0.525749683380127, "learning_rate": 0.0002, "loss": 0.4297, "step": 2080 }, { "epoch": 0.17245272947504567, "grad_norm": 0.45604804158210754, "learning_rate": 0.0002, "loss": 0.4411, "step": 2100 }, { "epoch": 0.17409513642242708, "grad_norm": 0.3894326984882355, "learning_rate": 0.0002, "loss": 0.4098, "step": 2120 }, { "epoch": 0.17573754336980846, "grad_norm": 0.34401944279670715, "learning_rate": 0.0002, "loss": 0.406, "step": 2140 }, { "epoch": 0.17737995031718984, "grad_norm": 0.3576812148094177, "learning_rate": 0.0002, "loss": 0.4024, "step": 2160 }, { "epoch": 0.17902235726457122, "grad_norm": 0.4276871979236603, "learning_rate": 0.0002, "loss": 0.4085, "step": 2180 }, { "epoch": 0.18066476421195263, "grad_norm": 0.49007973074913025, "learning_rate": 0.0002, "loss": 0.4104, "step": 2200 }, { "epoch": 0.182307171159334, "grad_norm": 0.4573257267475128, "learning_rate": 0.0002, "loss": 0.4041, "step": 2220 }, { "epoch": 0.1839495781067154, "grad_norm": 0.4118468463420868, "learning_rate": 0.0002, "loss": 0.3984, "step": 2240 }, { "epoch": 0.18559198505409677, "grad_norm": 0.357284277677536, "learning_rate": 0.0002, "loss": 0.4212, "step": 2260 }, { "epoch": 0.18723439200147818, "grad_norm": 0.4252781867980957, "learning_rate": 0.0002, "loss": 0.3924, "step": 2280 }, { "epoch": 0.18887679894885956, "grad_norm": 0.40546557307243347, "learning_rate": 0.0002, "loss": 0.398, "step": 2300 }, { "epoch": 0.19051920589624094, "grad_norm": 0.4305673837661743, "learning_rate": 0.0002, "loss": 0.398, "step": 2320 }, { "epoch": 0.19216161284362232, "grad_norm": 0.40348726511001587, "learning_rate": 0.0002, "loss": 0.4031, "step": 2340 }, { "epoch": 0.19380401979100373, "grad_norm": 0.48159924149513245, "learning_rate": 0.0002, "loss": 0.3926, "step": 2360 }, { "epoch": 0.1954464267383851, "grad_norm": 0.5939348936080933, "learning_rate": 0.0002, "loss": 0.3963, "step": 2380 }, { "epoch": 0.1970888336857665, "grad_norm": 0.42593804001808167, "learning_rate": 0.0002, "loss": 0.3925, "step": 2400 }, { "epoch": 0.19873124063314787, "grad_norm": 0.515277624130249, "learning_rate": 0.0002, "loss": 0.3753, "step": 2420 }, { "epoch": 0.20037364758052928, "grad_norm": 0.43423864245414734, "learning_rate": 0.0002, "loss": 0.396, "step": 2440 }, { "epoch": 0.20201605452791066, "grad_norm": 0.3857817053794861, "learning_rate": 0.0002, "loss": 0.3834, "step": 2460 }, { "epoch": 0.20365846147529204, "grad_norm": 0.3945648670196533, "learning_rate": 0.0002, "loss": 0.3768, "step": 2480 }, { "epoch": 0.20530086842267342, "grad_norm": 0.46411946415901184, "learning_rate": 0.0002, "loss": 0.3852, "step": 2500 }, { "epoch": 0.2069432753700548, "grad_norm": 0.3779551684856415, "learning_rate": 0.0002, "loss": 0.3767, "step": 2520 }, { "epoch": 0.2085856823174362, "grad_norm": 0.4743368625640869, "learning_rate": 0.0002, "loss": 0.4253, "step": 2540 }, { "epoch": 0.2102280892648176, "grad_norm": 0.4278275668621063, "learning_rate": 0.0002, "loss": 0.3558, "step": 2560 }, { "epoch": 0.21187049621219897, "grad_norm": 0.42412903904914856, "learning_rate": 0.0002, "loss": 0.3934, "step": 2580 }, { "epoch": 0.21351290315958035, "grad_norm": 7.02437162399292, "learning_rate": 0.0002, "loss": 0.3972, "step": 2600 }, { "epoch": 0.21515531010696176, "grad_norm": 0.46447402238845825, "learning_rate": 0.0002, "loss": 0.3742, "step": 2620 }, { "epoch": 0.21679771705434314, "grad_norm": 0.4078330993652344, "learning_rate": 0.0002, "loss": 0.3954, "step": 2640 }, { "epoch": 0.21844012400172452, "grad_norm": 0.39751455187797546, "learning_rate": 0.0002, "loss": 0.36, "step": 2660 }, { "epoch": 0.2200825309491059, "grad_norm": 0.4075968265533447, "learning_rate": 0.0002, "loss": 0.3894, "step": 2680 }, { "epoch": 0.2217249378964873, "grad_norm": 0.39630162715911865, "learning_rate": 0.0002, "loss": 0.3748, "step": 2700 }, { "epoch": 0.2233673448438687, "grad_norm": 0.42885056138038635, "learning_rate": 0.0002, "loss": 0.3496, "step": 2720 }, { "epoch": 0.22500975179125007, "grad_norm": 0.4635525941848755, "learning_rate": 0.0002, "loss": 0.3494, "step": 2740 }, { "epoch": 0.22665215873863145, "grad_norm": 0.48458898067474365, "learning_rate": 0.0002, "loss": 0.387, "step": 2760 }, { "epoch": 0.22829456568601286, "grad_norm": 0.49742501974105835, "learning_rate": 0.0002, "loss": 0.3717, "step": 2780 }, { "epoch": 0.22993697263339424, "grad_norm": 0.4279645085334778, "learning_rate": 0.0002, "loss": 0.3537, "step": 2800 }, { "epoch": 0.23157937958077562, "grad_norm": 0.5221889615058899, "learning_rate": 0.0002, "loss": 0.3676, "step": 2820 }, { "epoch": 0.233221786528157, "grad_norm": 0.5390656590461731, "learning_rate": 0.0002, "loss": 0.3439, "step": 2840 }, { "epoch": 0.2348641934755384, "grad_norm": 0.4269630014896393, "learning_rate": 0.0002, "loss": 0.3663, "step": 2860 }, { "epoch": 0.2365066004229198, "grad_norm": 0.37411990761756897, "learning_rate": 0.0002, "loss": 0.3779, "step": 2880 }, { "epoch": 0.23814900737030117, "grad_norm": 0.3186222016811371, "learning_rate": 0.0002, "loss": 0.3513, "step": 2900 }, { "epoch": 0.23979141431768256, "grad_norm": 0.33270496129989624, "learning_rate": 0.0002, "loss": 0.3534, "step": 2920 }, { "epoch": 0.24143382126506396, "grad_norm": 0.4496273100376129, "learning_rate": 0.0002, "loss": 0.3588, "step": 2940 }, { "epoch": 0.24307622821244534, "grad_norm": 0.35411253571510315, "learning_rate": 0.0002, "loss": 0.3466, "step": 2960 }, { "epoch": 0.24471863515982673, "grad_norm": 0.4333256185054779, "learning_rate": 0.0002, "loss": 0.3555, "step": 2980 }, { "epoch": 0.2463610421072081, "grad_norm": 0.3264130651950836, "learning_rate": 0.0002, "loss": 0.3345, "step": 3000 }, { "epoch": 0.24800344905458951, "grad_norm": 0.3925504684448242, "learning_rate": 0.0002, "loss": 0.3559, "step": 3020 }, { "epoch": 0.2496458560019709, "grad_norm": 0.4186360836029053, "learning_rate": 0.0002, "loss": 0.3458, "step": 3040 }, { "epoch": 0.2512882629493523, "grad_norm": 0.4656223952770233, "learning_rate": 0.0002, "loss": 0.349, "step": 3060 }, { "epoch": 0.2529306698967337, "grad_norm": 0.4535064399242401, "learning_rate": 0.0002, "loss": 0.3474, "step": 3080 }, { "epoch": 0.25457307684411506, "grad_norm": 0.37564146518707275, "learning_rate": 0.0002, "loss": 0.3454, "step": 3100 }, { "epoch": 0.25621548379149645, "grad_norm": 0.36363497376441956, "learning_rate": 0.0002, "loss": 0.3515, "step": 3120 }, { "epoch": 0.2578578907388778, "grad_norm": 0.380750447511673, "learning_rate": 0.0002, "loss": 0.3653, "step": 3140 }, { "epoch": 0.2595002976862592, "grad_norm": 0.3188472092151642, "learning_rate": 0.0002, "loss": 0.3596, "step": 3160 }, { "epoch": 0.2611427046336406, "grad_norm": 0.4478905200958252, "learning_rate": 0.0002, "loss": 0.3567, "step": 3180 }, { "epoch": 0.26278511158102197, "grad_norm": 0.4925800859928131, "learning_rate": 0.0002, "loss": 0.3466, "step": 3200 }, { "epoch": 0.26442751852840335, "grad_norm": 0.3702840209007263, "learning_rate": 0.0002, "loss": 0.3327, "step": 3220 }, { "epoch": 0.2660699254757848, "grad_norm": 0.35024309158325195, "learning_rate": 0.0002, "loss": 0.3524, "step": 3240 }, { "epoch": 0.26771233242316617, "grad_norm": 0.4079764783382416, "learning_rate": 0.0002, "loss": 0.338, "step": 3260 }, { "epoch": 0.26935473937054755, "grad_norm": 0.4466266632080078, "learning_rate": 0.0002, "loss": 0.3465, "step": 3280 }, { "epoch": 0.2709971463179289, "grad_norm": 0.4438311457633972, "learning_rate": 0.0002, "loss": 0.3396, "step": 3300 }, { "epoch": 0.2726395532653103, "grad_norm": 0.37101468443870544, "learning_rate": 0.0002, "loss": 0.3392, "step": 3320 }, { "epoch": 0.2742819602126917, "grad_norm": 0.41411712765693665, "learning_rate": 0.0002, "loss": 0.3341, "step": 3340 }, { "epoch": 0.27592436716007307, "grad_norm": 0.47411611676216125, "learning_rate": 0.0002, "loss": 0.3355, "step": 3360 }, { "epoch": 0.27756677410745445, "grad_norm": 0.4871801733970642, "learning_rate": 0.0002, "loss": 0.3627, "step": 3380 }, { "epoch": 0.2792091810548359, "grad_norm": 0.47128844261169434, "learning_rate": 0.0002, "loss": 0.324, "step": 3400 }, { "epoch": 0.28085158800221727, "grad_norm": 0.4556843042373657, "learning_rate": 0.0002, "loss": 0.3443, "step": 3420 }, { "epoch": 0.28249399494959865, "grad_norm": 0.3775945007801056, "learning_rate": 0.0002, "loss": 0.3401, "step": 3440 }, { "epoch": 0.28413640189698003, "grad_norm": 0.377316415309906, "learning_rate": 0.0002, "loss": 0.3478, "step": 3460 }, { "epoch": 0.2857788088443614, "grad_norm": 0.336944580078125, "learning_rate": 0.0002, "loss": 0.3382, "step": 3480 }, { "epoch": 0.2874212157917428, "grad_norm": 0.4296940863132477, "learning_rate": 0.0002, "loss": 0.3361, "step": 3500 }, { "epoch": 0.28906362273912417, "grad_norm": 0.4638020396232605, "learning_rate": 0.0002, "loss": 0.3583, "step": 3520 }, { "epoch": 0.29070602968650555, "grad_norm": 0.4074634313583374, "learning_rate": 0.0002, "loss": 0.3601, "step": 3540 }, { "epoch": 0.292348436633887, "grad_norm": 0.3634164035320282, "learning_rate": 0.0002, "loss": 0.3216, "step": 3560 }, { "epoch": 0.29399084358126837, "grad_norm": 0.43480202555656433, "learning_rate": 0.0002, "loss": 0.33, "step": 3580 }, { "epoch": 0.29563325052864975, "grad_norm": 0.42778658866882324, "learning_rate": 0.0002, "loss": 0.3408, "step": 3600 }, { "epoch": 0.29727565747603113, "grad_norm": 0.3778844177722931, "learning_rate": 0.0002, "loss": 0.3309, "step": 3620 }, { "epoch": 0.2989180644234125, "grad_norm": 0.33491814136505127, "learning_rate": 0.0002, "loss": 0.3011, "step": 3640 }, { "epoch": 0.3005604713707939, "grad_norm": 0.5079118609428406, "learning_rate": 0.0002, "loss": 0.3079, "step": 3660 }, { "epoch": 0.30220287831817527, "grad_norm": 0.3751799166202545, "learning_rate": 0.0002, "loss": 0.3286, "step": 3680 }, { "epoch": 0.30384528526555665, "grad_norm": 0.4447515904903412, "learning_rate": 0.0002, "loss": 0.2991, "step": 3700 }, { "epoch": 0.30548769221293803, "grad_norm": 0.33741819858551025, "learning_rate": 0.0002, "loss": 0.3169, "step": 3720 }, { "epoch": 0.30713009916031947, "grad_norm": 0.3624327480792999, "learning_rate": 0.0002, "loss": 0.3213, "step": 3740 }, { "epoch": 0.30877250610770085, "grad_norm": 0.5299442410469055, "learning_rate": 0.0002, "loss": 0.3476, "step": 3760 }, { "epoch": 0.31041491305508223, "grad_norm": 0.3178050220012665, "learning_rate": 0.0002, "loss": 0.329, "step": 3780 }, { "epoch": 0.3120573200024636, "grad_norm": 0.3178127408027649, "learning_rate": 0.0002, "loss": 0.3046, "step": 3800 }, { "epoch": 0.313699726949845, "grad_norm": 0.4366089403629303, "learning_rate": 0.0002, "loss": 0.3179, "step": 3820 }, { "epoch": 0.3153421338972264, "grad_norm": 0.47534024715423584, "learning_rate": 0.0002, "loss": 0.3377, "step": 3840 }, { "epoch": 0.31698454084460775, "grad_norm": 0.4247181713581085, "learning_rate": 0.0002, "loss": 0.311, "step": 3860 }, { "epoch": 0.31862694779198913, "grad_norm": 0.5085952877998352, "learning_rate": 0.0002, "loss": 0.3197, "step": 3880 }, { "epoch": 0.32026935473937057, "grad_norm": 0.3649958372116089, "learning_rate": 0.0002, "loss": 0.3243, "step": 3900 }, { "epoch": 0.32191176168675195, "grad_norm": 0.43816304206848145, "learning_rate": 0.0002, "loss": 0.3232, "step": 3920 }, { "epoch": 0.32355416863413333, "grad_norm": 0.32603034377098083, "learning_rate": 0.0002, "loss": 0.3155, "step": 3940 }, { "epoch": 0.3251965755815147, "grad_norm": 0.4867421090602875, "learning_rate": 0.0002, "loss": 0.3102, "step": 3960 }, { "epoch": 0.3268389825288961, "grad_norm": 0.3843926191329956, "learning_rate": 0.0002, "loss": 0.3035, "step": 3980 }, { "epoch": 0.3284813894762775, "grad_norm": 0.49313676357269287, "learning_rate": 0.0002, "loss": 0.322, "step": 4000 }, { "epoch": 0.33012379642365886, "grad_norm": 0.4102085530757904, "learning_rate": 0.0002, "loss": 0.3206, "step": 4020 }, { "epoch": 0.33176620337104024, "grad_norm": 0.47901496291160583, "learning_rate": 0.0002, "loss": 0.3131, "step": 4040 }, { "epoch": 0.33340861031842167, "grad_norm": 0.40674644708633423, "learning_rate": 0.0002, "loss": 0.3091, "step": 4060 }, { "epoch": 0.33505101726580305, "grad_norm": 0.44038107991218567, "learning_rate": 0.0002, "loss": 0.3116, "step": 4080 }, { "epoch": 0.33669342421318443, "grad_norm": 0.3919316828250885, "learning_rate": 0.0002, "loss": 0.3077, "step": 4100 }, { "epoch": 0.3383358311605658, "grad_norm": 0.38622769713401794, "learning_rate": 0.0002, "loss": 0.302, "step": 4120 }, { "epoch": 0.3399782381079472, "grad_norm": 0.4685916602611542, "learning_rate": 0.0002, "loss": 0.3234, "step": 4140 }, { "epoch": 0.3416206450553286, "grad_norm": 0.3348797559738159, "learning_rate": 0.0002, "loss": 0.3205, "step": 4160 }, { "epoch": 0.34326305200270996, "grad_norm": 0.4265504777431488, "learning_rate": 0.0002, "loss": 0.3101, "step": 4180 }, { "epoch": 0.34490545895009134, "grad_norm": 0.4005930423736572, "learning_rate": 0.0002, "loss": 0.3096, "step": 4200 }, { "epoch": 0.3465478658974728, "grad_norm": 0.4154227674007416, "learning_rate": 0.0002, "loss": 0.3188, "step": 4220 }, { "epoch": 0.34819027284485415, "grad_norm": 0.30359068512916565, "learning_rate": 0.0002, "loss": 0.2966, "step": 4240 }, { "epoch": 0.34983267979223553, "grad_norm": 0.35363709926605225, "learning_rate": 0.0002, "loss": 0.3189, "step": 4260 }, { "epoch": 0.3514750867396169, "grad_norm": 0.43156126141548157, "learning_rate": 0.0002, "loss": 0.2951, "step": 4280 }, { "epoch": 0.3531174936869983, "grad_norm": 0.4593096077442169, "learning_rate": 0.0002, "loss": 0.3048, "step": 4300 }, { "epoch": 0.3547599006343797, "grad_norm": 0.49352073669433594, "learning_rate": 0.0002, "loss": 0.301, "step": 4320 }, { "epoch": 0.35640230758176106, "grad_norm": 0.4053367078304291, "learning_rate": 0.0002, "loss": 0.311, "step": 4340 }, { "epoch": 0.35804471452914244, "grad_norm": 0.3465437889099121, "learning_rate": 0.0002, "loss": 0.3186, "step": 4360 }, { "epoch": 0.3596871214765238, "grad_norm": 0.4525587558746338, "learning_rate": 0.0002, "loss": 0.3126, "step": 4380 }, { "epoch": 0.36132952842390526, "grad_norm": 0.4213342070579529, "learning_rate": 0.0002, "loss": 0.3041, "step": 4400 }, { "epoch": 0.36297193537128664, "grad_norm": 0.37421244382858276, "learning_rate": 0.0002, "loss": 0.3295, "step": 4420 }, { "epoch": 0.364614342318668, "grad_norm": 0.4033282697200775, "learning_rate": 0.0002, "loss": 0.3031, "step": 4440 }, { "epoch": 0.3662567492660494, "grad_norm": 0.45873841643333435, "learning_rate": 0.0002, "loss": 0.2819, "step": 4460 }, { "epoch": 0.3678991562134308, "grad_norm": 0.36195841431617737, "learning_rate": 0.0002, "loss": 0.2908, "step": 4480 }, { "epoch": 0.36954156316081216, "grad_norm": 0.39707615971565247, "learning_rate": 0.0002, "loss": 0.3023, "step": 4500 }, { "epoch": 0.37118397010819354, "grad_norm": 0.3999727666378021, "learning_rate": 0.0002, "loss": 0.31, "step": 4520 }, { "epoch": 0.3728263770555749, "grad_norm": 0.36880913376808167, "learning_rate": 0.0002, "loss": 0.3017, "step": 4540 }, { "epoch": 0.37446878400295636, "grad_norm": 0.36656180024147034, "learning_rate": 0.0002, "loss": 0.3129, "step": 4560 }, { "epoch": 0.37611119095033774, "grad_norm": 0.4566299021244049, "learning_rate": 0.0002, "loss": 0.3039, "step": 4580 }, { "epoch": 0.3777535978977191, "grad_norm": 0.3202304542064667, "learning_rate": 0.0002, "loss": 0.2827, "step": 4600 }, { "epoch": 0.3793960048451005, "grad_norm": 0.4553089439868927, "learning_rate": 0.0002, "loss": 0.3401, "step": 4620 }, { "epoch": 0.3810384117924819, "grad_norm": 0.40536269545555115, "learning_rate": 0.0002, "loss": 0.3038, "step": 4640 }, { "epoch": 0.38268081873986326, "grad_norm": 0.36675453186035156, "learning_rate": 0.0002, "loss": 0.3198, "step": 4660 }, { "epoch": 0.38432322568724464, "grad_norm": 0.41660359501838684, "learning_rate": 0.0002, "loss": 0.2904, "step": 4680 }, { "epoch": 0.385965632634626, "grad_norm": 0.2889881134033203, "learning_rate": 0.0002, "loss": 0.3076, "step": 4700 }, { "epoch": 0.38760803958200746, "grad_norm": 0.3077252507209778, "learning_rate": 0.0002, "loss": 0.3087, "step": 4720 }, { "epoch": 0.38925044652938884, "grad_norm": 0.43053752183914185, "learning_rate": 0.0002, "loss": 0.2994, "step": 4740 }, { "epoch": 0.3908928534767702, "grad_norm": 0.39978402853012085, "learning_rate": 0.0002, "loss": 0.2825, "step": 4760 }, { "epoch": 0.3925352604241516, "grad_norm": 0.39721283316612244, "learning_rate": 0.0002, "loss": 0.3002, "step": 4780 }, { "epoch": 0.394177667371533, "grad_norm": 0.4234716296195984, "learning_rate": 0.0002, "loss": 0.281, "step": 4800 }, { "epoch": 0.39582007431891436, "grad_norm": 0.41390299797058105, "learning_rate": 0.0002, "loss": 0.3015, "step": 4820 }, { "epoch": 0.39746248126629574, "grad_norm": 0.8412930369377136, "learning_rate": 0.0002, "loss": 0.3034, "step": 4840 }, { "epoch": 0.3991048882136771, "grad_norm": 0.4165583848953247, "learning_rate": 0.0002, "loss": 0.2844, "step": 4860 }, { "epoch": 0.40074729516105856, "grad_norm": 0.4212113618850708, "learning_rate": 0.0002, "loss": 0.2847, "step": 4880 }, { "epoch": 0.40238970210843994, "grad_norm": 0.46880143880844116, "learning_rate": 0.0002, "loss": 0.2877, "step": 4900 }, { "epoch": 0.4040321090558213, "grad_norm": 0.33470281958580017, "learning_rate": 0.0002, "loss": 0.3006, "step": 4920 }, { "epoch": 0.4056745160032027, "grad_norm": 0.41939905285835266, "learning_rate": 0.0002, "loss": 0.3014, "step": 4940 }, { "epoch": 0.4073169229505841, "grad_norm": 0.4031718671321869, "learning_rate": 0.0002, "loss": 0.2959, "step": 4960 }, { "epoch": 0.40895932989796546, "grad_norm": 0.3611488938331604, "learning_rate": 0.0002, "loss": 0.3175, "step": 4980 }, { "epoch": 0.41060173684534684, "grad_norm": 0.38445645570755005, "learning_rate": 0.0002, "loss": 0.2897, "step": 5000 }, { "epoch": 0.4122441437927282, "grad_norm": 0.3903651833534241, "learning_rate": 0.0002, "loss": 0.2716, "step": 5020 }, { "epoch": 0.4138865507401096, "grad_norm": 0.39842015504837036, "learning_rate": 0.0002, "loss": 0.2987, "step": 5040 }, { "epoch": 0.41552895768749104, "grad_norm": 0.4211498200893402, "learning_rate": 0.0002, "loss": 0.3027, "step": 5060 }, { "epoch": 0.4171713646348724, "grad_norm": 0.4767220914363861, "learning_rate": 0.0002, "loss": 0.2897, "step": 5080 }, { "epoch": 0.4188137715822538, "grad_norm": 0.4871378242969513, "learning_rate": 0.0002, "loss": 0.2874, "step": 5100 }, { "epoch": 0.4204561785296352, "grad_norm": 0.3960734009742737, "learning_rate": 0.0002, "loss": 0.2903, "step": 5120 }, { "epoch": 0.42209858547701656, "grad_norm": 0.3350552022457123, "learning_rate": 0.0002, "loss": 0.2835, "step": 5140 }, { "epoch": 0.42374099242439794, "grad_norm": 0.34975695610046387, "learning_rate": 0.0002, "loss": 0.3025, "step": 5160 }, { "epoch": 0.4253833993717793, "grad_norm": 0.3886794149875641, "learning_rate": 0.0002, "loss": 0.289, "step": 5180 }, { "epoch": 0.4270258063191607, "grad_norm": 0.4114588797092438, "learning_rate": 0.0002, "loss": 0.2802, "step": 5200 }, { "epoch": 0.42866821326654214, "grad_norm": 0.4368172585964203, "learning_rate": 0.0002, "loss": 0.2918, "step": 5220 }, { "epoch": 0.4303106202139235, "grad_norm": 0.2889314889907837, "learning_rate": 0.0002, "loss": 0.2854, "step": 5240 }, { "epoch": 0.4319530271613049, "grad_norm": 0.3999134600162506, "learning_rate": 0.0002, "loss": 0.2955, "step": 5260 }, { "epoch": 0.4335954341086863, "grad_norm": 0.32143938541412354, "learning_rate": 0.0002, "loss": 0.2836, "step": 5280 }, { "epoch": 0.43523784105606766, "grad_norm": 0.4069638252258301, "learning_rate": 0.0002, "loss": 0.2854, "step": 5300 }, { "epoch": 0.43688024800344905, "grad_norm": 0.46609416604042053, "learning_rate": 0.0002, "loss": 0.2777, "step": 5320 }, { "epoch": 0.4385226549508304, "grad_norm": 0.35112160444259644, "learning_rate": 0.0002, "loss": 0.2896, "step": 5340 }, { "epoch": 0.4401650618982118, "grad_norm": 0.4243420660495758, "learning_rate": 0.0002, "loss": 0.2743, "step": 5360 }, { "epoch": 0.44180746884559324, "grad_norm": 0.45615971088409424, "learning_rate": 0.0002, "loss": 0.2699, "step": 5380 }, { "epoch": 0.4434498757929746, "grad_norm": 0.4836295247077942, "learning_rate": 0.0002, "loss": 0.2932, "step": 5400 }, { "epoch": 0.445092282740356, "grad_norm": 0.41774359345436096, "learning_rate": 0.0002, "loss": 0.2869, "step": 5420 }, { "epoch": 0.4467346896877374, "grad_norm": 0.3904239535331726, "learning_rate": 0.0002, "loss": 0.2798, "step": 5440 }, { "epoch": 0.44837709663511877, "grad_norm": 0.3867247700691223, "learning_rate": 0.0002, "loss": 0.2668, "step": 5460 }, { "epoch": 0.45001950358250015, "grad_norm": 0.33975329995155334, "learning_rate": 0.0002, "loss": 0.2805, "step": 5480 }, { "epoch": 0.4516619105298815, "grad_norm": 0.30403727293014526, "learning_rate": 0.0002, "loss": 0.2747, "step": 5500 }, { "epoch": 0.4533043174772629, "grad_norm": 0.4227672219276428, "learning_rate": 0.0002, "loss": 0.2699, "step": 5520 }, { "epoch": 0.4549467244246443, "grad_norm": 0.38823801279067993, "learning_rate": 0.0002, "loss": 0.256, "step": 5540 }, { "epoch": 0.4565891313720257, "grad_norm": 0.3460341691970825, "learning_rate": 0.0002, "loss": 0.2768, "step": 5560 }, { "epoch": 0.4582315383194071, "grad_norm": 0.40843436121940613, "learning_rate": 0.0002, "loss": 0.2829, "step": 5580 }, { "epoch": 0.4598739452667885, "grad_norm": 0.411004900932312, "learning_rate": 0.0002, "loss": 0.2849, "step": 5600 }, { "epoch": 0.46151635221416987, "grad_norm": 0.5354210138320923, "learning_rate": 0.0002, "loss": 0.298, "step": 5620 }, { "epoch": 0.46315875916155125, "grad_norm": 0.3296845555305481, "learning_rate": 0.0002, "loss": 0.2571, "step": 5640 }, { "epoch": 0.46480116610893263, "grad_norm": 0.404950350522995, "learning_rate": 0.0002, "loss": 0.2843, "step": 5660 }, { "epoch": 0.466443573056314, "grad_norm": 0.3697005808353424, "learning_rate": 0.0002, "loss": 0.2655, "step": 5680 }, { "epoch": 0.4680859800036954, "grad_norm": 0.3465549945831299, "learning_rate": 0.0002, "loss": 0.282, "step": 5700 }, { "epoch": 0.4697283869510768, "grad_norm": 0.4802212119102478, "learning_rate": 0.0002, "loss": 0.2672, "step": 5720 }, { "epoch": 0.4713707938984582, "grad_norm": 0.3909721076488495, "learning_rate": 0.0002, "loss": 0.2704, "step": 5740 }, { "epoch": 0.4730132008458396, "grad_norm": 0.41303369402885437, "learning_rate": 0.0002, "loss": 0.2797, "step": 5760 }, { "epoch": 0.47465560779322097, "grad_norm": 0.32934170961380005, "learning_rate": 0.0002, "loss": 0.2903, "step": 5780 }, { "epoch": 0.47629801474060235, "grad_norm": 0.375072181224823, "learning_rate": 0.0002, "loss": 0.2752, "step": 5800 }, { "epoch": 0.47794042168798373, "grad_norm": 0.35390418767929077, "learning_rate": 0.0002, "loss": 0.2755, "step": 5820 }, { "epoch": 0.4795828286353651, "grad_norm": 0.3856378197669983, "learning_rate": 0.0002, "loss": 0.2699, "step": 5840 }, { "epoch": 0.4812252355827465, "grad_norm": 0.2624310851097107, "learning_rate": 0.0002, "loss": 0.2654, "step": 5860 }, { "epoch": 0.4828676425301279, "grad_norm": 0.43709930777549744, "learning_rate": 0.0002, "loss": 0.2768, "step": 5880 }, { "epoch": 0.4845100494775093, "grad_norm": 0.3971209228038788, "learning_rate": 0.0002, "loss": 0.2728, "step": 5900 }, { "epoch": 0.4861524564248907, "grad_norm": 0.3937450647354126, "learning_rate": 0.0002, "loss": 0.2836, "step": 5920 }, { "epoch": 0.48779486337227207, "grad_norm": 0.3925333023071289, "learning_rate": 0.0002, "loss": 0.2653, "step": 5940 }, { "epoch": 0.48943727031965345, "grad_norm": 0.3056396245956421, "learning_rate": 0.0002, "loss": 0.2593, "step": 5960 }, { "epoch": 0.49107967726703483, "grad_norm": 0.349110871553421, "learning_rate": 0.0002, "loss": 0.2872, "step": 5980 }, { "epoch": 0.4927220842144162, "grad_norm": 0.37678685784339905, "learning_rate": 0.0002, "loss": 0.2779, "step": 6000 }, { "epoch": 0.4943644911617976, "grad_norm": 0.37364938855171204, "learning_rate": 0.0002, "loss": 0.2612, "step": 6020 }, { "epoch": 0.49600689810917903, "grad_norm": 0.3885985016822815, "learning_rate": 0.0002, "loss": 0.2701, "step": 6040 }, { "epoch": 0.4976493050565604, "grad_norm": 0.4726998507976532, "learning_rate": 0.0002, "loss": 0.258, "step": 6060 }, { "epoch": 0.4992917120039418, "grad_norm": 0.3752720355987549, "learning_rate": 0.0002, "loss": 0.2873, "step": 6080 }, { "epoch": 0.5009341189513231, "grad_norm": 0.5174003839492798, "learning_rate": 0.0002, "loss": 0.2677, "step": 6100 }, { "epoch": 0.5025765258987046, "grad_norm": 0.39343810081481934, "learning_rate": 0.0002, "loss": 0.2498, "step": 6120 }, { "epoch": 0.504218932846086, "grad_norm": 0.3367049992084503, "learning_rate": 0.0002, "loss": 0.2555, "step": 6140 }, { "epoch": 0.5058613397934674, "grad_norm": 0.3384205400943756, "learning_rate": 0.0002, "loss": 0.2865, "step": 6160 }, { "epoch": 0.5075037467408487, "grad_norm": 0.37642723321914673, "learning_rate": 0.0002, "loss": 0.2677, "step": 6180 }, { "epoch": 0.5091461536882301, "grad_norm": 0.31989771127700806, "learning_rate": 0.0002, "loss": 0.2675, "step": 6200 }, { "epoch": 0.5107885606356115, "grad_norm": 0.30809977650642395, "learning_rate": 0.0002, "loss": 0.2562, "step": 6220 }, { "epoch": 0.5124309675829929, "grad_norm": 0.3463954031467438, "learning_rate": 0.0002, "loss": 0.2576, "step": 6240 }, { "epoch": 0.5140733745303743, "grad_norm": 0.3789072036743164, "learning_rate": 0.0002, "loss": 0.2679, "step": 6260 }, { "epoch": 0.5157157814777557, "grad_norm": 0.458978533744812, "learning_rate": 0.0002, "loss": 0.2596, "step": 6280 }, { "epoch": 0.517358188425137, "grad_norm": 0.3515280783176422, "learning_rate": 0.0002, "loss": 0.2629, "step": 6300 }, { "epoch": 0.5190005953725184, "grad_norm": 0.42611977458000183, "learning_rate": 0.0002, "loss": 0.2674, "step": 6320 }, { "epoch": 0.5206430023198998, "grad_norm": 0.3865070641040802, "learning_rate": 0.0002, "loss": 0.2714, "step": 6340 }, { "epoch": 0.5222854092672812, "grad_norm": 0.3559401333332062, "learning_rate": 0.0002, "loss": 0.2751, "step": 6360 }, { "epoch": 0.5239278162146626, "grad_norm": 0.3181537389755249, "learning_rate": 0.0002, "loss": 0.2724, "step": 6380 }, { "epoch": 0.5255702231620439, "grad_norm": 0.37673598527908325, "learning_rate": 0.0002, "loss": 0.2711, "step": 6400 }, { "epoch": 0.5272126301094253, "grad_norm": 0.44122573733329773, "learning_rate": 0.0002, "loss": 0.2617, "step": 6420 }, { "epoch": 0.5288550370568067, "grad_norm": 0.4779141843318939, "learning_rate": 0.0002, "loss": 0.2602, "step": 6440 }, { "epoch": 0.5304974440041882, "grad_norm": 0.3975127339363098, "learning_rate": 0.0002, "loss": 0.2472, "step": 6460 }, { "epoch": 0.5321398509515696, "grad_norm": 0.3808406591415405, "learning_rate": 0.0002, "loss": 0.2623, "step": 6480 }, { "epoch": 0.533782257898951, "grad_norm": 0.340666264295578, "learning_rate": 0.0002, "loss": 0.2806, "step": 6500 }, { "epoch": 0.5354246648463323, "grad_norm": 0.41233885288238525, "learning_rate": 0.0002, "loss": 0.2458, "step": 6520 }, { "epoch": 0.5370670717937137, "grad_norm": 0.28576114773750305, "learning_rate": 0.0002, "loss": 0.2638, "step": 6540 }, { "epoch": 0.5387094787410951, "grad_norm": 0.4704492688179016, "learning_rate": 0.0002, "loss": 0.2735, "step": 6560 }, { "epoch": 0.5403518856884765, "grad_norm": 0.43339604139328003, "learning_rate": 0.0002, "loss": 0.2667, "step": 6580 }, { "epoch": 0.5419942926358579, "grad_norm": 0.332878440618515, "learning_rate": 0.0002, "loss": 0.2513, "step": 6600 }, { "epoch": 0.5436366995832392, "grad_norm": 0.34620800614356995, "learning_rate": 0.0002, "loss": 0.2768, "step": 6620 }, { "epoch": 0.5452791065306206, "grad_norm": 0.46673691272735596, "learning_rate": 0.0002, "loss": 0.2597, "step": 6640 }, { "epoch": 0.546921513478002, "grad_norm": 0.36888402700424194, "learning_rate": 0.0002, "loss": 0.2453, "step": 6660 }, { "epoch": 0.5485639204253834, "grad_norm": 0.363007515668869, "learning_rate": 0.0002, "loss": 0.2545, "step": 6680 }, { "epoch": 0.5502063273727648, "grad_norm": 0.3927077353000641, "learning_rate": 0.0002, "loss": 0.2597, "step": 6700 }, { "epoch": 0.5518487343201461, "grad_norm": 0.36897674202919006, "learning_rate": 0.0002, "loss": 0.2571, "step": 6720 }, { "epoch": 0.5534911412675275, "grad_norm": 0.3425733149051666, "learning_rate": 0.0002, "loss": 0.2624, "step": 6740 }, { "epoch": 0.5551335482149089, "grad_norm": 0.3315962553024292, "learning_rate": 0.0002, "loss": 0.2656, "step": 6760 }, { "epoch": 0.5567759551622903, "grad_norm": 0.4456098675727844, "learning_rate": 0.0002, "loss": 0.266, "step": 6780 }, { "epoch": 0.5584183621096718, "grad_norm": 0.4146248996257782, "learning_rate": 0.0002, "loss": 0.2631, "step": 6800 }, { "epoch": 0.5600607690570532, "grad_norm": 0.3591421842575073, "learning_rate": 0.0002, "loss": 0.2475, "step": 6820 }, { "epoch": 0.5617031760044345, "grad_norm": 0.4540598690509796, "learning_rate": 0.0002, "loss": 0.2667, "step": 6840 }, { "epoch": 0.5633455829518159, "grad_norm": 0.4394567906856537, "learning_rate": 0.0002, "loss": 0.2673, "step": 6860 }, { "epoch": 0.5649879898991973, "grad_norm": 0.3273297846317291, "learning_rate": 0.0002, "loss": 0.2631, "step": 6880 }, { "epoch": 0.5666303968465787, "grad_norm": 0.3828592896461487, "learning_rate": 0.0002, "loss": 0.2601, "step": 6900 }, { "epoch": 0.5682728037939601, "grad_norm": 0.24124163389205933, "learning_rate": 0.0002, "loss": 0.2507, "step": 6920 }, { "epoch": 0.5699152107413414, "grad_norm": 0.4403514564037323, "learning_rate": 0.0002, "loss": 0.2686, "step": 6940 }, { "epoch": 0.5715576176887228, "grad_norm": 0.39177918434143066, "learning_rate": 0.0002, "loss": 0.255, "step": 6960 }, { "epoch": 0.5732000246361042, "grad_norm": 0.41621333360671997, "learning_rate": 0.0002, "loss": 0.2472, "step": 6980 }, { "epoch": 0.5748424315834856, "grad_norm": 0.4051215648651123, "learning_rate": 0.0002, "loss": 0.2692, "step": 7000 }, { "epoch": 0.576484838530867, "grad_norm": 0.9351252317428589, "learning_rate": 0.0002, "loss": 0.2519, "step": 7020 }, { "epoch": 0.5781272454782483, "grad_norm": 0.38004037737846375, "learning_rate": 0.0002, "loss": 0.2683, "step": 7040 }, { "epoch": 0.5797696524256297, "grad_norm": 0.31271103024482727, "learning_rate": 0.0002, "loss": 0.2554, "step": 7060 }, { "epoch": 0.5814120593730111, "grad_norm": 0.3766959607601166, "learning_rate": 0.0002, "loss": 0.2555, "step": 7080 }, { "epoch": 0.5830544663203925, "grad_norm": 2.4575226306915283, "learning_rate": 0.0002, "loss": 0.2673, "step": 7100 }, { "epoch": 0.584696873267774, "grad_norm": 0.3419061005115509, "learning_rate": 0.0002, "loss": 0.2484, "step": 7120 }, { "epoch": 0.5863392802151554, "grad_norm": 0.3647725284099579, "learning_rate": 0.0002, "loss": 0.2614, "step": 7140 }, { "epoch": 0.5879816871625367, "grad_norm": 0.39643993973731995, "learning_rate": 0.0002, "loss": 0.2583, "step": 7160 }, { "epoch": 0.5896240941099181, "grad_norm": 0.37024736404418945, "learning_rate": 0.0002, "loss": 0.2605, "step": 7180 }, { "epoch": 0.5912665010572995, "grad_norm": 0.4551810324192047, "learning_rate": 0.0002, "loss": 0.2512, "step": 7200 }, { "epoch": 0.5929089080046809, "grad_norm": 0.2843814492225647, "learning_rate": 0.0002, "loss": 0.2504, "step": 7220 }, { "epoch": 0.5945513149520623, "grad_norm": 0.3765452206134796, "learning_rate": 0.0002, "loss": 0.2557, "step": 7240 }, { "epoch": 0.5961937218994436, "grad_norm": 0.4625066816806793, "learning_rate": 0.0002, "loss": 0.2433, "step": 7260 }, { "epoch": 0.597836128846825, "grad_norm": 0.4870743453502655, "learning_rate": 0.0002, "loss": 0.2494, "step": 7280 }, { "epoch": 0.5994785357942064, "grad_norm": 0.4229605197906494, "learning_rate": 0.0002, "loss": 0.2553, "step": 7300 }, { "epoch": 0.6011209427415878, "grad_norm": 0.37593892216682434, "learning_rate": 0.0002, "loss": 0.2523, "step": 7320 }, { "epoch": 0.6027633496889692, "grad_norm": 0.36149609088897705, "learning_rate": 0.0002, "loss": 0.2582, "step": 7340 }, { "epoch": 0.6044057566363505, "grad_norm": 0.3866046071052551, "learning_rate": 0.0002, "loss": 0.2534, "step": 7360 }, { "epoch": 0.6060481635837319, "grad_norm": 0.4623259902000427, "learning_rate": 0.0002, "loss": 0.2542, "step": 7380 }, { "epoch": 0.6076905705311133, "grad_norm": 0.32349276542663574, "learning_rate": 0.0002, "loss": 0.2437, "step": 7400 }, { "epoch": 0.6093329774784947, "grad_norm": 0.386561781167984, "learning_rate": 0.0002, "loss": 0.2494, "step": 7420 }, { "epoch": 0.6109753844258761, "grad_norm": 0.36509180068969727, "learning_rate": 0.0002, "loss": 0.2559, "step": 7440 }, { "epoch": 0.6126177913732576, "grad_norm": 0.3628571331501007, "learning_rate": 0.0002, "loss": 0.26, "step": 7460 }, { "epoch": 0.6142601983206389, "grad_norm": 0.3218732476234436, "learning_rate": 0.0002, "loss": 0.2487, "step": 7480 }, { "epoch": 0.6159026052680203, "grad_norm": 0.3551442623138428, "learning_rate": 0.0002, "loss": 0.231, "step": 7500 }, { "epoch": 0.6175450122154017, "grad_norm": 0.40962496399879456, "learning_rate": 0.0002, "loss": 0.2486, "step": 7520 }, { "epoch": 0.6191874191627831, "grad_norm": 0.48531442880630493, "learning_rate": 0.0002, "loss": 0.2547, "step": 7540 }, { "epoch": 0.6208298261101645, "grad_norm": 0.387851357460022, "learning_rate": 0.0002, "loss": 0.2655, "step": 7560 }, { "epoch": 0.6224722330575458, "grad_norm": 0.3165546953678131, "learning_rate": 0.0002, "loss": 0.2499, "step": 7580 }, { "epoch": 0.6241146400049272, "grad_norm": 0.3393017649650574, "learning_rate": 0.0002, "loss": 0.2546, "step": 7600 }, { "epoch": 0.6257570469523086, "grad_norm": 0.3975006639957428, "learning_rate": 0.0002, "loss": 0.255, "step": 7620 }, { "epoch": 0.62739945389969, "grad_norm": 0.4458036720752716, "learning_rate": 0.0002, "loss": 0.2671, "step": 7640 }, { "epoch": 0.6290418608470714, "grad_norm": 0.34977594017982483, "learning_rate": 0.0002, "loss": 0.2438, "step": 7660 }, { "epoch": 0.6306842677944527, "grad_norm": 0.4126521646976471, "learning_rate": 0.0002, "loss": 0.2473, "step": 7680 }, { "epoch": 0.6323266747418341, "grad_norm": 0.35712817311286926, "learning_rate": 0.0002, "loss": 0.2568, "step": 7700 }, { "epoch": 0.6339690816892155, "grad_norm": 0.3464488983154297, "learning_rate": 0.0002, "loss": 0.26, "step": 7720 }, { "epoch": 0.6356114886365969, "grad_norm": 0.40559422969818115, "learning_rate": 0.0002, "loss": 0.2531, "step": 7740 }, { "epoch": 0.6372538955839783, "grad_norm": 0.3709222972393036, "learning_rate": 0.0002, "loss": 0.257, "step": 7760 }, { "epoch": 0.6388963025313598, "grad_norm": 0.3671443462371826, "learning_rate": 0.0002, "loss": 0.243, "step": 7780 }, { "epoch": 0.6405387094787411, "grad_norm": 0.39361605048179626, "learning_rate": 0.0002, "loss": 0.2569, "step": 7800 }, { "epoch": 0.6421811164261225, "grad_norm": 0.41323602199554443, "learning_rate": 0.0002, "loss": 0.2465, "step": 7820 }, { "epoch": 0.6438235233735039, "grad_norm": 0.4266330301761627, "learning_rate": 0.0002, "loss": 0.2495, "step": 7840 }, { "epoch": 0.6454659303208853, "grad_norm": 0.3892604112625122, "learning_rate": 0.0002, "loss": 0.2505, "step": 7860 }, { "epoch": 0.6471083372682667, "grad_norm": 0.43539443612098694, "learning_rate": 0.0002, "loss": 0.2643, "step": 7880 }, { "epoch": 0.648750744215648, "grad_norm": 0.3637757897377014, "learning_rate": 0.0002, "loss": 0.2557, "step": 7900 }, { "epoch": 0.6503931511630294, "grad_norm": 0.42761602997779846, "learning_rate": 0.0002, "loss": 0.2578, "step": 7920 }, { "epoch": 0.6520355581104108, "grad_norm": 0.38917163014411926, "learning_rate": 0.0002, "loss": 0.2593, "step": 7940 }, { "epoch": 0.6536779650577922, "grad_norm": 0.42814767360687256, "learning_rate": 0.0002, "loss": 0.2412, "step": 7960 }, { "epoch": 0.6553203720051736, "grad_norm": 0.3543958365917206, "learning_rate": 0.0002, "loss": 0.2485, "step": 7980 }, { "epoch": 0.656962778952555, "grad_norm": 0.3452099859714508, "learning_rate": 0.0002, "loss": 0.2519, "step": 8000 }, { "epoch": 0.6586051858999363, "grad_norm": 0.38600897789001465, "learning_rate": 0.0002, "loss": 0.2443, "step": 8020 }, { "epoch": 0.6602475928473177, "grad_norm": 0.35474061965942383, "learning_rate": 0.0002, "loss": 0.2435, "step": 8040 }, { "epoch": 0.6618899997946991, "grad_norm": 0.48493891954421997, "learning_rate": 0.0002, "loss": 0.2564, "step": 8060 }, { "epoch": 0.6635324067420805, "grad_norm": 0.40137720108032227, "learning_rate": 0.0002, "loss": 0.2592, "step": 8080 }, { "epoch": 0.6651748136894619, "grad_norm": 0.38460877537727356, "learning_rate": 0.0002, "loss": 0.2387, "step": 8100 }, { "epoch": 0.6668172206368433, "grad_norm": 0.3780753016471863, "learning_rate": 0.0002, "loss": 0.2517, "step": 8120 }, { "epoch": 0.6684596275842247, "grad_norm": 0.30384665727615356, "learning_rate": 0.0002, "loss": 0.2442, "step": 8140 }, { "epoch": 0.6701020345316061, "grad_norm": 0.34080567955970764, "learning_rate": 0.0002, "loss": 0.2443, "step": 8160 }, { "epoch": 0.6717444414789875, "grad_norm": 0.3789510130882263, "learning_rate": 0.0002, "loss": 0.2462, "step": 8180 }, { "epoch": 0.6733868484263689, "grad_norm": 0.3566538989543915, "learning_rate": 0.0002, "loss": 0.2418, "step": 8200 }, { "epoch": 0.6750292553737502, "grad_norm": 0.3436945676803589, "learning_rate": 0.0002, "loss": 0.2353, "step": 8220 }, { "epoch": 0.6766716623211316, "grad_norm": 0.35046547651290894, "learning_rate": 0.0002, "loss": 0.2521, "step": 8240 }, { "epoch": 0.678314069268513, "grad_norm": 0.3671397566795349, "learning_rate": 0.0002, "loss": 0.2505, "step": 8260 }, { "epoch": 0.6799564762158944, "grad_norm": 0.33368802070617676, "learning_rate": 0.0002, "loss": 0.2663, "step": 8280 }, { "epoch": 0.6815988831632758, "grad_norm": 0.35810762643814087, "learning_rate": 0.0002, "loss": 0.2467, "step": 8300 }, { "epoch": 0.6832412901106572, "grad_norm": 0.3913412094116211, "learning_rate": 0.0002, "loss": 0.2544, "step": 8320 }, { "epoch": 0.6848836970580385, "grad_norm": 0.3313830494880676, "learning_rate": 0.0002, "loss": 0.2551, "step": 8340 }, { "epoch": 0.6865261040054199, "grad_norm": 0.3506488502025604, "learning_rate": 0.0002, "loss": 0.2416, "step": 8360 }, { "epoch": 0.6881685109528013, "grad_norm": 0.3841126561164856, "learning_rate": 0.0002, "loss": 0.2531, "step": 8380 }, { "epoch": 0.6898109179001827, "grad_norm": 0.38030919432640076, "learning_rate": 0.0002, "loss": 0.2374, "step": 8400 }, { "epoch": 0.691453324847564, "grad_norm": 0.3643128573894501, "learning_rate": 0.0002, "loss": 0.2616, "step": 8420 }, { "epoch": 0.6930957317949455, "grad_norm": 0.37401241064071655, "learning_rate": 0.0002, "loss": 0.2424, "step": 8440 }, { "epoch": 0.6947381387423269, "grad_norm": 0.42304474115371704, "learning_rate": 0.0002, "loss": 0.2491, "step": 8460 }, { "epoch": 0.6963805456897083, "grad_norm": 0.3441920280456543, "learning_rate": 0.0002, "loss": 0.2429, "step": 8480 }, { "epoch": 0.6980229526370897, "grad_norm": 0.33383867144584656, "learning_rate": 0.0002, "loss": 0.2361, "step": 8500 }, { "epoch": 0.6996653595844711, "grad_norm": 0.42935657501220703, "learning_rate": 0.0002, "loss": 0.2598, "step": 8520 }, { "epoch": 0.7013077665318525, "grad_norm": 0.5143205523490906, "learning_rate": 0.0002, "loss": 0.2348, "step": 8540 }, { "epoch": 0.7029501734792338, "grad_norm": 0.37915435433387756, "learning_rate": 0.0002, "loss": 0.2277, "step": 8560 }, { "epoch": 0.7045925804266152, "grad_norm": 0.3202255666255951, "learning_rate": 0.0002, "loss": 0.2474, "step": 8580 }, { "epoch": 0.7062349873739966, "grad_norm": 0.3681676387786865, "learning_rate": 0.0002, "loss": 0.2417, "step": 8600 }, { "epoch": 0.707877394321378, "grad_norm": 0.41214585304260254, "learning_rate": 0.0002, "loss": 0.2356, "step": 8620 }, { "epoch": 0.7095198012687594, "grad_norm": 0.35259029269218445, "learning_rate": 0.0002, "loss": 0.2394, "step": 8640 }, { "epoch": 0.7111622082161407, "grad_norm": 0.47768017649650574, "learning_rate": 0.0002, "loss": 0.248, "step": 8660 }, { "epoch": 0.7128046151635221, "grad_norm": 0.3282839059829712, "learning_rate": 0.0002, "loss": 0.2336, "step": 8680 }, { "epoch": 0.7144470221109035, "grad_norm": 0.441099613904953, "learning_rate": 0.0002, "loss": 0.2631, "step": 8700 }, { "epoch": 0.7160894290582849, "grad_norm": 0.3486292362213135, "learning_rate": 0.0002, "loss": 0.2531, "step": 8720 }, { "epoch": 0.7177318360056663, "grad_norm": 0.33037880063056946, "learning_rate": 0.0002, "loss": 0.2405, "step": 8740 }, { "epoch": 0.7193742429530476, "grad_norm": 0.47114354372024536, "learning_rate": 0.0002, "loss": 0.2665, "step": 8760 }, { "epoch": 0.7210166499004291, "grad_norm": 0.34797531366348267, "learning_rate": 0.0002, "loss": 0.2481, "step": 8780 }, { "epoch": 0.7226590568478105, "grad_norm": 0.43183642625808716, "learning_rate": 0.0002, "loss": 0.242, "step": 8800 }, { "epoch": 0.7243014637951919, "grad_norm": 0.4230342507362366, "learning_rate": 0.0002, "loss": 0.2363, "step": 8820 }, { "epoch": 0.7259438707425733, "grad_norm": 0.40553364157676697, "learning_rate": 0.0002, "loss": 0.2422, "step": 8840 }, { "epoch": 0.7275862776899547, "grad_norm": 0.34155145287513733, "learning_rate": 0.0002, "loss": 0.2422, "step": 8860 }, { "epoch": 0.729228684637336, "grad_norm": 0.4095294773578644, "learning_rate": 0.0002, "loss": 0.2605, "step": 8880 }, { "epoch": 0.7308710915847174, "grad_norm": 0.36541318893432617, "learning_rate": 0.0002, "loss": 0.2516, "step": 8900 }, { "epoch": 0.7325134985320988, "grad_norm": 0.40149998664855957, "learning_rate": 0.0002, "loss": 0.2515, "step": 8920 }, { "epoch": 0.7341559054794802, "grad_norm": 0.3220469653606415, "learning_rate": 0.0002, "loss": 0.2361, "step": 8940 }, { "epoch": 0.7357983124268616, "grad_norm": 0.3153376579284668, "learning_rate": 0.0002, "loss": 0.2325, "step": 8960 }, { "epoch": 0.7374407193742429, "grad_norm": 0.3046116530895233, "learning_rate": 0.0002, "loss": 0.2502, "step": 8980 }, { "epoch": 0.7390831263216243, "grad_norm": 0.502663791179657, "learning_rate": 0.0002, "loss": 0.2471, "step": 9000 }, { "epoch": 0.7407255332690057, "grad_norm": 0.35168886184692383, "learning_rate": 0.0002, "loss": 0.2309, "step": 9020 }, { "epoch": 0.7423679402163871, "grad_norm": 0.43629148602485657, "learning_rate": 0.0002, "loss": 0.2423, "step": 9040 }, { "epoch": 0.7440103471637685, "grad_norm": 0.35909175872802734, "learning_rate": 0.0002, "loss": 0.2453, "step": 9060 }, { "epoch": 0.7456527541111498, "grad_norm": 0.3052688539028168, "learning_rate": 0.0002, "loss": 0.2413, "step": 9080 }, { "epoch": 0.7472951610585313, "grad_norm": 0.2708439230918884, "learning_rate": 0.0002, "loss": 0.2237, "step": 9100 }, { "epoch": 0.7489375680059127, "grad_norm": 0.3965560495853424, "learning_rate": 0.0002, "loss": 0.2423, "step": 9120 }, { "epoch": 0.7505799749532941, "grad_norm": 0.3895662724971771, "learning_rate": 0.0002, "loss": 0.249, "step": 9140 }, { "epoch": 0.7522223819006755, "grad_norm": 0.32124513387680054, "learning_rate": 0.0002, "loss": 0.2376, "step": 9160 }, { "epoch": 0.7538647888480569, "grad_norm": 0.716029167175293, "learning_rate": 0.0002, "loss": 0.2529, "step": 9180 }, { "epoch": 0.7555071957954382, "grad_norm": 0.3812948167324066, "learning_rate": 0.0002, "loss": 0.2269, "step": 9200 }, { "epoch": 0.7571496027428196, "grad_norm": 0.37073054909706116, "learning_rate": 0.0002, "loss": 0.235, "step": 9220 }, { "epoch": 0.758792009690201, "grad_norm": 0.4043092727661133, "learning_rate": 0.0002, "loss": 0.2345, "step": 9240 }, { "epoch": 0.7604344166375824, "grad_norm": 0.3160434365272522, "learning_rate": 0.0002, "loss": 0.2412, "step": 9260 }, { "epoch": 0.7620768235849638, "grad_norm": 0.35415521264076233, "learning_rate": 0.0002, "loss": 0.2358, "step": 9280 }, { "epoch": 0.7637192305323451, "grad_norm": 0.41371211409568787, "learning_rate": 0.0002, "loss": 0.2317, "step": 9300 }, { "epoch": 0.7653616374797265, "grad_norm": 0.4175126850605011, "learning_rate": 0.0002, "loss": 0.2547, "step": 9320 }, { "epoch": 0.7670040444271079, "grad_norm": 0.39811649918556213, "learning_rate": 0.0002, "loss": 0.2462, "step": 9340 }, { "epoch": 0.7686464513744893, "grad_norm": 0.33596447110176086, "learning_rate": 0.0002, "loss": 0.2368, "step": 9360 }, { "epoch": 0.7702888583218707, "grad_norm": 0.36754104495048523, "learning_rate": 0.0002, "loss": 0.2484, "step": 9380 }, { "epoch": 0.771931265269252, "grad_norm": 0.38244250416755676, "learning_rate": 0.0002, "loss": 0.2364, "step": 9400 }, { "epoch": 0.7735736722166334, "grad_norm": 0.3366243839263916, "learning_rate": 0.0002, "loss": 0.2194, "step": 9420 }, { "epoch": 0.7752160791640149, "grad_norm": 0.39877885580062866, "learning_rate": 0.0002, "loss": 0.2469, "step": 9440 }, { "epoch": 0.7768584861113963, "grad_norm": 0.2690157890319824, "learning_rate": 0.0002, "loss": 0.2459, "step": 9460 }, { "epoch": 0.7785008930587777, "grad_norm": 0.3678382337093353, "learning_rate": 0.0002, "loss": 0.2192, "step": 9480 }, { "epoch": 0.7801433000061591, "grad_norm": 0.3121150732040405, "learning_rate": 0.0002, "loss": 0.2438, "step": 9500 }, { "epoch": 0.7817857069535404, "grad_norm": 0.3517535626888275, "learning_rate": 0.0002, "loss": 0.2495, "step": 9520 }, { "epoch": 0.7834281139009218, "grad_norm": 0.434817910194397, "learning_rate": 0.0002, "loss": 0.2532, "step": 9540 }, { "epoch": 0.7850705208483032, "grad_norm": 0.35570958256721497, "learning_rate": 0.0002, "loss": 0.2467, "step": 9560 }, { "epoch": 0.7867129277956846, "grad_norm": 0.4270517826080322, "learning_rate": 0.0002, "loss": 0.2337, "step": 9580 }, { "epoch": 0.788355334743066, "grad_norm": 0.2827800214290619, "learning_rate": 0.0002, "loss": 0.2309, "step": 9600 }, { "epoch": 0.7899977416904473, "grad_norm": 0.39158400893211365, "learning_rate": 0.0002, "loss": 0.2366, "step": 9620 }, { "epoch": 0.7916401486378287, "grad_norm": 0.32538673281669617, "learning_rate": 0.0002, "loss": 0.2389, "step": 9640 }, { "epoch": 0.7932825555852101, "grad_norm": 0.3370015323162079, "learning_rate": 0.0002, "loss": 0.2377, "step": 9660 }, { "epoch": 0.7949249625325915, "grad_norm": 0.3779650032520294, "learning_rate": 0.0002, "loss": 0.2339, "step": 9680 }, { "epoch": 0.7965673694799729, "grad_norm": 0.36034300923347473, "learning_rate": 0.0002, "loss": 0.2427, "step": 9700 }, { "epoch": 0.7982097764273542, "grad_norm": 0.3154286742210388, "learning_rate": 0.0002, "loss": 0.2338, "step": 9720 }, { "epoch": 0.7998521833747356, "grad_norm": 0.3282501697540283, "learning_rate": 0.0002, "loss": 0.2408, "step": 9740 }, { "epoch": 0.8014945903221171, "grad_norm": 0.41291025280952454, "learning_rate": 0.0002, "loss": 0.2507, "step": 9760 }, { "epoch": 0.8031369972694985, "grad_norm": 0.3961363136768341, "learning_rate": 0.0002, "loss": 0.2281, "step": 9780 }, { "epoch": 0.8047794042168799, "grad_norm": 0.47485384345054626, "learning_rate": 0.0002, "loss": 0.2349, "step": 9800 }, { "epoch": 0.8064218111642613, "grad_norm": 0.3284982740879059, "learning_rate": 0.0002, "loss": 0.2288, "step": 9820 }, { "epoch": 0.8080642181116426, "grad_norm": 0.38867270946502686, "learning_rate": 0.0002, "loss": 0.2328, "step": 9840 }, { "epoch": 0.809706625059024, "grad_norm": 0.44371268153190613, "learning_rate": 0.0002, "loss": 0.2416, "step": 9860 }, { "epoch": 0.8113490320064054, "grad_norm": 0.2462434470653534, "learning_rate": 0.0002, "loss": 0.2391, "step": 9880 }, { "epoch": 0.8129914389537868, "grad_norm": 0.31762421131134033, "learning_rate": 0.0002, "loss": 0.2467, "step": 9900 }, { "epoch": 0.8146338459011682, "grad_norm": 0.40011724829673767, "learning_rate": 0.0002, "loss": 0.2351, "step": 9920 }, { "epoch": 0.8162762528485495, "grad_norm": 0.2972090542316437, "learning_rate": 0.0002, "loss": 0.2469, "step": 9940 }, { "epoch": 0.8179186597959309, "grad_norm": 0.4047238230705261, "learning_rate": 0.0002, "loss": 0.2257, "step": 9960 }, { "epoch": 0.8195610667433123, "grad_norm": 0.36663326621055603, "learning_rate": 0.0002, "loss": 0.2302, "step": 9980 }, { "epoch": 0.8212034736906937, "grad_norm": 0.49191904067993164, "learning_rate": 0.0002, "loss": 0.242, "step": 10000 }, { "epoch": 0.8228458806380751, "grad_norm": 0.4621546268463135, "learning_rate": 0.0002, "loss": 0.2324, "step": 10020 }, { "epoch": 0.8244882875854564, "grad_norm": 0.4055505394935608, "learning_rate": 0.0002, "loss": 0.2373, "step": 10040 }, { "epoch": 0.8261306945328378, "grad_norm": 0.34892845153808594, "learning_rate": 0.0002, "loss": 0.23, "step": 10060 }, { "epoch": 0.8277731014802192, "grad_norm": 0.33453091979026794, "learning_rate": 0.0002, "loss": 0.2348, "step": 10080 }, { "epoch": 0.8294155084276007, "grad_norm": 0.3283565640449524, "learning_rate": 0.0002, "loss": 0.2314, "step": 10100 }, { "epoch": 0.8310579153749821, "grad_norm": 0.35970717668533325, "learning_rate": 0.0002, "loss": 0.2336, "step": 10120 }, { "epoch": 0.8327003223223635, "grad_norm": 0.3093232810497284, "learning_rate": 0.0002, "loss": 0.2363, "step": 10140 }, { "epoch": 0.8343427292697448, "grad_norm": 0.4389066696166992, "learning_rate": 0.0002, "loss": 0.2422, "step": 10160 }, { "epoch": 0.8359851362171262, "grad_norm": 0.44654580950737, "learning_rate": 0.0002, "loss": 0.232, "step": 10180 }, { "epoch": 0.8376275431645076, "grad_norm": 0.2830391526222229, "learning_rate": 0.0002, "loss": 0.2476, "step": 10200 }, { "epoch": 0.839269950111889, "grad_norm": 0.31547674536705017, "learning_rate": 0.0002, "loss": 0.231, "step": 10220 }, { "epoch": 0.8409123570592704, "grad_norm": 0.45748040080070496, "learning_rate": 0.0002, "loss": 0.2372, "step": 10240 }, { "epoch": 0.8425547640066517, "grad_norm": 0.34882062673568726, "learning_rate": 0.0002, "loss": 0.2376, "step": 10260 }, { "epoch": 0.8441971709540331, "grad_norm": 0.3529532849788666, "learning_rate": 0.0002, "loss": 0.2323, "step": 10280 }, { "epoch": 0.8458395779014145, "grad_norm": 0.33054473996162415, "learning_rate": 0.0002, "loss": 0.2376, "step": 10300 }, { "epoch": 0.8474819848487959, "grad_norm": 0.3015061616897583, "learning_rate": 0.0002, "loss": 0.2243, "step": 10320 }, { "epoch": 0.8491243917961773, "grad_norm": 0.3048664629459381, "learning_rate": 0.0002, "loss": 0.2318, "step": 10340 }, { "epoch": 0.8507667987435586, "grad_norm": 0.31459841132164, "learning_rate": 0.0002, "loss": 0.2307, "step": 10360 }, { "epoch": 0.85240920569094, "grad_norm": 0.39160168170928955, "learning_rate": 0.0002, "loss": 0.2407, "step": 10380 }, { "epoch": 0.8540516126383214, "grad_norm": 0.30392590165138245, "learning_rate": 0.0002, "loss": 0.2206, "step": 10400 }, { "epoch": 0.8556940195857029, "grad_norm": 0.3656589686870575, "learning_rate": 0.0002, "loss": 0.229, "step": 10420 }, { "epoch": 0.8573364265330843, "grad_norm": 0.35856541991233826, "learning_rate": 0.0002, "loss": 0.2361, "step": 10440 }, { "epoch": 0.8589788334804657, "grad_norm": 0.3591729402542114, "learning_rate": 0.0002, "loss": 0.2232, "step": 10460 }, { "epoch": 0.860621240427847, "grad_norm": 0.36023178696632385, "learning_rate": 0.0002, "loss": 0.2495, "step": 10480 }, { "epoch": 0.8622636473752284, "grad_norm": 0.38790059089660645, "learning_rate": 0.0002, "loss": 0.2288, "step": 10500 }, { "epoch": 0.8639060543226098, "grad_norm": 0.39627397060394287, "learning_rate": 0.0002, "loss": 0.24, "step": 10520 }, { "epoch": 0.8655484612699912, "grad_norm": 0.32167407870292664, "learning_rate": 0.0002, "loss": 0.2365, "step": 10540 }, { "epoch": 0.8671908682173726, "grad_norm": 0.34265172481536865, "learning_rate": 0.0002, "loss": 0.2419, "step": 10560 }, { "epoch": 0.868833275164754, "grad_norm": 0.3236486613750458, "learning_rate": 0.0002, "loss": 0.2326, "step": 10580 }, { "epoch": 0.8704756821121353, "grad_norm": 0.3700607120990753, "learning_rate": 0.0002, "loss": 0.2361, "step": 10600 }, { "epoch": 0.8721180890595167, "grad_norm": 0.33969688415527344, "learning_rate": 0.0002, "loss": 0.2236, "step": 10620 }, { "epoch": 0.8737604960068981, "grad_norm": 0.2824096083641052, "learning_rate": 0.0002, "loss": 0.2415, "step": 10640 }, { "epoch": 0.8754029029542795, "grad_norm": 0.3842727243900299, "learning_rate": 0.0002, "loss": 0.2223, "step": 10660 }, { "epoch": 0.8770453099016609, "grad_norm": 0.36808887124061584, "learning_rate": 0.0002, "loss": 0.2253, "step": 10680 }, { "epoch": 0.8786877168490422, "grad_norm": 0.4065176844596863, "learning_rate": 0.0002, "loss": 0.2274, "step": 10700 }, { "epoch": 0.8803301237964236, "grad_norm": 0.3421749174594879, "learning_rate": 0.0002, "loss": 0.2309, "step": 10720 }, { "epoch": 0.881972530743805, "grad_norm": 0.30610519647598267, "learning_rate": 0.0002, "loss": 0.2213, "step": 10740 }, { "epoch": 0.8836149376911865, "grad_norm": 0.40341177582740784, "learning_rate": 0.0002, "loss": 0.229, "step": 10760 }, { "epoch": 0.8852573446385679, "grad_norm": 0.43038755655288696, "learning_rate": 0.0002, "loss": 0.2312, "step": 10780 }, { "epoch": 0.8868997515859492, "grad_norm": 0.26736319065093994, "learning_rate": 0.0002, "loss": 0.2375, "step": 10800 }, { "epoch": 0.8885421585333306, "grad_norm": 0.34479281306266785, "learning_rate": 0.0002, "loss": 0.2342, "step": 10820 }, { "epoch": 0.890184565480712, "grad_norm": 0.32857152819633484, "learning_rate": 0.0002, "loss": 0.2352, "step": 10840 }, { "epoch": 0.8918269724280934, "grad_norm": 0.30919578671455383, "learning_rate": 0.0002, "loss": 0.2133, "step": 10860 }, { "epoch": 0.8934693793754748, "grad_norm": 0.3049899637699127, "learning_rate": 0.0002, "loss": 0.2374, "step": 10880 }, { "epoch": 0.8951117863228562, "grad_norm": 0.4088539779186249, "learning_rate": 0.0002, "loss": 0.2377, "step": 10900 }, { "epoch": 0.8967541932702375, "grad_norm": 0.3318689167499542, "learning_rate": 0.0002, "loss": 0.2459, "step": 10920 }, { "epoch": 0.8983966002176189, "grad_norm": 0.38051754236221313, "learning_rate": 0.0002, "loss": 0.2305, "step": 10940 }, { "epoch": 0.9000390071650003, "grad_norm": 0.401080846786499, "learning_rate": 0.0002, "loss": 0.2297, "step": 10960 }, { "epoch": 0.9016814141123817, "grad_norm": 0.30713602900505066, "learning_rate": 0.0002, "loss": 0.2254, "step": 10980 }, { "epoch": 0.903323821059763, "grad_norm": 0.37888234853744507, "learning_rate": 0.0002, "loss": 0.2346, "step": 11000 }, { "epoch": 0.9049662280071444, "grad_norm": 0.3106231689453125, "learning_rate": 0.0002, "loss": 0.2206, "step": 11020 }, { "epoch": 0.9066086349545258, "grad_norm": 0.44297677278518677, "learning_rate": 0.0002, "loss": 0.2218, "step": 11040 }, { "epoch": 0.9082510419019072, "grad_norm": 0.3375784456729889, "learning_rate": 0.0002, "loss": 0.2273, "step": 11060 }, { "epoch": 0.9098934488492886, "grad_norm": 0.4860747158527374, "learning_rate": 0.0002, "loss": 0.2317, "step": 11080 }, { "epoch": 0.9115358557966701, "grad_norm": 0.2880633771419525, "learning_rate": 0.0002, "loss": 0.2398, "step": 11100 }, { "epoch": 0.9131782627440514, "grad_norm": 0.4085402190685272, "learning_rate": 0.0002, "loss": 0.234, "step": 11120 }, { "epoch": 0.9148206696914328, "grad_norm": 0.38998520374298096, "learning_rate": 0.0002, "loss": 0.2402, "step": 11140 }, { "epoch": 0.9164630766388142, "grad_norm": 0.40508535504341125, "learning_rate": 0.0002, "loss": 0.2136, "step": 11160 }, { "epoch": 0.9181054835861956, "grad_norm": 0.3789615035057068, "learning_rate": 0.0002, "loss": 0.2267, "step": 11180 }, { "epoch": 0.919747890533577, "grad_norm": 0.3882130980491638, "learning_rate": 0.0002, "loss": 0.2276, "step": 11200 }, { "epoch": 0.9213902974809584, "grad_norm": 0.3001303970813751, "learning_rate": 0.0002, "loss": 0.2313, "step": 11220 }, { "epoch": 0.9230327044283397, "grad_norm": 0.4514042139053345, "learning_rate": 0.0002, "loss": 0.2204, "step": 11240 }, { "epoch": 0.9246751113757211, "grad_norm": 0.43372517824172974, "learning_rate": 0.0002, "loss": 0.2294, "step": 11260 }, { "epoch": 0.9263175183231025, "grad_norm": 0.2934057414531708, "learning_rate": 0.0002, "loss": 0.2308, "step": 11280 }, { "epoch": 0.9279599252704839, "grad_norm": 0.4067831337451935, "learning_rate": 0.0002, "loss": 0.2329, "step": 11300 }, { "epoch": 0.9296023322178653, "grad_norm": 0.3299509584903717, "learning_rate": 0.0002, "loss": 0.2214, "step": 11320 }, { "epoch": 0.9312447391652466, "grad_norm": 0.35204941034317017, "learning_rate": 0.0002, "loss": 0.239, "step": 11340 }, { "epoch": 0.932887146112628, "grad_norm": 0.30878013372421265, "learning_rate": 0.0002, "loss": 0.2248, "step": 11360 }, { "epoch": 0.9345295530600094, "grad_norm": 0.392170786857605, "learning_rate": 0.0002, "loss": 0.2274, "step": 11380 }, { "epoch": 0.9361719600073908, "grad_norm": 0.4151529371738434, "learning_rate": 0.0002, "loss": 0.2186, "step": 11400 }, { "epoch": 0.9378143669547723, "grad_norm": 0.3535741865634918, "learning_rate": 0.0002, "loss": 0.2285, "step": 11420 }, { "epoch": 0.9394567739021537, "grad_norm": 0.3477960526943207, "learning_rate": 0.0002, "loss": 0.2313, "step": 11440 }, { "epoch": 0.941099180849535, "grad_norm": 0.3621846139431, "learning_rate": 0.0002, "loss": 0.2317, "step": 11460 }, { "epoch": 0.9427415877969164, "grad_norm": 0.3844580352306366, "learning_rate": 0.0002, "loss": 0.2345, "step": 11480 }, { "epoch": 0.9443839947442978, "grad_norm": 0.3395872116088867, "learning_rate": 0.0002, "loss": 0.2233, "step": 11500 }, { "epoch": 0.9460264016916792, "grad_norm": 0.4554111063480377, "learning_rate": 0.0002, "loss": 0.2324, "step": 11520 }, { "epoch": 0.9476688086390606, "grad_norm": 0.34367838501930237, "learning_rate": 0.0002, "loss": 0.2157, "step": 11540 }, { "epoch": 0.9493112155864419, "grad_norm": 0.2760342061519623, "learning_rate": 0.0002, "loss": 0.2278, "step": 11560 }, { "epoch": 0.9509536225338233, "grad_norm": 0.4382875859737396, "learning_rate": 0.0002, "loss": 0.2361, "step": 11580 }, { "epoch": 0.9525960294812047, "grad_norm": 0.3573220670223236, "learning_rate": 0.0002, "loss": 0.2241, "step": 11600 }, { "epoch": 0.9542384364285861, "grad_norm": 0.3491596579551697, "learning_rate": 0.0002, "loss": 0.2258, "step": 11620 }, { "epoch": 0.9558808433759675, "grad_norm": 0.42366743087768555, "learning_rate": 0.0002, "loss": 0.2406, "step": 11640 }, { "epoch": 0.9575232503233488, "grad_norm": 0.3748779892921448, "learning_rate": 0.0002, "loss": 0.2305, "step": 11660 }, { "epoch": 0.9591656572707302, "grad_norm": 0.40864527225494385, "learning_rate": 0.0002, "loss": 0.235, "step": 11680 }, { "epoch": 0.9608080642181116, "grad_norm": 0.41164445877075195, "learning_rate": 0.0002, "loss": 0.2195, "step": 11700 }, { "epoch": 0.962450471165493, "grad_norm": 0.46402692794799805, "learning_rate": 0.0002, "loss": 0.2266, "step": 11720 }, { "epoch": 0.9640928781128744, "grad_norm": 0.32727622985839844, "learning_rate": 0.0002, "loss": 0.2324, "step": 11740 }, { "epoch": 0.9657352850602559, "grad_norm": 0.4346349537372589, "learning_rate": 0.0002, "loss": 0.2257, "step": 11760 }, { "epoch": 0.9673776920076372, "grad_norm": 0.3470235764980316, "learning_rate": 0.0002, "loss": 0.2333, "step": 11780 }, { "epoch": 0.9690200989550186, "grad_norm": 0.48941469192504883, "learning_rate": 0.0002, "loss": 0.2336, "step": 11800 }, { "epoch": 0.9706625059024, "grad_norm": 0.3959124982357025, "learning_rate": 0.0002, "loss": 0.2221, "step": 11820 }, { "epoch": 0.9723049128497814, "grad_norm": 0.40877676010131836, "learning_rate": 0.0002, "loss": 0.232, "step": 11840 }, { "epoch": 0.9739473197971628, "grad_norm": 0.4087940454483032, "learning_rate": 0.0002, "loss": 0.2195, "step": 11860 }, { "epoch": 0.9755897267445441, "grad_norm": 0.3967040181159973, "learning_rate": 0.0002, "loss": 0.234, "step": 11880 }, { "epoch": 0.9772321336919255, "grad_norm": 0.41639575362205505, "learning_rate": 0.0002, "loss": 0.221, "step": 11900 }, { "epoch": 0.9788745406393069, "grad_norm": 0.304775595664978, "learning_rate": 0.0002, "loss": 0.2283, "step": 11920 }, { "epoch": 0.9805169475866883, "grad_norm": 0.41931501030921936, "learning_rate": 0.0002, "loss": 0.2263, "step": 11940 }, { "epoch": 0.9821593545340697, "grad_norm": 0.34010422229766846, "learning_rate": 0.0002, "loss": 0.222, "step": 11960 }, { "epoch": 0.983801761481451, "grad_norm": 0.3099174499511719, "learning_rate": 0.0002, "loss": 0.2221, "step": 11980 }, { "epoch": 0.9854441684288324, "grad_norm": 0.3627716600894928, "learning_rate": 0.0002, "loss": 0.2419, "step": 12000 }, { "epoch": 0.9870865753762138, "grad_norm": 0.3797793388366699, "learning_rate": 0.0002, "loss": 0.2289, "step": 12020 }, { "epoch": 0.9887289823235952, "grad_norm": 0.34914806485176086, "learning_rate": 0.0002, "loss": 0.2211, "step": 12040 }, { "epoch": 0.9903713892709766, "grad_norm": 0.35985666513442993, "learning_rate": 0.0002, "loss": 0.2271, "step": 12060 }, { "epoch": 0.9920137962183581, "grad_norm": 0.3159051835536957, "learning_rate": 0.0002, "loss": 0.2364, "step": 12080 }, { "epoch": 0.9936562031657394, "grad_norm": 0.29203563928604126, "learning_rate": 0.0002, "loss": 0.2429, "step": 12100 }, { "epoch": 0.9952986101131208, "grad_norm": 0.32187801599502563, "learning_rate": 0.0002, "loss": 0.2386, "step": 12120 }, { "epoch": 0.9969410170605022, "grad_norm": 0.35564154386520386, "learning_rate": 0.0002, "loss": 0.2349, "step": 12140 }, { "epoch": 0.9985834240078836, "grad_norm": 0.3589749336242676, "learning_rate": 0.0002, "loss": 0.2275, "step": 12160 }, { "epoch": 1.0, "eval_loss": 0.28475141525268555, "eval_runtime": 907.1315, "eval_samples_per_second": 4.174, "eval_steps_per_second": 0.523, "step": 12178 } ], "logging_steps": 20, "max_steps": 16000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 77, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.843715322728153e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }