{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999802839116719, "eval_steps": 1268, "global_step": 10144, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001971608832807571, "grad_norm": 3.145049268342521, "learning_rate": 1.0000000000000002e-06, "loss": 0.6851, "step": 1 }, { "epoch": 0.0001971608832807571, "eval_loss": 0.6705650091171265, "eval_runtime": 342.719, "eval_samples_per_second": 23.722, "eval_steps_per_second": 1.485, "step": 1 }, { "epoch": 0.0003943217665615142, "grad_norm": 2.928706161869534, "learning_rate": 2.0000000000000003e-06, "loss": 0.6183, "step": 2 }, { "epoch": 0.0005914826498422713, "grad_norm": 2.8063413659680485, "learning_rate": 3e-06, "loss": 0.6447, "step": 3 }, { "epoch": 0.0007886435331230284, "grad_norm": 4.474770355859649, "learning_rate": 4.000000000000001e-06, "loss": 0.6615, "step": 4 }, { "epoch": 0.0009858044164037854, "grad_norm": 2.83892152250444, "learning_rate": 5e-06, "loss": 0.5571, "step": 5 }, { "epoch": 0.0011829652996845426, "grad_norm": 3.174661998811544, "learning_rate": 6e-06, "loss": 0.6092, "step": 6 }, { "epoch": 0.0013801261829652998, "grad_norm": 2.086667997610238, "learning_rate": 7e-06, "loss": 0.5469, "step": 7 }, { "epoch": 0.0015772870662460567, "grad_norm": 2.4093348319308454, "learning_rate": 8.000000000000001e-06, "loss": 0.6155, "step": 8 }, { "epoch": 0.001774447949526814, "grad_norm": 1.753355191522501, "learning_rate": 9e-06, "loss": 0.5135, "step": 9 }, { "epoch": 0.001971608832807571, "grad_norm": 1.813162138301549, "learning_rate": 1e-05, "loss": 0.5444, "step": 10 }, { "epoch": 0.002168769716088328, "grad_norm": 1.5430930747728506, "learning_rate": 1.1000000000000001e-05, "loss": 0.5032, "step": 11 }, { "epoch": 0.002365930599369085, "grad_norm": 1.4956193189962286, "learning_rate": 1.2e-05, "loss": 0.5294, "step": 12 }, { "epoch": 0.0025630914826498424, "grad_norm": 1.7344594420796293, "learning_rate": 1.3000000000000001e-05, "loss": 0.5647, "step": 13 }, { "epoch": 0.0027602523659305996, "grad_norm": 1.1966218841914706, "learning_rate": 1.4e-05, "loss": 0.5103, "step": 14 }, { "epoch": 0.0029574132492113563, "grad_norm": 1.432228734486573, "learning_rate": 1.5000000000000002e-05, "loss": 0.5, "step": 15 }, { "epoch": 0.0031545741324921135, "grad_norm": 1.4645984670917185, "learning_rate": 1.6000000000000003e-05, "loss": 0.4978, "step": 16 }, { "epoch": 0.0033517350157728706, "grad_norm": 1.3563922951463974, "learning_rate": 1.7e-05, "loss": 0.506, "step": 17 }, { "epoch": 0.003548895899053628, "grad_norm": 1.3871081225739526, "learning_rate": 1.8e-05, "loss": 0.5311, "step": 18 }, { "epoch": 0.003746056782334385, "grad_norm": 1.280010346938333, "learning_rate": 1.9e-05, "loss": 0.5433, "step": 19 }, { "epoch": 0.003943217665615142, "grad_norm": 2.284117117524146, "learning_rate": 2e-05, "loss": 0.5448, "step": 20 }, { "epoch": 0.004140378548895899, "grad_norm": 1.6225424151126402, "learning_rate": 1.9999999879870974e-05, "loss": 0.5044, "step": 21 }, { "epoch": 0.004337539432176656, "grad_norm": 1.6087069713987625, "learning_rate": 1.99999995194839e-05, "loss": 0.4712, "step": 22 }, { "epoch": 0.004534700315457414, "grad_norm": 1.4815279243249726, "learning_rate": 1.9999998918838782e-05, "loss": 0.5043, "step": 23 }, { "epoch": 0.00473186119873817, "grad_norm": 1.2421866810695292, "learning_rate": 1.9999998077935636e-05, "loss": 0.4933, "step": 24 }, { "epoch": 0.004929022082018927, "grad_norm": 1.392319279479948, "learning_rate": 1.9999996996774485e-05, "loss": 0.4657, "step": 25 }, { "epoch": 0.005126182965299685, "grad_norm": 1.613538190971824, "learning_rate": 1.9999995675355352e-05, "loss": 0.5023, "step": 26 }, { "epoch": 0.0053233438485804415, "grad_norm": 1.3274753395366978, "learning_rate": 1.999999411367827e-05, "loss": 0.486, "step": 27 }, { "epoch": 0.005520504731861199, "grad_norm": 1.4785198394383092, "learning_rate": 1.9999992311743276e-05, "loss": 0.4713, "step": 28 }, { "epoch": 0.005717665615141956, "grad_norm": 1.4857486979381507, "learning_rate": 1.9999990269550415e-05, "loss": 0.4881, "step": 29 }, { "epoch": 0.005914826498422713, "grad_norm": 1.36552629938149, "learning_rate": 1.9999987987099734e-05, "loss": 0.5463, "step": 30 }, { "epoch": 0.00611198738170347, "grad_norm": 1.2130083488975558, "learning_rate": 1.999998546439129e-05, "loss": 0.5119, "step": 31 }, { "epoch": 0.006309148264984227, "grad_norm": 1.4571092650286936, "learning_rate": 1.999998270142514e-05, "loss": 0.5107, "step": 32 }, { "epoch": 0.0065063091482649845, "grad_norm": 1.3046426394488964, "learning_rate": 1.9999979698201355e-05, "loss": 0.5163, "step": 33 }, { "epoch": 0.006703470031545741, "grad_norm": 1.3429233230107818, "learning_rate": 1.999997645472e-05, "loss": 0.4949, "step": 34 }, { "epoch": 0.006900630914826498, "grad_norm": 1.1865584873872308, "learning_rate": 1.9999972970981164e-05, "loss": 0.5042, "step": 35 }, { "epoch": 0.007097791798107256, "grad_norm": 1.2089421648447298, "learning_rate": 1.999996924698492e-05, "loss": 0.4996, "step": 36 }, { "epoch": 0.007294952681388012, "grad_norm": 1.2171992786206751, "learning_rate": 1.9999965282731364e-05, "loss": 0.5232, "step": 37 }, { "epoch": 0.00749211356466877, "grad_norm": 1.5903110532514266, "learning_rate": 1.9999961078220587e-05, "loss": 0.5206, "step": 38 }, { "epoch": 0.007689274447949527, "grad_norm": 1.2539942673764422, "learning_rate": 1.9999956633452696e-05, "loss": 0.4656, "step": 39 }, { "epoch": 0.007886435331230283, "grad_norm": 1.1722354147042853, "learning_rate": 1.9999951948427793e-05, "loss": 0.4811, "step": 40 }, { "epoch": 0.008083596214511041, "grad_norm": 1.520380985973137, "learning_rate": 1.9999947023145992e-05, "loss": 0.4745, "step": 41 }, { "epoch": 0.008280757097791799, "grad_norm": 1.181201745034852, "learning_rate": 1.999994185760741e-05, "loss": 0.5026, "step": 42 }, { "epoch": 0.008477917981072555, "grad_norm": 1.2315559894723096, "learning_rate": 1.9999936451812168e-05, "loss": 0.5071, "step": 43 }, { "epoch": 0.008675078864353312, "grad_norm": 1.0027266084772204, "learning_rate": 1.9999930805760403e-05, "loss": 0.4293, "step": 44 }, { "epoch": 0.00887223974763407, "grad_norm": 1.5355646992423921, "learning_rate": 1.999992491945225e-05, "loss": 0.5075, "step": 45 }, { "epoch": 0.009069400630914827, "grad_norm": 1.2141404668583387, "learning_rate": 1.9999918792887844e-05, "loss": 0.5002, "step": 46 }, { "epoch": 0.009266561514195583, "grad_norm": 1.1874086565333581, "learning_rate": 1.9999912426067335e-05, "loss": 0.5214, "step": 47 }, { "epoch": 0.00946372239747634, "grad_norm": 1.310468730323023, "learning_rate": 1.999990581899088e-05, "loss": 0.4954, "step": 48 }, { "epoch": 0.009660883280757098, "grad_norm": 1.1678068139516498, "learning_rate": 1.9999898971658632e-05, "loss": 0.4874, "step": 49 }, { "epoch": 0.009858044164037854, "grad_norm": 1.2436720803185994, "learning_rate": 1.9999891884070764e-05, "loss": 0.5053, "step": 50 }, { "epoch": 0.010055205047318612, "grad_norm": 1.191814384498826, "learning_rate": 1.999988455622744e-05, "loss": 0.4437, "step": 51 }, { "epoch": 0.01025236593059937, "grad_norm": 1.0535554735149129, "learning_rate": 1.9999876988128832e-05, "loss": 0.4572, "step": 52 }, { "epoch": 0.010449526813880125, "grad_norm": 1.5861895667849393, "learning_rate": 1.9999869179775126e-05, "loss": 0.5274, "step": 53 }, { "epoch": 0.010646687697160883, "grad_norm": 1.3410047214951284, "learning_rate": 1.9999861131166513e-05, "loss": 0.5134, "step": 54 }, { "epoch": 0.01084384858044164, "grad_norm": 1.5169630762672912, "learning_rate": 1.9999852842303183e-05, "loss": 0.498, "step": 55 }, { "epoch": 0.011041009463722398, "grad_norm": 1.221925524795734, "learning_rate": 1.9999844313185335e-05, "loss": 0.5275, "step": 56 }, { "epoch": 0.011238170347003154, "grad_norm": 5.091704408593217, "learning_rate": 1.9999835543813174e-05, "loss": 0.4799, "step": 57 }, { "epoch": 0.011435331230283912, "grad_norm": 2.4153909798169337, "learning_rate": 1.9999826534186914e-05, "loss": 0.4921, "step": 58 }, { "epoch": 0.01163249211356467, "grad_norm": 1.547979859587359, "learning_rate": 1.9999817284306766e-05, "loss": 0.5229, "step": 59 }, { "epoch": 0.011829652996845425, "grad_norm": 1.5690337913664598, "learning_rate": 1.9999807794172955e-05, "loss": 0.4735, "step": 60 }, { "epoch": 0.012026813880126183, "grad_norm": 1.6033315849616043, "learning_rate": 1.999979806378571e-05, "loss": 0.5152, "step": 61 }, { "epoch": 0.01222397476340694, "grad_norm": 1.2835879687259206, "learning_rate": 1.9999788093145264e-05, "loss": 0.5271, "step": 62 }, { "epoch": 0.012421135646687698, "grad_norm": 1.2507778090991872, "learning_rate": 1.9999777882251857e-05, "loss": 0.4735, "step": 63 }, { "epoch": 0.012618296529968454, "grad_norm": 1.0918550876107425, "learning_rate": 1.999976743110573e-05, "loss": 0.489, "step": 64 }, { "epoch": 0.012815457413249211, "grad_norm": 1.3178360947947898, "learning_rate": 1.999975673970714e-05, "loss": 0.5017, "step": 65 }, { "epoch": 0.013012618296529969, "grad_norm": 1.226739326525634, "learning_rate": 1.9999745808056344e-05, "loss": 0.4785, "step": 66 }, { "epoch": 0.013209779179810725, "grad_norm": 1.1354158482899859, "learning_rate": 1.99997346361536e-05, "loss": 0.4727, "step": 67 }, { "epoch": 0.013406940063091483, "grad_norm": 1.069961387810125, "learning_rate": 1.9999723223999178e-05, "loss": 0.4692, "step": 68 }, { "epoch": 0.01360410094637224, "grad_norm": 0.9762520384137559, "learning_rate": 1.999971157159335e-05, "loss": 0.4922, "step": 69 }, { "epoch": 0.013801261829652996, "grad_norm": 1.1048692949441026, "learning_rate": 1.99996996789364e-05, "loss": 0.4762, "step": 70 }, { "epoch": 0.013998422712933754, "grad_norm": 1.4422307357408406, "learning_rate": 1.9999687546028617e-05, "loss": 0.517, "step": 71 }, { "epoch": 0.014195583596214511, "grad_norm": 1.2130637644825035, "learning_rate": 1.9999675172870286e-05, "loss": 0.4549, "step": 72 }, { "epoch": 0.014392744479495269, "grad_norm": 1.3283546807946351, "learning_rate": 1.9999662559461704e-05, "loss": 0.4912, "step": 73 }, { "epoch": 0.014589905362776025, "grad_norm": 1.2835417017942559, "learning_rate": 1.9999649705803178e-05, "loss": 0.5079, "step": 74 }, { "epoch": 0.014787066246056782, "grad_norm": 1.0779965377716199, "learning_rate": 1.9999636611895018e-05, "loss": 0.4633, "step": 75 }, { "epoch": 0.01498422712933754, "grad_norm": 1.0321678770877212, "learning_rate": 1.999962327773753e-05, "loss": 0.4854, "step": 76 }, { "epoch": 0.015181388012618296, "grad_norm": 1.2808800261718734, "learning_rate": 1.9999609703331045e-05, "loss": 0.4931, "step": 77 }, { "epoch": 0.015378548895899053, "grad_norm": 1.3610125610716681, "learning_rate": 1.999959588867588e-05, "loss": 0.5009, "step": 78 }, { "epoch": 0.015575709779179811, "grad_norm": 1.0655940435477698, "learning_rate": 1.999958183377237e-05, "loss": 0.4816, "step": 79 }, { "epoch": 0.015772870662460567, "grad_norm": 1.2930806912423924, "learning_rate": 1.999956753862086e-05, "loss": 0.4807, "step": 80 }, { "epoch": 0.015970031545741326, "grad_norm": 1.287162958203915, "learning_rate": 1.9999553003221682e-05, "loss": 0.4588, "step": 81 }, { "epoch": 0.016167192429022082, "grad_norm": 1.1339051396074737, "learning_rate": 1.9999538227575196e-05, "loss": 0.5081, "step": 82 }, { "epoch": 0.016364353312302838, "grad_norm": 1.1265294750507961, "learning_rate": 1.9999523211681746e-05, "loss": 0.4669, "step": 83 }, { "epoch": 0.016561514195583597, "grad_norm": 1.1931313211356958, "learning_rate": 1.99995079555417e-05, "loss": 0.48, "step": 84 }, { "epoch": 0.016758675078864353, "grad_norm": 0.999555485327999, "learning_rate": 1.9999492459155424e-05, "loss": 0.4786, "step": 85 }, { "epoch": 0.01695583596214511, "grad_norm": 1.1669083683078765, "learning_rate": 1.9999476722523287e-05, "loss": 0.4581, "step": 86 }, { "epoch": 0.01715299684542587, "grad_norm": 1.0996260118567778, "learning_rate": 1.9999460745645673e-05, "loss": 0.4871, "step": 87 }, { "epoch": 0.017350157728706624, "grad_norm": 1.1438449589174153, "learning_rate": 1.999944452852296e-05, "loss": 0.4852, "step": 88 }, { "epoch": 0.01754731861198738, "grad_norm": 1.0031606047710635, "learning_rate": 1.9999428071155535e-05, "loss": 0.5007, "step": 89 }, { "epoch": 0.01774447949526814, "grad_norm": 1.3469859967905786, "learning_rate": 1.9999411373543804e-05, "loss": 0.5102, "step": 90 }, { "epoch": 0.017941640378548895, "grad_norm": 1.1936672885104658, "learning_rate": 1.9999394435688158e-05, "loss": 0.5077, "step": 91 }, { "epoch": 0.018138801261829655, "grad_norm": 1.3174882849640317, "learning_rate": 1.9999377257589012e-05, "loss": 0.4961, "step": 92 }, { "epoch": 0.01833596214511041, "grad_norm": 1.2572224290689644, "learning_rate": 1.9999359839246775e-05, "loss": 0.498, "step": 93 }, { "epoch": 0.018533123028391166, "grad_norm": 1.1310411895258263, "learning_rate": 1.9999342180661863e-05, "loss": 0.5175, "step": 94 }, { "epoch": 0.018730283911671926, "grad_norm": 1.2365159429858565, "learning_rate": 1.9999324281834705e-05, "loss": 0.4588, "step": 95 }, { "epoch": 0.01892744479495268, "grad_norm": 1.3269862941988586, "learning_rate": 1.9999306142765726e-05, "loss": 0.485, "step": 96 }, { "epoch": 0.019124605678233438, "grad_norm": 1.1672780497391835, "learning_rate": 1.9999287763455367e-05, "loss": 0.4964, "step": 97 }, { "epoch": 0.019321766561514197, "grad_norm": 1.2280978200722572, "learning_rate": 1.9999269143904066e-05, "loss": 0.5031, "step": 98 }, { "epoch": 0.019518927444794953, "grad_norm": 1.1756264159886103, "learning_rate": 1.999925028411227e-05, "loss": 0.4558, "step": 99 }, { "epoch": 0.01971608832807571, "grad_norm": 1.0797299658749118, "learning_rate": 1.9999231184080434e-05, "loss": 0.509, "step": 100 }, { "epoch": 0.019913249211356468, "grad_norm": 1.5073253405939093, "learning_rate": 1.9999211843809018e-05, "loss": 0.4829, "step": 101 }, { "epoch": 0.020110410094637224, "grad_norm": 1.22115492589763, "learning_rate": 1.9999192263298485e-05, "loss": 0.4954, "step": 102 }, { "epoch": 0.02030757097791798, "grad_norm": 1.215526599323687, "learning_rate": 1.9999172442549307e-05, "loss": 0.4973, "step": 103 }, { "epoch": 0.02050473186119874, "grad_norm": 1.0092097441473407, "learning_rate": 1.9999152381561955e-05, "loss": 0.461, "step": 104 }, { "epoch": 0.020701892744479495, "grad_norm": 1.1251707920134784, "learning_rate": 1.9999132080336915e-05, "loss": 0.503, "step": 105 }, { "epoch": 0.02089905362776025, "grad_norm": 1.0138460258654038, "learning_rate": 1.9999111538874677e-05, "loss": 0.4765, "step": 106 }, { "epoch": 0.02109621451104101, "grad_norm": 1.2072707360767827, "learning_rate": 1.999909075717573e-05, "loss": 0.4894, "step": 107 }, { "epoch": 0.021293375394321766, "grad_norm": 1.0013269764600548, "learning_rate": 1.9999069735240578e-05, "loss": 0.4957, "step": 108 }, { "epoch": 0.021490536277602525, "grad_norm": 1.1353117571038165, "learning_rate": 1.999904847306972e-05, "loss": 0.5045, "step": 109 }, { "epoch": 0.02168769716088328, "grad_norm": 1.057179146627742, "learning_rate": 1.999902697066367e-05, "loss": 0.4928, "step": 110 }, { "epoch": 0.021884858044164037, "grad_norm": 0.9423530793571464, "learning_rate": 1.999900522802295e-05, "loss": 0.5128, "step": 111 }, { "epoch": 0.022082018927444796, "grad_norm": 0.968781602059733, "learning_rate": 1.9998983245148072e-05, "loss": 0.4547, "step": 112 }, { "epoch": 0.022279179810725552, "grad_norm": 0.9113728069244939, "learning_rate": 1.999896102203957e-05, "loss": 0.4814, "step": 113 }, { "epoch": 0.022476340694006308, "grad_norm": 1.1042295670503781, "learning_rate": 1.999893855869798e-05, "loss": 0.4613, "step": 114 }, { "epoch": 0.022673501577287068, "grad_norm": 1.034777939325516, "learning_rate": 1.999891585512384e-05, "loss": 0.5244, "step": 115 }, { "epoch": 0.022870662460567823, "grad_norm": 0.9820709661002166, "learning_rate": 1.999889291131769e-05, "loss": 0.5079, "step": 116 }, { "epoch": 0.02306782334384858, "grad_norm": 2.571070138444174, "learning_rate": 1.9998869727280088e-05, "loss": 0.4896, "step": 117 }, { "epoch": 0.02326498422712934, "grad_norm": 1.5658204423287487, "learning_rate": 1.9998846303011588e-05, "loss": 0.4715, "step": 118 }, { "epoch": 0.023462145110410094, "grad_norm": 0.9704506073467836, "learning_rate": 1.9998822638512757e-05, "loss": 0.4948, "step": 119 }, { "epoch": 0.02365930599369085, "grad_norm": 1.8219918715734673, "learning_rate": 1.9998798733784155e-05, "loss": 0.4449, "step": 120 }, { "epoch": 0.02385646687697161, "grad_norm": 1.3674996874754528, "learning_rate": 1.9998774588826362e-05, "loss": 0.4736, "step": 121 }, { "epoch": 0.024053627760252366, "grad_norm": 1.0825809239978976, "learning_rate": 1.999875020363996e-05, "loss": 0.4462, "step": 122 }, { "epoch": 0.02425078864353312, "grad_norm": 1.2402946359904268, "learning_rate": 1.999872557822553e-05, "loss": 0.464, "step": 123 }, { "epoch": 0.02444794952681388, "grad_norm": 0.9734568557016353, "learning_rate": 1.999870071258367e-05, "loss": 0.5062, "step": 124 }, { "epoch": 0.024645110410094637, "grad_norm": 1.32265424416242, "learning_rate": 1.999867560671497e-05, "loss": 0.458, "step": 125 }, { "epoch": 0.024842271293375396, "grad_norm": 1.0916211714042101, "learning_rate": 1.999865026062004e-05, "loss": 0.4639, "step": 126 }, { "epoch": 0.025039432176656152, "grad_norm": 1.0101120690939442, "learning_rate": 1.999862467429948e-05, "loss": 0.5054, "step": 127 }, { "epoch": 0.025236593059936908, "grad_norm": 1.07583395846967, "learning_rate": 1.9998598847753918e-05, "loss": 0.4147, "step": 128 }, { "epoch": 0.025433753943217667, "grad_norm": 1.0153450766886587, "learning_rate": 1.999857278098396e-05, "loss": 0.4608, "step": 129 }, { "epoch": 0.025630914826498423, "grad_norm": 0.9636929458831386, "learning_rate": 1.999854647399024e-05, "loss": 0.4435, "step": 130 }, { "epoch": 0.02582807570977918, "grad_norm": 0.976995842942242, "learning_rate": 1.999851992677339e-05, "loss": 0.468, "step": 131 }, { "epoch": 0.026025236593059938, "grad_norm": 1.2900795634559068, "learning_rate": 1.999849313933405e-05, "loss": 0.4996, "step": 132 }, { "epoch": 0.026222397476340694, "grad_norm": 1.053599579751552, "learning_rate": 1.9998466111672856e-05, "loss": 0.4973, "step": 133 }, { "epoch": 0.02641955835962145, "grad_norm": 1.018304999333016, "learning_rate": 1.9998438843790463e-05, "loss": 0.4754, "step": 134 }, { "epoch": 0.02661671924290221, "grad_norm": 0.9962199755386415, "learning_rate": 1.9998411335687527e-05, "loss": 0.5074, "step": 135 }, { "epoch": 0.026813880126182965, "grad_norm": 1.0381641170143454, "learning_rate": 1.9998383587364706e-05, "loss": 0.5516, "step": 136 }, { "epoch": 0.02701104100946372, "grad_norm": 1.0875760040456148, "learning_rate": 1.999835559882267e-05, "loss": 0.4803, "step": 137 }, { "epoch": 0.02720820189274448, "grad_norm": 1.5281532146270271, "learning_rate": 1.9998327370062086e-05, "loss": 0.4919, "step": 138 }, { "epoch": 0.027405362776025236, "grad_norm": 0.9062501284108182, "learning_rate": 1.9998298901083637e-05, "loss": 0.4639, "step": 139 }, { "epoch": 0.027602523659305992, "grad_norm": 0.9073298345394162, "learning_rate": 1.9998270191888002e-05, "loss": 0.476, "step": 140 }, { "epoch": 0.02779968454258675, "grad_norm": 0.8859849922750372, "learning_rate": 1.9998241242475876e-05, "loss": 0.4808, "step": 141 }, { "epoch": 0.027996845425867507, "grad_norm": 9.232525438376628, "learning_rate": 1.9998212052847955e-05, "loss": 0.5524, "step": 142 }, { "epoch": 0.028194006309148267, "grad_norm": 1.1971641286106405, "learning_rate": 1.9998182623004935e-05, "loss": 0.4861, "step": 143 }, { "epoch": 0.028391167192429023, "grad_norm": 1.1014546150503517, "learning_rate": 1.9998152952947526e-05, "loss": 0.4994, "step": 144 }, { "epoch": 0.02858832807570978, "grad_norm": 1.1251735110198886, "learning_rate": 1.9998123042676444e-05, "loss": 0.5235, "step": 145 }, { "epoch": 0.028785488958990538, "grad_norm": 1.0757931397923415, "learning_rate": 1.9998092892192403e-05, "loss": 0.4805, "step": 146 }, { "epoch": 0.028982649842271294, "grad_norm": 1.0601740904924608, "learning_rate": 1.9998062501496126e-05, "loss": 0.4652, "step": 147 }, { "epoch": 0.02917981072555205, "grad_norm": 1.0857317459697904, "learning_rate": 1.999803187058835e-05, "loss": 0.4949, "step": 148 }, { "epoch": 0.02937697160883281, "grad_norm": 1.2154921400920446, "learning_rate": 1.99980009994698e-05, "loss": 0.4988, "step": 149 }, { "epoch": 0.029574132492113565, "grad_norm": 1.0127571620302098, "learning_rate": 1.999796988814123e-05, "loss": 0.4898, "step": 150 }, { "epoch": 0.02977129337539432, "grad_norm": 0.9897255202956721, "learning_rate": 1.9997938536603386e-05, "loss": 0.5307, "step": 151 }, { "epoch": 0.02996845425867508, "grad_norm": 20.686417789329063, "learning_rate": 1.999790694485701e-05, "loss": 0.551, "step": 152 }, { "epoch": 0.030165615141955836, "grad_norm": 1.7162332300485896, "learning_rate": 1.999787511290287e-05, "loss": 0.5087, "step": 153 }, { "epoch": 0.03036277602523659, "grad_norm": 1.3514560362581123, "learning_rate": 1.999784304074173e-05, "loss": 0.5216, "step": 154 }, { "epoch": 0.03055993690851735, "grad_norm": 1.5606869153465206, "learning_rate": 1.9997810728374362e-05, "loss": 0.4768, "step": 155 }, { "epoch": 0.030757097791798107, "grad_norm": 1.3648443694718702, "learning_rate": 1.999777817580154e-05, "loss": 0.494, "step": 156 }, { "epoch": 0.030954258675078863, "grad_norm": 1.132691943500696, "learning_rate": 1.9997745383024043e-05, "loss": 0.4662, "step": 157 }, { "epoch": 0.031151419558359622, "grad_norm": 1.2793599540460188, "learning_rate": 1.9997712350042663e-05, "loss": 0.4834, "step": 158 }, { "epoch": 0.03134858044164038, "grad_norm": 1.1425965940998726, "learning_rate": 1.9997679076858193e-05, "loss": 0.4975, "step": 159 }, { "epoch": 0.031545741324921134, "grad_norm": 1.1861645099171103, "learning_rate": 1.9997645563471432e-05, "loss": 0.4793, "step": 160 }, { "epoch": 0.03174290220820189, "grad_norm": 1.0972295116283801, "learning_rate": 1.9997611809883187e-05, "loss": 0.4746, "step": 161 }, { "epoch": 0.03194006309148265, "grad_norm": 1.066451222541386, "learning_rate": 1.9997577816094266e-05, "loss": 0.4907, "step": 162 }, { "epoch": 0.03213722397476341, "grad_norm": 1.0553209971214357, "learning_rate": 1.9997543582105484e-05, "loss": 0.4623, "step": 163 }, { "epoch": 0.032334384858044164, "grad_norm": 0.978588088915193, "learning_rate": 1.999750910791767e-05, "loss": 0.4758, "step": 164 }, { "epoch": 0.03253154574132492, "grad_norm": 1.1746969879352085, "learning_rate": 1.9997474393531648e-05, "loss": 0.5184, "step": 165 }, { "epoch": 0.032728706624605676, "grad_norm": 1.1684805326278327, "learning_rate": 1.999743943894825e-05, "loss": 0.5299, "step": 166 }, { "epoch": 0.03292586750788644, "grad_norm": 1.0632876477663906, "learning_rate": 1.999740424416832e-05, "loss": 0.4858, "step": 167 }, { "epoch": 0.033123028391167195, "grad_norm": 1.0392183455349258, "learning_rate": 1.9997368809192704e-05, "loss": 0.4637, "step": 168 }, { "epoch": 0.03332018927444795, "grad_norm": 1.0283561237168466, "learning_rate": 1.999733313402225e-05, "loss": 0.5204, "step": 169 }, { "epoch": 0.033517350157728706, "grad_norm": 1.1695473033049606, "learning_rate": 1.999729721865782e-05, "loss": 0.5182, "step": 170 }, { "epoch": 0.03371451104100946, "grad_norm": 0.9872390177704469, "learning_rate": 1.999726106310027e-05, "loss": 0.4961, "step": 171 }, { "epoch": 0.03391167192429022, "grad_norm": 2.3231197797384, "learning_rate": 1.9997224667350474e-05, "loss": 0.4806, "step": 172 }, { "epoch": 0.03410883280757098, "grad_norm": 1.1146751680169542, "learning_rate": 1.9997188031409302e-05, "loss": 0.4741, "step": 173 }, { "epoch": 0.03430599369085174, "grad_norm": 0.896181610550275, "learning_rate": 1.9997151155277638e-05, "loss": 0.4462, "step": 174 }, { "epoch": 0.03450315457413249, "grad_norm": 0.9768565430868928, "learning_rate": 1.9997114038956367e-05, "loss": 0.4981, "step": 175 }, { "epoch": 0.03470031545741325, "grad_norm": 0.9551378209936853, "learning_rate": 1.999707668244638e-05, "loss": 0.4674, "step": 176 }, { "epoch": 0.034897476340694004, "grad_norm": 0.9851958907905455, "learning_rate": 1.9997039085748576e-05, "loss": 0.4604, "step": 177 }, { "epoch": 0.03509463722397476, "grad_norm": 1.0161464160279716, "learning_rate": 1.9997001248863858e-05, "loss": 0.4541, "step": 178 }, { "epoch": 0.03529179810725552, "grad_norm": 1.050476492771695, "learning_rate": 1.9996963171793132e-05, "loss": 0.4949, "step": 179 }, { "epoch": 0.03548895899053628, "grad_norm": 1.1900294403214005, "learning_rate": 1.999692485453732e-05, "loss": 0.4815, "step": 180 }, { "epoch": 0.035686119873817035, "grad_norm": 1.1031147141223552, "learning_rate": 1.9996886297097335e-05, "loss": 0.4862, "step": 181 }, { "epoch": 0.03588328075709779, "grad_norm": 1.083256023575967, "learning_rate": 1.9996847499474102e-05, "loss": 0.4587, "step": 182 }, { "epoch": 0.03608044164037855, "grad_norm": 0.9670193439952784, "learning_rate": 1.9996808461668565e-05, "loss": 0.4513, "step": 183 }, { "epoch": 0.03627760252365931, "grad_norm": 0.8979692827898681, "learning_rate": 1.999676918368165e-05, "loss": 0.4517, "step": 184 }, { "epoch": 0.036474763406940065, "grad_norm": 0.9969877135191793, "learning_rate": 1.9996729665514306e-05, "loss": 0.4933, "step": 185 }, { "epoch": 0.03667192429022082, "grad_norm": 0.9692177718636533, "learning_rate": 1.999668990716748e-05, "loss": 0.5174, "step": 186 }, { "epoch": 0.03686908517350158, "grad_norm": 0.8842091129334964, "learning_rate": 1.999664990864213e-05, "loss": 0.4742, "step": 187 }, { "epoch": 0.03706624605678233, "grad_norm": 0.9359713100345171, "learning_rate": 1.9996609669939214e-05, "loss": 0.4754, "step": 188 }, { "epoch": 0.03726340694006309, "grad_norm": 0.9191517387573663, "learning_rate": 1.9996569191059705e-05, "loss": 0.4416, "step": 189 }, { "epoch": 0.03746056782334385, "grad_norm": 0.9570764893044853, "learning_rate": 1.9996528472004567e-05, "loss": 0.5044, "step": 190 }, { "epoch": 0.03765772870662461, "grad_norm": 0.9708144686903228, "learning_rate": 1.999648751277478e-05, "loss": 0.4986, "step": 191 }, { "epoch": 0.03785488958990536, "grad_norm": 0.9143041364090021, "learning_rate": 1.9996446313371334e-05, "loss": 0.4715, "step": 192 }, { "epoch": 0.03805205047318612, "grad_norm": 1.0756442068587027, "learning_rate": 1.9996404873795216e-05, "loss": 0.466, "step": 193 }, { "epoch": 0.038249211356466875, "grad_norm": 0.8509637591107831, "learning_rate": 1.999636319404742e-05, "loss": 0.4419, "step": 194 }, { "epoch": 0.03844637223974763, "grad_norm": 0.8874791124952598, "learning_rate": 1.9996321274128947e-05, "loss": 0.4899, "step": 195 }, { "epoch": 0.038643533123028394, "grad_norm": 0.9741286408649374, "learning_rate": 1.9996279114040806e-05, "loss": 0.4753, "step": 196 }, { "epoch": 0.03884069400630915, "grad_norm": 0.9085224492157351, "learning_rate": 1.999623671378401e-05, "loss": 0.4701, "step": 197 }, { "epoch": 0.039037854889589906, "grad_norm": 1.0258910989454444, "learning_rate": 1.9996194073359576e-05, "loss": 0.4888, "step": 198 }, { "epoch": 0.03923501577287066, "grad_norm": 0.8126732806815207, "learning_rate": 1.999615119276853e-05, "loss": 0.4555, "step": 199 }, { "epoch": 0.03943217665615142, "grad_norm": 0.8218884414629809, "learning_rate": 1.99961080720119e-05, "loss": 0.4769, "step": 200 }, { "epoch": 0.03962933753943218, "grad_norm": 0.8991946813956051, "learning_rate": 1.9996064711090727e-05, "loss": 0.4588, "step": 201 }, { "epoch": 0.039826498422712936, "grad_norm": 0.8566916147286883, "learning_rate": 1.9996021110006046e-05, "loss": 0.5019, "step": 202 }, { "epoch": 0.04002365930599369, "grad_norm": 0.9756060213649459, "learning_rate": 1.9995977268758912e-05, "loss": 0.4746, "step": 203 }, { "epoch": 0.04022082018927445, "grad_norm": 2.1550975546469253, "learning_rate": 1.9995933187350372e-05, "loss": 0.5151, "step": 204 }, { "epoch": 0.040417981072555204, "grad_norm": 2.0312951209754386, "learning_rate": 1.999588886578149e-05, "loss": 0.4713, "step": 205 }, { "epoch": 0.04061514195583596, "grad_norm": 1.0090851939886327, "learning_rate": 1.9995844304053325e-05, "loss": 0.4607, "step": 206 }, { "epoch": 0.04081230283911672, "grad_norm": 1.2675954246551175, "learning_rate": 1.9995799502166952e-05, "loss": 0.4975, "step": 207 }, { "epoch": 0.04100946372239748, "grad_norm": 1.218002669237388, "learning_rate": 1.9995754460123445e-05, "loss": 0.4877, "step": 208 }, { "epoch": 0.041206624605678234, "grad_norm": 1.621757149431934, "learning_rate": 1.999570917792389e-05, "loss": 0.487, "step": 209 }, { "epoch": 0.04140378548895899, "grad_norm": 1.0341497665720387, "learning_rate": 1.999566365556937e-05, "loss": 0.5016, "step": 210 }, { "epoch": 0.041600946372239746, "grad_norm": 0.882195008783823, "learning_rate": 1.9995617893060984e-05, "loss": 0.4523, "step": 211 }, { "epoch": 0.0417981072555205, "grad_norm": 0.9026396173054654, "learning_rate": 1.9995571890399827e-05, "loss": 0.4561, "step": 212 }, { "epoch": 0.041995268138801264, "grad_norm": 1.2144739906269006, "learning_rate": 1.9995525647587005e-05, "loss": 0.5047, "step": 213 }, { "epoch": 0.04219242902208202, "grad_norm": 0.975053328028896, "learning_rate": 1.9995479164623633e-05, "loss": 0.4736, "step": 214 }, { "epoch": 0.042389589905362776, "grad_norm": 0.9231877842298208, "learning_rate": 1.9995432441510824e-05, "loss": 0.4898, "step": 215 }, { "epoch": 0.04258675078864353, "grad_norm": 0.9144587574515857, "learning_rate": 1.9995385478249697e-05, "loss": 0.4796, "step": 216 }, { "epoch": 0.04278391167192429, "grad_norm": 17.62666522642842, "learning_rate": 1.999533827484139e-05, "loss": 0.6332, "step": 217 }, { "epoch": 0.04298107255520505, "grad_norm": 1.471356897407039, "learning_rate": 1.9995290831287032e-05, "loss": 0.4722, "step": 218 }, { "epoch": 0.04317823343848581, "grad_norm": 11.254738838618625, "learning_rate": 1.9995243147587758e-05, "loss": 0.5691, "step": 219 }, { "epoch": 0.04337539432176656, "grad_norm": 2.055362498950427, "learning_rate": 1.999519522374472e-05, "loss": 0.4621, "step": 220 }, { "epoch": 0.04357255520504732, "grad_norm": 8.194523341432308, "learning_rate": 1.999514705975907e-05, "loss": 0.4652, "step": 221 }, { "epoch": 0.043769716088328074, "grad_norm": 1.9182748476507985, "learning_rate": 1.9995098655631957e-05, "loss": 0.5171, "step": 222 }, { "epoch": 0.04396687697160883, "grad_norm": 1.0480239186671922, "learning_rate": 1.9995050011364557e-05, "loss": 0.4751, "step": 223 }, { "epoch": 0.04416403785488959, "grad_norm": 1.665247357622267, "learning_rate": 1.9995001126958025e-05, "loss": 0.4874, "step": 224 }, { "epoch": 0.04436119873817035, "grad_norm": 1.2047598455545891, "learning_rate": 1.999495200241355e-05, "loss": 0.4712, "step": 225 }, { "epoch": 0.044558359621451105, "grad_norm": 1.4064682020754653, "learning_rate": 1.9994902637732295e-05, "loss": 0.4641, "step": 226 }, { "epoch": 0.04475552050473186, "grad_norm": 1.354286339165095, "learning_rate": 1.999485303291546e-05, "loss": 0.4793, "step": 227 }, { "epoch": 0.044952681388012616, "grad_norm": 1.5121544057772933, "learning_rate": 1.9994803187964233e-05, "loss": 0.5025, "step": 228 }, { "epoch": 0.04514984227129337, "grad_norm": 1.0667226038238984, "learning_rate": 1.9994753102879807e-05, "loss": 0.4352, "step": 229 }, { "epoch": 0.045347003154574135, "grad_norm": 2.1283935213841714, "learning_rate": 1.999470277766339e-05, "loss": 0.4862, "step": 230 }, { "epoch": 0.04554416403785489, "grad_norm": 0.9323160156698969, "learning_rate": 1.9994652212316193e-05, "loss": 0.4586, "step": 231 }, { "epoch": 0.04574132492113565, "grad_norm": 1.2774644395318477, "learning_rate": 1.9994601406839428e-05, "loss": 0.5293, "step": 232 }, { "epoch": 0.0459384858044164, "grad_norm": 0.8563504697504617, "learning_rate": 1.9994550361234314e-05, "loss": 0.4671, "step": 233 }, { "epoch": 0.04613564668769716, "grad_norm": 1.2981705612870134, "learning_rate": 1.9994499075502078e-05, "loss": 0.4774, "step": 234 }, { "epoch": 0.04633280757097792, "grad_norm": 0.9650905057900812, "learning_rate": 1.999444754964395e-05, "loss": 0.4785, "step": 235 }, { "epoch": 0.04652996845425868, "grad_norm": 0.9892482279995838, "learning_rate": 1.9994395783661177e-05, "loss": 0.478, "step": 236 }, { "epoch": 0.04672712933753943, "grad_norm": 1.0372935377294314, "learning_rate": 1.9994343777554995e-05, "loss": 0.4965, "step": 237 }, { "epoch": 0.04692429022082019, "grad_norm": 1.2079564548732702, "learning_rate": 1.9994291531326656e-05, "loss": 0.4444, "step": 238 }, { "epoch": 0.047121451104100945, "grad_norm": 0.9768363136576569, "learning_rate": 1.999423904497741e-05, "loss": 0.4985, "step": 239 }, { "epoch": 0.0473186119873817, "grad_norm": 1.7614504439224654, "learning_rate": 1.999418631850853e-05, "loss": 0.4653, "step": 240 }, { "epoch": 0.047515772870662464, "grad_norm": 1.1032925804972276, "learning_rate": 1.9994133351921274e-05, "loss": 0.4659, "step": 241 }, { "epoch": 0.04771293375394322, "grad_norm": 1.2804412541262307, "learning_rate": 1.9994080145216908e-05, "loss": 0.4483, "step": 242 }, { "epoch": 0.047910094637223975, "grad_norm": 2.294484377346188, "learning_rate": 1.9994026698396727e-05, "loss": 0.4887, "step": 243 }, { "epoch": 0.04810725552050473, "grad_norm": 1.118613544316812, "learning_rate": 1.9993973011462004e-05, "loss": 0.4775, "step": 244 }, { "epoch": 0.04830441640378549, "grad_norm": 0.8403959866676638, "learning_rate": 1.999391908441403e-05, "loss": 0.4442, "step": 245 }, { "epoch": 0.04850157728706624, "grad_norm": 1.0300870576969843, "learning_rate": 1.9993864917254103e-05, "loss": 0.4462, "step": 246 }, { "epoch": 0.048698738170347006, "grad_norm": 1.1077983845276296, "learning_rate": 1.9993810509983524e-05, "loss": 0.4789, "step": 247 }, { "epoch": 0.04889589905362776, "grad_norm": 0.9138468109346035, "learning_rate": 1.9993755862603597e-05, "loss": 0.4638, "step": 248 }, { "epoch": 0.04909305993690852, "grad_norm": 0.9955167957819017, "learning_rate": 1.9993700975115636e-05, "loss": 0.5144, "step": 249 }, { "epoch": 0.04929022082018927, "grad_norm": 0.9520927314114271, "learning_rate": 1.9993645847520965e-05, "loss": 0.4976, "step": 250 }, { "epoch": 0.04948738170347003, "grad_norm": 1.2526305722815652, "learning_rate": 1.9993590479820906e-05, "loss": 0.5092, "step": 251 }, { "epoch": 0.04968454258675079, "grad_norm": 0.8493530635842205, "learning_rate": 1.9993534872016784e-05, "loss": 0.4517, "step": 252 }, { "epoch": 0.04988170347003155, "grad_norm": 0.9146559563429385, "learning_rate": 1.999347902410994e-05, "loss": 0.5143, "step": 253 }, { "epoch": 0.050078864353312304, "grad_norm": 1.2331946908319966, "learning_rate": 1.9993422936101715e-05, "loss": 0.5273, "step": 254 }, { "epoch": 0.05027602523659306, "grad_norm": 0.9053104001937419, "learning_rate": 1.9993366607993457e-05, "loss": 0.4623, "step": 255 }, { "epoch": 0.050473186119873815, "grad_norm": 1.0114462146919174, "learning_rate": 1.999331003978652e-05, "loss": 0.484, "step": 256 }, { "epoch": 0.05067034700315457, "grad_norm": 0.8468019047992964, "learning_rate": 1.9993253231482258e-05, "loss": 0.477, "step": 257 }, { "epoch": 0.050867507886435334, "grad_norm": 0.9113246734228752, "learning_rate": 1.999319618308204e-05, "loss": 0.4913, "step": 258 }, { "epoch": 0.05106466876971609, "grad_norm": 0.8740898510256843, "learning_rate": 1.999313889458724e-05, "loss": 0.504, "step": 259 }, { "epoch": 0.051261829652996846, "grad_norm": 0.8543091738206727, "learning_rate": 1.9993081365999228e-05, "loss": 0.4752, "step": 260 }, { "epoch": 0.0514589905362776, "grad_norm": 0.8447390016252053, "learning_rate": 1.9993023597319387e-05, "loss": 0.4582, "step": 261 }, { "epoch": 0.05165615141955836, "grad_norm": 0.8559956935990739, "learning_rate": 1.999296558854911e-05, "loss": 0.4567, "step": 262 }, { "epoch": 0.051853312302839114, "grad_norm": 0.8497707119769089, "learning_rate": 1.9992907339689786e-05, "loss": 0.4681, "step": 263 }, { "epoch": 0.052050473186119876, "grad_norm": 1.2959384495585367, "learning_rate": 1.9992848850742817e-05, "loss": 0.4612, "step": 264 }, { "epoch": 0.05224763406940063, "grad_norm": 0.9252677800758691, "learning_rate": 1.9992790121709604e-05, "loss": 0.4969, "step": 265 }, { "epoch": 0.05244479495268139, "grad_norm": 0.9986026073347362, "learning_rate": 1.9992731152591563e-05, "loss": 0.483, "step": 266 }, { "epoch": 0.052641955835962144, "grad_norm": 0.8766211673811323, "learning_rate": 1.999267194339011e-05, "loss": 0.4952, "step": 267 }, { "epoch": 0.0528391167192429, "grad_norm": 0.9062026471143431, "learning_rate": 1.9992612494106666e-05, "loss": 0.4922, "step": 268 }, { "epoch": 0.05303627760252366, "grad_norm": 1.2051311308802126, "learning_rate": 1.999255280474266e-05, "loss": 0.4528, "step": 269 }, { "epoch": 0.05323343848580442, "grad_norm": 0.9481095486211566, "learning_rate": 1.9992492875299528e-05, "loss": 0.4676, "step": 270 }, { "epoch": 0.053430599369085174, "grad_norm": 1.0581042227575974, "learning_rate": 1.9992432705778703e-05, "loss": 0.5275, "step": 271 }, { "epoch": 0.05362776025236593, "grad_norm": 1.0862738326883588, "learning_rate": 1.9992372296181637e-05, "loss": 0.4802, "step": 272 }, { "epoch": 0.053824921135646686, "grad_norm": 1.1530116773068577, "learning_rate": 1.999231164650978e-05, "loss": 0.4963, "step": 273 }, { "epoch": 0.05402208201892744, "grad_norm": 1.1406385581126395, "learning_rate": 1.999225075676459e-05, "loss": 0.5017, "step": 274 }, { "epoch": 0.054219242902208205, "grad_norm": 0.9840133715884154, "learning_rate": 1.9992189626947528e-05, "loss": 0.4184, "step": 275 }, { "epoch": 0.05441640378548896, "grad_norm": 0.8369535854434705, "learning_rate": 1.9992128257060064e-05, "loss": 0.4766, "step": 276 }, { "epoch": 0.05461356466876972, "grad_norm": 0.9733589505757138, "learning_rate": 1.999206664710367e-05, "loss": 0.4783, "step": 277 }, { "epoch": 0.05481072555205047, "grad_norm": 1.1016083321043935, "learning_rate": 1.999200479707983e-05, "loss": 0.5023, "step": 278 }, { "epoch": 0.05500788643533123, "grad_norm": 0.9453559986774039, "learning_rate": 1.9991942706990028e-05, "loss": 0.4997, "step": 279 }, { "epoch": 0.055205047318611984, "grad_norm": 1.1205398036178456, "learning_rate": 1.999188037683576e-05, "loss": 0.4918, "step": 280 }, { "epoch": 0.05540220820189275, "grad_norm": 0.8547868381519064, "learning_rate": 1.9991817806618512e-05, "loss": 0.4691, "step": 281 }, { "epoch": 0.0555993690851735, "grad_norm": 1.101644837851159, "learning_rate": 1.99917549963398e-05, "loss": 0.478, "step": 282 }, { "epoch": 0.05579652996845426, "grad_norm": 0.9648045124396167, "learning_rate": 1.9991691946001123e-05, "loss": 0.451, "step": 283 }, { "epoch": 0.055993690851735015, "grad_norm": 0.9890972584569301, "learning_rate": 1.9991628655604006e-05, "loss": 0.4688, "step": 284 }, { "epoch": 0.05619085173501577, "grad_norm": 1.087262812823818, "learning_rate": 1.999156512514996e-05, "loss": 0.4974, "step": 285 }, { "epoch": 0.05638801261829653, "grad_norm": 1.1458887553291872, "learning_rate": 1.9991501354640517e-05, "loss": 0.4982, "step": 286 }, { "epoch": 0.05658517350157729, "grad_norm": 0.8822768555579248, "learning_rate": 1.9991437344077212e-05, "loss": 0.4486, "step": 287 }, { "epoch": 0.056782334384858045, "grad_norm": 1.2313826346986292, "learning_rate": 1.9991373093461574e-05, "loss": 0.4873, "step": 288 }, { "epoch": 0.0569794952681388, "grad_norm": 0.8882605229939141, "learning_rate": 1.9991308602795156e-05, "loss": 0.4553, "step": 289 }, { "epoch": 0.05717665615141956, "grad_norm": 1.0115742101244216, "learning_rate": 1.9991243872079495e-05, "loss": 0.4561, "step": 290 }, { "epoch": 0.05737381703470031, "grad_norm": 0.9908774101006372, "learning_rate": 1.999117890131616e-05, "loss": 0.4427, "step": 291 }, { "epoch": 0.057570977917981075, "grad_norm": 0.8561287541214222, "learning_rate": 1.9991113690506705e-05, "loss": 0.4268, "step": 292 }, { "epoch": 0.05776813880126183, "grad_norm": 1.0671041225310995, "learning_rate": 1.99910482396527e-05, "loss": 0.48, "step": 293 }, { "epoch": 0.05796529968454259, "grad_norm": 0.9685606768815737, "learning_rate": 1.9990982548755712e-05, "loss": 0.5016, "step": 294 }, { "epoch": 0.05816246056782334, "grad_norm": 0.9888136254935049, "learning_rate": 1.9990916617817323e-05, "loss": 0.5133, "step": 295 }, { "epoch": 0.0583596214511041, "grad_norm": 0.9279140857322566, "learning_rate": 1.9990850446839114e-05, "loss": 0.4645, "step": 296 }, { "epoch": 0.058556782334384855, "grad_norm": 0.9503914334506955, "learning_rate": 1.999078403582268e-05, "loss": 0.4442, "step": 297 }, { "epoch": 0.05875394321766562, "grad_norm": 0.8179461934678155, "learning_rate": 1.9990717384769617e-05, "loss": 0.4266, "step": 298 }, { "epoch": 0.058951104100946373, "grad_norm": 0.9592664796746861, "learning_rate": 1.9990650493681517e-05, "loss": 0.4843, "step": 299 }, { "epoch": 0.05914826498422713, "grad_norm": 0.9196227211585776, "learning_rate": 1.999058336256e-05, "loss": 0.4678, "step": 300 }, { "epoch": 0.059345425867507885, "grad_norm": 1.2796468769464633, "learning_rate": 1.9990515991406666e-05, "loss": 0.4968, "step": 301 }, { "epoch": 0.05954258675078864, "grad_norm": 0.9210540437615927, "learning_rate": 1.9990448380223145e-05, "loss": 0.4315, "step": 302 }, { "epoch": 0.059739747634069404, "grad_norm": 0.9150812147912519, "learning_rate": 1.9990380529011056e-05, "loss": 0.4992, "step": 303 }, { "epoch": 0.05993690851735016, "grad_norm": 1.0184465463567445, "learning_rate": 1.9990312437772025e-05, "loss": 0.5149, "step": 304 }, { "epoch": 0.060134069400630916, "grad_norm": 0.909544636631595, "learning_rate": 1.99902441065077e-05, "loss": 0.4802, "step": 305 }, { "epoch": 0.06033123028391167, "grad_norm": 1.0603662944162957, "learning_rate": 1.9990175535219708e-05, "loss": 0.4475, "step": 306 }, { "epoch": 0.06052839116719243, "grad_norm": 1.241117709798254, "learning_rate": 1.999010672390971e-05, "loss": 0.477, "step": 307 }, { "epoch": 0.06072555205047318, "grad_norm": 1.035832208551567, "learning_rate": 1.9990037672579347e-05, "loss": 0.4822, "step": 308 }, { "epoch": 0.060922712933753946, "grad_norm": 0.9408863869538917, "learning_rate": 1.9989968381230288e-05, "loss": 0.4862, "step": 309 }, { "epoch": 0.0611198738170347, "grad_norm": 0.9296848076140912, "learning_rate": 1.998989884986419e-05, "loss": 0.5125, "step": 310 }, { "epoch": 0.06131703470031546, "grad_norm": 0.8918476259653481, "learning_rate": 1.998982907848273e-05, "loss": 0.4937, "step": 311 }, { "epoch": 0.061514195583596214, "grad_norm": 1.4848687664773468, "learning_rate": 1.9989759067087582e-05, "loss": 0.4865, "step": 312 }, { "epoch": 0.06171135646687697, "grad_norm": 0.8732760816687123, "learning_rate": 1.998968881568043e-05, "loss": 0.4778, "step": 313 }, { "epoch": 0.061908517350157725, "grad_norm": 1.1341625184193884, "learning_rate": 1.998961832426295e-05, "loss": 0.4798, "step": 314 }, { "epoch": 0.06210567823343849, "grad_norm": 0.8993091141038897, "learning_rate": 1.9989547592836853e-05, "loss": 0.4292, "step": 315 }, { "epoch": 0.062302839116719244, "grad_norm": 0.8850853933714552, "learning_rate": 1.998947662140383e-05, "loss": 0.4594, "step": 316 }, { "epoch": 0.0625, "grad_norm": 1.1127472937800988, "learning_rate": 1.9989405409965585e-05, "loss": 0.4943, "step": 317 }, { "epoch": 0.06269716088328076, "grad_norm": 0.866897163477894, "learning_rate": 1.998933395852383e-05, "loss": 0.4633, "step": 318 }, { "epoch": 0.06289432176656151, "grad_norm": 0.9547891953542145, "learning_rate": 1.9989262267080284e-05, "loss": 0.5155, "step": 319 }, { "epoch": 0.06309148264984227, "grad_norm": 5.818229069199477, "learning_rate": 1.998919033563667e-05, "loss": 0.5422, "step": 320 }, { "epoch": 0.06328864353312302, "grad_norm": 1.5240274072221687, "learning_rate": 1.998911816419471e-05, "loss": 0.4835, "step": 321 }, { "epoch": 0.06348580441640378, "grad_norm": 0.9717985560653184, "learning_rate": 1.9989045752756145e-05, "loss": 0.4848, "step": 322 }, { "epoch": 0.06368296529968455, "grad_norm": 1.1194941379469816, "learning_rate": 1.998897310132271e-05, "loss": 0.465, "step": 323 }, { "epoch": 0.0638801261829653, "grad_norm": 1.3015806776449268, "learning_rate": 1.9988900209896148e-05, "loss": 0.5076, "step": 324 }, { "epoch": 0.06407728706624606, "grad_norm": 1.3495583310297246, "learning_rate": 1.998882707847822e-05, "loss": 0.4891, "step": 325 }, { "epoch": 0.06427444794952682, "grad_norm": 1.094314000355448, "learning_rate": 1.9988753707070675e-05, "loss": 0.4603, "step": 326 }, { "epoch": 0.06447160883280757, "grad_norm": 0.8976180746045599, "learning_rate": 1.998868009567528e-05, "loss": 0.4852, "step": 327 }, { "epoch": 0.06466876971608833, "grad_norm": 1.176456935066941, "learning_rate": 1.99886062442938e-05, "loss": 0.4505, "step": 328 }, { "epoch": 0.06486593059936908, "grad_norm": 0.8621831797378902, "learning_rate": 1.9988532152928012e-05, "loss": 0.4521, "step": 329 }, { "epoch": 0.06506309148264984, "grad_norm": 1.1751922853480894, "learning_rate": 1.9988457821579698e-05, "loss": 0.4684, "step": 330 }, { "epoch": 0.0652602523659306, "grad_norm": 1.214299413735334, "learning_rate": 1.9988383250250636e-05, "loss": 0.5136, "step": 331 }, { "epoch": 0.06545741324921135, "grad_norm": 1.3935797324505006, "learning_rate": 1.9988308438942626e-05, "loss": 0.543, "step": 332 }, { "epoch": 0.06565457413249211, "grad_norm": 1.3113325461742111, "learning_rate": 1.9988233387657462e-05, "loss": 0.469, "step": 333 }, { "epoch": 0.06585173501577288, "grad_norm": 0.9832962911329318, "learning_rate": 1.9988158096396945e-05, "loss": 0.4977, "step": 334 }, { "epoch": 0.06604889589905363, "grad_norm": 1.4696155006597584, "learning_rate": 1.998808256516289e-05, "loss": 0.5422, "step": 335 }, { "epoch": 0.06624605678233439, "grad_norm": 1.1348175666726867, "learning_rate": 1.9988006793957106e-05, "loss": 0.4752, "step": 336 }, { "epoch": 0.06644321766561515, "grad_norm": 1.5792191486101228, "learning_rate": 1.9987930782781416e-05, "loss": 0.5261, "step": 337 }, { "epoch": 0.0666403785488959, "grad_norm": 1.0368934464451127, "learning_rate": 1.9987854531637644e-05, "loss": 0.4642, "step": 338 }, { "epoch": 0.06683753943217666, "grad_norm": 1.0337884912885194, "learning_rate": 1.9987778040527625e-05, "loss": 0.4755, "step": 339 }, { "epoch": 0.06703470031545741, "grad_norm": 1.133861117613178, "learning_rate": 1.9987701309453195e-05, "loss": 0.4828, "step": 340 }, { "epoch": 0.06723186119873817, "grad_norm": 1.111075907788898, "learning_rate": 1.99876243384162e-05, "loss": 0.4941, "step": 341 }, { "epoch": 0.06742902208201892, "grad_norm": 1.2284579964954567, "learning_rate": 1.9987547127418485e-05, "loss": 0.4999, "step": 342 }, { "epoch": 0.06762618296529968, "grad_norm": 1.0071834128462238, "learning_rate": 1.9987469676461904e-05, "loss": 0.4132, "step": 343 }, { "epoch": 0.06782334384858044, "grad_norm": 1.1346939479337024, "learning_rate": 1.9987391985548326e-05, "loss": 0.5326, "step": 344 }, { "epoch": 0.06802050473186119, "grad_norm": 0.8817839707262805, "learning_rate": 1.9987314054679615e-05, "loss": 0.4856, "step": 345 }, { "epoch": 0.06821766561514196, "grad_norm": 0.967996798695509, "learning_rate": 1.998723588385764e-05, "loss": 0.5015, "step": 346 }, { "epoch": 0.06841482649842272, "grad_norm": 1.2535242900070593, "learning_rate": 1.9987157473084276e-05, "loss": 0.4721, "step": 347 }, { "epoch": 0.06861198738170347, "grad_norm": 0.9971572384429465, "learning_rate": 1.9987078822361412e-05, "loss": 0.4866, "step": 348 }, { "epoch": 0.06880914826498423, "grad_norm": 0.9880019131723238, "learning_rate": 1.9986999931690937e-05, "loss": 0.4217, "step": 349 }, { "epoch": 0.06900630914826499, "grad_norm": 0.8510336475538133, "learning_rate": 1.9986920801074747e-05, "loss": 0.4997, "step": 350 }, { "epoch": 0.06920347003154574, "grad_norm": 1.3697939147272762, "learning_rate": 1.9986841430514743e-05, "loss": 0.4791, "step": 351 }, { "epoch": 0.0694006309148265, "grad_norm": 0.9037413372344649, "learning_rate": 1.9986761820012833e-05, "loss": 0.4642, "step": 352 }, { "epoch": 0.06959779179810725, "grad_norm": 1.1537513355491777, "learning_rate": 1.9986681969570924e-05, "loss": 0.4693, "step": 353 }, { "epoch": 0.06979495268138801, "grad_norm": 0.7969873717195902, "learning_rate": 1.9986601879190938e-05, "loss": 0.4309, "step": 354 }, { "epoch": 0.06999211356466876, "grad_norm": 1.0506442175587962, "learning_rate": 1.9986521548874802e-05, "loss": 0.4575, "step": 355 }, { "epoch": 0.07018927444794952, "grad_norm": 0.958839633619317, "learning_rate": 1.9986440978624444e-05, "loss": 0.4848, "step": 356 }, { "epoch": 0.07038643533123029, "grad_norm": 0.8487815542536234, "learning_rate": 1.9986360168441798e-05, "loss": 0.4346, "step": 357 }, { "epoch": 0.07058359621451105, "grad_norm": 0.9101190136837835, "learning_rate": 1.998627911832881e-05, "loss": 0.4798, "step": 358 }, { "epoch": 0.0707807570977918, "grad_norm": 0.8359817460302178, "learning_rate": 1.998619782828742e-05, "loss": 0.4457, "step": 359 }, { "epoch": 0.07097791798107256, "grad_norm": 0.9727976821516041, "learning_rate": 1.9986116298319585e-05, "loss": 0.5337, "step": 360 }, { "epoch": 0.07117507886435331, "grad_norm": 0.7658661529985589, "learning_rate": 1.998603452842727e-05, "loss": 0.4475, "step": 361 }, { "epoch": 0.07137223974763407, "grad_norm": 0.9267521801472242, "learning_rate": 1.998595251861243e-05, "loss": 0.5341, "step": 362 }, { "epoch": 0.07156940063091483, "grad_norm": 1.5288633084722636, "learning_rate": 1.998587026887704e-05, "loss": 0.4811, "step": 363 }, { "epoch": 0.07176656151419558, "grad_norm": 1.1549526328305526, "learning_rate": 1.9985787779223073e-05, "loss": 0.4785, "step": 364 }, { "epoch": 0.07196372239747634, "grad_norm": 1.0840635546798056, "learning_rate": 1.9985705049652513e-05, "loss": 0.499, "step": 365 }, { "epoch": 0.0721608832807571, "grad_norm": 0.779747962979529, "learning_rate": 1.998562208016735e-05, "loss": 0.4663, "step": 366 }, { "epoch": 0.07235804416403785, "grad_norm": 1.1129287322836847, "learning_rate": 1.9985538870769573e-05, "loss": 0.4727, "step": 367 }, { "epoch": 0.07255520504731862, "grad_norm": 0.853670529824468, "learning_rate": 1.9985455421461183e-05, "loss": 0.4947, "step": 368 }, { "epoch": 0.07275236593059937, "grad_norm": 0.924674407680012, "learning_rate": 1.9985371732244188e-05, "loss": 0.4584, "step": 369 }, { "epoch": 0.07294952681388013, "grad_norm": 0.8629571748054644, "learning_rate": 1.9985287803120595e-05, "loss": 0.4705, "step": 370 }, { "epoch": 0.07314668769716089, "grad_norm": 0.8242683110246124, "learning_rate": 1.998520363409242e-05, "loss": 0.4579, "step": 371 }, { "epoch": 0.07334384858044164, "grad_norm": 1.499462700491384, "learning_rate": 1.9985119225161688e-05, "loss": 0.4607, "step": 372 }, { "epoch": 0.0735410094637224, "grad_norm": 0.9669915502441957, "learning_rate": 1.9985034576330425e-05, "loss": 0.4956, "step": 373 }, { "epoch": 0.07373817034700315, "grad_norm": 1.0411892472276396, "learning_rate": 1.998494968760067e-05, "loss": 0.4557, "step": 374 }, { "epoch": 0.07393533123028391, "grad_norm": 0.8567052653582087, "learning_rate": 1.998486455897445e-05, "loss": 0.5177, "step": 375 }, { "epoch": 0.07413249211356467, "grad_norm": 1.1116739095705466, "learning_rate": 1.998477919045382e-05, "loss": 0.4716, "step": 376 }, { "epoch": 0.07432965299684542, "grad_norm": 0.8626595222872029, "learning_rate": 1.9984693582040834e-05, "loss": 0.5426, "step": 377 }, { "epoch": 0.07452681388012618, "grad_norm": 0.9587022633462172, "learning_rate": 1.998460773373754e-05, "loss": 0.462, "step": 378 }, { "epoch": 0.07472397476340693, "grad_norm": 0.8941067614117472, "learning_rate": 1.9984521645546007e-05, "loss": 0.4989, "step": 379 }, { "epoch": 0.0749211356466877, "grad_norm": 0.9295392099689723, "learning_rate": 1.9984435317468298e-05, "loss": 0.4745, "step": 380 }, { "epoch": 0.07511829652996846, "grad_norm": 0.8793262590604104, "learning_rate": 1.998434874950649e-05, "loss": 0.4847, "step": 381 }, { "epoch": 0.07531545741324921, "grad_norm": 0.9197548867944333, "learning_rate": 1.9984261941662666e-05, "loss": 0.4822, "step": 382 }, { "epoch": 0.07551261829652997, "grad_norm": 0.918094619940462, "learning_rate": 1.9984174893938908e-05, "loss": 0.473, "step": 383 }, { "epoch": 0.07570977917981073, "grad_norm": 0.834295051365838, "learning_rate": 1.998408760633731e-05, "loss": 0.4423, "step": 384 }, { "epoch": 0.07590694006309148, "grad_norm": 0.8187104700252339, "learning_rate": 1.998400007885996e-05, "loss": 0.4582, "step": 385 }, { "epoch": 0.07610410094637224, "grad_norm": 1.0466182710170804, "learning_rate": 1.9983912311508977e-05, "loss": 0.4694, "step": 386 }, { "epoch": 0.076301261829653, "grad_norm": 0.8718063081983285, "learning_rate": 1.998382430428645e-05, "loss": 0.476, "step": 387 }, { "epoch": 0.07649842271293375, "grad_norm": 0.7915900428994362, "learning_rate": 1.9983736057194512e-05, "loss": 0.416, "step": 388 }, { "epoch": 0.0766955835962145, "grad_norm": 0.8158965365869196, "learning_rate": 1.9983647570235274e-05, "loss": 0.4667, "step": 389 }, { "epoch": 0.07689274447949526, "grad_norm": 0.7997150055795212, "learning_rate": 1.9983558843410863e-05, "loss": 0.4466, "step": 390 }, { "epoch": 0.07708990536277603, "grad_norm": 0.8940610724543107, "learning_rate": 1.998346987672341e-05, "loss": 0.4658, "step": 391 }, { "epoch": 0.07728706624605679, "grad_norm": 1.294715926753004, "learning_rate": 1.9983380670175054e-05, "loss": 0.4642, "step": 392 }, { "epoch": 0.07748422712933754, "grad_norm": 11.425972122218342, "learning_rate": 1.998329122376794e-05, "loss": 0.5511, "step": 393 }, { "epoch": 0.0776813880126183, "grad_norm": 1.1867295493168513, "learning_rate": 1.998320153750421e-05, "loss": 0.4462, "step": 394 }, { "epoch": 0.07787854889589906, "grad_norm": 0.9729316139359963, "learning_rate": 1.998311161138603e-05, "loss": 0.4976, "step": 395 }, { "epoch": 0.07807570977917981, "grad_norm": 0.9157344759346018, "learning_rate": 1.998302144541555e-05, "loss": 0.4709, "step": 396 }, { "epoch": 0.07827287066246057, "grad_norm": 0.934161687151528, "learning_rate": 1.998293103959494e-05, "loss": 0.4673, "step": 397 }, { "epoch": 0.07847003154574132, "grad_norm": 1.4223262840464979, "learning_rate": 1.9982840393926374e-05, "loss": 0.5139, "step": 398 }, { "epoch": 0.07866719242902208, "grad_norm": 1.131720070151689, "learning_rate": 1.9982749508412026e-05, "loss": 0.5327, "step": 399 }, { "epoch": 0.07886435331230283, "grad_norm": 1.2074643005200005, "learning_rate": 1.998265838305409e-05, "loss": 0.4974, "step": 400 }, { "epoch": 0.07906151419558359, "grad_norm": 0.9531948695115436, "learning_rate": 1.998256701785474e-05, "loss": 0.4841, "step": 401 }, { "epoch": 0.07925867507886436, "grad_norm": 1.043133116018871, "learning_rate": 1.998247541281618e-05, "loss": 0.4506, "step": 402 }, { "epoch": 0.07945583596214512, "grad_norm": 1.0246039260585653, "learning_rate": 1.9982383567940606e-05, "loss": 0.4445, "step": 403 }, { "epoch": 0.07965299684542587, "grad_norm": 1.0452483998819864, "learning_rate": 1.9982291483230232e-05, "loss": 0.4851, "step": 404 }, { "epoch": 0.07985015772870663, "grad_norm": 1.1683875566798174, "learning_rate": 1.9982199158687266e-05, "loss": 0.4548, "step": 405 }, { "epoch": 0.08004731861198738, "grad_norm": 0.906125687513742, "learning_rate": 1.9982106594313924e-05, "loss": 0.4704, "step": 406 }, { "epoch": 0.08024447949526814, "grad_norm": 5.213779615448294, "learning_rate": 1.9982013790112437e-05, "loss": 0.526, "step": 407 }, { "epoch": 0.0804416403785489, "grad_norm": 1.9791118557774463, "learning_rate": 1.9981920746085025e-05, "loss": 0.4971, "step": 408 }, { "epoch": 0.08063880126182965, "grad_norm": 8.292277341865951, "learning_rate": 1.9981827462233932e-05, "loss": 0.4924, "step": 409 }, { "epoch": 0.08083596214511041, "grad_norm": 1.4694955541234114, "learning_rate": 1.99817339385614e-05, "loss": 0.5109, "step": 410 }, { "epoch": 0.08103312302839116, "grad_norm": 1.0200131993418244, "learning_rate": 1.9981640175069663e-05, "loss": 0.4837, "step": 411 }, { "epoch": 0.08123028391167192, "grad_norm": 1.241162294871943, "learning_rate": 1.998154617176099e-05, "loss": 0.4752, "step": 412 }, { "epoch": 0.08142744479495267, "grad_norm": 1.6151902756221161, "learning_rate": 1.9981451928637627e-05, "loss": 0.4739, "step": 413 }, { "epoch": 0.08162460567823344, "grad_norm": 1.2407308825596557, "learning_rate": 1.9981357445701846e-05, "loss": 0.4345, "step": 414 }, { "epoch": 0.0818217665615142, "grad_norm": 0.9936264286526395, "learning_rate": 1.9981262722955913e-05, "loss": 0.5211, "step": 415 }, { "epoch": 0.08201892744479496, "grad_norm": 1.3047191271197602, "learning_rate": 1.9981167760402104e-05, "loss": 0.4827, "step": 416 }, { "epoch": 0.08221608832807571, "grad_norm": 0.8876022805465946, "learning_rate": 1.9981072558042705e-05, "loss": 0.4838, "step": 417 }, { "epoch": 0.08241324921135647, "grad_norm": 1.3442900789849086, "learning_rate": 1.9980977115879997e-05, "loss": 0.4614, "step": 418 }, { "epoch": 0.08261041009463722, "grad_norm": 1.5881293239590277, "learning_rate": 1.998088143391628e-05, "loss": 0.4813, "step": 419 }, { "epoch": 0.08280757097791798, "grad_norm": 1.1153506722397977, "learning_rate": 1.9980785512153846e-05, "loss": 0.5095, "step": 420 }, { "epoch": 0.08300473186119874, "grad_norm": 0.9249721852533636, "learning_rate": 1.9980689350595004e-05, "loss": 0.4764, "step": 421 }, { "epoch": 0.08320189274447949, "grad_norm": 1.073860170389333, "learning_rate": 1.9980592949242063e-05, "loss": 0.4597, "step": 422 }, { "epoch": 0.08339905362776025, "grad_norm": 0.8354298196477534, "learning_rate": 1.998049630809734e-05, "loss": 0.4805, "step": 423 }, { "epoch": 0.083596214511041, "grad_norm": 0.9975582382283317, "learning_rate": 1.9980399427163154e-05, "loss": 0.4734, "step": 424 }, { "epoch": 0.08379337539432177, "grad_norm": 0.7461081355848223, "learning_rate": 1.9980302306441834e-05, "loss": 0.4456, "step": 425 }, { "epoch": 0.08399053627760253, "grad_norm": 0.8992637044263566, "learning_rate": 1.9980204945935716e-05, "loss": 0.4514, "step": 426 }, { "epoch": 0.08418769716088328, "grad_norm": 0.9484738986938942, "learning_rate": 1.9980107345647133e-05, "loss": 0.5281, "step": 427 }, { "epoch": 0.08438485804416404, "grad_norm": 0.8453814347866732, "learning_rate": 1.9980009505578438e-05, "loss": 0.4493, "step": 428 }, { "epoch": 0.0845820189274448, "grad_norm": 0.8262276270862967, "learning_rate": 1.9979911425731978e-05, "loss": 0.4483, "step": 429 }, { "epoch": 0.08477917981072555, "grad_norm": 0.856137009508442, "learning_rate": 1.9979813106110108e-05, "loss": 0.4635, "step": 430 }, { "epoch": 0.08497634069400631, "grad_norm": 0.8889039446695481, "learning_rate": 1.997971454671519e-05, "loss": 0.4308, "step": 431 }, { "epoch": 0.08517350157728706, "grad_norm": 8.27925993626678, "learning_rate": 1.9979615747549594e-05, "loss": 0.4852, "step": 432 }, { "epoch": 0.08537066246056782, "grad_norm": 2.064710279361049, "learning_rate": 1.9979516708615696e-05, "loss": 0.4843, "step": 433 }, { "epoch": 0.08556782334384858, "grad_norm": 0.9477116643465063, "learning_rate": 1.997941742991587e-05, "loss": 0.4534, "step": 434 }, { "epoch": 0.08576498422712933, "grad_norm": 0.9584045524018094, "learning_rate": 1.9979317911452503e-05, "loss": 0.4787, "step": 435 }, { "epoch": 0.0859621451104101, "grad_norm": 0.8809961453962474, "learning_rate": 1.997921815322799e-05, "loss": 0.4742, "step": 436 }, { "epoch": 0.08615930599369086, "grad_norm": 1.0555900314632443, "learning_rate": 1.997911815524472e-05, "loss": 0.4854, "step": 437 }, { "epoch": 0.08635646687697161, "grad_norm": 0.8568644478491275, "learning_rate": 1.9979017917505102e-05, "loss": 0.5062, "step": 438 }, { "epoch": 0.08655362776025237, "grad_norm": 0.899069617822026, "learning_rate": 1.997891744001155e-05, "loss": 0.4572, "step": 439 }, { "epoch": 0.08675078864353312, "grad_norm": 0.858458600280124, "learning_rate": 1.997881672276646e-05, "loss": 0.4561, "step": 440 }, { "epoch": 0.08694794952681388, "grad_norm": 0.8872507068731488, "learning_rate": 1.9978715765772266e-05, "loss": 0.4893, "step": 441 }, { "epoch": 0.08714511041009464, "grad_norm": 0.8001811452184983, "learning_rate": 1.9978614569031388e-05, "loss": 0.4473, "step": 442 }, { "epoch": 0.08734227129337539, "grad_norm": 0.7719094886129825, "learning_rate": 1.997851313254626e-05, "loss": 0.4162, "step": 443 }, { "epoch": 0.08753943217665615, "grad_norm": 0.8070187892782263, "learning_rate": 1.997841145631932e-05, "loss": 0.4357, "step": 444 }, { "epoch": 0.0877365930599369, "grad_norm": 0.7876463515387281, "learning_rate": 1.9978309540353013e-05, "loss": 0.4511, "step": 445 }, { "epoch": 0.08793375394321766, "grad_norm": 0.888526192109841, "learning_rate": 1.9978207384649778e-05, "loss": 0.482, "step": 446 }, { "epoch": 0.08813091482649842, "grad_norm": 0.8684918565462284, "learning_rate": 1.9978104989212078e-05, "loss": 0.488, "step": 447 }, { "epoch": 0.08832807570977919, "grad_norm": 0.9067510910757722, "learning_rate": 1.997800235404237e-05, "loss": 0.4546, "step": 448 }, { "epoch": 0.08852523659305994, "grad_norm": 0.8445544456418074, "learning_rate": 1.9977899479143117e-05, "loss": 0.4509, "step": 449 }, { "epoch": 0.0887223974763407, "grad_norm": 0.8441154566325766, "learning_rate": 1.9977796364516796e-05, "loss": 0.4794, "step": 450 }, { "epoch": 0.08891955835962145, "grad_norm": 0.9219556253568325, "learning_rate": 1.997769301016588e-05, "loss": 0.4876, "step": 451 }, { "epoch": 0.08911671924290221, "grad_norm": 0.8432925267438068, "learning_rate": 1.997758941609286e-05, "loss": 0.4947, "step": 452 }, { "epoch": 0.08931388012618297, "grad_norm": 0.8912819991209812, "learning_rate": 1.9977485582300215e-05, "loss": 0.441, "step": 453 }, { "epoch": 0.08951104100946372, "grad_norm": 0.8123986878321956, "learning_rate": 1.9977381508790446e-05, "loss": 0.461, "step": 454 }, { "epoch": 0.08970820189274448, "grad_norm": 0.9178026802956625, "learning_rate": 1.997727719556605e-05, "loss": 0.5066, "step": 455 }, { "epoch": 0.08990536277602523, "grad_norm": 0.8987063196006664, "learning_rate": 1.9977172642629537e-05, "loss": 0.44, "step": 456 }, { "epoch": 0.09010252365930599, "grad_norm": 0.890159715184649, "learning_rate": 1.9977067849983412e-05, "loss": 0.4605, "step": 457 }, { "epoch": 0.09029968454258674, "grad_norm": 0.95233992862221, "learning_rate": 1.9976962817630202e-05, "loss": 0.4727, "step": 458 }, { "epoch": 0.09049684542586751, "grad_norm": 0.9459884973587354, "learning_rate": 1.9976857545572425e-05, "loss": 0.4619, "step": 459 }, { "epoch": 0.09069400630914827, "grad_norm": 1.2642744480328538, "learning_rate": 1.997675203381261e-05, "loss": 0.5395, "step": 460 }, { "epoch": 0.09089116719242903, "grad_norm": 1.0028131372893, "learning_rate": 1.997664628235329e-05, "loss": 0.4673, "step": 461 }, { "epoch": 0.09108832807570978, "grad_norm": 0.9306510247064776, "learning_rate": 1.9976540291197015e-05, "loss": 0.4886, "step": 462 }, { "epoch": 0.09128548895899054, "grad_norm": 0.905251801766755, "learning_rate": 1.9976434060346324e-05, "loss": 0.5161, "step": 463 }, { "epoch": 0.0914826498422713, "grad_norm": 0.929602575181201, "learning_rate": 1.9976327589803767e-05, "loss": 0.4745, "step": 464 }, { "epoch": 0.09167981072555205, "grad_norm": 3.0755405650687653, "learning_rate": 1.997622087957191e-05, "loss": 0.4565, "step": 465 }, { "epoch": 0.0918769716088328, "grad_norm": 1.3035255174455254, "learning_rate": 1.9976113929653312e-05, "loss": 0.4869, "step": 466 }, { "epoch": 0.09207413249211356, "grad_norm": 0.863482479091288, "learning_rate": 1.997600674005054e-05, "loss": 0.4533, "step": 467 }, { "epoch": 0.09227129337539432, "grad_norm": 1.007184735840567, "learning_rate": 1.9975899310766173e-05, "loss": 0.4697, "step": 468 }, { "epoch": 0.09246845425867507, "grad_norm": 1.1269484430615138, "learning_rate": 1.997579164180279e-05, "loss": 0.5163, "step": 469 }, { "epoch": 0.09266561514195584, "grad_norm": 1.1397659345471354, "learning_rate": 1.9975683733162987e-05, "loss": 0.4959, "step": 470 }, { "epoch": 0.0928627760252366, "grad_norm": 0.9860221015384155, "learning_rate": 1.9975575584849346e-05, "loss": 0.4788, "step": 471 }, { "epoch": 0.09305993690851735, "grad_norm": 0.8672216442193951, "learning_rate": 1.9975467196864465e-05, "loss": 0.4846, "step": 472 }, { "epoch": 0.09325709779179811, "grad_norm": 1.125980566236637, "learning_rate": 1.9975358569210952e-05, "loss": 0.467, "step": 473 }, { "epoch": 0.09345425867507887, "grad_norm": 0.9201790577474425, "learning_rate": 1.9975249701891414e-05, "loss": 0.4767, "step": 474 }, { "epoch": 0.09365141955835962, "grad_norm": 1.0505904231912129, "learning_rate": 1.9975140594908472e-05, "loss": 0.4671, "step": 475 }, { "epoch": 0.09384858044164038, "grad_norm": 0.8373445019309896, "learning_rate": 1.9975031248264746e-05, "loss": 0.5008, "step": 476 }, { "epoch": 0.09404574132492113, "grad_norm": 1.016203023349781, "learning_rate": 1.997492166196286e-05, "loss": 0.4614, "step": 477 }, { "epoch": 0.09424290220820189, "grad_norm": 0.785117485057104, "learning_rate": 1.9974811836005446e-05, "loss": 0.465, "step": 478 }, { "epoch": 0.09444006309148265, "grad_norm": 1.0204263244954062, "learning_rate": 1.9974701770395147e-05, "loss": 0.461, "step": 479 }, { "epoch": 0.0946372239747634, "grad_norm": 2.1396373733832785, "learning_rate": 1.9974591465134606e-05, "loss": 0.4751, "step": 480 }, { "epoch": 0.09483438485804416, "grad_norm": 1.112339191932461, "learning_rate": 1.9974480920226472e-05, "loss": 0.454, "step": 481 }, { "epoch": 0.09503154574132493, "grad_norm": 1.0016882760903807, "learning_rate": 1.9974370135673398e-05, "loss": 0.4848, "step": 482 }, { "epoch": 0.09522870662460568, "grad_norm": 1.0366723685296435, "learning_rate": 1.9974259111478054e-05, "loss": 0.4619, "step": 483 }, { "epoch": 0.09542586750788644, "grad_norm": 0.7816273224252894, "learning_rate": 1.9974147847643103e-05, "loss": 0.4356, "step": 484 }, { "epoch": 0.0956230283911672, "grad_norm": 2.2595110257069386, "learning_rate": 1.9974036344171215e-05, "loss": 0.528, "step": 485 }, { "epoch": 0.09582018927444795, "grad_norm": 0.9462755974465362, "learning_rate": 1.997392460106507e-05, "loss": 0.4768, "step": 486 }, { "epoch": 0.0960173501577287, "grad_norm": 0.9715227579239536, "learning_rate": 1.997381261832736e-05, "loss": 0.4725, "step": 487 }, { "epoch": 0.09621451104100946, "grad_norm": 0.9626137085486789, "learning_rate": 1.9973700395960765e-05, "loss": 0.4768, "step": 488 }, { "epoch": 0.09641167192429022, "grad_norm": 0.7957423308586874, "learning_rate": 1.997358793396799e-05, "loss": 0.4333, "step": 489 }, { "epoch": 0.09660883280757097, "grad_norm": 0.8587376909161047, "learning_rate": 1.9973475232351728e-05, "loss": 0.4677, "step": 490 }, { "epoch": 0.09680599369085173, "grad_norm": 0.8640406505371612, "learning_rate": 1.9973362291114697e-05, "loss": 0.4915, "step": 491 }, { "epoch": 0.09700315457413249, "grad_norm": 22.479833252401775, "learning_rate": 1.99732491102596e-05, "loss": 0.5093, "step": 492 }, { "epoch": 0.09720031545741326, "grad_norm": 1.7104531508622285, "learning_rate": 1.9973135689789167e-05, "loss": 0.4697, "step": 493 }, { "epoch": 0.09739747634069401, "grad_norm": 0.8723153591836569, "learning_rate": 1.9973022029706117e-05, "loss": 0.4539, "step": 494 }, { "epoch": 0.09759463722397477, "grad_norm": 1.1539417078528944, "learning_rate": 1.997290813001318e-05, "loss": 0.47, "step": 495 }, { "epoch": 0.09779179810725552, "grad_norm": 0.8862353677994035, "learning_rate": 1.9972793990713093e-05, "loss": 0.4891, "step": 496 }, { "epoch": 0.09798895899053628, "grad_norm": 1.1144995212251465, "learning_rate": 1.9972679611808603e-05, "loss": 0.4823, "step": 497 }, { "epoch": 0.09818611987381703, "grad_norm": 1.4516257465643632, "learning_rate": 1.997256499330245e-05, "loss": 0.5077, "step": 498 }, { "epoch": 0.09838328075709779, "grad_norm": 0.9004639995601033, "learning_rate": 1.9972450135197397e-05, "loss": 0.4404, "step": 499 }, { "epoch": 0.09858044164037855, "grad_norm": 0.8035330529361184, "learning_rate": 1.9972335037496195e-05, "loss": 0.456, "step": 500 }, { "epoch": 0.0987776025236593, "grad_norm": 0.8385329004850893, "learning_rate": 1.9972219700201612e-05, "loss": 0.4332, "step": 501 }, { "epoch": 0.09897476340694006, "grad_norm": 0.8495426403609305, "learning_rate": 1.9972104123316422e-05, "loss": 0.4541, "step": 502 }, { "epoch": 0.09917192429022081, "grad_norm": 0.7382887096139712, "learning_rate": 1.9971988306843403e-05, "loss": 0.4095, "step": 503 }, { "epoch": 0.09936908517350158, "grad_norm": 0.8439026090011481, "learning_rate": 1.997187225078533e-05, "loss": 0.4443, "step": 504 }, { "epoch": 0.09956624605678234, "grad_norm": 0.8075139072966796, "learning_rate": 1.9971755955144995e-05, "loss": 0.4608, "step": 505 }, { "epoch": 0.0997634069400631, "grad_norm": 0.818587342922555, "learning_rate": 1.9971639419925197e-05, "loss": 0.4486, "step": 506 }, { "epoch": 0.09996056782334385, "grad_norm": 0.8577985021487166, "learning_rate": 1.997152264512873e-05, "loss": 0.4494, "step": 507 }, { "epoch": 0.10015772870662461, "grad_norm": 0.8628896861024081, "learning_rate": 1.9971405630758402e-05, "loss": 0.474, "step": 508 }, { "epoch": 0.10035488958990536, "grad_norm": 0.799898383721641, "learning_rate": 1.9971288376817023e-05, "loss": 0.4588, "step": 509 }, { "epoch": 0.10055205047318612, "grad_norm": 0.7095652696234459, "learning_rate": 1.997117088330741e-05, "loss": 0.4359, "step": 510 }, { "epoch": 0.10074921135646688, "grad_norm": 0.9284886130598523, "learning_rate": 1.9971053150232387e-05, "loss": 0.4517, "step": 511 }, { "epoch": 0.10094637223974763, "grad_norm": 0.8423011199925328, "learning_rate": 1.9970935177594787e-05, "loss": 0.4511, "step": 512 }, { "epoch": 0.10114353312302839, "grad_norm": 0.7810718155478004, "learning_rate": 1.9970816965397435e-05, "loss": 0.5072, "step": 513 }, { "epoch": 0.10134069400630914, "grad_norm": 0.8041359390874309, "learning_rate": 1.9970698513643178e-05, "loss": 0.4703, "step": 514 }, { "epoch": 0.1015378548895899, "grad_norm": 1.53976548457417, "learning_rate": 1.9970579822334856e-05, "loss": 0.4518, "step": 515 }, { "epoch": 0.10173501577287067, "grad_norm": 0.9907811278036276, "learning_rate": 1.9970460891475328e-05, "loss": 0.476, "step": 516 }, { "epoch": 0.10193217665615142, "grad_norm": 0.8081338018995352, "learning_rate": 1.997034172106745e-05, "loss": 0.4691, "step": 517 }, { "epoch": 0.10212933753943218, "grad_norm": 1.0051078776013984, "learning_rate": 1.9970222311114078e-05, "loss": 0.5074, "step": 518 }, { "epoch": 0.10232649842271294, "grad_norm": 0.8105369088543491, "learning_rate": 1.9970102661618088e-05, "loss": 0.4502, "step": 519 }, { "epoch": 0.10252365930599369, "grad_norm": 0.7202414114786588, "learning_rate": 1.9969982772582354e-05, "loss": 0.4452, "step": 520 }, { "epoch": 0.10272082018927445, "grad_norm": 0.8648373894327899, "learning_rate": 1.996986264400975e-05, "loss": 0.4757, "step": 521 }, { "epoch": 0.1029179810725552, "grad_norm": 0.7629352205940431, "learning_rate": 1.9969742275903172e-05, "loss": 0.477, "step": 522 }, { "epoch": 0.10311514195583596, "grad_norm": 0.8526046130077743, "learning_rate": 1.996962166826551e-05, "loss": 0.4341, "step": 523 }, { "epoch": 0.10331230283911672, "grad_norm": 5.257272773165352, "learning_rate": 1.9969500821099654e-05, "loss": 0.4958, "step": 524 }, { "epoch": 0.10350946372239747, "grad_norm": 0.8477826266411921, "learning_rate": 1.996937973440851e-05, "loss": 0.43, "step": 525 }, { "epoch": 0.10370662460567823, "grad_norm": 1.4050180036912192, "learning_rate": 1.9969258408194997e-05, "loss": 0.4446, "step": 526 }, { "epoch": 0.103903785488959, "grad_norm": 0.976079511985993, "learning_rate": 1.996913684246202e-05, "loss": 0.4718, "step": 527 }, { "epoch": 0.10410094637223975, "grad_norm": 1.460058823361328, "learning_rate": 1.99690150372125e-05, "loss": 0.4904, "step": 528 }, { "epoch": 0.10429810725552051, "grad_norm": 1.2260644333639692, "learning_rate": 1.9968892992449364e-05, "loss": 0.4757, "step": 529 }, { "epoch": 0.10449526813880126, "grad_norm": 0.820353657627711, "learning_rate": 1.9968770708175552e-05, "loss": 0.4823, "step": 530 }, { "epoch": 0.10469242902208202, "grad_norm": 0.7536246666392749, "learning_rate": 1.996864818439399e-05, "loss": 0.4212, "step": 531 }, { "epoch": 0.10488958990536278, "grad_norm": 0.8026979740475652, "learning_rate": 1.9968525421107633e-05, "loss": 0.4606, "step": 532 }, { "epoch": 0.10508675078864353, "grad_norm": 0.8687858454813581, "learning_rate": 1.996840241831942e-05, "loss": 0.4862, "step": 533 }, { "epoch": 0.10528391167192429, "grad_norm": 3.2595441481445944, "learning_rate": 1.9968279176032314e-05, "loss": 0.4919, "step": 534 }, { "epoch": 0.10548107255520504, "grad_norm": 0.9637701357273671, "learning_rate": 1.9968155694249274e-05, "loss": 0.4574, "step": 535 }, { "epoch": 0.1056782334384858, "grad_norm": 0.9015755854427224, "learning_rate": 1.9968031972973266e-05, "loss": 0.4892, "step": 536 }, { "epoch": 0.10587539432176656, "grad_norm": 0.811289401193313, "learning_rate": 1.996790801220726e-05, "loss": 0.4393, "step": 537 }, { "epoch": 0.10607255520504733, "grad_norm": 1.15420005303407, "learning_rate": 1.996778381195424e-05, "loss": 0.5066, "step": 538 }, { "epoch": 0.10626971608832808, "grad_norm": 0.8200798308419749, "learning_rate": 1.9967659372217187e-05, "loss": 0.5189, "step": 539 }, { "epoch": 0.10646687697160884, "grad_norm": 0.8867823143154604, "learning_rate": 1.9967534692999085e-05, "loss": 0.4365, "step": 540 }, { "epoch": 0.10666403785488959, "grad_norm": 0.8669753830991764, "learning_rate": 1.996740977430294e-05, "loss": 0.457, "step": 541 }, { "epoch": 0.10686119873817035, "grad_norm": 0.8954903277161942, "learning_rate": 1.996728461613175e-05, "loss": 0.4597, "step": 542 }, { "epoch": 0.1070583596214511, "grad_norm": 0.8258256049709248, "learning_rate": 1.9967159218488515e-05, "loss": 0.428, "step": 543 }, { "epoch": 0.10725552050473186, "grad_norm": 0.811522877066346, "learning_rate": 1.996703358137626e-05, "loss": 0.4708, "step": 544 }, { "epoch": 0.10745268138801262, "grad_norm": 0.9746081819733365, "learning_rate": 1.996690770479799e-05, "loss": 0.5035, "step": 545 }, { "epoch": 0.10764984227129337, "grad_norm": 0.8151447726161154, "learning_rate": 1.9966781588756743e-05, "loss": 0.4748, "step": 546 }, { "epoch": 0.10784700315457413, "grad_norm": 0.9244734430462046, "learning_rate": 1.996665523325554e-05, "loss": 0.4811, "step": 547 }, { "epoch": 0.10804416403785488, "grad_norm": 0.8967791767272244, "learning_rate": 1.9966528638297415e-05, "loss": 0.4329, "step": 548 }, { "epoch": 0.10824132492113564, "grad_norm": 1.184825474138752, "learning_rate": 1.9966401803885413e-05, "loss": 0.4622, "step": 549 }, { "epoch": 0.10843848580441641, "grad_norm": 0.8122484029135365, "learning_rate": 1.9966274730022587e-05, "loss": 0.473, "step": 550 }, { "epoch": 0.10863564668769717, "grad_norm": 1.1993820808729505, "learning_rate": 1.996614741671198e-05, "loss": 0.4356, "step": 551 }, { "epoch": 0.10883280757097792, "grad_norm": 0.767742171284427, "learning_rate": 1.996601986395666e-05, "loss": 0.4302, "step": 552 }, { "epoch": 0.10902996845425868, "grad_norm": 1.1421804231314927, "learning_rate": 1.9965892071759685e-05, "loss": 0.4594, "step": 553 }, { "epoch": 0.10922712933753943, "grad_norm": 0.8790816673408636, "learning_rate": 1.9965764040124126e-05, "loss": 0.4877, "step": 554 }, { "epoch": 0.10942429022082019, "grad_norm": 1.2969027619478393, "learning_rate": 1.9965635769053064e-05, "loss": 0.474, "step": 555 }, { "epoch": 0.10962145110410094, "grad_norm": 0.8359668233842006, "learning_rate": 1.9965507258549573e-05, "loss": 0.4586, "step": 556 }, { "epoch": 0.1098186119873817, "grad_norm": 0.9318305367356925, "learning_rate": 1.996537850861675e-05, "loss": 0.4439, "step": 557 }, { "epoch": 0.11001577287066246, "grad_norm": 0.9793179279182511, "learning_rate": 1.9965249519257682e-05, "loss": 0.4953, "step": 558 }, { "epoch": 0.11021293375394321, "grad_norm": 0.7714402928720476, "learning_rate": 1.9965120290475466e-05, "loss": 0.4635, "step": 559 }, { "epoch": 0.11041009463722397, "grad_norm": 1.0153326096876347, "learning_rate": 1.996499082227321e-05, "loss": 0.465, "step": 560 }, { "epoch": 0.11060725552050474, "grad_norm": 0.7664943405741772, "learning_rate": 1.996486111465403e-05, "loss": 0.4849, "step": 561 }, { "epoch": 0.1108044164037855, "grad_norm": 0.8468927415011162, "learning_rate": 1.996473116762103e-05, "loss": 0.4897, "step": 562 }, { "epoch": 0.11100157728706625, "grad_norm": 0.7604431065724107, "learning_rate": 1.9964600981177344e-05, "loss": 0.4737, "step": 563 }, { "epoch": 0.111198738170347, "grad_norm": 0.7359161844087391, "learning_rate": 1.9964470555326096e-05, "loss": 0.4508, "step": 564 }, { "epoch": 0.11139589905362776, "grad_norm": 1.3795069909449265, "learning_rate": 1.9964339890070415e-05, "loss": 0.4826, "step": 565 }, { "epoch": 0.11159305993690852, "grad_norm": 0.9524081314639894, "learning_rate": 1.9964208985413448e-05, "loss": 0.4497, "step": 566 }, { "epoch": 0.11179022082018927, "grad_norm": 0.7435304463746423, "learning_rate": 1.9964077841358333e-05, "loss": 0.4797, "step": 567 }, { "epoch": 0.11198738170347003, "grad_norm": 2.6303972907019326, "learning_rate": 1.9963946457908223e-05, "loss": 0.4954, "step": 568 }, { "epoch": 0.11218454258675079, "grad_norm": 0.8886897209525809, "learning_rate": 1.9963814835066274e-05, "loss": 0.4991, "step": 569 }, { "epoch": 0.11238170347003154, "grad_norm": 16.23439623915258, "learning_rate": 1.9963682972835654e-05, "loss": 0.4808, "step": 570 }, { "epoch": 0.1125788643533123, "grad_norm": 1.0835312648993332, "learning_rate": 1.9963550871219522e-05, "loss": 0.5025, "step": 571 }, { "epoch": 0.11277602523659307, "grad_norm": 2.8501049449687157, "learning_rate": 1.996341853022106e-05, "loss": 0.4398, "step": 572 }, { "epoch": 0.11297318611987382, "grad_norm": 1.4391188096590775, "learning_rate": 1.9963285949843446e-05, "loss": 0.4883, "step": 573 }, { "epoch": 0.11317034700315458, "grad_norm": 0.8468547462854358, "learning_rate": 1.996315313008986e-05, "loss": 0.4499, "step": 574 }, { "epoch": 0.11336750788643533, "grad_norm": 1.1620214814742198, "learning_rate": 1.99630200709635e-05, "loss": 0.4489, "step": 575 }, { "epoch": 0.11356466876971609, "grad_norm": 1.9853935899449358, "learning_rate": 1.996288677246756e-05, "loss": 0.4849, "step": 576 }, { "epoch": 0.11376182965299685, "grad_norm": 1.8186145471704431, "learning_rate": 1.996275323460524e-05, "loss": 0.5109, "step": 577 }, { "epoch": 0.1139589905362776, "grad_norm": 0.8478050994792304, "learning_rate": 1.996261945737975e-05, "loss": 0.5038, "step": 578 }, { "epoch": 0.11415615141955836, "grad_norm": 1.1104246850094956, "learning_rate": 1.9962485440794306e-05, "loss": 0.4551, "step": 579 }, { "epoch": 0.11435331230283911, "grad_norm": 2.048919052058303, "learning_rate": 1.9962351184852123e-05, "loss": 0.4642, "step": 580 }, { "epoch": 0.11455047318611987, "grad_norm": 0.9669037613683414, "learning_rate": 1.9962216689556435e-05, "loss": 0.4547, "step": 581 }, { "epoch": 0.11474763406940063, "grad_norm": 0.7967676382705094, "learning_rate": 1.996208195491047e-05, "loss": 0.4732, "step": 582 }, { "epoch": 0.11494479495268138, "grad_norm": 0.973744486308741, "learning_rate": 1.9961946980917457e-05, "loss": 0.4612, "step": 583 }, { "epoch": 0.11514195583596215, "grad_norm": 0.8065923330637983, "learning_rate": 1.9961811767580646e-05, "loss": 0.4467, "step": 584 }, { "epoch": 0.1153391167192429, "grad_norm": 2.7671844074244127, "learning_rate": 1.996167631490329e-05, "loss": 0.493, "step": 585 }, { "epoch": 0.11553627760252366, "grad_norm": 0.8616456402628487, "learning_rate": 1.9961540622888637e-05, "loss": 0.4738, "step": 586 }, { "epoch": 0.11573343848580442, "grad_norm": 4.8009523346043945, "learning_rate": 1.9961404691539947e-05, "loss": 0.4741, "step": 587 }, { "epoch": 0.11593059936908517, "grad_norm": 1.2289335426336983, "learning_rate": 1.996126852086049e-05, "loss": 0.4626, "step": 588 }, { "epoch": 0.11612776025236593, "grad_norm": 0.8888193606458007, "learning_rate": 1.996113211085353e-05, "loss": 0.469, "step": 589 }, { "epoch": 0.11632492113564669, "grad_norm": 1.020387123983096, "learning_rate": 1.996099546152235e-05, "loss": 0.4977, "step": 590 }, { "epoch": 0.11652208201892744, "grad_norm": 1.5821331097425058, "learning_rate": 1.9960858572870238e-05, "loss": 0.5181, "step": 591 }, { "epoch": 0.1167192429022082, "grad_norm": 1.406346825248364, "learning_rate": 1.9960721444900475e-05, "loss": 0.5002, "step": 592 }, { "epoch": 0.11691640378548895, "grad_norm": 0.8333326596917328, "learning_rate": 1.9960584077616356e-05, "loss": 0.4882, "step": 593 }, { "epoch": 0.11711356466876971, "grad_norm": 0.9769660099184305, "learning_rate": 1.9960446471021187e-05, "loss": 0.4375, "step": 594 }, { "epoch": 0.11731072555205048, "grad_norm": 0.8908916939407923, "learning_rate": 1.9960308625118265e-05, "loss": 0.475, "step": 595 }, { "epoch": 0.11750788643533124, "grad_norm": 0.8760104320431786, "learning_rate": 1.996017053991091e-05, "loss": 0.461, "step": 596 }, { "epoch": 0.11770504731861199, "grad_norm": 1.0317263432759556, "learning_rate": 1.9960032215402436e-05, "loss": 0.4877, "step": 597 }, { "epoch": 0.11790220820189275, "grad_norm": 1.3949799089229347, "learning_rate": 1.995989365159617e-05, "loss": 0.4721, "step": 598 }, { "epoch": 0.1180993690851735, "grad_norm": 3.854325351616305, "learning_rate": 1.9959754848495437e-05, "loss": 0.4897, "step": 599 }, { "epoch": 0.11829652996845426, "grad_norm": 0.9559253651339167, "learning_rate": 1.9959615806103572e-05, "loss": 0.4632, "step": 600 }, { "epoch": 0.11849369085173501, "grad_norm": 1.864457347756339, "learning_rate": 1.9959476524423917e-05, "loss": 0.4725, "step": 601 }, { "epoch": 0.11869085173501577, "grad_norm": 1.03878334430699, "learning_rate": 1.9959337003459816e-05, "loss": 0.4476, "step": 602 }, { "epoch": 0.11888801261829653, "grad_norm": 0.9045258083599287, "learning_rate": 1.995919724321463e-05, "loss": 0.4791, "step": 603 }, { "epoch": 0.11908517350157728, "grad_norm": 0.9747663255202897, "learning_rate": 1.9959057243691707e-05, "loss": 0.4271, "step": 604 }, { "epoch": 0.11928233438485804, "grad_norm": 1.2080345292681676, "learning_rate": 1.995891700489441e-05, "loss": 0.4782, "step": 605 }, { "epoch": 0.11947949526813881, "grad_norm": 0.8131818958865078, "learning_rate": 1.9958776526826115e-05, "loss": 0.4596, "step": 606 }, { "epoch": 0.11967665615141956, "grad_norm": 0.9375228525284777, "learning_rate": 1.9958635809490195e-05, "loss": 0.4583, "step": 607 }, { "epoch": 0.11987381703470032, "grad_norm": 0.9475574679425915, "learning_rate": 1.995849485289003e-05, "loss": 0.4766, "step": 608 }, { "epoch": 0.12007097791798108, "grad_norm": 1.0121761969425598, "learning_rate": 1.9958353657029007e-05, "loss": 0.4816, "step": 609 }, { "epoch": 0.12026813880126183, "grad_norm": 1.0654177765828532, "learning_rate": 1.9958212221910514e-05, "loss": 0.4907, "step": 610 }, { "epoch": 0.12046529968454259, "grad_norm": 0.8477027332986309, "learning_rate": 1.9958070547537956e-05, "loss": 0.4706, "step": 611 }, { "epoch": 0.12066246056782334, "grad_norm": 0.8677643052383288, "learning_rate": 1.9957928633914735e-05, "loss": 0.4858, "step": 612 }, { "epoch": 0.1208596214511041, "grad_norm": 0.9201595715495308, "learning_rate": 1.9957786481044253e-05, "loss": 0.4733, "step": 613 }, { "epoch": 0.12105678233438485, "grad_norm": 1.0931868890217196, "learning_rate": 1.995764408892994e-05, "loss": 0.4507, "step": 614 }, { "epoch": 0.12125394321766561, "grad_norm": 0.9329354196032422, "learning_rate": 1.9957501457575207e-05, "loss": 0.481, "step": 615 }, { "epoch": 0.12145110410094637, "grad_norm": 0.8796940111635481, "learning_rate": 1.995735858698348e-05, "loss": 0.4416, "step": 616 }, { "epoch": 0.12164826498422712, "grad_norm": 0.9276302620532716, "learning_rate": 1.9957215477158196e-05, "loss": 0.5071, "step": 617 }, { "epoch": 0.12184542586750789, "grad_norm": 0.9200272790666039, "learning_rate": 1.9957072128102792e-05, "loss": 0.4609, "step": 618 }, { "epoch": 0.12204258675078865, "grad_norm": 0.8595424766519794, "learning_rate": 1.995692853982071e-05, "loss": 0.4935, "step": 619 }, { "epoch": 0.1222397476340694, "grad_norm": 0.8508720329773822, "learning_rate": 1.99567847123154e-05, "loss": 0.4703, "step": 620 }, { "epoch": 0.12243690851735016, "grad_norm": 0.7662885988170952, "learning_rate": 1.9956640645590326e-05, "loss": 0.4285, "step": 621 }, { "epoch": 0.12263406940063092, "grad_norm": 0.9173829148052663, "learning_rate": 1.9956496339648936e-05, "loss": 0.4573, "step": 622 }, { "epoch": 0.12283123028391167, "grad_norm": 0.7616629858014796, "learning_rate": 1.9956351794494706e-05, "loss": 0.5152, "step": 623 }, { "epoch": 0.12302839116719243, "grad_norm": 0.8272088646379836, "learning_rate": 1.9956207010131107e-05, "loss": 0.477, "step": 624 }, { "epoch": 0.12322555205047318, "grad_norm": 0.7612317285779934, "learning_rate": 1.9956061986561615e-05, "loss": 0.4668, "step": 625 }, { "epoch": 0.12342271293375394, "grad_norm": 0.7612795525681632, "learning_rate": 1.9955916723789718e-05, "loss": 0.4562, "step": 626 }, { "epoch": 0.1236198738170347, "grad_norm": 0.8179624963182074, "learning_rate": 1.99557712218189e-05, "loss": 0.4713, "step": 627 }, { "epoch": 0.12381703470031545, "grad_norm": 0.8528861188969025, "learning_rate": 1.995562548065267e-05, "loss": 0.4664, "step": 628 }, { "epoch": 0.12401419558359622, "grad_norm": 0.7690605067619397, "learning_rate": 1.995547950029451e-05, "loss": 0.455, "step": 629 }, { "epoch": 0.12421135646687698, "grad_norm": 0.7483628212376229, "learning_rate": 1.9955333280747944e-05, "loss": 0.4652, "step": 630 }, { "epoch": 0.12440851735015773, "grad_norm": 0.9221392473701614, "learning_rate": 1.995518682201648e-05, "loss": 0.4776, "step": 631 }, { "epoch": 0.12460567823343849, "grad_norm": 0.851528134608556, "learning_rate": 1.995504012410363e-05, "loss": 0.4963, "step": 632 }, { "epoch": 0.12480283911671924, "grad_norm": 0.7424830012420283, "learning_rate": 1.9954893187012927e-05, "loss": 0.4266, "step": 633 }, { "epoch": 0.125, "grad_norm": 0.8242224350981598, "learning_rate": 1.99547460107479e-05, "loss": 0.4744, "step": 634 }, { "epoch": 0.12519716088328076, "grad_norm": 0.7923912180491203, "learning_rate": 1.9954598595312084e-05, "loss": 0.4633, "step": 635 }, { "epoch": 0.1253943217665615, "grad_norm": 0.8875572382982255, "learning_rate": 1.9954450940709018e-05, "loss": 0.4813, "step": 636 }, { "epoch": 0.12559148264984227, "grad_norm": 0.7926285972835072, "learning_rate": 1.9954303046942255e-05, "loss": 0.4878, "step": 637 }, { "epoch": 0.12578864353312302, "grad_norm": 0.7499862912600862, "learning_rate": 1.995415491401534e-05, "loss": 0.4734, "step": 638 }, { "epoch": 0.12598580441640378, "grad_norm": 0.7691964904703829, "learning_rate": 1.9954006541931844e-05, "loss": 0.4509, "step": 639 }, { "epoch": 0.12618296529968454, "grad_norm": 0.7532129288589031, "learning_rate": 1.9953857930695318e-05, "loss": 0.4733, "step": 640 }, { "epoch": 0.1263801261829653, "grad_norm": 0.8835688736071808, "learning_rate": 1.995370908030934e-05, "loss": 0.4813, "step": 641 }, { "epoch": 0.12657728706624605, "grad_norm": 0.8280331392534872, "learning_rate": 1.995355999077749e-05, "loss": 0.5209, "step": 642 }, { "epoch": 0.1267744479495268, "grad_norm": 0.7790589964453112, "learning_rate": 1.9953410662103346e-05, "loss": 0.468, "step": 643 }, { "epoch": 0.12697160883280756, "grad_norm": 1.1955818097157254, "learning_rate": 1.995326109429049e-05, "loss": 0.5219, "step": 644 }, { "epoch": 0.12716876971608831, "grad_norm": 0.760756053231397, "learning_rate": 1.9953111287342524e-05, "loss": 0.4269, "step": 645 }, { "epoch": 0.1273659305993691, "grad_norm": 0.801104959032305, "learning_rate": 1.9952961241263047e-05, "loss": 0.4947, "step": 646 }, { "epoch": 0.12756309148264985, "grad_norm": 0.9451949931761754, "learning_rate": 1.9952810956055656e-05, "loss": 0.4904, "step": 647 }, { "epoch": 0.1277602523659306, "grad_norm": 0.8441360454771364, "learning_rate": 1.995266043172397e-05, "loss": 0.4505, "step": 648 }, { "epoch": 0.12795741324921137, "grad_norm": 1.975864774694993, "learning_rate": 1.99525096682716e-05, "loss": 0.4824, "step": 649 }, { "epoch": 0.12815457413249212, "grad_norm": 0.9293646943980033, "learning_rate": 1.995235866570217e-05, "loss": 0.4646, "step": 650 }, { "epoch": 0.12835173501577288, "grad_norm": 0.7685956450339866, "learning_rate": 1.9952207424019314e-05, "loss": 0.4259, "step": 651 }, { "epoch": 0.12854889589905363, "grad_norm": 1.0613268056457563, "learning_rate": 1.9952055943226656e-05, "loss": 0.4783, "step": 652 }, { "epoch": 0.1287460567823344, "grad_norm": 0.7669978348644308, "learning_rate": 1.995190422332784e-05, "loss": 0.4342, "step": 653 }, { "epoch": 0.12894321766561515, "grad_norm": 1.3019510608457148, "learning_rate": 1.995175226432651e-05, "loss": 0.5192, "step": 654 }, { "epoch": 0.1291403785488959, "grad_norm": 1.0625567624596923, "learning_rate": 1.995160006622632e-05, "loss": 0.483, "step": 655 }, { "epoch": 0.12933753943217666, "grad_norm": 0.7982339743597725, "learning_rate": 1.995144762903092e-05, "loss": 0.4548, "step": 656 }, { "epoch": 0.1295347003154574, "grad_norm": 2.15419728357655, "learning_rate": 1.995129495274398e-05, "loss": 0.4486, "step": 657 }, { "epoch": 0.12973186119873817, "grad_norm": 0.7953532235642334, "learning_rate": 1.9951142037369163e-05, "loss": 0.5146, "step": 658 }, { "epoch": 0.12992902208201892, "grad_norm": 0.7733286355003084, "learning_rate": 1.995098888291015e-05, "loss": 0.4509, "step": 659 }, { "epoch": 0.13012618296529968, "grad_norm": 0.7219639448869223, "learning_rate": 1.995083548937061e-05, "loss": 0.4339, "step": 660 }, { "epoch": 0.13032334384858044, "grad_norm": 0.6876863470552866, "learning_rate": 1.9950681856754236e-05, "loss": 0.4397, "step": 661 }, { "epoch": 0.1305205047318612, "grad_norm": 0.7330567511407265, "learning_rate": 1.9950527985064717e-05, "loss": 0.4403, "step": 662 }, { "epoch": 0.13071766561514195, "grad_norm": 0.6967061391482817, "learning_rate": 1.9950373874305752e-05, "loss": 0.4402, "step": 663 }, { "epoch": 0.1309148264984227, "grad_norm": 1.2938777572194384, "learning_rate": 1.9950219524481042e-05, "loss": 0.5203, "step": 664 }, { "epoch": 0.13111198738170346, "grad_norm": 0.9539568996661243, "learning_rate": 1.995006493559429e-05, "loss": 0.4834, "step": 665 }, { "epoch": 0.13130914826498422, "grad_norm": 0.8042726712344785, "learning_rate": 1.9949910107649218e-05, "loss": 0.4626, "step": 666 }, { "epoch": 0.13150630914826497, "grad_norm": 1.6472739683316433, "learning_rate": 1.9949755040649545e-05, "loss": 0.479, "step": 667 }, { "epoch": 0.13170347003154576, "grad_norm": 1.0783243407313121, "learning_rate": 1.9949599734598993e-05, "loss": 0.4656, "step": 668 }, { "epoch": 0.1319006309148265, "grad_norm": 0.8310407005161574, "learning_rate": 1.9949444189501294e-05, "loss": 0.4254, "step": 669 }, { "epoch": 0.13209779179810727, "grad_norm": 0.7957824614600361, "learning_rate": 1.9949288405360186e-05, "loss": 0.4683, "step": 670 }, { "epoch": 0.13229495268138802, "grad_norm": 0.7451707613081128, "learning_rate": 1.9949132382179415e-05, "loss": 0.4632, "step": 671 }, { "epoch": 0.13249211356466878, "grad_norm": 0.7738134822360583, "learning_rate": 1.9948976119962724e-05, "loss": 0.4252, "step": 672 }, { "epoch": 0.13268927444794953, "grad_norm": 1.2563956712818225, "learning_rate": 1.9948819618713868e-05, "loss": 0.4297, "step": 673 }, { "epoch": 0.1328864353312303, "grad_norm": 0.7251684476228085, "learning_rate": 1.994866287843661e-05, "loss": 0.4308, "step": 674 }, { "epoch": 0.13308359621451105, "grad_norm": 0.8186852540206319, "learning_rate": 1.9948505899134717e-05, "loss": 0.4646, "step": 675 }, { "epoch": 0.1332807570977918, "grad_norm": 1.7860304743465076, "learning_rate": 1.994834868081196e-05, "loss": 0.4876, "step": 676 }, { "epoch": 0.13347791798107256, "grad_norm": 0.850580467492657, "learning_rate": 1.9948191223472108e-05, "loss": 0.4828, "step": 677 }, { "epoch": 0.13367507886435331, "grad_norm": 0.7130834372780571, "learning_rate": 1.9948033527118954e-05, "loss": 0.4874, "step": 678 }, { "epoch": 0.13387223974763407, "grad_norm": 0.7595923377554168, "learning_rate": 1.9947875591756286e-05, "loss": 0.4508, "step": 679 }, { "epoch": 0.13406940063091483, "grad_norm": 0.841937797265363, "learning_rate": 1.9947717417387894e-05, "loss": 0.4789, "step": 680 }, { "epoch": 0.13426656151419558, "grad_norm": 0.7865575362210725, "learning_rate": 1.994755900401758e-05, "loss": 0.439, "step": 681 }, { "epoch": 0.13446372239747634, "grad_norm": 0.7464667503739694, "learning_rate": 1.9947400351649148e-05, "loss": 0.4574, "step": 682 }, { "epoch": 0.1346608832807571, "grad_norm": 0.765573758240295, "learning_rate": 1.9947241460286414e-05, "loss": 0.4661, "step": 683 }, { "epoch": 0.13485804416403785, "grad_norm": 0.8053849370801137, "learning_rate": 1.9947082329933192e-05, "loss": 0.4902, "step": 684 }, { "epoch": 0.1350552050473186, "grad_norm": 0.9079668954607747, "learning_rate": 1.9946922960593307e-05, "loss": 0.4344, "step": 685 }, { "epoch": 0.13525236593059936, "grad_norm": 0.7910386751156379, "learning_rate": 1.994676335227059e-05, "loss": 0.4423, "step": 686 }, { "epoch": 0.13544952681388012, "grad_norm": 1.5549413245532762, "learning_rate": 1.994660350496887e-05, "loss": 0.5308, "step": 687 }, { "epoch": 0.13564668769716087, "grad_norm": 0.9255915463007742, "learning_rate": 1.9946443418691994e-05, "loss": 0.4618, "step": 688 }, { "epoch": 0.13584384858044163, "grad_norm": 0.7804360864352426, "learning_rate": 1.9946283093443803e-05, "loss": 0.449, "step": 689 }, { "epoch": 0.13604100946372238, "grad_norm": 0.7475663046747916, "learning_rate": 1.9946122529228153e-05, "loss": 0.4503, "step": 690 }, { "epoch": 0.13623817034700317, "grad_norm": 1.2554380433560701, "learning_rate": 1.9945961726048895e-05, "loss": 0.5168, "step": 691 }, { "epoch": 0.13643533123028392, "grad_norm": 0.7879053546798648, "learning_rate": 1.9945800683909904e-05, "loss": 0.4771, "step": 692 }, { "epoch": 0.13663249211356468, "grad_norm": 0.7487562573568673, "learning_rate": 1.9945639402815037e-05, "loss": 0.4672, "step": 693 }, { "epoch": 0.13682965299684544, "grad_norm": 0.7202621138555081, "learning_rate": 1.9945477882768177e-05, "loss": 0.447, "step": 694 }, { "epoch": 0.1370268138801262, "grad_norm": 0.8717913765163078, "learning_rate": 1.99453161237732e-05, "loss": 0.4599, "step": 695 }, { "epoch": 0.13722397476340695, "grad_norm": 0.8569531287456702, "learning_rate": 1.9945154125833996e-05, "loss": 0.4604, "step": 696 }, { "epoch": 0.1374211356466877, "grad_norm": 0.8364641774305389, "learning_rate": 1.9944991888954453e-05, "loss": 0.48, "step": 697 }, { "epoch": 0.13761829652996846, "grad_norm": 0.7618210867976017, "learning_rate": 1.9944829413138472e-05, "loss": 0.4546, "step": 698 }, { "epoch": 0.13781545741324921, "grad_norm": 0.745175412220073, "learning_rate": 1.9944666698389957e-05, "loss": 0.465, "step": 699 }, { "epoch": 0.13801261829652997, "grad_norm": 0.8186541094239488, "learning_rate": 1.9944503744712814e-05, "loss": 0.5081, "step": 700 }, { "epoch": 0.13820977917981073, "grad_norm": 0.7800566021733093, "learning_rate": 1.994434055211096e-05, "loss": 0.4692, "step": 701 }, { "epoch": 0.13840694006309148, "grad_norm": 2.337423114442513, "learning_rate": 1.9944177120588318e-05, "loss": 0.4922, "step": 702 }, { "epoch": 0.13860410094637224, "grad_norm": 0.9603122253059109, "learning_rate": 1.9944013450148812e-05, "loss": 0.5075, "step": 703 }, { "epoch": 0.138801261829653, "grad_norm": 0.7119216198791689, "learning_rate": 1.9943849540796375e-05, "loss": 0.4626, "step": 704 }, { "epoch": 0.13899842271293375, "grad_norm": 0.8234426695741333, "learning_rate": 1.9943685392534945e-05, "loss": 0.4274, "step": 705 }, { "epoch": 0.1391955835962145, "grad_norm": 0.8841914182356732, "learning_rate": 1.9943521005368468e-05, "loss": 0.5169, "step": 706 }, { "epoch": 0.13939274447949526, "grad_norm": 0.9754273189473419, "learning_rate": 1.994335637930089e-05, "loss": 0.4822, "step": 707 }, { "epoch": 0.13958990536277602, "grad_norm": 0.8312745222051547, "learning_rate": 1.9943191514336164e-05, "loss": 0.4614, "step": 708 }, { "epoch": 0.13978706624605677, "grad_norm": 0.8430236639900083, "learning_rate": 1.9943026410478258e-05, "loss": 0.4339, "step": 709 }, { "epoch": 0.13998422712933753, "grad_norm": 1.0814961430956602, "learning_rate": 1.9942861067731135e-05, "loss": 0.4825, "step": 710 }, { "epoch": 0.14018138801261829, "grad_norm": 0.77685594526798, "learning_rate": 1.994269548609877e-05, "loss": 0.4801, "step": 711 }, { "epoch": 0.14037854889589904, "grad_norm": 0.8376210695048821, "learning_rate": 1.9942529665585134e-05, "loss": 0.4742, "step": 712 }, { "epoch": 0.1405757097791798, "grad_norm": 1.0443709227200295, "learning_rate": 1.994236360619422e-05, "loss": 0.4616, "step": 713 }, { "epoch": 0.14077287066246058, "grad_norm": 0.7104447806333858, "learning_rate": 1.9942197307930014e-05, "loss": 0.4193, "step": 714 }, { "epoch": 0.14097003154574134, "grad_norm": 0.7967791879544678, "learning_rate": 1.994203077079651e-05, "loss": 0.4377, "step": 715 }, { "epoch": 0.1411671924290221, "grad_norm": 0.6852662543141802, "learning_rate": 1.994186399479771e-05, "loss": 0.4558, "step": 716 }, { "epoch": 0.14136435331230285, "grad_norm": 0.7293696917628472, "learning_rate": 1.9941696979937622e-05, "loss": 0.4695, "step": 717 }, { "epoch": 0.1415615141955836, "grad_norm": 1.0106248411363608, "learning_rate": 1.994152972622026e-05, "loss": 0.4496, "step": 718 }, { "epoch": 0.14175867507886436, "grad_norm": 2.069396673415166, "learning_rate": 1.994136223364964e-05, "loss": 0.5479, "step": 719 }, { "epoch": 0.14195583596214512, "grad_norm": 0.8625581928141414, "learning_rate": 1.994119450222978e-05, "loss": 0.5064, "step": 720 }, { "epoch": 0.14215299684542587, "grad_norm": 0.9612101953431659, "learning_rate": 1.9941026531964723e-05, "loss": 0.4676, "step": 721 }, { "epoch": 0.14235015772870663, "grad_norm": 0.775181863734654, "learning_rate": 1.9940858322858493e-05, "loss": 0.4736, "step": 722 }, { "epoch": 0.14254731861198738, "grad_norm": 1.0608368229238194, "learning_rate": 1.994068987491514e-05, "loss": 0.4761, "step": 723 }, { "epoch": 0.14274447949526814, "grad_norm": 0.7660677802463719, "learning_rate": 1.9940521188138707e-05, "loss": 0.4652, "step": 724 }, { "epoch": 0.1429416403785489, "grad_norm": 0.8853653109759493, "learning_rate": 1.9940352262533253e-05, "loss": 0.4351, "step": 725 }, { "epoch": 0.14313880126182965, "grad_norm": 0.8053400341791106, "learning_rate": 1.9940183098102823e-05, "loss": 0.4688, "step": 726 }, { "epoch": 0.1433359621451104, "grad_norm": 0.8403515646943758, "learning_rate": 1.9940013694851492e-05, "loss": 0.4709, "step": 727 }, { "epoch": 0.14353312302839116, "grad_norm": 0.8639620985700686, "learning_rate": 1.9939844052783328e-05, "loss": 0.44, "step": 728 }, { "epoch": 0.14373028391167192, "grad_norm": 0.7864708085272261, "learning_rate": 1.9939674171902406e-05, "loss": 0.4835, "step": 729 }, { "epoch": 0.14392744479495267, "grad_norm": 1.5378342164534347, "learning_rate": 1.9939504052212807e-05, "loss": 0.4371, "step": 730 }, { "epoch": 0.14412460567823343, "grad_norm": 0.8960131433484737, "learning_rate": 1.993933369371862e-05, "loss": 0.4814, "step": 731 }, { "epoch": 0.1443217665615142, "grad_norm": 1.2989860336198154, "learning_rate": 1.9939163096423936e-05, "loss": 0.4499, "step": 732 }, { "epoch": 0.14451892744479494, "grad_norm": 0.8858611738667077, "learning_rate": 1.9938992260332854e-05, "loss": 0.4327, "step": 733 }, { "epoch": 0.1447160883280757, "grad_norm": 1.1182251531253669, "learning_rate": 1.993882118544948e-05, "loss": 0.4575, "step": 734 }, { "epoch": 0.14491324921135645, "grad_norm": 1.181366460583882, "learning_rate": 1.993864987177792e-05, "loss": 0.4997, "step": 735 }, { "epoch": 0.14511041009463724, "grad_norm": 0.8730586316239893, "learning_rate": 1.9938478319322296e-05, "loss": 0.4279, "step": 736 }, { "epoch": 0.145307570977918, "grad_norm": 0.8622249699446333, "learning_rate": 1.9938306528086728e-05, "loss": 0.4689, "step": 737 }, { "epoch": 0.14550473186119875, "grad_norm": 0.856694039815468, "learning_rate": 1.9938134498075344e-05, "loss": 0.495, "step": 738 }, { "epoch": 0.1457018927444795, "grad_norm": 1.076447016915175, "learning_rate": 1.993796222929227e-05, "loss": 0.4981, "step": 739 }, { "epoch": 0.14589905362776026, "grad_norm": 1.1443003392978084, "learning_rate": 1.9937789721741654e-05, "loss": 0.4818, "step": 740 }, { "epoch": 0.14609621451104102, "grad_norm": 0.7913177157915059, "learning_rate": 1.9937616975427635e-05, "loss": 0.4423, "step": 741 }, { "epoch": 0.14629337539432177, "grad_norm": 0.8690811074204894, "learning_rate": 1.993744399035437e-05, "loss": 0.4583, "step": 742 }, { "epoch": 0.14649053627760253, "grad_norm": 0.8018130690999904, "learning_rate": 1.9937270766526007e-05, "loss": 0.4581, "step": 743 }, { "epoch": 0.14668769716088328, "grad_norm": 0.7068458870399968, "learning_rate": 1.9937097303946712e-05, "loss": 0.4365, "step": 744 }, { "epoch": 0.14688485804416404, "grad_norm": 1.106160247073608, "learning_rate": 1.993692360262065e-05, "loss": 0.4621, "step": 745 }, { "epoch": 0.1470820189274448, "grad_norm": 1.2660319824619375, "learning_rate": 1.9936749662552e-05, "loss": 0.4606, "step": 746 }, { "epoch": 0.14727917981072555, "grad_norm": 1.0379124369378552, "learning_rate": 1.9936575483744934e-05, "loss": 0.5044, "step": 747 }, { "epoch": 0.1474763406940063, "grad_norm": 2.817773387208584, "learning_rate": 1.993640106620364e-05, "loss": 0.4786, "step": 748 }, { "epoch": 0.14767350157728706, "grad_norm": 1.2455773315165135, "learning_rate": 1.993622640993231e-05, "loss": 0.4406, "step": 749 }, { "epoch": 0.14787066246056782, "grad_norm": 0.8090742340724816, "learning_rate": 1.993605151493514e-05, "loss": 0.4581, "step": 750 }, { "epoch": 0.14806782334384858, "grad_norm": 1.079672829303502, "learning_rate": 1.9935876381216327e-05, "loss": 0.4869, "step": 751 }, { "epoch": 0.14826498422712933, "grad_norm": 1.1108900962579644, "learning_rate": 1.993570100878009e-05, "loss": 0.4919, "step": 752 }, { "epoch": 0.1484621451104101, "grad_norm": 0.9754359159817712, "learning_rate": 1.993552539763063e-05, "loss": 0.4823, "step": 753 }, { "epoch": 0.14865930599369084, "grad_norm": 0.9888856468493855, "learning_rate": 1.9935349547772168e-05, "loss": 0.4685, "step": 754 }, { "epoch": 0.1488564668769716, "grad_norm": 0.8379931331598425, "learning_rate": 1.993517345920894e-05, "loss": 0.4936, "step": 755 }, { "epoch": 0.14905362776025236, "grad_norm": 1.0605810276155365, "learning_rate": 1.9934997131945165e-05, "loss": 0.5102, "step": 756 }, { "epoch": 0.1492507886435331, "grad_norm": 0.7915803452960315, "learning_rate": 1.993482056598508e-05, "loss": 0.4434, "step": 757 }, { "epoch": 0.14944794952681387, "grad_norm": 0.9745418084289177, "learning_rate": 1.9934643761332933e-05, "loss": 0.493, "step": 758 }, { "epoch": 0.14964511041009465, "grad_norm": 0.8122670056237417, "learning_rate": 1.993446671799297e-05, "loss": 0.4966, "step": 759 }, { "epoch": 0.1498422712933754, "grad_norm": 0.8242151071688861, "learning_rate": 1.9934289435969443e-05, "loss": 0.4771, "step": 760 }, { "epoch": 0.15003943217665616, "grad_norm": 0.8669205650067113, "learning_rate": 1.9934111915266614e-05, "loss": 0.4658, "step": 761 }, { "epoch": 0.15023659305993692, "grad_norm": 0.8210061083466603, "learning_rate": 1.9933934155888745e-05, "loss": 0.4503, "step": 762 }, { "epoch": 0.15043375394321767, "grad_norm": 0.8739389286616153, "learning_rate": 1.993375615784011e-05, "loss": 0.4426, "step": 763 }, { "epoch": 0.15063091482649843, "grad_norm": 0.8857407360915495, "learning_rate": 1.993357792112498e-05, "loss": 0.4337, "step": 764 }, { "epoch": 0.15082807570977919, "grad_norm": 0.8810531719213002, "learning_rate": 1.9933399445747645e-05, "loss": 0.4614, "step": 765 }, { "epoch": 0.15102523659305994, "grad_norm": 0.8157853115519594, "learning_rate": 1.9933220731712385e-05, "loss": 0.4531, "step": 766 }, { "epoch": 0.1512223974763407, "grad_norm": 0.6694213590950058, "learning_rate": 1.9933041779023502e-05, "loss": 0.4429, "step": 767 }, { "epoch": 0.15141955835962145, "grad_norm": 0.8193309247418029, "learning_rate": 1.993286258768529e-05, "loss": 0.4641, "step": 768 }, { "epoch": 0.1516167192429022, "grad_norm": 1.13684291197781, "learning_rate": 1.9932683157702054e-05, "loss": 0.4807, "step": 769 }, { "epoch": 0.15181388012618297, "grad_norm": 0.7712724217918977, "learning_rate": 1.9932503489078105e-05, "loss": 0.496, "step": 770 }, { "epoch": 0.15201104100946372, "grad_norm": 0.7302957702591549, "learning_rate": 1.993232358181776e-05, "loss": 0.4562, "step": 771 }, { "epoch": 0.15220820189274448, "grad_norm": 2.0853914712696477, "learning_rate": 1.9932143435925346e-05, "loss": 0.4809, "step": 772 }, { "epoch": 0.15240536277602523, "grad_norm": 0.7477688111923385, "learning_rate": 1.993196305140519e-05, "loss": 0.4518, "step": 773 }, { "epoch": 0.152602523659306, "grad_norm": 0.7646481115742062, "learning_rate": 1.993178242826162e-05, "loss": 0.4686, "step": 774 }, { "epoch": 0.15279968454258674, "grad_norm": 0.7688921576232522, "learning_rate": 1.9931601566498976e-05, "loss": 0.4965, "step": 775 }, { "epoch": 0.1529968454258675, "grad_norm": 0.749864201600683, "learning_rate": 1.9931420466121613e-05, "loss": 0.435, "step": 776 }, { "epoch": 0.15319400630914826, "grad_norm": 0.856076678547743, "learning_rate": 1.993123912713387e-05, "loss": 0.5236, "step": 777 }, { "epoch": 0.153391167192429, "grad_norm": 1.1208704076088636, "learning_rate": 1.9931057549540114e-05, "loss": 0.5461, "step": 778 }, { "epoch": 0.15358832807570977, "grad_norm": 1.6176725348893128, "learning_rate": 1.9930875733344698e-05, "loss": 0.4927, "step": 779 }, { "epoch": 0.15378548895899052, "grad_norm": 0.6955630821889409, "learning_rate": 1.9930693678552e-05, "loss": 0.4298, "step": 780 }, { "epoch": 0.15398264984227128, "grad_norm": 0.7928697755514822, "learning_rate": 1.9930511385166388e-05, "loss": 0.4368, "step": 781 }, { "epoch": 0.15417981072555206, "grad_norm": 0.9287503152149722, "learning_rate": 1.9930328853192243e-05, "loss": 0.4308, "step": 782 }, { "epoch": 0.15437697160883282, "grad_norm": 0.9144564030468558, "learning_rate": 1.993014608263395e-05, "loss": 0.4802, "step": 783 }, { "epoch": 0.15457413249211358, "grad_norm": 0.7843727384732332, "learning_rate": 1.9929963073495896e-05, "loss": 0.4635, "step": 784 }, { "epoch": 0.15477129337539433, "grad_norm": 0.8191880589199644, "learning_rate": 1.992977982578249e-05, "loss": 0.4816, "step": 785 }, { "epoch": 0.1549684542586751, "grad_norm": 0.8904807870661935, "learning_rate": 1.9929596339498122e-05, "loss": 0.47, "step": 786 }, { "epoch": 0.15516561514195584, "grad_norm": 1.8183390384125857, "learning_rate": 1.9929412614647207e-05, "loss": 0.4631, "step": 787 }, { "epoch": 0.1553627760252366, "grad_norm": 0.8216800794838671, "learning_rate": 1.992922865123416e-05, "loss": 0.4264, "step": 788 }, { "epoch": 0.15555993690851735, "grad_norm": 0.7165148280953453, "learning_rate": 1.9929044449263397e-05, "loss": 0.4674, "step": 789 }, { "epoch": 0.1557570977917981, "grad_norm": 1.52791161060643, "learning_rate": 1.9928860008739343e-05, "loss": 0.5036, "step": 790 }, { "epoch": 0.15595425867507887, "grad_norm": 0.9923711638162429, "learning_rate": 1.9928675329666435e-05, "loss": 0.4778, "step": 791 }, { "epoch": 0.15615141955835962, "grad_norm": 0.8551576282277356, "learning_rate": 1.9928490412049108e-05, "loss": 0.464, "step": 792 }, { "epoch": 0.15634858044164038, "grad_norm": 0.7732867427931409, "learning_rate": 1.99283052558918e-05, "loss": 0.4623, "step": 793 }, { "epoch": 0.15654574132492113, "grad_norm": 0.9096270847699062, "learning_rate": 1.9928119861198962e-05, "loss": 0.5047, "step": 794 }, { "epoch": 0.1567429022082019, "grad_norm": 0.913028519166887, "learning_rate": 1.9927934227975054e-05, "loss": 0.4832, "step": 795 }, { "epoch": 0.15694006309148265, "grad_norm": 0.7830739067460275, "learning_rate": 1.9927748356224528e-05, "loss": 0.4744, "step": 796 }, { "epoch": 0.1571372239747634, "grad_norm": 0.8632625237500243, "learning_rate": 1.9927562245951854e-05, "loss": 0.4458, "step": 797 }, { "epoch": 0.15733438485804416, "grad_norm": 0.7175002952446166, "learning_rate": 1.9927375897161502e-05, "loss": 0.4602, "step": 798 }, { "epoch": 0.1575315457413249, "grad_norm": 0.8959680573938172, "learning_rate": 1.9927189309857948e-05, "loss": 0.4611, "step": 799 }, { "epoch": 0.15772870662460567, "grad_norm": 0.8010317830596815, "learning_rate": 1.9927002484045678e-05, "loss": 0.4761, "step": 800 }, { "epoch": 0.15792586750788642, "grad_norm": 0.704519354931981, "learning_rate": 1.992681541972918e-05, "loss": 0.4654, "step": 801 }, { "epoch": 0.15812302839116718, "grad_norm": 1.0431620409267255, "learning_rate": 1.9926628116912946e-05, "loss": 0.519, "step": 802 }, { "epoch": 0.15832018927444794, "grad_norm": 0.743437599527334, "learning_rate": 1.992644057560148e-05, "loss": 0.4452, "step": 803 }, { "epoch": 0.15851735015772872, "grad_norm": 2.131396450457408, "learning_rate": 1.992625279579928e-05, "loss": 0.4587, "step": 804 }, { "epoch": 0.15871451104100948, "grad_norm": 1.6197122809814877, "learning_rate": 1.992606477751087e-05, "loss": 0.4668, "step": 805 }, { "epoch": 0.15891167192429023, "grad_norm": 0.9266315678990761, "learning_rate": 1.9925876520740758e-05, "loss": 0.5003, "step": 806 }, { "epoch": 0.159108832807571, "grad_norm": 1.3138014974549306, "learning_rate": 1.9925688025493468e-05, "loss": 0.4816, "step": 807 }, { "epoch": 0.15930599369085174, "grad_norm": 0.7340535577058777, "learning_rate": 1.9925499291773528e-05, "loss": 0.4804, "step": 808 }, { "epoch": 0.1595031545741325, "grad_norm": 1.1487683565124704, "learning_rate": 1.9925310319585475e-05, "loss": 0.4804, "step": 809 }, { "epoch": 0.15970031545741326, "grad_norm": 0.7305026939171207, "learning_rate": 1.9925121108933852e-05, "loss": 0.4774, "step": 810 }, { "epoch": 0.159897476340694, "grad_norm": 0.9950230997430713, "learning_rate": 1.99249316598232e-05, "loss": 0.4638, "step": 811 }, { "epoch": 0.16009463722397477, "grad_norm": 0.857287078084844, "learning_rate": 1.9924741972258076e-05, "loss": 0.4742, "step": 812 }, { "epoch": 0.16029179810725552, "grad_norm": 0.8005250545507363, "learning_rate": 1.9924552046243026e-05, "loss": 0.488, "step": 813 }, { "epoch": 0.16048895899053628, "grad_norm": 0.90398378601464, "learning_rate": 1.9924361881782625e-05, "loss": 0.4677, "step": 814 }, { "epoch": 0.16068611987381703, "grad_norm": 0.7372247300869629, "learning_rate": 1.992417147888144e-05, "loss": 0.4529, "step": 815 }, { "epoch": 0.1608832807570978, "grad_norm": 0.8887941980027961, "learning_rate": 1.992398083754404e-05, "loss": 0.4629, "step": 816 }, { "epoch": 0.16108044164037855, "grad_norm": 0.6535394579923204, "learning_rate": 1.992378995777501e-05, "loss": 0.4515, "step": 817 }, { "epoch": 0.1612776025236593, "grad_norm": 0.84734882482852, "learning_rate": 1.9923598839578937e-05, "loss": 0.4263, "step": 818 }, { "epoch": 0.16147476340694006, "grad_norm": 0.7925717387308715, "learning_rate": 1.9923407482960408e-05, "loss": 0.4775, "step": 819 }, { "epoch": 0.16167192429022081, "grad_norm": 0.8742951540598778, "learning_rate": 1.9923215887924022e-05, "loss": 0.4637, "step": 820 }, { "epoch": 0.16186908517350157, "grad_norm": 0.7279557612080059, "learning_rate": 1.9923024054474384e-05, "loss": 0.4636, "step": 821 }, { "epoch": 0.16206624605678233, "grad_norm": 0.8534513364783904, "learning_rate": 1.99228319826161e-05, "loss": 0.4857, "step": 822 }, { "epoch": 0.16226340694006308, "grad_norm": 0.7402713942104387, "learning_rate": 1.992263967235379e-05, "loss": 0.481, "step": 823 }, { "epoch": 0.16246056782334384, "grad_norm": 0.7153338719389193, "learning_rate": 1.992244712369207e-05, "loss": 0.4457, "step": 824 }, { "epoch": 0.1626577287066246, "grad_norm": 1.4728339073471215, "learning_rate": 1.9922254336635567e-05, "loss": 0.4514, "step": 825 }, { "epoch": 0.16285488958990535, "grad_norm": 0.8791659930736712, "learning_rate": 1.9922061311188914e-05, "loss": 0.4852, "step": 826 }, { "epoch": 0.16305205047318613, "grad_norm": 0.7681632672109859, "learning_rate": 1.9921868047356747e-05, "loss": 0.4869, "step": 827 }, { "epoch": 0.1632492113564669, "grad_norm": 0.8229171295277476, "learning_rate": 1.992167454514371e-05, "loss": 0.4828, "step": 828 }, { "epoch": 0.16344637223974764, "grad_norm": 0.7529692381939216, "learning_rate": 1.9921480804554453e-05, "loss": 0.46, "step": 829 }, { "epoch": 0.1636435331230284, "grad_norm": 0.9423934940595431, "learning_rate": 1.9921286825593632e-05, "loss": 0.4418, "step": 830 }, { "epoch": 0.16384069400630916, "grad_norm": 0.8322161382525551, "learning_rate": 1.9921092608265902e-05, "loss": 0.4489, "step": 831 }, { "epoch": 0.1640378548895899, "grad_norm": 0.8265863211632591, "learning_rate": 1.9920898152575932e-05, "loss": 0.457, "step": 832 }, { "epoch": 0.16423501577287067, "grad_norm": 0.8016187387195418, "learning_rate": 1.99207034585284e-05, "loss": 0.4935, "step": 833 }, { "epoch": 0.16443217665615142, "grad_norm": 0.7562870006268851, "learning_rate": 1.992050852612797e-05, "loss": 0.4901, "step": 834 }, { "epoch": 0.16462933753943218, "grad_norm": 0.8037484145010387, "learning_rate": 1.992031335537934e-05, "loss": 0.4905, "step": 835 }, { "epoch": 0.16482649842271294, "grad_norm": 0.8098114037544843, "learning_rate": 1.9920117946287193e-05, "loss": 0.465, "step": 836 }, { "epoch": 0.1650236593059937, "grad_norm": 0.8269800376599079, "learning_rate": 1.991992229885622e-05, "loss": 0.4735, "step": 837 }, { "epoch": 0.16522082018927445, "grad_norm": 0.7558534353840859, "learning_rate": 1.9919726413091127e-05, "loss": 0.4824, "step": 838 }, { "epoch": 0.1654179810725552, "grad_norm": 0.766057843711096, "learning_rate": 1.9919530288996617e-05, "loss": 0.4509, "step": 839 }, { "epoch": 0.16561514195583596, "grad_norm": 0.8748898217978727, "learning_rate": 1.9919333926577406e-05, "loss": 0.442, "step": 840 }, { "epoch": 0.16581230283911672, "grad_norm": 0.8934360442693702, "learning_rate": 1.9919137325838208e-05, "loss": 0.5239, "step": 841 }, { "epoch": 0.16600946372239747, "grad_norm": 0.8552477164679089, "learning_rate": 1.9918940486783752e-05, "loss": 0.4645, "step": 842 }, { "epoch": 0.16620662460567823, "grad_norm": 0.7957360554655829, "learning_rate": 1.9918743409418756e-05, "loss": 0.4883, "step": 843 }, { "epoch": 0.16640378548895898, "grad_norm": 0.6874695944020062, "learning_rate": 1.9918546093747965e-05, "loss": 0.4179, "step": 844 }, { "epoch": 0.16660094637223974, "grad_norm": 0.7762889366838219, "learning_rate": 1.991834853977612e-05, "loss": 0.4905, "step": 845 }, { "epoch": 0.1667981072555205, "grad_norm": 0.7696965320259346, "learning_rate": 1.9918150747507963e-05, "loss": 0.4404, "step": 846 }, { "epoch": 0.16699526813880125, "grad_norm": 0.8072545032949946, "learning_rate": 1.9917952716948243e-05, "loss": 0.4681, "step": 847 }, { "epoch": 0.167192429022082, "grad_norm": 0.8777353512586558, "learning_rate": 1.9917754448101725e-05, "loss": 0.4939, "step": 848 }, { "epoch": 0.16738958990536276, "grad_norm": 0.7082853265302725, "learning_rate": 1.991755594097317e-05, "loss": 0.4543, "step": 849 }, { "epoch": 0.16758675078864355, "grad_norm": 0.8604796366799073, "learning_rate": 1.9917357195567347e-05, "loss": 0.4454, "step": 850 }, { "epoch": 0.1677839116719243, "grad_norm": 0.6898391954794304, "learning_rate": 1.991715821188903e-05, "loss": 0.4238, "step": 851 }, { "epoch": 0.16798107255520506, "grad_norm": 0.8800493119077631, "learning_rate": 1.9916958989943002e-05, "loss": 0.4569, "step": 852 }, { "epoch": 0.1681782334384858, "grad_norm": 0.8417073275173806, "learning_rate": 1.9916759529734046e-05, "loss": 0.4777, "step": 853 }, { "epoch": 0.16837539432176657, "grad_norm": 0.7791244231651858, "learning_rate": 1.9916559831266957e-05, "loss": 0.4544, "step": 854 }, { "epoch": 0.16857255520504733, "grad_norm": 0.8140536453201463, "learning_rate": 1.9916359894546534e-05, "loss": 0.4634, "step": 855 }, { "epoch": 0.16876971608832808, "grad_norm": 0.7303022471726613, "learning_rate": 1.9916159719577577e-05, "loss": 0.4519, "step": 856 }, { "epoch": 0.16896687697160884, "grad_norm": 0.7729780959678046, "learning_rate": 1.9915959306364897e-05, "loss": 0.422, "step": 857 }, { "epoch": 0.1691640378548896, "grad_norm": 0.8034659806730786, "learning_rate": 1.9915758654913313e-05, "loss": 0.4942, "step": 858 }, { "epoch": 0.16936119873817035, "grad_norm": 0.7897233162629976, "learning_rate": 1.991555776522764e-05, "loss": 0.4762, "step": 859 }, { "epoch": 0.1695583596214511, "grad_norm": 0.7803962061080867, "learning_rate": 1.9915356637312704e-05, "loss": 0.4649, "step": 860 }, { "epoch": 0.16975552050473186, "grad_norm": 0.9547947469437102, "learning_rate": 1.991515527117334e-05, "loss": 0.4652, "step": 861 }, { "epoch": 0.16995268138801262, "grad_norm": 0.8072759008329805, "learning_rate": 1.9914953666814392e-05, "loss": 0.4665, "step": 862 }, { "epoch": 0.17014984227129337, "grad_norm": 0.8077676711668577, "learning_rate": 1.9914751824240694e-05, "loss": 0.4617, "step": 863 }, { "epoch": 0.17034700315457413, "grad_norm": 1.293222408833284, "learning_rate": 1.9914549743457096e-05, "loss": 0.4666, "step": 864 }, { "epoch": 0.17054416403785488, "grad_norm": 0.7923729068722024, "learning_rate": 1.991434742446846e-05, "loss": 0.4261, "step": 865 }, { "epoch": 0.17074132492113564, "grad_norm": 14.718649448920804, "learning_rate": 1.9914144867279644e-05, "loss": 0.5072, "step": 866 }, { "epoch": 0.1709384858044164, "grad_norm": 0.831483953129002, "learning_rate": 1.991394207189551e-05, "loss": 0.4281, "step": 867 }, { "epoch": 0.17113564668769715, "grad_norm": 0.8157367156824404, "learning_rate": 1.9913739038320935e-05, "loss": 0.4886, "step": 868 }, { "epoch": 0.1713328075709779, "grad_norm": 0.747934401454802, "learning_rate": 1.99135357665608e-05, "loss": 0.464, "step": 869 }, { "epoch": 0.17152996845425866, "grad_norm": 0.7940635147564067, "learning_rate": 1.991333225661998e-05, "loss": 0.446, "step": 870 }, { "epoch": 0.17172712933753942, "grad_norm": 0.7556559635824, "learning_rate": 1.9913128508503373e-05, "loss": 0.4187, "step": 871 }, { "epoch": 0.1719242902208202, "grad_norm": 0.937585969395132, "learning_rate": 1.991292452221587e-05, "loss": 0.4627, "step": 872 }, { "epoch": 0.17212145110410096, "grad_norm": 0.7287727570911343, "learning_rate": 1.9912720297762372e-05, "loss": 0.4412, "step": 873 }, { "epoch": 0.17231861198738171, "grad_norm": 1.4483761847244239, "learning_rate": 1.9912515835147785e-05, "loss": 0.4702, "step": 874 }, { "epoch": 0.17251577287066247, "grad_norm": 0.7776251644712487, "learning_rate": 1.9912311134377023e-05, "loss": 0.4827, "step": 875 }, { "epoch": 0.17271293375394323, "grad_norm": 0.811467179063533, "learning_rate": 1.9912106195455002e-05, "loss": 0.4303, "step": 876 }, { "epoch": 0.17291009463722398, "grad_norm": 0.7347947732597557, "learning_rate": 1.991190101838665e-05, "loss": 0.48, "step": 877 }, { "epoch": 0.17310725552050474, "grad_norm": 0.7700860751914179, "learning_rate": 1.9911695603176896e-05, "loss": 0.4418, "step": 878 }, { "epoch": 0.1733044164037855, "grad_norm": 0.7424955632937327, "learning_rate": 1.9911489949830665e-05, "loss": 0.4778, "step": 879 }, { "epoch": 0.17350157728706625, "grad_norm": 0.6988442966765853, "learning_rate": 1.9911284058352916e-05, "loss": 0.4955, "step": 880 }, { "epoch": 0.173698738170347, "grad_norm": 1.201129785880802, "learning_rate": 1.9911077928748577e-05, "loss": 0.5094, "step": 881 }, { "epoch": 0.17389589905362776, "grad_norm": 0.7077413544925765, "learning_rate": 1.9910871561022617e-05, "loss": 0.4489, "step": 882 }, { "epoch": 0.17409305993690852, "grad_norm": 0.7332451772736706, "learning_rate": 1.9910664955179983e-05, "loss": 0.4479, "step": 883 }, { "epoch": 0.17429022082018927, "grad_norm": 0.7177585522779164, "learning_rate": 1.9910458111225645e-05, "loss": 0.428, "step": 884 }, { "epoch": 0.17448738170347003, "grad_norm": 0.6737153923574553, "learning_rate": 1.9910251029164568e-05, "loss": 0.4288, "step": 885 }, { "epoch": 0.17468454258675079, "grad_norm": 0.8450869761757991, "learning_rate": 1.9910043709001727e-05, "loss": 0.4537, "step": 886 }, { "epoch": 0.17488170347003154, "grad_norm": 0.6872411051621798, "learning_rate": 1.990983615074211e-05, "loss": 0.4449, "step": 887 }, { "epoch": 0.1750788643533123, "grad_norm": 0.8721136747808786, "learning_rate": 1.9909628354390697e-05, "loss": 0.4752, "step": 888 }, { "epoch": 0.17527602523659305, "grad_norm": 0.7655080513181773, "learning_rate": 1.990942031995248e-05, "loss": 0.4868, "step": 889 }, { "epoch": 0.1754731861198738, "grad_norm": 0.7199528165114193, "learning_rate": 1.9909212047432465e-05, "loss": 0.4528, "step": 890 }, { "epoch": 0.17567034700315456, "grad_norm": 0.7061497459915089, "learning_rate": 1.990900353683565e-05, "loss": 0.455, "step": 891 }, { "epoch": 0.17586750788643532, "grad_norm": 2.291375288631725, "learning_rate": 1.990879478816704e-05, "loss": 0.4722, "step": 892 }, { "epoch": 0.17606466876971608, "grad_norm": 0.8123591618774287, "learning_rate": 1.9908585801431658e-05, "loss": 0.4706, "step": 893 }, { "epoch": 0.17626182965299683, "grad_norm": 0.785222829757174, "learning_rate": 1.9908376576634526e-05, "loss": 0.4608, "step": 894 }, { "epoch": 0.17645899053627762, "grad_norm": 0.7633221752691605, "learning_rate": 1.9908167113780665e-05, "loss": 0.4663, "step": 895 }, { "epoch": 0.17665615141955837, "grad_norm": 0.7931603437941884, "learning_rate": 1.990795741287511e-05, "loss": 0.4979, "step": 896 }, { "epoch": 0.17685331230283913, "grad_norm": 0.740593821102, "learning_rate": 1.99077474739229e-05, "loss": 0.4595, "step": 897 }, { "epoch": 0.17705047318611988, "grad_norm": 0.7137127142445181, "learning_rate": 1.9907537296929077e-05, "loss": 0.468, "step": 898 }, { "epoch": 0.17724763406940064, "grad_norm": 0.7856236565024629, "learning_rate": 1.9907326881898693e-05, "loss": 0.4532, "step": 899 }, { "epoch": 0.1774447949526814, "grad_norm": 0.8304106213478027, "learning_rate": 1.99071162288368e-05, "loss": 0.4835, "step": 900 }, { "epoch": 0.17764195583596215, "grad_norm": 0.7075946238835652, "learning_rate": 1.9906905337748466e-05, "loss": 0.4415, "step": 901 }, { "epoch": 0.1778391167192429, "grad_norm": 0.7255892333472377, "learning_rate": 1.990669420863875e-05, "loss": 0.462, "step": 902 }, { "epoch": 0.17803627760252366, "grad_norm": 0.7551817312285064, "learning_rate": 1.990648284151273e-05, "loss": 0.4352, "step": 903 }, { "epoch": 0.17823343848580442, "grad_norm": 0.7462637275359422, "learning_rate": 1.9906271236375478e-05, "loss": 0.4635, "step": 904 }, { "epoch": 0.17843059936908517, "grad_norm": 0.7285489399185598, "learning_rate": 1.9906059393232088e-05, "loss": 0.4696, "step": 905 }, { "epoch": 0.17862776025236593, "grad_norm": 0.789466645420734, "learning_rate": 1.990584731208764e-05, "loss": 0.4968, "step": 906 }, { "epoch": 0.17882492113564669, "grad_norm": 2.968306758137648, "learning_rate": 1.9905634992947235e-05, "loss": 0.491, "step": 907 }, { "epoch": 0.17902208201892744, "grad_norm": 0.7750070374318527, "learning_rate": 1.990542243581597e-05, "loss": 0.4703, "step": 908 }, { "epoch": 0.1792192429022082, "grad_norm": 0.695028794150739, "learning_rate": 1.9905209640698952e-05, "loss": 0.4642, "step": 909 }, { "epoch": 0.17941640378548895, "grad_norm": 0.7695918165395215, "learning_rate": 1.9904996607601303e-05, "loss": 0.4552, "step": 910 }, { "epoch": 0.1796135646687697, "grad_norm": 0.7202892429327876, "learning_rate": 1.9904783336528128e-05, "loss": 0.4646, "step": 911 }, { "epoch": 0.17981072555205047, "grad_norm": 0.8135132238328896, "learning_rate": 1.9904569827484556e-05, "loss": 0.4594, "step": 912 }, { "epoch": 0.18000788643533122, "grad_norm": 0.7554729773822189, "learning_rate": 1.990435608047572e-05, "loss": 0.4008, "step": 913 }, { "epoch": 0.18020504731861198, "grad_norm": 0.737373759645686, "learning_rate": 1.9904142095506756e-05, "loss": 0.4335, "step": 914 }, { "epoch": 0.18040220820189273, "grad_norm": 0.8409703840361105, "learning_rate": 1.99039278725828e-05, "loss": 0.4612, "step": 915 }, { "epoch": 0.1805993690851735, "grad_norm": 0.6825714924533132, "learning_rate": 1.9903713411709003e-05, "loss": 0.4225, "step": 916 }, { "epoch": 0.18079652996845424, "grad_norm": 0.9821803862497531, "learning_rate": 1.9903498712890516e-05, "loss": 0.4981, "step": 917 }, { "epoch": 0.18099369085173503, "grad_norm": 0.7273123016078351, "learning_rate": 1.9903283776132495e-05, "loss": 0.4461, "step": 918 }, { "epoch": 0.18119085173501578, "grad_norm": 0.8665342753995058, "learning_rate": 1.9903068601440106e-05, "loss": 0.4452, "step": 919 }, { "epoch": 0.18138801261829654, "grad_norm": 0.7822855298339977, "learning_rate": 1.9902853188818518e-05, "loss": 0.4655, "step": 920 }, { "epoch": 0.1815851735015773, "grad_norm": 0.8357899870781356, "learning_rate": 1.990263753827291e-05, "loss": 0.4416, "step": 921 }, { "epoch": 0.18178233438485805, "grad_norm": 0.6936838258206445, "learning_rate": 1.990242164980846e-05, "loss": 0.4612, "step": 922 }, { "epoch": 0.1819794952681388, "grad_norm": 0.8050591566486436, "learning_rate": 1.9902205523430353e-05, "loss": 0.4465, "step": 923 }, { "epoch": 0.18217665615141956, "grad_norm": 1.0415997120130642, "learning_rate": 1.9901989159143786e-05, "loss": 0.4404, "step": 924 }, { "epoch": 0.18237381703470032, "grad_norm": 0.7976243404123774, "learning_rate": 1.9901772556953958e-05, "loss": 0.4767, "step": 925 }, { "epoch": 0.18257097791798108, "grad_norm": 0.7036095691664724, "learning_rate": 1.9901555716866067e-05, "loss": 0.4479, "step": 926 }, { "epoch": 0.18276813880126183, "grad_norm": 0.7746405790517535, "learning_rate": 1.9901338638885327e-05, "loss": 0.4514, "step": 927 }, { "epoch": 0.1829652996845426, "grad_norm": 0.7905763277511705, "learning_rate": 1.9901121323016955e-05, "loss": 0.4947, "step": 928 }, { "epoch": 0.18316246056782334, "grad_norm": 0.8011819612935877, "learning_rate": 1.9900903769266167e-05, "loss": 0.4711, "step": 929 }, { "epoch": 0.1833596214511041, "grad_norm": 0.7885618953274046, "learning_rate": 1.9900685977638194e-05, "loss": 0.4474, "step": 930 }, { "epoch": 0.18355678233438485, "grad_norm": 0.8628719762309652, "learning_rate": 1.9900467948138266e-05, "loss": 0.484, "step": 931 }, { "epoch": 0.1837539432176656, "grad_norm": 0.8426852896992455, "learning_rate": 1.9900249680771622e-05, "loss": 0.4597, "step": 932 }, { "epoch": 0.18395110410094637, "grad_norm": 0.8844622895009938, "learning_rate": 1.990003117554351e-05, "loss": 0.4837, "step": 933 }, { "epoch": 0.18414826498422712, "grad_norm": 0.7099824020200362, "learning_rate": 1.9899812432459175e-05, "loss": 0.4425, "step": 934 }, { "epoch": 0.18434542586750788, "grad_norm": 0.7629633321327819, "learning_rate": 1.9899593451523875e-05, "loss": 0.4526, "step": 935 }, { "epoch": 0.18454258675078863, "grad_norm": 0.8792149544075311, "learning_rate": 1.989937423274287e-05, "loss": 0.4682, "step": 936 }, { "epoch": 0.1847397476340694, "grad_norm": 0.6838639839900289, "learning_rate": 1.9899154776121424e-05, "loss": 0.4261, "step": 937 }, { "epoch": 0.18493690851735015, "grad_norm": 0.706142676214522, "learning_rate": 1.9898935081664814e-05, "loss": 0.4805, "step": 938 }, { "epoch": 0.1851340694006309, "grad_norm": 0.6859539690000207, "learning_rate": 1.9898715149378317e-05, "loss": 0.4706, "step": 939 }, { "epoch": 0.18533123028391169, "grad_norm": 0.6812899026189113, "learning_rate": 1.989849497926722e-05, "loss": 0.4256, "step": 940 }, { "epoch": 0.18552839116719244, "grad_norm": 0.6577900927420254, "learning_rate": 1.989827457133681e-05, "loss": 0.4463, "step": 941 }, { "epoch": 0.1857255520504732, "grad_norm": 0.7514857045339612, "learning_rate": 1.9898053925592376e-05, "loss": 0.4591, "step": 942 }, { "epoch": 0.18592271293375395, "grad_norm": 0.6549094044973097, "learning_rate": 1.9897833042039233e-05, "loss": 0.4623, "step": 943 }, { "epoch": 0.1861198738170347, "grad_norm": 0.7310174851562488, "learning_rate": 1.9897611920682676e-05, "loss": 0.4487, "step": 944 }, { "epoch": 0.18631703470031546, "grad_norm": 0.6963083495505222, "learning_rate": 1.9897390561528024e-05, "loss": 0.4947, "step": 945 }, { "epoch": 0.18651419558359622, "grad_norm": 0.6460049210995387, "learning_rate": 1.9897168964580594e-05, "loss": 0.4631, "step": 946 }, { "epoch": 0.18671135646687698, "grad_norm": 0.7279929742381652, "learning_rate": 1.9896947129845707e-05, "loss": 0.454, "step": 947 }, { "epoch": 0.18690851735015773, "grad_norm": 0.7314455893730071, "learning_rate": 1.9896725057328695e-05, "loss": 0.468, "step": 948 }, { "epoch": 0.1871056782334385, "grad_norm": 0.707927540023582, "learning_rate": 1.9896502747034894e-05, "loss": 0.475, "step": 949 }, { "epoch": 0.18730283911671924, "grad_norm": 0.8331091743901959, "learning_rate": 1.989628019896965e-05, "loss": 0.467, "step": 950 }, { "epoch": 0.1875, "grad_norm": 0.7225918783951755, "learning_rate": 1.98960574131383e-05, "loss": 0.477, "step": 951 }, { "epoch": 0.18769716088328076, "grad_norm": 0.6757638681754824, "learning_rate": 1.9895834389546204e-05, "loss": 0.4572, "step": 952 }, { "epoch": 0.1878943217665615, "grad_norm": 0.7206616057098645, "learning_rate": 1.9895611128198714e-05, "loss": 0.4644, "step": 953 }, { "epoch": 0.18809148264984227, "grad_norm": 0.6474073350668482, "learning_rate": 1.9895387629101203e-05, "loss": 0.4484, "step": 954 }, { "epoch": 0.18828864353312302, "grad_norm": 0.7027079831141624, "learning_rate": 1.989516389225903e-05, "loss": 0.4845, "step": 955 }, { "epoch": 0.18848580441640378, "grad_norm": 0.6725377762031143, "learning_rate": 1.9894939917677577e-05, "loss": 0.4592, "step": 956 }, { "epoch": 0.18868296529968454, "grad_norm": 0.6747407598105398, "learning_rate": 1.9894715705362227e-05, "loss": 0.4316, "step": 957 }, { "epoch": 0.1888801261829653, "grad_norm": 0.990350135133828, "learning_rate": 1.9894491255318362e-05, "loss": 0.4676, "step": 958 }, { "epoch": 0.18907728706624605, "grad_norm": 0.6877906835243988, "learning_rate": 1.9894266567551378e-05, "loss": 0.469, "step": 959 }, { "epoch": 0.1892744479495268, "grad_norm": 0.6882946457373598, "learning_rate": 1.989404164206667e-05, "loss": 0.4932, "step": 960 }, { "epoch": 0.18947160883280756, "grad_norm": 0.7061372635020192, "learning_rate": 1.9893816478869646e-05, "loss": 0.484, "step": 961 }, { "epoch": 0.18966876971608831, "grad_norm": 0.6724470559392719, "learning_rate": 1.989359107796571e-05, "loss": 0.4332, "step": 962 }, { "epoch": 0.1898659305993691, "grad_norm": 0.7625542937410119, "learning_rate": 1.9893365439360285e-05, "loss": 0.4763, "step": 963 }, { "epoch": 0.19006309148264985, "grad_norm": 0.6953307451809908, "learning_rate": 1.9893139563058786e-05, "loss": 0.4958, "step": 964 }, { "epoch": 0.1902602523659306, "grad_norm": 0.7466843034274665, "learning_rate": 1.9892913449066643e-05, "loss": 0.4958, "step": 965 }, { "epoch": 0.19045741324921137, "grad_norm": 0.6585282349015559, "learning_rate": 1.9892687097389288e-05, "loss": 0.4224, "step": 966 }, { "epoch": 0.19065457413249212, "grad_norm": 0.6883957756753183, "learning_rate": 1.9892460508032158e-05, "loss": 0.4603, "step": 967 }, { "epoch": 0.19085173501577288, "grad_norm": 0.7020929418484416, "learning_rate": 1.9892233681000696e-05, "loss": 0.4644, "step": 968 }, { "epoch": 0.19104889589905363, "grad_norm": 0.659651299875094, "learning_rate": 1.9892006616300358e-05, "loss": 0.4378, "step": 969 }, { "epoch": 0.1912460567823344, "grad_norm": 0.6871160793712359, "learning_rate": 1.989177931393659e-05, "loss": 0.4432, "step": 970 }, { "epoch": 0.19144321766561515, "grad_norm": 0.7875222747543351, "learning_rate": 1.989155177391486e-05, "loss": 0.4698, "step": 971 }, { "epoch": 0.1916403785488959, "grad_norm": 0.6688379722075266, "learning_rate": 1.9891323996240633e-05, "loss": 0.4608, "step": 972 }, { "epoch": 0.19183753943217666, "grad_norm": 0.7330583737954934, "learning_rate": 1.9891095980919383e-05, "loss": 0.4809, "step": 973 }, { "epoch": 0.1920347003154574, "grad_norm": 0.6958413705522188, "learning_rate": 1.9890867727956587e-05, "loss": 0.4627, "step": 974 }, { "epoch": 0.19223186119873817, "grad_norm": 1.5897290586527348, "learning_rate": 1.9890639237357726e-05, "loss": 0.5164, "step": 975 }, { "epoch": 0.19242902208201892, "grad_norm": 0.7313960505685592, "learning_rate": 1.98904105091283e-05, "loss": 0.4792, "step": 976 }, { "epoch": 0.19262618296529968, "grad_norm": 0.7063385670757377, "learning_rate": 1.989018154327379e-05, "loss": 0.4605, "step": 977 }, { "epoch": 0.19282334384858044, "grad_norm": 0.708185416892424, "learning_rate": 1.9889952339799704e-05, "loss": 0.439, "step": 978 }, { "epoch": 0.1930205047318612, "grad_norm": 0.6540312802061391, "learning_rate": 1.9889722898711546e-05, "loss": 0.4435, "step": 979 }, { "epoch": 0.19321766561514195, "grad_norm": 0.8450940057465913, "learning_rate": 1.9889493220014837e-05, "loss": 0.4531, "step": 980 }, { "epoch": 0.1934148264984227, "grad_norm": 0.7708701148620279, "learning_rate": 1.9889263303715086e-05, "loss": 0.4704, "step": 981 }, { "epoch": 0.19361198738170346, "grad_norm": 0.621772319575249, "learning_rate": 1.9889033149817823e-05, "loss": 0.4514, "step": 982 }, { "epoch": 0.19380914826498422, "grad_norm": 0.737304053648671, "learning_rate": 1.9888802758328574e-05, "loss": 0.458, "step": 983 }, { "epoch": 0.19400630914826497, "grad_norm": 0.6741704564219441, "learning_rate": 1.9888572129252875e-05, "loss": 0.4394, "step": 984 }, { "epoch": 0.19420347003154576, "grad_norm": 0.7931415567269723, "learning_rate": 1.9888341262596266e-05, "loss": 0.4729, "step": 985 }, { "epoch": 0.1944006309148265, "grad_norm": 0.731130849168293, "learning_rate": 1.9888110158364296e-05, "loss": 0.4546, "step": 986 }, { "epoch": 0.19459779179810727, "grad_norm": 0.6910048306098391, "learning_rate": 1.988787881656252e-05, "loss": 0.4799, "step": 987 }, { "epoch": 0.19479495268138802, "grad_norm": 0.6876331628207631, "learning_rate": 1.988764723719649e-05, "loss": 0.4642, "step": 988 }, { "epoch": 0.19499211356466878, "grad_norm": 0.6926517120324639, "learning_rate": 1.988741542027177e-05, "loss": 0.4482, "step": 989 }, { "epoch": 0.19518927444794953, "grad_norm": 0.6914878618737181, "learning_rate": 1.9887183365793935e-05, "loss": 0.4126, "step": 990 }, { "epoch": 0.1953864353312303, "grad_norm": 0.6866079399218992, "learning_rate": 1.9886951073768557e-05, "loss": 0.4373, "step": 991 }, { "epoch": 0.19558359621451105, "grad_norm": 0.6833596593294384, "learning_rate": 1.988671854420122e-05, "loss": 0.4332, "step": 992 }, { "epoch": 0.1957807570977918, "grad_norm": 0.7160110640102156, "learning_rate": 1.9886485777097505e-05, "loss": 0.472, "step": 993 }, { "epoch": 0.19597791798107256, "grad_norm": 0.701325188265206, "learning_rate": 1.9886252772463008e-05, "loss": 0.4854, "step": 994 }, { "epoch": 0.19617507886435331, "grad_norm": 0.6802161665541124, "learning_rate": 1.9886019530303328e-05, "loss": 0.4466, "step": 995 }, { "epoch": 0.19637223974763407, "grad_norm": 0.7478910314629341, "learning_rate": 1.9885786050624066e-05, "loss": 0.5023, "step": 996 }, { "epoch": 0.19656940063091483, "grad_norm": 0.7673069709477471, "learning_rate": 1.9885552333430834e-05, "loss": 0.5205, "step": 997 }, { "epoch": 0.19676656151419558, "grad_norm": 0.7036229450957218, "learning_rate": 1.9885318378729247e-05, "loss": 0.4693, "step": 998 }, { "epoch": 0.19696372239747634, "grad_norm": 0.7059304315094523, "learning_rate": 1.9885084186524922e-05, "loss": 0.4687, "step": 999 }, { "epoch": 0.1971608832807571, "grad_norm": 0.701201649068646, "learning_rate": 1.988484975682349e-05, "loss": 0.4727, "step": 1000 }, { "epoch": 0.19735804416403785, "grad_norm": 0.6840771386046793, "learning_rate": 1.9884615089630584e-05, "loss": 0.4636, "step": 1001 }, { "epoch": 0.1975552050473186, "grad_norm": 0.7119449126029943, "learning_rate": 1.988438018495184e-05, "loss": 0.465, "step": 1002 }, { "epoch": 0.19775236593059936, "grad_norm": 0.7547620511614711, "learning_rate": 1.9884145042792905e-05, "loss": 0.4701, "step": 1003 }, { "epoch": 0.19794952681388012, "grad_norm": 0.6957906304621474, "learning_rate": 1.9883909663159424e-05, "loss": 0.4531, "step": 1004 }, { "epoch": 0.19814668769716087, "grad_norm": 0.8261687307427324, "learning_rate": 1.9883674046057054e-05, "loss": 0.4989, "step": 1005 }, { "epoch": 0.19834384858044163, "grad_norm": 0.6898115598695652, "learning_rate": 1.9883438191491453e-05, "loss": 0.4537, "step": 1006 }, { "epoch": 0.19854100946372238, "grad_norm": 0.818376757438343, "learning_rate": 1.9883202099468294e-05, "loss": 0.4706, "step": 1007 }, { "epoch": 0.19873817034700317, "grad_norm": 0.7807033836708956, "learning_rate": 1.988296576999324e-05, "loss": 0.4311, "step": 1008 }, { "epoch": 0.19893533123028392, "grad_norm": 0.6424414464365055, "learning_rate": 1.988272920307198e-05, "loss": 0.4303, "step": 1009 }, { "epoch": 0.19913249211356468, "grad_norm": 0.7799809923491292, "learning_rate": 1.9882492398710192e-05, "loss": 0.5081, "step": 1010 }, { "epoch": 0.19932965299684544, "grad_norm": 0.7784674048585015, "learning_rate": 1.9882255356913563e-05, "loss": 0.5344, "step": 1011 }, { "epoch": 0.1995268138801262, "grad_norm": 0.7082795210370787, "learning_rate": 1.988201807768779e-05, "loss": 0.4639, "step": 1012 }, { "epoch": 0.19972397476340695, "grad_norm": 0.6999909023367687, "learning_rate": 1.9881780561038583e-05, "loss": 0.4525, "step": 1013 }, { "epoch": 0.1999211356466877, "grad_norm": 0.7285326225615099, "learning_rate": 1.988154280697163e-05, "loss": 0.4278, "step": 1014 }, { "epoch": 0.20011829652996846, "grad_norm": 0.7638833349383104, "learning_rate": 1.988130481549266e-05, "loss": 0.4622, "step": 1015 }, { "epoch": 0.20031545741324921, "grad_norm": 0.6387623204916645, "learning_rate": 1.9881066586607384e-05, "loss": 0.4361, "step": 1016 }, { "epoch": 0.20051261829652997, "grad_norm": 0.7537122531684395, "learning_rate": 1.9880828120321523e-05, "loss": 0.5057, "step": 1017 }, { "epoch": 0.20070977917981073, "grad_norm": 0.6862604234144699, "learning_rate": 1.988058941664081e-05, "loss": 0.4416, "step": 1018 }, { "epoch": 0.20090694006309148, "grad_norm": 0.7489284571072279, "learning_rate": 1.988035047557098e-05, "loss": 0.4998, "step": 1019 }, { "epoch": 0.20110410094637224, "grad_norm": 0.7523322609228381, "learning_rate": 1.9880111297117772e-05, "loss": 0.4551, "step": 1020 }, { "epoch": 0.201301261829653, "grad_norm": 0.7501370176061963, "learning_rate": 1.9879871881286936e-05, "loss": 0.4416, "step": 1021 }, { "epoch": 0.20149842271293375, "grad_norm": 0.7351622592876997, "learning_rate": 1.9879632228084224e-05, "loss": 0.4639, "step": 1022 }, { "epoch": 0.2016955835962145, "grad_norm": 0.7423971116335808, "learning_rate": 1.9879392337515385e-05, "loss": 0.488, "step": 1023 }, { "epoch": 0.20189274447949526, "grad_norm": 0.6751923754084928, "learning_rate": 1.9879152209586193e-05, "loss": 0.3938, "step": 1024 }, { "epoch": 0.20208990536277602, "grad_norm": 0.722495594329759, "learning_rate": 1.987891184430241e-05, "loss": 0.4837, "step": 1025 }, { "epoch": 0.20228706624605677, "grad_norm": 0.6782801414805263, "learning_rate": 1.9878671241669824e-05, "loss": 0.4781, "step": 1026 }, { "epoch": 0.20248422712933753, "grad_norm": 0.7053528722513924, "learning_rate": 1.98784304016942e-05, "loss": 0.4424, "step": 1027 }, { "epoch": 0.20268138801261829, "grad_norm": 0.6877822028898056, "learning_rate": 1.987818932438133e-05, "loss": 0.4585, "step": 1028 }, { "epoch": 0.20287854889589904, "grad_norm": 0.7414751335330734, "learning_rate": 1.9877948009737006e-05, "loss": 0.4581, "step": 1029 }, { "epoch": 0.2030757097791798, "grad_norm": 0.6447511500376699, "learning_rate": 1.9877706457767028e-05, "loss": 0.4246, "step": 1030 }, { "epoch": 0.20327287066246058, "grad_norm": 0.7084092307640751, "learning_rate": 1.9877464668477195e-05, "loss": 0.4779, "step": 1031 }, { "epoch": 0.20347003154574134, "grad_norm": 0.753567669793824, "learning_rate": 1.987722264187332e-05, "loss": 0.4658, "step": 1032 }, { "epoch": 0.2036671924290221, "grad_norm": 0.733202685122525, "learning_rate": 1.987698037796122e-05, "loss": 0.4506, "step": 1033 }, { "epoch": 0.20386435331230285, "grad_norm": 0.6827619467500508, "learning_rate": 1.987673787674671e-05, "loss": 0.4124, "step": 1034 }, { "epoch": 0.2040615141955836, "grad_norm": 0.7015654555624304, "learning_rate": 1.987649513823562e-05, "loss": 0.4498, "step": 1035 }, { "epoch": 0.20425867507886436, "grad_norm": 0.6474610550353069, "learning_rate": 1.987625216243378e-05, "loss": 0.4092, "step": 1036 }, { "epoch": 0.20445583596214512, "grad_norm": 0.7064160426450053, "learning_rate": 1.987600894934703e-05, "loss": 0.4379, "step": 1037 }, { "epoch": 0.20465299684542587, "grad_norm": 0.6676749684842929, "learning_rate": 1.987576549898121e-05, "loss": 0.439, "step": 1038 }, { "epoch": 0.20485015772870663, "grad_norm": 0.7077339162356132, "learning_rate": 1.987552181134217e-05, "loss": 0.4433, "step": 1039 }, { "epoch": 0.20504731861198738, "grad_norm": 0.7923537698393385, "learning_rate": 1.9875277886435768e-05, "loss": 0.4747, "step": 1040 }, { "epoch": 0.20524447949526814, "grad_norm": 0.7264062737086919, "learning_rate": 1.9875033724267863e-05, "loss": 0.4479, "step": 1041 }, { "epoch": 0.2054416403785489, "grad_norm": 0.7252182386797893, "learning_rate": 1.987478932484432e-05, "loss": 0.4701, "step": 1042 }, { "epoch": 0.20563880126182965, "grad_norm": 0.7474427366745425, "learning_rate": 1.9874544688171008e-05, "loss": 0.4749, "step": 1043 }, { "epoch": 0.2058359621451104, "grad_norm": 0.8821899810782728, "learning_rate": 1.9874299814253813e-05, "loss": 0.512, "step": 1044 }, { "epoch": 0.20603312302839116, "grad_norm": 0.7196304240914877, "learning_rate": 1.9874054703098608e-05, "loss": 0.44, "step": 1045 }, { "epoch": 0.20623028391167192, "grad_norm": 0.6964106812809121, "learning_rate": 1.987380935471129e-05, "loss": 0.4283, "step": 1046 }, { "epoch": 0.20642744479495267, "grad_norm": 0.7918473693269067, "learning_rate": 1.9873563769097752e-05, "loss": 0.4519, "step": 1047 }, { "epoch": 0.20662460567823343, "grad_norm": 0.8505419812339252, "learning_rate": 1.9873317946263892e-05, "loss": 0.476, "step": 1048 }, { "epoch": 0.2068217665615142, "grad_norm": 0.6864258063010205, "learning_rate": 1.9873071886215616e-05, "loss": 0.463, "step": 1049 }, { "epoch": 0.20701892744479494, "grad_norm": 0.9553769219710571, "learning_rate": 1.987282558895884e-05, "loss": 0.4925, "step": 1050 }, { "epoch": 0.2072160883280757, "grad_norm": 0.6980654478796005, "learning_rate": 1.9872579054499478e-05, "loss": 0.426, "step": 1051 }, { "epoch": 0.20741324921135645, "grad_norm": 0.719479062368989, "learning_rate": 1.987233228284345e-05, "loss": 0.4504, "step": 1052 }, { "epoch": 0.20761041009463724, "grad_norm": 0.7018789823838913, "learning_rate": 1.9872085273996694e-05, "loss": 0.4546, "step": 1053 }, { "epoch": 0.207807570977918, "grad_norm": 0.7802910640814608, "learning_rate": 1.9871838027965134e-05, "loss": 0.4972, "step": 1054 }, { "epoch": 0.20800473186119875, "grad_norm": 0.7844445505713344, "learning_rate": 1.987159054475472e-05, "loss": 0.4492, "step": 1055 }, { "epoch": 0.2082018927444795, "grad_norm": 0.692043038159667, "learning_rate": 1.9871342824371393e-05, "loss": 0.4406, "step": 1056 }, { "epoch": 0.20839905362776026, "grad_norm": 0.775005815750936, "learning_rate": 1.9871094866821104e-05, "loss": 0.4282, "step": 1057 }, { "epoch": 0.20859621451104102, "grad_norm": 0.7449051373918187, "learning_rate": 1.987084667210981e-05, "loss": 0.46, "step": 1058 }, { "epoch": 0.20879337539432177, "grad_norm": 0.7073489451850218, "learning_rate": 1.987059824024348e-05, "loss": 0.429, "step": 1059 }, { "epoch": 0.20899053627760253, "grad_norm": 0.7309470583978874, "learning_rate": 1.9870349571228075e-05, "loss": 0.4798, "step": 1060 }, { "epoch": 0.20918769716088328, "grad_norm": 0.7718451336210451, "learning_rate": 1.9870100665069577e-05, "loss": 0.4688, "step": 1061 }, { "epoch": 0.20938485804416404, "grad_norm": 0.6817754353610815, "learning_rate": 1.9869851521773956e-05, "loss": 0.4495, "step": 1062 }, { "epoch": 0.2095820189274448, "grad_norm": 0.7665443527171363, "learning_rate": 1.986960214134721e-05, "loss": 0.4802, "step": 1063 }, { "epoch": 0.20977917981072555, "grad_norm": 0.6633898852299499, "learning_rate": 1.986935252379532e-05, "loss": 0.4538, "step": 1064 }, { "epoch": 0.2099763406940063, "grad_norm": 0.68853230845626, "learning_rate": 1.9869102669124293e-05, "loss": 0.4882, "step": 1065 }, { "epoch": 0.21017350157728706, "grad_norm": 0.7017829715484818, "learning_rate": 1.986885257734012e-05, "loss": 0.4855, "step": 1066 }, { "epoch": 0.21037066246056782, "grad_norm": 0.7815202320893195, "learning_rate": 1.986860224844882e-05, "loss": 0.4763, "step": 1067 }, { "epoch": 0.21056782334384858, "grad_norm": 0.7066784986762935, "learning_rate": 1.9868351682456408e-05, "loss": 0.481, "step": 1068 }, { "epoch": 0.21076498422712933, "grad_norm": 0.7041493363838017, "learning_rate": 1.986810087936889e-05, "loss": 0.4637, "step": 1069 }, { "epoch": 0.2109621451104101, "grad_norm": 0.72824829966811, "learning_rate": 1.9867849839192313e-05, "loss": 0.4651, "step": 1070 }, { "epoch": 0.21115930599369084, "grad_norm": 0.6680455738505751, "learning_rate": 1.986759856193269e-05, "loss": 0.4665, "step": 1071 }, { "epoch": 0.2113564668769716, "grad_norm": 0.9281048268306978, "learning_rate": 1.9867347047596066e-05, "loss": 0.471, "step": 1072 }, { "epoch": 0.21155362776025236, "grad_norm": 0.7395272448325264, "learning_rate": 1.9867095296188483e-05, "loss": 0.4715, "step": 1073 }, { "epoch": 0.2117507886435331, "grad_norm": 0.7830224178550704, "learning_rate": 1.986684330771599e-05, "loss": 0.4641, "step": 1074 }, { "epoch": 0.21194794952681387, "grad_norm": 0.7574110213860703, "learning_rate": 1.986659108218464e-05, "loss": 0.4474, "step": 1075 }, { "epoch": 0.21214511041009465, "grad_norm": 0.6927377615200532, "learning_rate": 1.98663386196005e-05, "loss": 0.4287, "step": 1076 }, { "epoch": 0.2123422712933754, "grad_norm": 0.7232907585511054, "learning_rate": 1.986608591996962e-05, "loss": 0.471, "step": 1077 }, { "epoch": 0.21253943217665616, "grad_norm": 0.6491519990201463, "learning_rate": 1.9865832983298085e-05, "loss": 0.4228, "step": 1078 }, { "epoch": 0.21273659305993692, "grad_norm": 0.6831967457936221, "learning_rate": 1.986557980959197e-05, "loss": 0.4732, "step": 1079 }, { "epoch": 0.21293375394321767, "grad_norm": 0.6954600579093602, "learning_rate": 1.986532639885735e-05, "loss": 0.4764, "step": 1080 }, { "epoch": 0.21313091482649843, "grad_norm": 0.6995051067856928, "learning_rate": 1.9865072751100324e-05, "loss": 0.4769, "step": 1081 }, { "epoch": 0.21332807570977919, "grad_norm": 0.6775668825484679, "learning_rate": 1.9864818866326978e-05, "loss": 0.4461, "step": 1082 }, { "epoch": 0.21352523659305994, "grad_norm": 0.7519792388180498, "learning_rate": 1.9864564744543412e-05, "loss": 0.4812, "step": 1083 }, { "epoch": 0.2137223974763407, "grad_norm": 0.7107462899081852, "learning_rate": 1.986431038575574e-05, "loss": 0.5028, "step": 1084 }, { "epoch": 0.21391955835962145, "grad_norm": 0.6753961183203763, "learning_rate": 1.9864055789970064e-05, "loss": 0.4237, "step": 1085 }, { "epoch": 0.2141167192429022, "grad_norm": 0.6737584152485184, "learning_rate": 1.9863800957192504e-05, "loss": 0.4461, "step": 1086 }, { "epoch": 0.21431388012618297, "grad_norm": 0.6556918876340901, "learning_rate": 1.9863545887429185e-05, "loss": 0.4506, "step": 1087 }, { "epoch": 0.21451104100946372, "grad_norm": 0.6877170697107042, "learning_rate": 1.9863290580686228e-05, "loss": 0.4058, "step": 1088 }, { "epoch": 0.21470820189274448, "grad_norm": 0.7037870318662259, "learning_rate": 1.9863035036969775e-05, "loss": 0.4576, "step": 1089 }, { "epoch": 0.21490536277602523, "grad_norm": 0.7399278087323496, "learning_rate": 1.9862779256285964e-05, "loss": 0.4907, "step": 1090 }, { "epoch": 0.215102523659306, "grad_norm": 0.688883167018551, "learning_rate": 1.986252323864094e-05, "loss": 0.4597, "step": 1091 }, { "epoch": 0.21529968454258674, "grad_norm": 0.7010288184037431, "learning_rate": 1.9862266984040847e-05, "loss": 0.4519, "step": 1092 }, { "epoch": 0.2154968454258675, "grad_norm": 0.6241563971147653, "learning_rate": 1.9862010492491852e-05, "loss": 0.3987, "step": 1093 }, { "epoch": 0.21569400630914826, "grad_norm": 0.6700336161149536, "learning_rate": 1.9861753764000115e-05, "loss": 0.4728, "step": 1094 }, { "epoch": 0.215891167192429, "grad_norm": 0.7471341036922323, "learning_rate": 1.98614967985718e-05, "loss": 0.446, "step": 1095 }, { "epoch": 0.21608832807570977, "grad_norm": 0.6506229824272028, "learning_rate": 1.986123959621308e-05, "loss": 0.416, "step": 1096 }, { "epoch": 0.21628548895899052, "grad_norm": 1.2484822940321072, "learning_rate": 1.986098215693014e-05, "loss": 0.4903, "step": 1097 }, { "epoch": 0.21648264984227128, "grad_norm": 0.9678245848808749, "learning_rate": 1.986072448072916e-05, "loss": 0.4802, "step": 1098 }, { "epoch": 0.21667981072555206, "grad_norm": 0.7448588276706483, "learning_rate": 1.9860466567616335e-05, "loss": 0.4231, "step": 1099 }, { "epoch": 0.21687697160883282, "grad_norm": 0.7225398921155327, "learning_rate": 1.9860208417597863e-05, "loss": 0.4666, "step": 1100 }, { "epoch": 0.21707413249211358, "grad_norm": 0.8881220700758276, "learning_rate": 1.9859950030679943e-05, "loss": 0.4587, "step": 1101 }, { "epoch": 0.21727129337539433, "grad_norm": 0.6824253304142237, "learning_rate": 1.985969140686878e-05, "loss": 0.4671, "step": 1102 }, { "epoch": 0.2174684542586751, "grad_norm": 0.822081525030592, "learning_rate": 1.9859432546170594e-05, "loss": 0.4309, "step": 1103 }, { "epoch": 0.21766561514195584, "grad_norm": 0.7177176238573195, "learning_rate": 1.98591734485916e-05, "loss": 0.4662, "step": 1104 }, { "epoch": 0.2178627760252366, "grad_norm": 23.933597214678645, "learning_rate": 1.9858914114138024e-05, "loss": 0.4718, "step": 1105 }, { "epoch": 0.21805993690851735, "grad_norm": 0.9295012594770319, "learning_rate": 1.9858654542816098e-05, "loss": 0.4374, "step": 1106 }, { "epoch": 0.2182570977917981, "grad_norm": 0.7110886368537057, "learning_rate": 1.9858394734632054e-05, "loss": 0.4292, "step": 1107 }, { "epoch": 0.21845425867507887, "grad_norm": 0.745899081526924, "learning_rate": 1.9858134689592143e-05, "loss": 0.4494, "step": 1108 }, { "epoch": 0.21865141955835962, "grad_norm": 3.2936984686195667, "learning_rate": 1.9857874407702606e-05, "loss": 0.436, "step": 1109 }, { "epoch": 0.21884858044164038, "grad_norm": 0.9426735384515953, "learning_rate": 1.9857613888969694e-05, "loss": 0.4761, "step": 1110 }, { "epoch": 0.21904574132492113, "grad_norm": 0.9274354892950216, "learning_rate": 1.9857353133399675e-05, "loss": 0.4885, "step": 1111 }, { "epoch": 0.2192429022082019, "grad_norm": 0.6753675988750968, "learning_rate": 1.9857092140998807e-05, "loss": 0.4496, "step": 1112 }, { "epoch": 0.21944006309148265, "grad_norm": 0.9644822228389445, "learning_rate": 1.985683091177336e-05, "loss": 0.4677, "step": 1113 }, { "epoch": 0.2196372239747634, "grad_norm": 0.7186407032609544, "learning_rate": 1.9856569445729615e-05, "loss": 0.4894, "step": 1114 }, { "epoch": 0.21983438485804416, "grad_norm": 0.7974174876233592, "learning_rate": 1.9856307742873852e-05, "loss": 0.4593, "step": 1115 }, { "epoch": 0.2200315457413249, "grad_norm": 0.7199784472630298, "learning_rate": 1.9856045803212356e-05, "loss": 0.4488, "step": 1116 }, { "epoch": 0.22022870662460567, "grad_norm": 0.7059488613258683, "learning_rate": 1.9855783626751425e-05, "loss": 0.423, "step": 1117 }, { "epoch": 0.22042586750788642, "grad_norm": 0.6784311746671168, "learning_rate": 1.9855521213497355e-05, "loss": 0.4455, "step": 1118 }, { "epoch": 0.22062302839116718, "grad_norm": 0.6987440905493829, "learning_rate": 1.9855258563456448e-05, "loss": 0.4458, "step": 1119 }, { "epoch": 0.22082018927444794, "grad_norm": 0.7453334954062374, "learning_rate": 1.985499567663502e-05, "loss": 0.472, "step": 1120 }, { "epoch": 0.22101735015772872, "grad_norm": 0.8208236623644202, "learning_rate": 1.9854732553039388e-05, "loss": 0.4327, "step": 1121 }, { "epoch": 0.22121451104100948, "grad_norm": 0.8076041122842882, "learning_rate": 1.9854469192675868e-05, "loss": 0.4488, "step": 1122 }, { "epoch": 0.22141167192429023, "grad_norm": 0.6509600595961608, "learning_rate": 1.9854205595550787e-05, "loss": 0.4253, "step": 1123 }, { "epoch": 0.221608832807571, "grad_norm": 0.8076858203915139, "learning_rate": 1.9853941761670483e-05, "loss": 0.4793, "step": 1124 }, { "epoch": 0.22180599369085174, "grad_norm": 0.6925472433506414, "learning_rate": 1.9853677691041293e-05, "loss": 0.4703, "step": 1125 }, { "epoch": 0.2220031545741325, "grad_norm": 0.68014215224375, "learning_rate": 1.985341338366956e-05, "loss": 0.4668, "step": 1126 }, { "epoch": 0.22220031545741326, "grad_norm": 0.7501920154620356, "learning_rate": 1.9853148839561638e-05, "loss": 0.4611, "step": 1127 }, { "epoch": 0.222397476340694, "grad_norm": 0.7031267527114912, "learning_rate": 1.985288405872388e-05, "loss": 0.4603, "step": 1128 }, { "epoch": 0.22259463722397477, "grad_norm": 0.6372019068678325, "learning_rate": 1.9852619041162646e-05, "loss": 0.41, "step": 1129 }, { "epoch": 0.22279179810725552, "grad_norm": 0.6979715600361694, "learning_rate": 1.9852353786884306e-05, "loss": 0.4666, "step": 1130 }, { "epoch": 0.22298895899053628, "grad_norm": 0.7323097747430808, "learning_rate": 1.9852088295895232e-05, "loss": 0.4468, "step": 1131 }, { "epoch": 0.22318611987381703, "grad_norm": 0.6991485532926531, "learning_rate": 1.9851822568201806e-05, "loss": 0.4634, "step": 1132 }, { "epoch": 0.2233832807570978, "grad_norm": 0.7591783665648759, "learning_rate": 1.9851556603810406e-05, "loss": 0.4419, "step": 1133 }, { "epoch": 0.22358044164037855, "grad_norm": 1.1746824112571197, "learning_rate": 1.9851290402727426e-05, "loss": 0.4639, "step": 1134 }, { "epoch": 0.2237776025236593, "grad_norm": 0.7081731495754315, "learning_rate": 1.985102396495926e-05, "loss": 0.4881, "step": 1135 }, { "epoch": 0.22397476340694006, "grad_norm": 0.6590193639944613, "learning_rate": 1.9850757290512313e-05, "loss": 0.4536, "step": 1136 }, { "epoch": 0.22417192429022081, "grad_norm": 8.480821491410778, "learning_rate": 1.9850490379392988e-05, "loss": 0.4433, "step": 1137 }, { "epoch": 0.22436908517350157, "grad_norm": 0.7990748970455287, "learning_rate": 1.9850223231607696e-05, "loss": 0.4704, "step": 1138 }, { "epoch": 0.22456624605678233, "grad_norm": 0.6257113394097338, "learning_rate": 1.984995584716286e-05, "loss": 0.4377, "step": 1139 }, { "epoch": 0.22476340694006308, "grad_norm": 4.598690932717382, "learning_rate": 1.9849688226064906e-05, "loss": 0.5025, "step": 1140 }, { "epoch": 0.22496056782334384, "grad_norm": 0.8210582553644494, "learning_rate": 1.9849420368320254e-05, "loss": 0.4854, "step": 1141 }, { "epoch": 0.2251577287066246, "grad_norm": 0.7646003921775568, "learning_rate": 1.9849152273935353e-05, "loss": 0.4344, "step": 1142 }, { "epoch": 0.22535488958990535, "grad_norm": 0.7228017400656176, "learning_rate": 1.9848883942916632e-05, "loss": 0.4785, "step": 1143 }, { "epoch": 0.22555205047318613, "grad_norm": 0.7985110474365666, "learning_rate": 1.9848615375270547e-05, "loss": 0.4461, "step": 1144 }, { "epoch": 0.2257492113564669, "grad_norm": 0.8935580670763312, "learning_rate": 1.984834657100354e-05, "loss": 0.4658, "step": 1145 }, { "epoch": 0.22594637223974764, "grad_norm": 0.8552039095318358, "learning_rate": 1.9848077530122083e-05, "loss": 0.4574, "step": 1146 }, { "epoch": 0.2261435331230284, "grad_norm": 1.2094865238135273, "learning_rate": 1.984780825263263e-05, "loss": 0.4461, "step": 1147 }, { "epoch": 0.22634069400630916, "grad_norm": 0.8645616019682253, "learning_rate": 1.984753873854165e-05, "loss": 0.5116, "step": 1148 }, { "epoch": 0.2265378548895899, "grad_norm": 3.127099963117224, "learning_rate": 1.984726898785563e-05, "loss": 0.4398, "step": 1149 }, { "epoch": 0.22673501577287067, "grad_norm": 1.1378328088647094, "learning_rate": 1.9846999000581033e-05, "loss": 0.4529, "step": 1150 }, { "epoch": 0.22693217665615142, "grad_norm": 1.2150716840072067, "learning_rate": 1.9846728776724358e-05, "loss": 0.5032, "step": 1151 }, { "epoch": 0.22712933753943218, "grad_norm": 1.0286587973333643, "learning_rate": 1.98464583162921e-05, "loss": 0.4525, "step": 1152 }, { "epoch": 0.22732649842271294, "grad_norm": 1.126989898873081, "learning_rate": 1.9846187619290746e-05, "loss": 0.4292, "step": 1153 }, { "epoch": 0.2275236593059937, "grad_norm": 1.2199916483600795, "learning_rate": 1.9845916685726808e-05, "loss": 0.4732, "step": 1154 }, { "epoch": 0.22772082018927445, "grad_norm": 0.9371106577012303, "learning_rate": 1.9845645515606792e-05, "loss": 0.4872, "step": 1155 }, { "epoch": 0.2279179810725552, "grad_norm": 0.9056546747302068, "learning_rate": 1.9845374108937213e-05, "loss": 0.4975, "step": 1156 }, { "epoch": 0.22811514195583596, "grad_norm": 1.5812769964252382, "learning_rate": 1.9845102465724593e-05, "loss": 0.4733, "step": 1157 }, { "epoch": 0.22831230283911672, "grad_norm": 1.1843221653360212, "learning_rate": 1.984483058597546e-05, "loss": 0.4624, "step": 1158 }, { "epoch": 0.22850946372239747, "grad_norm": 0.9821603418098558, "learning_rate": 1.9844558469696342e-05, "loss": 0.4413, "step": 1159 }, { "epoch": 0.22870662460567823, "grad_norm": 0.9806072784995279, "learning_rate": 1.984428611689378e-05, "loss": 0.4868, "step": 1160 }, { "epoch": 0.22890378548895898, "grad_norm": 0.8793984774617989, "learning_rate": 1.9844013527574316e-05, "loss": 0.4267, "step": 1161 }, { "epoch": 0.22910094637223974, "grad_norm": 0.8495785803021494, "learning_rate": 1.9843740701744497e-05, "loss": 0.4354, "step": 1162 }, { "epoch": 0.2292981072555205, "grad_norm": 1.7057039922514965, "learning_rate": 1.9843467639410885e-05, "loss": 0.4447, "step": 1163 }, { "epoch": 0.22949526813880125, "grad_norm": 1.075911659015585, "learning_rate": 1.9843194340580032e-05, "loss": 0.4962, "step": 1164 }, { "epoch": 0.229692429022082, "grad_norm": 0.8457228409931461, "learning_rate": 1.9842920805258513e-05, "loss": 0.4552, "step": 1165 }, { "epoch": 0.22988958990536276, "grad_norm": 1.008444001726399, "learning_rate": 1.9842647033452893e-05, "loss": 0.4462, "step": 1166 }, { "epoch": 0.23008675078864355, "grad_norm": 0.8092829859053057, "learning_rate": 1.984237302516975e-05, "loss": 0.4559, "step": 1167 }, { "epoch": 0.2302839116719243, "grad_norm": 0.9687384781928995, "learning_rate": 1.984209878041567e-05, "loss": 0.5053, "step": 1168 }, { "epoch": 0.23048107255520506, "grad_norm": 0.8297737211847974, "learning_rate": 1.984182429919724e-05, "loss": 0.462, "step": 1169 }, { "epoch": 0.2306782334384858, "grad_norm": 0.7460078380279647, "learning_rate": 1.9841549581521058e-05, "loss": 0.4538, "step": 1170 }, { "epoch": 0.23087539432176657, "grad_norm": 0.9658528590503032, "learning_rate": 1.984127462739372e-05, "loss": 0.4759, "step": 1171 }, { "epoch": 0.23107255520504733, "grad_norm": 0.7016223428827921, "learning_rate": 1.9840999436821836e-05, "loss": 0.4328, "step": 1172 }, { "epoch": 0.23126971608832808, "grad_norm": 0.6671082456539488, "learning_rate": 1.9840724009812013e-05, "loss": 0.4259, "step": 1173 }, { "epoch": 0.23146687697160884, "grad_norm": 0.7264492795866501, "learning_rate": 1.9840448346370873e-05, "loss": 0.4251, "step": 1174 }, { "epoch": 0.2316640378548896, "grad_norm": 0.726746284722804, "learning_rate": 1.9840172446505036e-05, "loss": 0.4838, "step": 1175 }, { "epoch": 0.23186119873817035, "grad_norm": 1.151370972769961, "learning_rate": 1.9839896310221133e-05, "loss": 0.4793, "step": 1176 }, { "epoch": 0.2320583596214511, "grad_norm": 0.8135672403953714, "learning_rate": 1.9839619937525794e-05, "loss": 0.4496, "step": 1177 }, { "epoch": 0.23225552050473186, "grad_norm": 0.6927163568862448, "learning_rate": 1.9839343328425668e-05, "loss": 0.4628, "step": 1178 }, { "epoch": 0.23245268138801262, "grad_norm": 0.6732578753958156, "learning_rate": 1.983906648292739e-05, "loss": 0.43, "step": 1179 }, { "epoch": 0.23264984227129337, "grad_norm": 0.7086446749133889, "learning_rate": 1.9838789401037616e-05, "loss": 0.4291, "step": 1180 }, { "epoch": 0.23284700315457413, "grad_norm": 0.7334824055009613, "learning_rate": 1.9838512082763002e-05, "loss": 0.4342, "step": 1181 }, { "epoch": 0.23304416403785488, "grad_norm": 0.7653457648798104, "learning_rate": 1.983823452811022e-05, "loss": 0.4805, "step": 1182 }, { "epoch": 0.23324132492113564, "grad_norm": 0.9262252445526374, "learning_rate": 1.9837956737085924e-05, "loss": 0.4788, "step": 1183 }, { "epoch": 0.2334384858044164, "grad_norm": 0.7504820328186347, "learning_rate": 1.98376787096968e-05, "loss": 0.4555, "step": 1184 }, { "epoch": 0.23363564668769715, "grad_norm": 0.721933776706364, "learning_rate": 1.983740044594952e-05, "loss": 0.4848, "step": 1185 }, { "epoch": 0.2338328075709779, "grad_norm": 2.846732366688398, "learning_rate": 1.9837121945850766e-05, "loss": 0.4638, "step": 1186 }, { "epoch": 0.23402996845425866, "grad_norm": 0.8306542747582979, "learning_rate": 1.9836843209407247e-05, "loss": 0.4082, "step": 1187 }, { "epoch": 0.23422712933753942, "grad_norm": 0.736142320584185, "learning_rate": 1.983656423662564e-05, "loss": 0.4569, "step": 1188 }, { "epoch": 0.2344242902208202, "grad_norm": 0.7039751490055907, "learning_rate": 1.983628502751266e-05, "loss": 0.4182, "step": 1189 }, { "epoch": 0.23462145110410096, "grad_norm": 0.9525090224636836, "learning_rate": 1.983600558207501e-05, "loss": 0.4414, "step": 1190 }, { "epoch": 0.23481861198738171, "grad_norm": 1.4964168121471113, "learning_rate": 1.9835725900319406e-05, "loss": 0.4468, "step": 1191 }, { "epoch": 0.23501577287066247, "grad_norm": 0.8670187458357161, "learning_rate": 1.9835445982252565e-05, "loss": 0.4159, "step": 1192 }, { "epoch": 0.23521293375394323, "grad_norm": 0.6948661600832843, "learning_rate": 1.983516582788121e-05, "loss": 0.4451, "step": 1193 }, { "epoch": 0.23541009463722398, "grad_norm": 0.7812431346409022, "learning_rate": 1.9834885437212083e-05, "loss": 0.4613, "step": 1194 }, { "epoch": 0.23560725552050474, "grad_norm": 0.7970118513981926, "learning_rate": 1.983460481025191e-05, "loss": 0.4761, "step": 1195 }, { "epoch": 0.2358044164037855, "grad_norm": 0.7490679736689834, "learning_rate": 1.9834323947007433e-05, "loss": 0.4931, "step": 1196 }, { "epoch": 0.23600157728706625, "grad_norm": 0.7282970951897094, "learning_rate": 1.983404284748541e-05, "loss": 0.4184, "step": 1197 }, { "epoch": 0.236198738170347, "grad_norm": 0.7024789178171861, "learning_rate": 1.9833761511692583e-05, "loss": 0.4604, "step": 1198 }, { "epoch": 0.23639589905362776, "grad_norm": 0.7562914768111765, "learning_rate": 1.9833479939635724e-05, "loss": 0.4835, "step": 1199 }, { "epoch": 0.23659305993690852, "grad_norm": 0.6905900737165961, "learning_rate": 1.9833198131321582e-05, "loss": 0.4362, "step": 1200 }, { "epoch": 0.23679022082018927, "grad_norm": 0.666922032665526, "learning_rate": 1.9832916086756938e-05, "loss": 0.4619, "step": 1201 }, { "epoch": 0.23698738170347003, "grad_norm": 0.6837662204239521, "learning_rate": 1.983263380594857e-05, "loss": 0.4373, "step": 1202 }, { "epoch": 0.23718454258675079, "grad_norm": 0.6414003295820169, "learning_rate": 1.9832351288903256e-05, "loss": 0.4317, "step": 1203 }, { "epoch": 0.23738170347003154, "grad_norm": 0.7460692979898119, "learning_rate": 1.9832068535627785e-05, "loss": 0.479, "step": 1204 }, { "epoch": 0.2375788643533123, "grad_norm": 0.6428078211747335, "learning_rate": 1.983178554612895e-05, "loss": 0.4469, "step": 1205 }, { "epoch": 0.23777602523659305, "grad_norm": 0.7517122154912249, "learning_rate": 1.9831502320413543e-05, "loss": 0.4858, "step": 1206 }, { "epoch": 0.2379731861198738, "grad_norm": 1.3463826441881264, "learning_rate": 1.983121885848838e-05, "loss": 0.4735, "step": 1207 }, { "epoch": 0.23817034700315456, "grad_norm": 0.7102923990366657, "learning_rate": 1.983093516036027e-05, "loss": 0.487, "step": 1208 }, { "epoch": 0.23836750788643532, "grad_norm": 2.290103684996036, "learning_rate": 1.9830651226036023e-05, "loss": 0.4727, "step": 1209 }, { "epoch": 0.23856466876971608, "grad_norm": 0.6811005595914935, "learning_rate": 1.9830367055522463e-05, "loss": 0.4263, "step": 1210 }, { "epoch": 0.23876182965299683, "grad_norm": 0.799809715831858, "learning_rate": 1.983008264882642e-05, "loss": 0.4097, "step": 1211 }, { "epoch": 0.23895899053627762, "grad_norm": 0.7354195020597284, "learning_rate": 1.9829798005954725e-05, "loss": 0.4724, "step": 1212 }, { "epoch": 0.23915615141955837, "grad_norm": 0.8296463030390158, "learning_rate": 1.982951312691422e-05, "loss": 0.4904, "step": 1213 }, { "epoch": 0.23935331230283913, "grad_norm": 0.6229789713014955, "learning_rate": 1.9829228011711738e-05, "loss": 0.4451, "step": 1214 }, { "epoch": 0.23955047318611988, "grad_norm": 0.7037507970309669, "learning_rate": 1.9828942660354144e-05, "loss": 0.4382, "step": 1215 }, { "epoch": 0.23974763406940064, "grad_norm": 1.7862177754634971, "learning_rate": 1.9828657072848284e-05, "loss": 0.4677, "step": 1216 }, { "epoch": 0.2399447949526814, "grad_norm": 0.7511599646811729, "learning_rate": 1.9828371249201025e-05, "loss": 0.4432, "step": 1217 }, { "epoch": 0.24014195583596215, "grad_norm": 0.6663595319040087, "learning_rate": 1.982808518941923e-05, "loss": 0.4535, "step": 1218 }, { "epoch": 0.2403391167192429, "grad_norm": 0.7505465175906688, "learning_rate": 1.982779889350978e-05, "loss": 0.483, "step": 1219 }, { "epoch": 0.24053627760252366, "grad_norm": 0.7052786101408759, "learning_rate": 1.982751236147954e-05, "loss": 0.4568, "step": 1220 }, { "epoch": 0.24073343848580442, "grad_norm": 1.0567060689641148, "learning_rate": 1.9827225593335403e-05, "loss": 0.4725, "step": 1221 }, { "epoch": 0.24093059936908517, "grad_norm": 0.668419640423508, "learning_rate": 1.9826938589084258e-05, "loss": 0.462, "step": 1222 }, { "epoch": 0.24112776025236593, "grad_norm": 1.2570484391370291, "learning_rate": 1.9826651348733e-05, "loss": 0.5211, "step": 1223 }, { "epoch": 0.24132492113564669, "grad_norm": 0.7349089292621471, "learning_rate": 1.982636387228853e-05, "loss": 0.4528, "step": 1224 }, { "epoch": 0.24152208201892744, "grad_norm": 0.8253540404906965, "learning_rate": 1.9826076159757753e-05, "loss": 0.5093, "step": 1225 }, { "epoch": 0.2417192429022082, "grad_norm": 2.1092492780074745, "learning_rate": 1.9825788211147587e-05, "loss": 0.4845, "step": 1226 }, { "epoch": 0.24191640378548895, "grad_norm": 0.8850457162579004, "learning_rate": 1.9825500026464947e-05, "loss": 0.4805, "step": 1227 }, { "epoch": 0.2421135646687697, "grad_norm": 0.6781748488422774, "learning_rate": 1.9825211605716748e-05, "loss": 0.4738, "step": 1228 }, { "epoch": 0.24231072555205047, "grad_norm": 1.0845607789046796, "learning_rate": 1.9824922948909937e-05, "loss": 0.4449, "step": 1229 }, { "epoch": 0.24250788643533122, "grad_norm": 1.2815030508313785, "learning_rate": 1.9824634056051436e-05, "loss": 0.473, "step": 1230 }, { "epoch": 0.24270504731861198, "grad_norm": 0.8080992979642333, "learning_rate": 1.9824344927148193e-05, "loss": 0.4731, "step": 1231 }, { "epoch": 0.24290220820189273, "grad_norm": 0.7437272267133168, "learning_rate": 1.982405556220715e-05, "loss": 0.4519, "step": 1232 }, { "epoch": 0.2430993690851735, "grad_norm": 0.7480610069853911, "learning_rate": 1.982376596123526e-05, "loss": 0.4417, "step": 1233 }, { "epoch": 0.24329652996845424, "grad_norm": 0.7550320130527832, "learning_rate": 1.982347612423948e-05, "loss": 0.439, "step": 1234 }, { "epoch": 0.24349369085173503, "grad_norm": 0.6749020950952507, "learning_rate": 1.9823186051226783e-05, "loss": 0.4201, "step": 1235 }, { "epoch": 0.24369085173501578, "grad_norm": 0.8142599734476462, "learning_rate": 1.9822895742204122e-05, "loss": 0.4621, "step": 1236 }, { "epoch": 0.24388801261829654, "grad_norm": 0.6700713256447526, "learning_rate": 1.9822605197178485e-05, "loss": 0.4387, "step": 1237 }, { "epoch": 0.2440851735015773, "grad_norm": 0.7745146153654806, "learning_rate": 1.9822314416156848e-05, "loss": 0.4605, "step": 1238 }, { "epoch": 0.24428233438485805, "grad_norm": 0.9942732385767507, "learning_rate": 1.9822023399146194e-05, "loss": 0.4773, "step": 1239 }, { "epoch": 0.2444794952681388, "grad_norm": 0.7228782340223426, "learning_rate": 1.9821732146153517e-05, "loss": 0.4754, "step": 1240 }, { "epoch": 0.24467665615141956, "grad_norm": 0.7905102450931651, "learning_rate": 1.9821440657185822e-05, "loss": 0.4596, "step": 1241 }, { "epoch": 0.24487381703470032, "grad_norm": 0.8374670342807091, "learning_rate": 1.9821148932250103e-05, "loss": 0.4102, "step": 1242 }, { "epoch": 0.24507097791798108, "grad_norm": 0.9557468893603367, "learning_rate": 1.9820856971353374e-05, "loss": 0.441, "step": 1243 }, { "epoch": 0.24526813880126183, "grad_norm": 0.9884255566988156, "learning_rate": 1.9820564774502644e-05, "loss": 0.4866, "step": 1244 }, { "epoch": 0.2454652996845426, "grad_norm": 0.6985509946646631, "learning_rate": 1.9820272341704937e-05, "loss": 0.4441, "step": 1245 }, { "epoch": 0.24566246056782334, "grad_norm": 0.7274501226462767, "learning_rate": 1.981997967296728e-05, "loss": 0.4387, "step": 1246 }, { "epoch": 0.2458596214511041, "grad_norm": 0.7388406161273386, "learning_rate": 1.9819686768296706e-05, "loss": 0.4613, "step": 1247 }, { "epoch": 0.24605678233438485, "grad_norm": 0.7240588593494661, "learning_rate": 1.9819393627700247e-05, "loss": 0.4782, "step": 1248 }, { "epoch": 0.2462539432176656, "grad_norm": 0.7023054870349664, "learning_rate": 1.9819100251184945e-05, "loss": 0.4687, "step": 1249 }, { "epoch": 0.24645110410094637, "grad_norm": 0.7018924187358255, "learning_rate": 1.9818806638757856e-05, "loss": 0.4643, "step": 1250 }, { "epoch": 0.24664826498422712, "grad_norm": 0.7507947096176073, "learning_rate": 1.981851279042603e-05, "loss": 0.4471, "step": 1251 }, { "epoch": 0.24684542586750788, "grad_norm": 0.7236171294016701, "learning_rate": 1.9818218706196527e-05, "loss": 0.486, "step": 1252 }, { "epoch": 0.24704258675078863, "grad_norm": 0.7074399537871575, "learning_rate": 1.9817924386076416e-05, "loss": 0.4696, "step": 1253 }, { "epoch": 0.2472397476340694, "grad_norm": 0.6990187430370988, "learning_rate": 1.981762983007276e-05, "loss": 0.4539, "step": 1254 }, { "epoch": 0.24743690851735015, "grad_norm": 0.8201430818139063, "learning_rate": 1.9817335038192644e-05, "loss": 0.4644, "step": 1255 }, { "epoch": 0.2476340694006309, "grad_norm": 0.6857424544658502, "learning_rate": 1.981704001044315e-05, "loss": 0.4737, "step": 1256 }, { "epoch": 0.24783123028391169, "grad_norm": 0.6640652585450276, "learning_rate": 1.981674474683136e-05, "loss": 0.4464, "step": 1257 }, { "epoch": 0.24802839116719244, "grad_norm": 0.7345704399108267, "learning_rate": 1.9816449247364374e-05, "loss": 0.4625, "step": 1258 }, { "epoch": 0.2482255520504732, "grad_norm": 0.6686587208589408, "learning_rate": 1.981615351204929e-05, "loss": 0.4369, "step": 1259 }, { "epoch": 0.24842271293375395, "grad_norm": 0.6762802311646812, "learning_rate": 1.9815857540893214e-05, "loss": 0.432, "step": 1260 }, { "epoch": 0.2486198738170347, "grad_norm": 0.6562041100893717, "learning_rate": 1.9815561333903255e-05, "loss": 0.43, "step": 1261 }, { "epoch": 0.24881703470031546, "grad_norm": 0.6532239829668289, "learning_rate": 1.981526489108653e-05, "loss": 0.4351, "step": 1262 }, { "epoch": 0.24901419558359622, "grad_norm": 0.6996616378046743, "learning_rate": 1.981496821245016e-05, "loss": 0.4881, "step": 1263 }, { "epoch": 0.24921135646687698, "grad_norm": 0.6705242832654518, "learning_rate": 1.9814671298001274e-05, "loss": 0.4536, "step": 1264 }, { "epoch": 0.24940851735015773, "grad_norm": 0.7366373896084766, "learning_rate": 1.9814374147747012e-05, "loss": 0.4733, "step": 1265 }, { "epoch": 0.2496056782334385, "grad_norm": 0.6716024398128168, "learning_rate": 1.98140767616945e-05, "loss": 0.4879, "step": 1266 }, { "epoch": 0.24980283911671924, "grad_norm": 0.7567180353841482, "learning_rate": 1.98137791398509e-05, "loss": 0.4714, "step": 1267 }, { "epoch": 0.25, "grad_norm": 0.6280236993661752, "learning_rate": 1.981348128222335e-05, "loss": 0.4467, "step": 1268 }, { "epoch": 0.25, "eval_loss": 0.45942220091819763, "eval_runtime": 344.9533, "eval_samples_per_second": 23.568, "eval_steps_per_second": 1.476, "step": 1268 }, { "epoch": 0.25019716088328076, "grad_norm": 0.7858090304991342, "learning_rate": 1.9813183188819005e-05, "loss": 0.515, "step": 1269 }, { "epoch": 0.2503943217665615, "grad_norm": 0.7486528699456562, "learning_rate": 1.981288485964503e-05, "loss": 0.482, "step": 1270 }, { "epoch": 0.25059148264984227, "grad_norm": 0.6374094423910737, "learning_rate": 1.98125862947086e-05, "loss": 0.4309, "step": 1271 }, { "epoch": 0.250788643533123, "grad_norm": 0.7096340783400006, "learning_rate": 1.981228749401688e-05, "loss": 0.483, "step": 1272 }, { "epoch": 0.2509858044164038, "grad_norm": 0.7444158649998527, "learning_rate": 1.9811988457577054e-05, "loss": 0.4592, "step": 1273 }, { "epoch": 0.25118296529968454, "grad_norm": 0.7501735940555235, "learning_rate": 1.98116891853963e-05, "loss": 0.4499, "step": 1274 }, { "epoch": 0.2513801261829653, "grad_norm": 0.6782176587394123, "learning_rate": 1.9811389677481815e-05, "loss": 0.4538, "step": 1275 }, { "epoch": 0.25157728706624605, "grad_norm": 0.6415201673231513, "learning_rate": 1.9811089933840788e-05, "loss": 0.3966, "step": 1276 }, { "epoch": 0.2517744479495268, "grad_norm": 0.7902039683370593, "learning_rate": 1.9810789954480425e-05, "loss": 0.4445, "step": 1277 }, { "epoch": 0.25197160883280756, "grad_norm": 0.7144540560155214, "learning_rate": 1.9810489739407934e-05, "loss": 0.4711, "step": 1278 }, { "epoch": 0.2521687697160883, "grad_norm": 0.6516750917925597, "learning_rate": 1.9810189288630524e-05, "loss": 0.4704, "step": 1279 }, { "epoch": 0.25236593059936907, "grad_norm": 0.6422474397006303, "learning_rate": 1.980988860215542e-05, "loss": 0.4565, "step": 1280 }, { "epoch": 0.2525630914826498, "grad_norm": 0.6600661011809978, "learning_rate": 1.9809587679989843e-05, "loss": 0.4603, "step": 1281 }, { "epoch": 0.2527602523659306, "grad_norm": 0.6621495554696686, "learning_rate": 1.980928652214102e-05, "loss": 0.4539, "step": 1282 }, { "epoch": 0.25295741324921134, "grad_norm": 0.6210665496532219, "learning_rate": 1.980898512861619e-05, "loss": 0.388, "step": 1283 }, { "epoch": 0.2531545741324921, "grad_norm": 0.6372480467697316, "learning_rate": 1.9808683499422595e-05, "loss": 0.4625, "step": 1284 }, { "epoch": 0.25335173501577285, "grad_norm": 0.7040049125741846, "learning_rate": 1.9808381634567478e-05, "loss": 0.4644, "step": 1285 }, { "epoch": 0.2535488958990536, "grad_norm": 0.6788863637030159, "learning_rate": 1.9808079534058092e-05, "loss": 0.4233, "step": 1286 }, { "epoch": 0.25374605678233436, "grad_norm": 0.7084313524121193, "learning_rate": 1.9807777197901697e-05, "loss": 0.4692, "step": 1287 }, { "epoch": 0.2539432176656151, "grad_norm": 0.7122952126645031, "learning_rate": 1.9807474626105557e-05, "loss": 0.4687, "step": 1288 }, { "epoch": 0.2541403785488959, "grad_norm": 0.9877933281913743, "learning_rate": 1.9807171818676944e-05, "loss": 0.4496, "step": 1289 }, { "epoch": 0.25433753943217663, "grad_norm": 0.6865217783581581, "learning_rate": 1.9806868775623127e-05, "loss": 0.4448, "step": 1290 }, { "epoch": 0.25453470031545744, "grad_norm": 0.6823194099522876, "learning_rate": 1.980656549695139e-05, "loss": 0.4556, "step": 1291 }, { "epoch": 0.2547318611987382, "grad_norm": 0.6647345972741581, "learning_rate": 1.9806261982669025e-05, "loss": 0.4491, "step": 1292 }, { "epoch": 0.25492902208201895, "grad_norm": 0.6302502151295573, "learning_rate": 1.9805958232783314e-05, "loss": 0.4464, "step": 1293 }, { "epoch": 0.2551261829652997, "grad_norm": 0.6579780881631992, "learning_rate": 1.980565424730156e-05, "loss": 0.4485, "step": 1294 }, { "epoch": 0.25532334384858046, "grad_norm": 0.7655010402371665, "learning_rate": 1.9805350026231067e-05, "loss": 0.4669, "step": 1295 }, { "epoch": 0.2555205047318612, "grad_norm": 0.6625750789917858, "learning_rate": 1.9805045569579144e-05, "loss": 0.463, "step": 1296 }, { "epoch": 0.255717665615142, "grad_norm": 0.6662591258788151, "learning_rate": 1.9804740877353105e-05, "loss": 0.4367, "step": 1297 }, { "epoch": 0.25591482649842273, "grad_norm": 0.6108948522622889, "learning_rate": 1.980443594956027e-05, "loss": 0.4241, "step": 1298 }, { "epoch": 0.2561119873817035, "grad_norm": 0.7914240432743016, "learning_rate": 1.9804130786207966e-05, "loss": 0.4388, "step": 1299 }, { "epoch": 0.25630914826498424, "grad_norm": 0.6627711788576277, "learning_rate": 1.9803825387303525e-05, "loss": 0.4356, "step": 1300 }, { "epoch": 0.256506309148265, "grad_norm": 0.6421042720478715, "learning_rate": 1.9803519752854284e-05, "loss": 0.4434, "step": 1301 }, { "epoch": 0.25670347003154576, "grad_norm": 0.9154766392932525, "learning_rate": 1.9803213882867583e-05, "loss": 0.5037, "step": 1302 }, { "epoch": 0.2569006309148265, "grad_norm": 0.6412059503472081, "learning_rate": 1.9802907777350778e-05, "loss": 0.4555, "step": 1303 }, { "epoch": 0.25709779179810727, "grad_norm": 0.7328818019918474, "learning_rate": 1.9802601436311223e-05, "loss": 0.4795, "step": 1304 }, { "epoch": 0.257294952681388, "grad_norm": 1.290582323709545, "learning_rate": 1.980229485975627e-05, "loss": 0.4574, "step": 1305 }, { "epoch": 0.2574921135646688, "grad_norm": 0.7047549042877954, "learning_rate": 1.980198804769329e-05, "loss": 0.4441, "step": 1306 }, { "epoch": 0.25768927444794953, "grad_norm": 0.6450143398070649, "learning_rate": 1.9801681000129652e-05, "loss": 0.4523, "step": 1307 }, { "epoch": 0.2578864353312303, "grad_norm": 0.7236989507032727, "learning_rate": 1.9801373717072732e-05, "loss": 0.4811, "step": 1308 }, { "epoch": 0.25808359621451105, "grad_norm": 0.5982776267715486, "learning_rate": 1.980106619852992e-05, "loss": 0.4257, "step": 1309 }, { "epoch": 0.2582807570977918, "grad_norm": 1.5437331705738089, "learning_rate": 1.98007584445086e-05, "loss": 0.4755, "step": 1310 }, { "epoch": 0.25847791798107256, "grad_norm": 0.665819086891947, "learning_rate": 1.9800450455016163e-05, "loss": 0.426, "step": 1311 }, { "epoch": 0.2586750788643533, "grad_norm": 0.6986032017043294, "learning_rate": 1.9800142230060012e-05, "loss": 0.4677, "step": 1312 }, { "epoch": 0.25887223974763407, "grad_norm": 0.7076440147684866, "learning_rate": 1.9799833769647553e-05, "loss": 0.4849, "step": 1313 }, { "epoch": 0.2590694006309148, "grad_norm": 0.6993982811871999, "learning_rate": 1.9799525073786196e-05, "loss": 0.4636, "step": 1314 }, { "epoch": 0.2592665615141956, "grad_norm": 0.6663223299060342, "learning_rate": 1.9799216142483358e-05, "loss": 0.4374, "step": 1315 }, { "epoch": 0.25946372239747634, "grad_norm": 0.7322544020861116, "learning_rate": 1.9798906975746462e-05, "loss": 0.4616, "step": 1316 }, { "epoch": 0.2596608832807571, "grad_norm": 4.868764964094287, "learning_rate": 1.9798597573582935e-05, "loss": 0.5068, "step": 1317 }, { "epoch": 0.25985804416403785, "grad_norm": 1.6121049926148328, "learning_rate": 1.9798287936000203e-05, "loss": 0.4858, "step": 1318 }, { "epoch": 0.2600552050473186, "grad_norm": 0.7881657434369355, "learning_rate": 1.979797806300572e-05, "loss": 0.4707, "step": 1319 }, { "epoch": 0.26025236593059936, "grad_norm": 0.6719396543796176, "learning_rate": 1.9797667954606923e-05, "loss": 0.4375, "step": 1320 }, { "epoch": 0.2604495268138801, "grad_norm": 0.677646758521938, "learning_rate": 1.9797357610811264e-05, "loss": 0.3941, "step": 1321 }, { "epoch": 0.2606466876971609, "grad_norm": 0.6629789255172303, "learning_rate": 1.9797047031626197e-05, "loss": 0.4666, "step": 1322 }, { "epoch": 0.26084384858044163, "grad_norm": 0.7016369371591297, "learning_rate": 1.9796736217059184e-05, "loss": 0.4176, "step": 1323 }, { "epoch": 0.2610410094637224, "grad_norm": 0.7392651241817684, "learning_rate": 1.9796425167117697e-05, "loss": 0.4569, "step": 1324 }, { "epoch": 0.26123817034700314, "grad_norm": 3.4278364452599943, "learning_rate": 1.9796113881809207e-05, "loss": 0.4897, "step": 1325 }, { "epoch": 0.2614353312302839, "grad_norm": 0.7430113958710807, "learning_rate": 1.979580236114119e-05, "loss": 0.4281, "step": 1326 }, { "epoch": 0.26163249211356465, "grad_norm": 0.7574764251542814, "learning_rate": 1.9795490605121133e-05, "loss": 0.4285, "step": 1327 }, { "epoch": 0.2618296529968454, "grad_norm": 0.7573150663369252, "learning_rate": 1.9795178613756526e-05, "loss": 0.4214, "step": 1328 }, { "epoch": 0.26202681388012616, "grad_norm": 0.7233219537322484, "learning_rate": 1.9794866387054866e-05, "loss": 0.4333, "step": 1329 }, { "epoch": 0.2622239747634069, "grad_norm": 0.6536227209487654, "learning_rate": 1.9794553925023648e-05, "loss": 0.508, "step": 1330 }, { "epoch": 0.2624211356466877, "grad_norm": 0.697150695998406, "learning_rate": 1.979424122767039e-05, "loss": 0.4402, "step": 1331 }, { "epoch": 0.26261829652996843, "grad_norm": 1.1510688800982456, "learning_rate": 1.97939282950026e-05, "loss": 0.4935, "step": 1332 }, { "epoch": 0.2628154574132492, "grad_norm": 0.6936122849738883, "learning_rate": 1.979361512702779e-05, "loss": 0.4369, "step": 1333 }, { "epoch": 0.26301261829652994, "grad_norm": 0.7316037732825653, "learning_rate": 1.9793301723753494e-05, "loss": 0.4774, "step": 1334 }, { "epoch": 0.2632097791798107, "grad_norm": 0.7046667225142733, "learning_rate": 1.9792988085187237e-05, "loss": 0.4586, "step": 1335 }, { "epoch": 0.2634069400630915, "grad_norm": 0.7576252182556432, "learning_rate": 1.9792674211336557e-05, "loss": 0.4444, "step": 1336 }, { "epoch": 0.26360410094637227, "grad_norm": 0.702436953338292, "learning_rate": 1.9792360102208987e-05, "loss": 0.4508, "step": 1337 }, { "epoch": 0.263801261829653, "grad_norm": 0.8581168449077242, "learning_rate": 1.9792045757812083e-05, "loss": 0.4536, "step": 1338 }, { "epoch": 0.2639984227129338, "grad_norm": 0.7421141473102278, "learning_rate": 1.9791731178153398e-05, "loss": 0.4884, "step": 1339 }, { "epoch": 0.26419558359621453, "grad_norm": 0.8029593819817289, "learning_rate": 1.979141636324048e-05, "loss": 0.5005, "step": 1340 }, { "epoch": 0.2643927444794953, "grad_norm": 0.7626034173756989, "learning_rate": 1.97911013130809e-05, "loss": 0.4701, "step": 1341 }, { "epoch": 0.26458990536277605, "grad_norm": 0.6947755523676065, "learning_rate": 1.979078602768223e-05, "loss": 0.4649, "step": 1342 }, { "epoch": 0.2647870662460568, "grad_norm": 0.7173606924037355, "learning_rate": 1.9790470507052043e-05, "loss": 0.4636, "step": 1343 }, { "epoch": 0.26498422712933756, "grad_norm": 0.7174536980291077, "learning_rate": 1.979015475119791e-05, "loss": 0.4722, "step": 1344 }, { "epoch": 0.2651813880126183, "grad_norm": 0.7091313571434286, "learning_rate": 1.978983876012743e-05, "loss": 0.4604, "step": 1345 }, { "epoch": 0.26537854889589907, "grad_norm": 0.7938427813870562, "learning_rate": 1.978952253384819e-05, "loss": 0.4683, "step": 1346 }, { "epoch": 0.2655757097791798, "grad_norm": 0.8665000592475282, "learning_rate": 1.9789206072367788e-05, "loss": 0.4651, "step": 1347 }, { "epoch": 0.2657728706624606, "grad_norm": 0.7476112206662554, "learning_rate": 1.9788889375693826e-05, "loss": 0.4501, "step": 1348 }, { "epoch": 0.26597003154574134, "grad_norm": 0.6411119915064069, "learning_rate": 1.978857244383391e-05, "loss": 0.4332, "step": 1349 }, { "epoch": 0.2661671924290221, "grad_norm": 0.7501426692435633, "learning_rate": 1.9788255276795665e-05, "loss": 0.4482, "step": 1350 }, { "epoch": 0.26636435331230285, "grad_norm": 0.6924050647879918, "learning_rate": 1.97879378745867e-05, "loss": 0.4781, "step": 1351 }, { "epoch": 0.2665615141955836, "grad_norm": 0.7692951834407704, "learning_rate": 1.9787620237214648e-05, "loss": 0.4492, "step": 1352 }, { "epoch": 0.26675867507886436, "grad_norm": 0.7874785369772107, "learning_rate": 1.9787302364687137e-05, "loss": 0.4625, "step": 1353 }, { "epoch": 0.2669558359621451, "grad_norm": 0.7706538665841983, "learning_rate": 1.9786984257011804e-05, "loss": 0.5065, "step": 1354 }, { "epoch": 0.26715299684542587, "grad_norm": 0.9023943582130787, "learning_rate": 1.9786665914196293e-05, "loss": 0.4269, "step": 1355 }, { "epoch": 0.26735015772870663, "grad_norm": 0.7444077480799488, "learning_rate": 1.978634733624825e-05, "loss": 0.4632, "step": 1356 }, { "epoch": 0.2675473186119874, "grad_norm": 0.750725217959198, "learning_rate": 1.9786028523175334e-05, "loss": 0.4813, "step": 1357 }, { "epoch": 0.26774447949526814, "grad_norm": 0.6159139836796179, "learning_rate": 1.9785709474985205e-05, "loss": 0.4647, "step": 1358 }, { "epoch": 0.2679416403785489, "grad_norm": 0.7129947513385421, "learning_rate": 1.978539019168552e-05, "loss": 0.4177, "step": 1359 }, { "epoch": 0.26813880126182965, "grad_norm": 0.6516268614502504, "learning_rate": 1.9785070673283958e-05, "loss": 0.4365, "step": 1360 }, { "epoch": 0.2683359621451104, "grad_norm": 0.7106387839347237, "learning_rate": 1.9784750919788192e-05, "loss": 0.4908, "step": 1361 }, { "epoch": 0.26853312302839116, "grad_norm": 0.6645867370230059, "learning_rate": 1.978443093120591e-05, "loss": 0.4646, "step": 1362 }, { "epoch": 0.2687302839116719, "grad_norm": 0.6609654446353767, "learning_rate": 1.978411070754479e-05, "loss": 0.4464, "step": 1363 }, { "epoch": 0.2689274447949527, "grad_norm": 0.6509684679600655, "learning_rate": 1.9783790248812535e-05, "loss": 0.4378, "step": 1364 }, { "epoch": 0.26912460567823343, "grad_norm": 0.7410255193953944, "learning_rate": 1.9783469555016838e-05, "loss": 0.4883, "step": 1365 }, { "epoch": 0.2693217665615142, "grad_norm": 0.6410457849255176, "learning_rate": 1.9783148626165408e-05, "loss": 0.4566, "step": 1366 }, { "epoch": 0.26951892744479494, "grad_norm": 0.7212189325599111, "learning_rate": 1.978282746226595e-05, "loss": 0.4748, "step": 1367 }, { "epoch": 0.2697160883280757, "grad_norm": 0.6386044383277297, "learning_rate": 1.9782506063326188e-05, "loss": 0.4525, "step": 1368 }, { "epoch": 0.26991324921135645, "grad_norm": 0.6183024303790638, "learning_rate": 1.978218442935384e-05, "loss": 0.4419, "step": 1369 }, { "epoch": 0.2701104100946372, "grad_norm": 1.0311747282997497, "learning_rate": 1.9781862560356632e-05, "loss": 0.4705, "step": 1370 }, { "epoch": 0.27030757097791797, "grad_norm": 0.8876601008055358, "learning_rate": 1.97815404563423e-05, "loss": 0.4952, "step": 1371 }, { "epoch": 0.2705047318611987, "grad_norm": 0.6707553837334197, "learning_rate": 1.978121811731858e-05, "loss": 0.4727, "step": 1372 }, { "epoch": 0.2707018927444795, "grad_norm": 0.6324861998325033, "learning_rate": 1.978089554329322e-05, "loss": 0.4512, "step": 1373 }, { "epoch": 0.27089905362776023, "grad_norm": 0.7208718808928518, "learning_rate": 1.9780572734273965e-05, "loss": 0.4708, "step": 1374 }, { "epoch": 0.271096214511041, "grad_norm": 0.6289916939191427, "learning_rate": 1.9780249690268577e-05, "loss": 0.46, "step": 1375 }, { "epoch": 0.27129337539432175, "grad_norm": 0.6255312002347381, "learning_rate": 1.977992641128481e-05, "loss": 0.4113, "step": 1376 }, { "epoch": 0.2714905362776025, "grad_norm": 0.7538032993514353, "learning_rate": 1.977960289733044e-05, "loss": 0.5171, "step": 1377 }, { "epoch": 0.27168769716088326, "grad_norm": 0.6617762757068099, "learning_rate": 1.977927914841323e-05, "loss": 0.4574, "step": 1378 }, { "epoch": 0.271884858044164, "grad_norm": 0.6317670145204217, "learning_rate": 1.9778955164540966e-05, "loss": 0.4416, "step": 1379 }, { "epoch": 0.27208201892744477, "grad_norm": 0.6625339760031199, "learning_rate": 1.977863094572143e-05, "loss": 0.4695, "step": 1380 }, { "epoch": 0.2722791798107255, "grad_norm": 0.8851653534578081, "learning_rate": 1.977830649196241e-05, "loss": 0.469, "step": 1381 }, { "epoch": 0.27247634069400634, "grad_norm": 0.6525212576179507, "learning_rate": 1.9777981803271702e-05, "loss": 0.4607, "step": 1382 }, { "epoch": 0.2726735015772871, "grad_norm": 0.687794207764544, "learning_rate": 1.9777656879657104e-05, "loss": 0.4329, "step": 1383 }, { "epoch": 0.27287066246056785, "grad_norm": 0.6477780040394759, "learning_rate": 1.9777331721126432e-05, "loss": 0.455, "step": 1384 }, { "epoch": 0.2730678233438486, "grad_norm": 0.6704764509430108, "learning_rate": 1.9777006327687486e-05, "loss": 0.4756, "step": 1385 }, { "epoch": 0.27326498422712936, "grad_norm": 0.6673187768283187, "learning_rate": 1.9776680699348093e-05, "loss": 0.4525, "step": 1386 }, { "epoch": 0.2734621451104101, "grad_norm": 0.6725761328570758, "learning_rate": 1.977635483611607e-05, "loss": 0.4465, "step": 1387 }, { "epoch": 0.27365930599369087, "grad_norm": 0.6036295529404659, "learning_rate": 1.9776028737999256e-05, "loss": 0.4371, "step": 1388 }, { "epoch": 0.2738564668769716, "grad_norm": 0.649936023702026, "learning_rate": 1.9775702405005473e-05, "loss": 0.4419, "step": 1389 }, { "epoch": 0.2740536277602524, "grad_norm": 0.6259329731865518, "learning_rate": 1.977537583714257e-05, "loss": 0.4607, "step": 1390 }, { "epoch": 0.27425078864353314, "grad_norm": 0.9337258788421762, "learning_rate": 1.9775049034418384e-05, "loss": 0.4314, "step": 1391 }, { "epoch": 0.2744479495268139, "grad_norm": 0.7553643495249596, "learning_rate": 1.977472199684078e-05, "loss": 0.4019, "step": 1392 }, { "epoch": 0.27464511041009465, "grad_norm": 0.6681682048717218, "learning_rate": 1.9774394724417608e-05, "loss": 0.4669, "step": 1393 }, { "epoch": 0.2748422712933754, "grad_norm": 0.6660846989527527, "learning_rate": 1.977406721715673e-05, "loss": 0.4592, "step": 1394 }, { "epoch": 0.27503943217665616, "grad_norm": 1.030125307014766, "learning_rate": 1.9773739475066015e-05, "loss": 0.4519, "step": 1395 }, { "epoch": 0.2752365930599369, "grad_norm": 0.6165906390246885, "learning_rate": 1.977341149815334e-05, "loss": 0.4611, "step": 1396 }, { "epoch": 0.2754337539432177, "grad_norm": 0.6301071810608176, "learning_rate": 1.977308328642658e-05, "loss": 0.4728, "step": 1397 }, { "epoch": 0.27563091482649843, "grad_norm": 0.603328270231981, "learning_rate": 1.9772754839893627e-05, "loss": 0.434, "step": 1398 }, { "epoch": 0.2758280757097792, "grad_norm": 0.6648768357451612, "learning_rate": 1.9772426158562367e-05, "loss": 0.4612, "step": 1399 }, { "epoch": 0.27602523659305994, "grad_norm": 0.6209334691515703, "learning_rate": 1.9772097242440703e-05, "loss": 0.4629, "step": 1400 }, { "epoch": 0.2762223974763407, "grad_norm": 0.6615598121420424, "learning_rate": 1.9771768091536528e-05, "loss": 0.4546, "step": 1401 }, { "epoch": 0.27641955835962145, "grad_norm": 0.6824317459563148, "learning_rate": 1.977143870585776e-05, "loss": 0.4428, "step": 1402 }, { "epoch": 0.2766167192429022, "grad_norm": 0.6670787550837416, "learning_rate": 1.9771109085412304e-05, "loss": 0.4997, "step": 1403 }, { "epoch": 0.27681388012618297, "grad_norm": 0.8229581570264787, "learning_rate": 1.9770779230208088e-05, "loss": 0.4375, "step": 1404 }, { "epoch": 0.2770110410094637, "grad_norm": 0.637951151429009, "learning_rate": 1.977044914025303e-05, "loss": 0.4322, "step": 1405 }, { "epoch": 0.2772082018927445, "grad_norm": 0.607442608155513, "learning_rate": 1.9770118815555063e-05, "loss": 0.4242, "step": 1406 }, { "epoch": 0.27740536277602523, "grad_norm": 0.6993192647998349, "learning_rate": 1.9769788256122125e-05, "loss": 0.4741, "step": 1407 }, { "epoch": 0.277602523659306, "grad_norm": 0.6691537581034921, "learning_rate": 1.9769457461962154e-05, "loss": 0.4504, "step": 1408 }, { "epoch": 0.27779968454258674, "grad_norm": 0.7485422674380534, "learning_rate": 1.9769126433083102e-05, "loss": 0.4825, "step": 1409 }, { "epoch": 0.2779968454258675, "grad_norm": 0.6486895377627935, "learning_rate": 1.976879516949292e-05, "loss": 0.459, "step": 1410 }, { "epoch": 0.27819400630914826, "grad_norm": 0.669976702587913, "learning_rate": 1.976846367119957e-05, "loss": 0.4618, "step": 1411 }, { "epoch": 0.278391167192429, "grad_norm": 0.6947541489605775, "learning_rate": 1.976813193821101e-05, "loss": 0.4782, "step": 1412 }, { "epoch": 0.27858832807570977, "grad_norm": 0.7279085002000397, "learning_rate": 1.9767799970535214e-05, "loss": 0.4584, "step": 1413 }, { "epoch": 0.2787854889589905, "grad_norm": 0.6324612775847488, "learning_rate": 1.9767467768180163e-05, "loss": 0.4429, "step": 1414 }, { "epoch": 0.2789826498422713, "grad_norm": 0.7075686828831383, "learning_rate": 1.9767135331153827e-05, "loss": 0.497, "step": 1415 }, { "epoch": 0.27917981072555204, "grad_norm": 0.677366770732698, "learning_rate": 1.97668026594642e-05, "loss": 0.4739, "step": 1416 }, { "epoch": 0.2793769716088328, "grad_norm": 0.6413070384790073, "learning_rate": 1.9766469753119274e-05, "loss": 0.4609, "step": 1417 }, { "epoch": 0.27957413249211355, "grad_norm": 0.6875558936981296, "learning_rate": 1.976613661212705e-05, "loss": 0.5027, "step": 1418 }, { "epoch": 0.2797712933753943, "grad_norm": 0.7069012738884121, "learning_rate": 1.976580323649553e-05, "loss": 0.4675, "step": 1419 }, { "epoch": 0.27996845425867506, "grad_norm": 0.6999152312103506, "learning_rate": 1.976546962623272e-05, "loss": 0.479, "step": 1420 }, { "epoch": 0.2801656151419558, "grad_norm": 0.632966968386772, "learning_rate": 1.9765135781346637e-05, "loss": 0.4458, "step": 1421 }, { "epoch": 0.28036277602523657, "grad_norm": 0.680032576952019, "learning_rate": 1.9764801701845307e-05, "loss": 0.4904, "step": 1422 }, { "epoch": 0.2805599369085173, "grad_norm": 0.6793474835680345, "learning_rate": 1.9764467387736748e-05, "loss": 0.4594, "step": 1423 }, { "epoch": 0.2807570977917981, "grad_norm": 0.6990046169084455, "learning_rate": 1.9764132839029e-05, "loss": 0.4825, "step": 1424 }, { "epoch": 0.28095425867507884, "grad_norm": 0.7012958434364689, "learning_rate": 1.9763798055730096e-05, "loss": 0.4394, "step": 1425 }, { "epoch": 0.2811514195583596, "grad_norm": 0.649381673277496, "learning_rate": 1.9763463037848082e-05, "loss": 0.4832, "step": 1426 }, { "epoch": 0.2813485804416404, "grad_norm": 0.7051628624005888, "learning_rate": 1.9763127785391007e-05, "loss": 0.4962, "step": 1427 }, { "epoch": 0.28154574132492116, "grad_norm": 0.6757095316378556, "learning_rate": 1.976279229836692e-05, "loss": 0.4448, "step": 1428 }, { "epoch": 0.2817429022082019, "grad_norm": 0.7505169408393606, "learning_rate": 1.976245657678389e-05, "loss": 0.4875, "step": 1429 }, { "epoch": 0.2819400630914827, "grad_norm": 0.6482256189057635, "learning_rate": 1.9762120620649978e-05, "loss": 0.4457, "step": 1430 }, { "epoch": 0.28213722397476343, "grad_norm": 0.6341748454996773, "learning_rate": 1.9761784429973257e-05, "loss": 0.4298, "step": 1431 }, { "epoch": 0.2823343848580442, "grad_norm": 0.6426740598337791, "learning_rate": 1.9761448004761804e-05, "loss": 0.4351, "step": 1432 }, { "epoch": 0.28253154574132494, "grad_norm": 0.6893390849187452, "learning_rate": 1.9761111345023702e-05, "loss": 0.4528, "step": 1433 }, { "epoch": 0.2827287066246057, "grad_norm": 0.6931202899611968, "learning_rate": 1.976077445076704e-05, "loss": 0.4504, "step": 1434 }, { "epoch": 0.28292586750788645, "grad_norm": 0.6396492394646054, "learning_rate": 1.976043732199991e-05, "loss": 0.5051, "step": 1435 }, { "epoch": 0.2831230283911672, "grad_norm": 0.6068638024331832, "learning_rate": 1.9760099958730414e-05, "loss": 0.3965, "step": 1436 }, { "epoch": 0.28332018927444796, "grad_norm": 0.6793441643898733, "learning_rate": 1.9759762360966658e-05, "loss": 0.4565, "step": 1437 }, { "epoch": 0.2835173501577287, "grad_norm": 0.6338152033705464, "learning_rate": 1.9759424528716748e-05, "loss": 0.3908, "step": 1438 }, { "epoch": 0.2837145110410095, "grad_norm": 0.6375149590618007, "learning_rate": 1.975908646198881e-05, "loss": 0.4457, "step": 1439 }, { "epoch": 0.28391167192429023, "grad_norm": 0.6912444261554432, "learning_rate": 1.9758748160790956e-05, "loss": 0.4556, "step": 1440 }, { "epoch": 0.284108832807571, "grad_norm": 0.7308297162675815, "learning_rate": 1.975840962513132e-05, "loss": 0.484, "step": 1441 }, { "epoch": 0.28430599369085174, "grad_norm": 0.7358253544207752, "learning_rate": 1.9758070855018033e-05, "loss": 0.4667, "step": 1442 }, { "epoch": 0.2845031545741325, "grad_norm": 0.6240361195624918, "learning_rate": 1.975773185045924e-05, "loss": 0.4549, "step": 1443 }, { "epoch": 0.28470031545741326, "grad_norm": 0.6895402014641637, "learning_rate": 1.975739261146308e-05, "loss": 0.4234, "step": 1444 }, { "epoch": 0.284897476340694, "grad_norm": 2.677764220478418, "learning_rate": 1.97570531380377e-05, "loss": 0.4986, "step": 1445 }, { "epoch": 0.28509463722397477, "grad_norm": 0.7134750571138294, "learning_rate": 1.975671343019126e-05, "loss": 0.4143, "step": 1446 }, { "epoch": 0.2852917981072555, "grad_norm": 0.7374402505710781, "learning_rate": 1.9756373487931932e-05, "loss": 0.478, "step": 1447 }, { "epoch": 0.2854889589905363, "grad_norm": 0.7841802091710707, "learning_rate": 1.975603331126787e-05, "loss": 0.4791, "step": 1448 }, { "epoch": 0.28568611987381703, "grad_norm": 0.7851659427951097, "learning_rate": 1.975569290020725e-05, "loss": 0.4445, "step": 1449 }, { "epoch": 0.2858832807570978, "grad_norm": 0.7016574816261865, "learning_rate": 1.9755352254758253e-05, "loss": 0.4718, "step": 1450 }, { "epoch": 0.28608044164037855, "grad_norm": 1.360154947822406, "learning_rate": 1.975501137492906e-05, "loss": 0.4778, "step": 1451 }, { "epoch": 0.2862776025236593, "grad_norm": 0.7088540961818571, "learning_rate": 1.9754670260727865e-05, "loss": 0.4956, "step": 1452 }, { "epoch": 0.28647476340694006, "grad_norm": 0.6877943073595973, "learning_rate": 1.9754328912162864e-05, "loss": 0.4666, "step": 1453 }, { "epoch": 0.2866719242902208, "grad_norm": 0.7632821008022346, "learning_rate": 1.975398732924225e-05, "loss": 0.4624, "step": 1454 }, { "epoch": 0.28686908517350157, "grad_norm": 0.649955168958308, "learning_rate": 1.975364551197424e-05, "loss": 0.4327, "step": 1455 }, { "epoch": 0.2870662460567823, "grad_norm": 1.5748278853806321, "learning_rate": 1.975330346036704e-05, "loss": 0.4463, "step": 1456 }, { "epoch": 0.2872634069400631, "grad_norm": 0.6418235523968513, "learning_rate": 1.975296117442887e-05, "loss": 0.4426, "step": 1457 }, { "epoch": 0.28746056782334384, "grad_norm": 0.7529789256896934, "learning_rate": 1.9752618654167954e-05, "loss": 0.4525, "step": 1458 }, { "epoch": 0.2876577287066246, "grad_norm": 0.6837140757349681, "learning_rate": 1.975227589959252e-05, "loss": 0.4664, "step": 1459 }, { "epoch": 0.28785488958990535, "grad_norm": 0.7302417926946705, "learning_rate": 1.9751932910710808e-05, "loss": 0.4344, "step": 1460 }, { "epoch": 0.2880520504731861, "grad_norm": 0.6549023096371199, "learning_rate": 1.9751589687531052e-05, "loss": 0.4637, "step": 1461 }, { "epoch": 0.28824921135646686, "grad_norm": 0.7572816037486845, "learning_rate": 1.97512462300615e-05, "loss": 0.4808, "step": 1462 }, { "epoch": 0.2884463722397476, "grad_norm": 0.7438685296075004, "learning_rate": 1.9750902538310407e-05, "loss": 0.4554, "step": 1463 }, { "epoch": 0.2886435331230284, "grad_norm": 0.7921043989775165, "learning_rate": 1.9750558612286025e-05, "loss": 0.4669, "step": 1464 }, { "epoch": 0.28884069400630913, "grad_norm": 1.7620430759198427, "learning_rate": 1.9750214451996623e-05, "loss": 0.4356, "step": 1465 }, { "epoch": 0.2890378548895899, "grad_norm": 0.7869312505011672, "learning_rate": 1.9749870057450464e-05, "loss": 0.4502, "step": 1466 }, { "epoch": 0.28923501577287064, "grad_norm": 0.7845532647336225, "learning_rate": 1.974952542865583e-05, "loss": 0.4835, "step": 1467 }, { "epoch": 0.2894321766561514, "grad_norm": 0.7515938063415738, "learning_rate": 1.9749180565620995e-05, "loss": 0.4507, "step": 1468 }, { "epoch": 0.28962933753943215, "grad_norm": 0.8913235246679625, "learning_rate": 1.9748835468354243e-05, "loss": 0.4729, "step": 1469 }, { "epoch": 0.2898264984227129, "grad_norm": 0.8870979739457769, "learning_rate": 1.9748490136863867e-05, "loss": 0.4391, "step": 1470 }, { "epoch": 0.29002365930599366, "grad_norm": 0.7727887896133023, "learning_rate": 1.9748144571158167e-05, "loss": 0.501, "step": 1471 }, { "epoch": 0.2902208201892745, "grad_norm": 0.6725824982109148, "learning_rate": 1.9747798771245446e-05, "loss": 0.4284, "step": 1472 }, { "epoch": 0.29041798107255523, "grad_norm": 0.7636840941258795, "learning_rate": 1.9747452737134006e-05, "loss": 0.4975, "step": 1473 }, { "epoch": 0.290615141955836, "grad_norm": 0.695534933075417, "learning_rate": 1.974710646883217e-05, "loss": 0.4367, "step": 1474 }, { "epoch": 0.29081230283911674, "grad_norm": 0.7808275437629759, "learning_rate": 1.9746759966348244e-05, "loss": 0.4821, "step": 1475 }, { "epoch": 0.2910094637223975, "grad_norm": 0.6839691016507893, "learning_rate": 1.9746413229690565e-05, "loss": 0.4496, "step": 1476 }, { "epoch": 0.29120662460567825, "grad_norm": 0.7291630955736097, "learning_rate": 1.974606625886746e-05, "loss": 0.467, "step": 1477 }, { "epoch": 0.291403785488959, "grad_norm": 0.7415071271998195, "learning_rate": 1.9745719053887265e-05, "loss": 0.4563, "step": 1478 }, { "epoch": 0.29160094637223977, "grad_norm": 0.721134342027533, "learning_rate": 1.974537161475832e-05, "loss": 0.4157, "step": 1479 }, { "epoch": 0.2917981072555205, "grad_norm": 0.6815829464296383, "learning_rate": 1.9745023941488974e-05, "loss": 0.4959, "step": 1480 }, { "epoch": 0.2919952681388013, "grad_norm": 0.6455560207981099, "learning_rate": 1.974467603408758e-05, "loss": 0.4131, "step": 1481 }, { "epoch": 0.29219242902208203, "grad_norm": 0.7495965931748421, "learning_rate": 1.9744327892562497e-05, "loss": 0.4536, "step": 1482 }, { "epoch": 0.2923895899053628, "grad_norm": 4.622108646120691, "learning_rate": 1.974397951692209e-05, "loss": 0.4472, "step": 1483 }, { "epoch": 0.29258675078864355, "grad_norm": 0.7671628175372462, "learning_rate": 1.9743630907174727e-05, "loss": 0.4463, "step": 1484 }, { "epoch": 0.2927839116719243, "grad_norm": 0.6306822829326908, "learning_rate": 1.9743282063328786e-05, "loss": 0.4321, "step": 1485 }, { "epoch": 0.29298107255520506, "grad_norm": 0.8026977169526986, "learning_rate": 1.9742932985392646e-05, "loss": 0.44, "step": 1486 }, { "epoch": 0.2931782334384858, "grad_norm": 0.6458962336741704, "learning_rate": 1.97425836733747e-05, "loss": 0.415, "step": 1487 }, { "epoch": 0.29337539432176657, "grad_norm": 0.8366441003492903, "learning_rate": 1.9742234127283328e-05, "loss": 0.5109, "step": 1488 }, { "epoch": 0.2935725552050473, "grad_norm": 3.206384511793217, "learning_rate": 1.9741884347126937e-05, "loss": 0.4654, "step": 1489 }, { "epoch": 0.2937697160883281, "grad_norm": 1.0905491570369534, "learning_rate": 1.9741534332913934e-05, "loss": 0.4734, "step": 1490 }, { "epoch": 0.29396687697160884, "grad_norm": 0.7932149212181082, "learning_rate": 1.9741184084652723e-05, "loss": 0.4809, "step": 1491 }, { "epoch": 0.2941640378548896, "grad_norm": 0.907352399489384, "learning_rate": 1.9740833602351718e-05, "loss": 0.4856, "step": 1492 }, { "epoch": 0.29436119873817035, "grad_norm": 0.8172339427555916, "learning_rate": 1.9740482886019342e-05, "loss": 0.4348, "step": 1493 }, { "epoch": 0.2945583596214511, "grad_norm": 0.7379386874605652, "learning_rate": 1.974013193566402e-05, "loss": 0.4315, "step": 1494 }, { "epoch": 0.29475552050473186, "grad_norm": 0.7315832037117764, "learning_rate": 1.9739780751294188e-05, "loss": 0.4302, "step": 1495 }, { "epoch": 0.2949526813880126, "grad_norm": 0.7363421312105899, "learning_rate": 1.9739429332918276e-05, "loss": 0.4428, "step": 1496 }, { "epoch": 0.29514984227129337, "grad_norm": 0.7954453286705275, "learning_rate": 1.973907768054473e-05, "loss": 0.4826, "step": 1497 }, { "epoch": 0.29534700315457413, "grad_norm": 0.6920665293293689, "learning_rate": 1.9738725794182004e-05, "loss": 0.45, "step": 1498 }, { "epoch": 0.2955441640378549, "grad_norm": 0.7476711485594634, "learning_rate": 1.9738373673838545e-05, "loss": 0.47, "step": 1499 }, { "epoch": 0.29574132492113564, "grad_norm": 0.6378986468629754, "learning_rate": 1.9738021319522817e-05, "loss": 0.446, "step": 1500 }, { "epoch": 0.2959384858044164, "grad_norm": 0.7449299773188399, "learning_rate": 1.9737668731243284e-05, "loss": 0.4622, "step": 1501 }, { "epoch": 0.29613564668769715, "grad_norm": 0.6648614812967949, "learning_rate": 1.973731590900842e-05, "loss": 0.4512, "step": 1502 }, { "epoch": 0.2963328075709779, "grad_norm": 0.7321775927414682, "learning_rate": 1.97369628528267e-05, "loss": 0.423, "step": 1503 }, { "epoch": 0.29652996845425866, "grad_norm": 0.6801575733467247, "learning_rate": 1.9736609562706604e-05, "loss": 0.4424, "step": 1504 }, { "epoch": 0.2967271293375394, "grad_norm": 0.6497325650317418, "learning_rate": 1.9736256038656624e-05, "loss": 0.4494, "step": 1505 }, { "epoch": 0.2969242902208202, "grad_norm": 0.7063493502541413, "learning_rate": 1.9735902280685252e-05, "loss": 0.4375, "step": 1506 }, { "epoch": 0.29712145110410093, "grad_norm": 0.6907055254378035, "learning_rate": 1.9735548288800988e-05, "loss": 0.4723, "step": 1507 }, { "epoch": 0.2973186119873817, "grad_norm": 0.6955235821677667, "learning_rate": 1.9735194063012337e-05, "loss": 0.4377, "step": 1508 }, { "epoch": 0.29751577287066244, "grad_norm": 0.6505153342650509, "learning_rate": 1.9734839603327805e-05, "loss": 0.4741, "step": 1509 }, { "epoch": 0.2977129337539432, "grad_norm": 1.21645699588291, "learning_rate": 1.973448490975592e-05, "loss": 0.488, "step": 1510 }, { "epoch": 0.29791009463722395, "grad_norm": 5.633561734216787, "learning_rate": 1.9734129982305187e-05, "loss": 0.5188, "step": 1511 }, { "epoch": 0.2981072555205047, "grad_norm": 5.960949833229553, "learning_rate": 1.9733774820984146e-05, "loss": 0.4864, "step": 1512 }, { "epoch": 0.29830441640378547, "grad_norm": 0.8936987496926002, "learning_rate": 1.9733419425801326e-05, "loss": 0.4904, "step": 1513 }, { "epoch": 0.2985015772870662, "grad_norm": 0.8736184927683982, "learning_rate": 1.9733063796765267e-05, "loss": 0.4807, "step": 1514 }, { "epoch": 0.298698738170347, "grad_norm": 0.8389206829886541, "learning_rate": 1.9732707933884508e-05, "loss": 0.4708, "step": 1515 }, { "epoch": 0.29889589905362773, "grad_norm": 0.877518939540052, "learning_rate": 1.973235183716761e-05, "loss": 0.4064, "step": 1516 }, { "epoch": 0.2990930599369085, "grad_norm": 1.0406389624738088, "learning_rate": 1.9731995506623118e-05, "loss": 0.4824, "step": 1517 }, { "epoch": 0.2992902208201893, "grad_norm": 0.8112544551340672, "learning_rate": 1.9731638942259596e-05, "loss": 0.4748, "step": 1518 }, { "epoch": 0.29948738170347006, "grad_norm": 0.7560476486010564, "learning_rate": 1.9731282144085613e-05, "loss": 0.4459, "step": 1519 }, { "epoch": 0.2996845425867508, "grad_norm": 0.8150605495041554, "learning_rate": 1.973092511210974e-05, "loss": 0.4697, "step": 1520 }, { "epoch": 0.29988170347003157, "grad_norm": 0.7564808190893473, "learning_rate": 1.9730567846340552e-05, "loss": 0.4705, "step": 1521 }, { "epoch": 0.3000788643533123, "grad_norm": 0.7262145297310585, "learning_rate": 1.973021034678664e-05, "loss": 0.3978, "step": 1522 }, { "epoch": 0.3002760252365931, "grad_norm": 0.7038980359439974, "learning_rate": 1.9729852613456586e-05, "loss": 0.4584, "step": 1523 }, { "epoch": 0.30047318611987384, "grad_norm": 0.7588921782146265, "learning_rate": 1.972949464635899e-05, "loss": 0.4819, "step": 1524 }, { "epoch": 0.3006703470031546, "grad_norm": 0.7168024227685293, "learning_rate": 1.9729136445502446e-05, "loss": 0.4864, "step": 1525 }, { "epoch": 0.30086750788643535, "grad_norm": 0.6493407202654529, "learning_rate": 1.9728778010895567e-05, "loss": 0.4107, "step": 1526 }, { "epoch": 0.3010646687697161, "grad_norm": 0.6952700541408848, "learning_rate": 1.9728419342546962e-05, "loss": 0.495, "step": 1527 }, { "epoch": 0.30126182965299686, "grad_norm": 1.1339617708695726, "learning_rate": 1.9728060440465246e-05, "loss": 0.4262, "step": 1528 }, { "epoch": 0.3014589905362776, "grad_norm": 0.6443436669507463, "learning_rate": 1.9727701304659046e-05, "loss": 0.4323, "step": 1529 }, { "epoch": 0.30165615141955837, "grad_norm": 0.7193540103567199, "learning_rate": 1.9727341935136987e-05, "loss": 0.4688, "step": 1530 }, { "epoch": 0.3018533123028391, "grad_norm": 0.6808229865935738, "learning_rate": 1.9726982331907706e-05, "loss": 0.4637, "step": 1531 }, { "epoch": 0.3020504731861199, "grad_norm": 0.6469885340908015, "learning_rate": 1.972662249497984e-05, "loss": 0.4533, "step": 1532 }, { "epoch": 0.30224763406940064, "grad_norm": 0.8050241386096714, "learning_rate": 1.9726262424362033e-05, "loss": 0.4629, "step": 1533 }, { "epoch": 0.3024447949526814, "grad_norm": 1.1279481880937756, "learning_rate": 1.9725902120062942e-05, "loss": 0.4769, "step": 1534 }, { "epoch": 0.30264195583596215, "grad_norm": 0.9362607518901818, "learning_rate": 1.972554158209122e-05, "loss": 0.4499, "step": 1535 }, { "epoch": 0.3028391167192429, "grad_norm": 0.6311429250558253, "learning_rate": 1.972518081045553e-05, "loss": 0.4707, "step": 1536 }, { "epoch": 0.30303627760252366, "grad_norm": 0.6908119659769627, "learning_rate": 1.9724819805164542e-05, "loss": 0.459, "step": 1537 }, { "epoch": 0.3032334384858044, "grad_norm": 1.3140084577134628, "learning_rate": 1.972445856622692e-05, "loss": 0.4604, "step": 1538 }, { "epoch": 0.3034305993690852, "grad_norm": 1.4336266646144273, "learning_rate": 1.9724097093651356e-05, "loss": 0.5054, "step": 1539 }, { "epoch": 0.30362776025236593, "grad_norm": 0.6266235358734742, "learning_rate": 1.9723735387446526e-05, "loss": 0.4195, "step": 1540 }, { "epoch": 0.3038249211356467, "grad_norm": 0.664253082557445, "learning_rate": 1.9723373447621125e-05, "loss": 0.4466, "step": 1541 }, { "epoch": 0.30402208201892744, "grad_norm": 0.6631894672531906, "learning_rate": 1.9723011274183844e-05, "loss": 0.4613, "step": 1542 }, { "epoch": 0.3042192429022082, "grad_norm": 0.9552950397058307, "learning_rate": 1.9722648867143384e-05, "loss": 0.4328, "step": 1543 }, { "epoch": 0.30441640378548895, "grad_norm": 0.6534330380880412, "learning_rate": 1.972228622650846e-05, "loss": 0.472, "step": 1544 }, { "epoch": 0.3046135646687697, "grad_norm": 0.6792854850313309, "learning_rate": 1.972192335228778e-05, "loss": 0.4639, "step": 1545 }, { "epoch": 0.30481072555205047, "grad_norm": 0.8246065548514672, "learning_rate": 1.972156024449006e-05, "loss": 0.4678, "step": 1546 }, { "epoch": 0.3050078864353312, "grad_norm": 0.6805673028086667, "learning_rate": 1.972119690312403e-05, "loss": 0.4392, "step": 1547 }, { "epoch": 0.305205047318612, "grad_norm": 1.1098363616208942, "learning_rate": 1.9720833328198416e-05, "loss": 0.457, "step": 1548 }, { "epoch": 0.30540220820189273, "grad_norm": 1.9882754170201284, "learning_rate": 1.972046951972195e-05, "loss": 0.4782, "step": 1549 }, { "epoch": 0.3055993690851735, "grad_norm": 0.6655712983680275, "learning_rate": 1.972010547770338e-05, "loss": 0.4654, "step": 1550 }, { "epoch": 0.30579652996845424, "grad_norm": 0.6357244449306213, "learning_rate": 1.9719741202151442e-05, "loss": 0.4337, "step": 1551 }, { "epoch": 0.305993690851735, "grad_norm": 0.6432848249907509, "learning_rate": 1.9719376693074898e-05, "loss": 0.4656, "step": 1552 }, { "epoch": 0.30619085173501576, "grad_norm": 0.6490834027479839, "learning_rate": 1.97190119504825e-05, "loss": 0.4442, "step": 1553 }, { "epoch": 0.3063880126182965, "grad_norm": 0.9593162347229904, "learning_rate": 1.9718646974383016e-05, "loss": 0.417, "step": 1554 }, { "epoch": 0.30658517350157727, "grad_norm": 0.8582061855411773, "learning_rate": 1.9718281764785213e-05, "loss": 0.4533, "step": 1555 }, { "epoch": 0.306782334384858, "grad_norm": 0.6829232907025286, "learning_rate": 1.9717916321697862e-05, "loss": 0.4531, "step": 1556 }, { "epoch": 0.3069794952681388, "grad_norm": 0.6946751006171858, "learning_rate": 1.9717550645129745e-05, "loss": 0.4332, "step": 1557 }, { "epoch": 0.30717665615141954, "grad_norm": 0.7343732521374964, "learning_rate": 1.971718473508965e-05, "loss": 0.4563, "step": 1558 }, { "epoch": 0.3073738170347003, "grad_norm": 0.7092985413488407, "learning_rate": 1.9716818591586367e-05, "loss": 0.4427, "step": 1559 }, { "epoch": 0.30757097791798105, "grad_norm": 0.6920199000103088, "learning_rate": 1.9716452214628688e-05, "loss": 0.4867, "step": 1560 }, { "epoch": 0.3077681388012618, "grad_norm": 0.7518768098411801, "learning_rate": 1.9716085604225425e-05, "loss": 0.4288, "step": 1561 }, { "epoch": 0.30796529968454256, "grad_norm": 0.9921646912510204, "learning_rate": 1.9715718760385377e-05, "loss": 0.458, "step": 1562 }, { "epoch": 0.30816246056782337, "grad_norm": 0.723626350748604, "learning_rate": 1.9715351683117364e-05, "loss": 0.4865, "step": 1563 }, { "epoch": 0.3083596214511041, "grad_norm": 0.6699114054096069, "learning_rate": 1.9714984372430205e-05, "loss": 0.4719, "step": 1564 }, { "epoch": 0.3085567823343849, "grad_norm": 0.7208356036921162, "learning_rate": 1.971461682833272e-05, "loss": 0.4499, "step": 1565 }, { "epoch": 0.30875394321766564, "grad_norm": 0.696266504957194, "learning_rate": 1.9714249050833743e-05, "loss": 0.4685, "step": 1566 }, { "epoch": 0.3089511041009464, "grad_norm": 0.7045591753323159, "learning_rate": 1.971388103994211e-05, "loss": 0.4693, "step": 1567 }, { "epoch": 0.30914826498422715, "grad_norm": 0.9972061656036422, "learning_rate": 1.9713512795666663e-05, "loss": 0.4655, "step": 1568 }, { "epoch": 0.3093454258675079, "grad_norm": 0.6343439339945319, "learning_rate": 1.971314431801625e-05, "loss": 0.4607, "step": 1569 }, { "epoch": 0.30954258675078866, "grad_norm": 1.3214688571967796, "learning_rate": 1.9712775606999718e-05, "loss": 0.5253, "step": 1570 }, { "epoch": 0.3097397476340694, "grad_norm": 1.1643733839819645, "learning_rate": 1.9712406662625934e-05, "loss": 0.4845, "step": 1571 }, { "epoch": 0.3099369085173502, "grad_norm": 0.6395778049163567, "learning_rate": 1.9712037484903758e-05, "loss": 0.4646, "step": 1572 }, { "epoch": 0.31013406940063093, "grad_norm": 0.728804625037493, "learning_rate": 1.971166807384206e-05, "loss": 0.4829, "step": 1573 }, { "epoch": 0.3103312302839117, "grad_norm": 0.6692642203500485, "learning_rate": 1.9711298429449716e-05, "loss": 0.4628, "step": 1574 }, { "epoch": 0.31052839116719244, "grad_norm": 0.6801411829078509, "learning_rate": 1.9710928551735606e-05, "loss": 0.4806, "step": 1575 }, { "epoch": 0.3107255520504732, "grad_norm": 0.9147951787329351, "learning_rate": 1.971055844070862e-05, "loss": 0.4823, "step": 1576 }, { "epoch": 0.31092271293375395, "grad_norm": 0.6704142482301383, "learning_rate": 1.9710188096377645e-05, "loss": 0.493, "step": 1577 }, { "epoch": 0.3111198738170347, "grad_norm": 0.7934181252723825, "learning_rate": 1.9709817518751585e-05, "loss": 0.4711, "step": 1578 }, { "epoch": 0.31131703470031546, "grad_norm": 0.9638101618156172, "learning_rate": 1.9709446707839336e-05, "loss": 0.4734, "step": 1579 }, { "epoch": 0.3115141955835962, "grad_norm": 0.8058930264480674, "learning_rate": 1.9709075663649812e-05, "loss": 0.4672, "step": 1580 }, { "epoch": 0.311711356466877, "grad_norm": 0.648347607286343, "learning_rate": 1.9708704386191924e-05, "loss": 0.4387, "step": 1581 }, { "epoch": 0.31190851735015773, "grad_norm": 0.677310939518794, "learning_rate": 1.97083328754746e-05, "loss": 0.4441, "step": 1582 }, { "epoch": 0.3121056782334385, "grad_norm": 1.102580749565265, "learning_rate": 1.9707961131506756e-05, "loss": 0.4659, "step": 1583 }, { "epoch": 0.31230283911671924, "grad_norm": 0.7198499814401207, "learning_rate": 1.9707589154297328e-05, "loss": 0.4805, "step": 1584 }, { "epoch": 0.3125, "grad_norm": 0.6444043482783797, "learning_rate": 1.9707216943855258e-05, "loss": 0.4437, "step": 1585 }, { "epoch": 0.31269716088328076, "grad_norm": 0.8244715296197911, "learning_rate": 1.970684450018948e-05, "loss": 0.4363, "step": 1586 }, { "epoch": 0.3128943217665615, "grad_norm": 0.6445385325086617, "learning_rate": 1.9706471823308946e-05, "loss": 0.4599, "step": 1587 }, { "epoch": 0.31309148264984227, "grad_norm": 0.699849167744654, "learning_rate": 1.9706098913222608e-05, "loss": 0.5236, "step": 1588 }, { "epoch": 0.313288643533123, "grad_norm": 0.7229780587068287, "learning_rate": 1.970572576993943e-05, "loss": 0.412, "step": 1589 }, { "epoch": 0.3134858044164038, "grad_norm": 0.6479915833004943, "learning_rate": 1.9705352393468374e-05, "loss": 0.4697, "step": 1590 }, { "epoch": 0.31368296529968454, "grad_norm": 0.650021857370611, "learning_rate": 1.9704978783818413e-05, "loss": 0.4326, "step": 1591 }, { "epoch": 0.3138801261829653, "grad_norm": 0.6861324174548465, "learning_rate": 1.970460494099852e-05, "loss": 0.4125, "step": 1592 }, { "epoch": 0.31407728706624605, "grad_norm": 0.6801449769884733, "learning_rate": 1.9704230865017675e-05, "loss": 0.5069, "step": 1593 }, { "epoch": 0.3142744479495268, "grad_norm": 0.7205207674630276, "learning_rate": 1.970385655588487e-05, "loss": 0.4757, "step": 1594 }, { "epoch": 0.31447160883280756, "grad_norm": 0.6629052999763434, "learning_rate": 1.9703482013609098e-05, "loss": 0.4351, "step": 1595 }, { "epoch": 0.3146687697160883, "grad_norm": 0.7088009223433047, "learning_rate": 1.9703107238199356e-05, "loss": 0.4652, "step": 1596 }, { "epoch": 0.31486593059936907, "grad_norm": 0.9099735140613966, "learning_rate": 1.9702732229664653e-05, "loss": 0.4856, "step": 1597 }, { "epoch": 0.3150630914826498, "grad_norm": 0.744041311387149, "learning_rate": 1.9702356988013988e-05, "loss": 0.4675, "step": 1598 }, { "epoch": 0.3152602523659306, "grad_norm": 0.6721850270536037, "learning_rate": 1.970198151325639e-05, "loss": 0.4876, "step": 1599 }, { "epoch": 0.31545741324921134, "grad_norm": 0.7183831817138312, "learning_rate": 1.9701605805400866e-05, "loss": 0.4715, "step": 1600 }, { "epoch": 0.3156545741324921, "grad_norm": 0.6393423636617389, "learning_rate": 1.9701229864456452e-05, "loss": 0.4591, "step": 1601 }, { "epoch": 0.31585173501577285, "grad_norm": 0.6363184919995188, "learning_rate": 1.970085369043218e-05, "loss": 0.4134, "step": 1602 }, { "epoch": 0.3160488958990536, "grad_norm": 0.703734451223413, "learning_rate": 1.9700477283337084e-05, "loss": 0.4589, "step": 1603 }, { "epoch": 0.31624605678233436, "grad_norm": 0.6584994125713037, "learning_rate": 1.9700100643180213e-05, "loss": 0.4482, "step": 1604 }, { "epoch": 0.3164432176656151, "grad_norm": 0.6078730627425877, "learning_rate": 1.9699723769970608e-05, "loss": 0.4399, "step": 1605 }, { "epoch": 0.3166403785488959, "grad_norm": 0.6961873263732445, "learning_rate": 1.969934666371733e-05, "loss": 0.4775, "step": 1606 }, { "epoch": 0.31683753943217663, "grad_norm": 0.724397941037564, "learning_rate": 1.969896932442944e-05, "loss": 0.4981, "step": 1607 }, { "epoch": 0.31703470031545744, "grad_norm": 0.7312028182555175, "learning_rate": 1.9698591752115997e-05, "loss": 0.4688, "step": 1608 }, { "epoch": 0.3172318611987382, "grad_norm": 0.6505489803123736, "learning_rate": 1.969821394678608e-05, "loss": 0.4298, "step": 1609 }, { "epoch": 0.31742902208201895, "grad_norm": 0.6779628156309334, "learning_rate": 1.969783590844876e-05, "loss": 0.4806, "step": 1610 }, { "epoch": 0.3176261829652997, "grad_norm": 0.7180529086461475, "learning_rate": 1.9697457637113126e-05, "loss": 0.5021, "step": 1611 }, { "epoch": 0.31782334384858046, "grad_norm": 0.6009368475283028, "learning_rate": 1.969707913278826e-05, "loss": 0.4411, "step": 1612 }, { "epoch": 0.3180205047318612, "grad_norm": 0.6337105416643031, "learning_rate": 1.969670039548326e-05, "loss": 0.4554, "step": 1613 }, { "epoch": 0.318217665615142, "grad_norm": 0.7204205483322568, "learning_rate": 1.9696321425207227e-05, "loss": 0.4912, "step": 1614 }, { "epoch": 0.31841482649842273, "grad_norm": 0.6061996997155742, "learning_rate": 1.969594222196926e-05, "loss": 0.4442, "step": 1615 }, { "epoch": 0.3186119873817035, "grad_norm": 0.6433833110520849, "learning_rate": 1.9695562785778473e-05, "loss": 0.4289, "step": 1616 }, { "epoch": 0.31880914826498424, "grad_norm": 0.5906452185455755, "learning_rate": 1.9695183116643983e-05, "loss": 0.408, "step": 1617 }, { "epoch": 0.319006309148265, "grad_norm": 0.6795767414105931, "learning_rate": 1.9694803214574914e-05, "loss": 0.4325, "step": 1618 }, { "epoch": 0.31920347003154576, "grad_norm": 0.6337536961202738, "learning_rate": 1.9694423079580387e-05, "loss": 0.4426, "step": 1619 }, { "epoch": 0.3194006309148265, "grad_norm": 0.6793514535697415, "learning_rate": 1.969404271166954e-05, "loss": 0.4756, "step": 1620 }, { "epoch": 0.31959779179810727, "grad_norm": 0.5997185524987442, "learning_rate": 1.9693662110851507e-05, "loss": 0.4417, "step": 1621 }, { "epoch": 0.319794952681388, "grad_norm": 0.6223038049361463, "learning_rate": 1.969328127713544e-05, "loss": 0.4317, "step": 1622 }, { "epoch": 0.3199921135646688, "grad_norm": 0.6472194848635398, "learning_rate": 1.9692900210530482e-05, "loss": 0.4702, "step": 1623 }, { "epoch": 0.32018927444794953, "grad_norm": 0.5962256875505306, "learning_rate": 1.9692518911045793e-05, "loss": 0.4499, "step": 1624 }, { "epoch": 0.3203864353312303, "grad_norm": 0.6998497786192203, "learning_rate": 1.969213737869053e-05, "loss": 0.4395, "step": 1625 }, { "epoch": 0.32058359621451105, "grad_norm": 0.6753566663287628, "learning_rate": 1.969175561347386e-05, "loss": 0.4741, "step": 1626 }, { "epoch": 0.3207807570977918, "grad_norm": 0.6364586995100593, "learning_rate": 1.969137361540496e-05, "loss": 0.425, "step": 1627 }, { "epoch": 0.32097791798107256, "grad_norm": 0.6316173890013236, "learning_rate": 1.9690991384493002e-05, "loss": 0.426, "step": 1628 }, { "epoch": 0.3211750788643533, "grad_norm": 0.6592976432535022, "learning_rate": 1.969060892074717e-05, "loss": 0.4435, "step": 1629 }, { "epoch": 0.32137223974763407, "grad_norm": 0.6057087728738385, "learning_rate": 1.969022622417666e-05, "loss": 0.4434, "step": 1630 }, { "epoch": 0.3215694006309148, "grad_norm": 0.6341926570691082, "learning_rate": 1.968984329479066e-05, "loss": 0.4479, "step": 1631 }, { "epoch": 0.3217665615141956, "grad_norm": 0.6729916605171744, "learning_rate": 1.9689460132598372e-05, "loss": 0.4832, "step": 1632 }, { "epoch": 0.32196372239747634, "grad_norm": 0.8010790117508052, "learning_rate": 1.9689076737608998e-05, "loss": 0.4815, "step": 1633 }, { "epoch": 0.3221608832807571, "grad_norm": 0.6342908894048237, "learning_rate": 1.9688693109831755e-05, "loss": 0.4412, "step": 1634 }, { "epoch": 0.32235804416403785, "grad_norm": 0.6574543494629974, "learning_rate": 1.9688309249275857e-05, "loss": 0.4337, "step": 1635 }, { "epoch": 0.3225552050473186, "grad_norm": 0.6700412322589491, "learning_rate": 1.9687925155950526e-05, "loss": 0.4612, "step": 1636 }, { "epoch": 0.32275236593059936, "grad_norm": 0.6880798043825762, "learning_rate": 1.9687540829864996e-05, "loss": 0.4633, "step": 1637 }, { "epoch": 0.3229495268138801, "grad_norm": 0.5858751037584682, "learning_rate": 1.9687156271028493e-05, "loss": 0.4147, "step": 1638 }, { "epoch": 0.3231466876971609, "grad_norm": 1.1648261296219227, "learning_rate": 1.968677147945026e-05, "loss": 0.4663, "step": 1639 }, { "epoch": 0.32334384858044163, "grad_norm": 0.6393378112398456, "learning_rate": 1.9686386455139544e-05, "loss": 0.4392, "step": 1640 }, { "epoch": 0.3235410094637224, "grad_norm": 0.6736562294968578, "learning_rate": 1.9686001198105587e-05, "loss": 0.459, "step": 1641 }, { "epoch": 0.32373817034700314, "grad_norm": 0.6291265298426209, "learning_rate": 1.9685615708357656e-05, "loss": 0.455, "step": 1642 }, { "epoch": 0.3239353312302839, "grad_norm": 0.6956696368636405, "learning_rate": 1.9685229985905007e-05, "loss": 0.4735, "step": 1643 }, { "epoch": 0.32413249211356465, "grad_norm": 0.7248600768104795, "learning_rate": 1.9684844030756907e-05, "loss": 0.4599, "step": 1644 }, { "epoch": 0.3243296529968454, "grad_norm": 0.8868152265197408, "learning_rate": 1.9684457842922632e-05, "loss": 0.4353, "step": 1645 }, { "epoch": 0.32452681388012616, "grad_norm": 0.7520432959600722, "learning_rate": 1.9684071422411456e-05, "loss": 0.4873, "step": 1646 }, { "epoch": 0.3247239747634069, "grad_norm": 0.7377406807353308, "learning_rate": 1.968368476923267e-05, "loss": 0.461, "step": 1647 }, { "epoch": 0.3249211356466877, "grad_norm": 0.708141024519448, "learning_rate": 1.968329788339555e-05, "loss": 0.467, "step": 1648 }, { "epoch": 0.32511829652996843, "grad_norm": 0.6848367537784837, "learning_rate": 1.9682910764909405e-05, "loss": 0.496, "step": 1649 }, { "epoch": 0.3253154574132492, "grad_norm": 0.6665467412273421, "learning_rate": 1.9682523413783533e-05, "loss": 0.4732, "step": 1650 }, { "epoch": 0.32551261829652994, "grad_norm": 0.6953139108715455, "learning_rate": 1.968213583002724e-05, "loss": 0.4518, "step": 1651 }, { "epoch": 0.3257097791798107, "grad_norm": 0.6427842151833715, "learning_rate": 1.9681748013649834e-05, "loss": 0.4153, "step": 1652 }, { "epoch": 0.3259069400630915, "grad_norm": 1.183472131884011, "learning_rate": 1.968135996466064e-05, "loss": 0.4764, "step": 1653 }, { "epoch": 0.32610410094637227, "grad_norm": 0.617392531814384, "learning_rate": 1.968097168306897e-05, "loss": 0.4081, "step": 1654 }, { "epoch": 0.326301261829653, "grad_norm": 0.6869167388782502, "learning_rate": 1.9680583168884163e-05, "loss": 0.4505, "step": 1655 }, { "epoch": 0.3264984227129338, "grad_norm": 0.8700641935343537, "learning_rate": 1.9680194422115548e-05, "loss": 0.5009, "step": 1656 }, { "epoch": 0.32669558359621453, "grad_norm": 0.696926074465654, "learning_rate": 1.9679805442772464e-05, "loss": 0.4871, "step": 1657 }, { "epoch": 0.3268927444794953, "grad_norm": 0.7205797312738156, "learning_rate": 1.9679416230864265e-05, "loss": 0.487, "step": 1658 }, { "epoch": 0.32708990536277605, "grad_norm": 0.7034418967452446, "learning_rate": 1.967902678640029e-05, "loss": 0.4442, "step": 1659 }, { "epoch": 0.3272870662460568, "grad_norm": 0.6205957767275062, "learning_rate": 1.9678637109389903e-05, "loss": 0.4606, "step": 1660 }, { "epoch": 0.32748422712933756, "grad_norm": 0.6767913682774107, "learning_rate": 1.967824719984247e-05, "loss": 0.4449, "step": 1661 }, { "epoch": 0.3276813880126183, "grad_norm": 0.7628685405323068, "learning_rate": 1.967785705776735e-05, "loss": 0.4912, "step": 1662 }, { "epoch": 0.32787854889589907, "grad_norm": 0.6449573923061704, "learning_rate": 1.9677466683173922e-05, "loss": 0.4424, "step": 1663 }, { "epoch": 0.3280757097791798, "grad_norm": 0.6596295374884681, "learning_rate": 1.9677076076071568e-05, "loss": 0.4491, "step": 1664 }, { "epoch": 0.3282728706624606, "grad_norm": 0.6027707481490366, "learning_rate": 1.967668523646966e-05, "loss": 0.4272, "step": 1665 }, { "epoch": 0.32847003154574134, "grad_norm": 0.6919043992802008, "learning_rate": 1.9676294164377603e-05, "loss": 0.4597, "step": 1666 }, { "epoch": 0.3286671924290221, "grad_norm": 0.6975499115349002, "learning_rate": 1.9675902859804786e-05, "loss": 0.4978, "step": 1667 }, { "epoch": 0.32886435331230285, "grad_norm": 0.6526383582936955, "learning_rate": 1.967551132276061e-05, "loss": 0.4506, "step": 1668 }, { "epoch": 0.3290615141955836, "grad_norm": 0.6951437907766501, "learning_rate": 1.9675119553254477e-05, "loss": 0.4678, "step": 1669 }, { "epoch": 0.32925867507886436, "grad_norm": 0.6405156580206511, "learning_rate": 1.9674727551295812e-05, "loss": 0.4657, "step": 1670 }, { "epoch": 0.3294558359621451, "grad_norm": 0.6686701711260229, "learning_rate": 1.9674335316894024e-05, "loss": 0.4424, "step": 1671 }, { "epoch": 0.32965299684542587, "grad_norm": 0.6364715264193335, "learning_rate": 1.9673942850058542e-05, "loss": 0.4423, "step": 1672 }, { "epoch": 0.32985015772870663, "grad_norm": 0.6132748220879023, "learning_rate": 1.9673550150798787e-05, "loss": 0.4247, "step": 1673 }, { "epoch": 0.3300473186119874, "grad_norm": 0.6645141377088348, "learning_rate": 1.9673157219124207e-05, "loss": 0.466, "step": 1674 }, { "epoch": 0.33024447949526814, "grad_norm": 0.6213535676703986, "learning_rate": 1.967276405504423e-05, "loss": 0.4326, "step": 1675 }, { "epoch": 0.3304416403785489, "grad_norm": 1.0248391227718814, "learning_rate": 1.9672370658568306e-05, "loss": 0.5144, "step": 1676 }, { "epoch": 0.33063880126182965, "grad_norm": 0.6705592825340083, "learning_rate": 1.967197702970589e-05, "loss": 0.4504, "step": 1677 }, { "epoch": 0.3308359621451104, "grad_norm": 0.6159547925544582, "learning_rate": 1.967158316846644e-05, "loss": 0.4262, "step": 1678 }, { "epoch": 0.33103312302839116, "grad_norm": 0.6625745295364192, "learning_rate": 1.9671189074859412e-05, "loss": 0.463, "step": 1679 }, { "epoch": 0.3312302839116719, "grad_norm": 0.6192642001745661, "learning_rate": 1.967079474889428e-05, "loss": 0.4283, "step": 1680 }, { "epoch": 0.3314274447949527, "grad_norm": 0.659809255678557, "learning_rate": 1.9670400190580516e-05, "loss": 0.4443, "step": 1681 }, { "epoch": 0.33162460567823343, "grad_norm": 0.60127325810791, "learning_rate": 1.9670005399927602e-05, "loss": 0.4579, "step": 1682 }, { "epoch": 0.3318217665615142, "grad_norm": 0.7413899493606506, "learning_rate": 1.9669610376945013e-05, "loss": 0.487, "step": 1683 }, { "epoch": 0.33201892744479494, "grad_norm": 0.6582626993097654, "learning_rate": 1.9669215121642255e-05, "loss": 0.4734, "step": 1684 }, { "epoch": 0.3322160883280757, "grad_norm": 0.7151538898984898, "learning_rate": 1.9668819634028816e-05, "loss": 0.4446, "step": 1685 }, { "epoch": 0.33241324921135645, "grad_norm": 0.6913048824848471, "learning_rate": 1.96684239141142e-05, "loss": 0.4253, "step": 1686 }, { "epoch": 0.3326104100946372, "grad_norm": 0.6414540359989238, "learning_rate": 1.966802796190791e-05, "loss": 0.4317, "step": 1687 }, { "epoch": 0.33280757097791797, "grad_norm": 0.9207986171363428, "learning_rate": 1.9667631777419466e-05, "loss": 0.4627, "step": 1688 }, { "epoch": 0.3330047318611987, "grad_norm": 0.616029959208574, "learning_rate": 1.966723536065838e-05, "loss": 0.4655, "step": 1689 }, { "epoch": 0.3332018927444795, "grad_norm": 0.6883300541317802, "learning_rate": 1.9666838711634182e-05, "loss": 0.4618, "step": 1690 }, { "epoch": 0.33339905362776023, "grad_norm": 0.650342941791099, "learning_rate": 1.9666441830356397e-05, "loss": 0.444, "step": 1691 }, { "epoch": 0.333596214511041, "grad_norm": 0.6491057879424494, "learning_rate": 1.9666044716834566e-05, "loss": 0.4624, "step": 1692 }, { "epoch": 0.33379337539432175, "grad_norm": 0.6311071726696637, "learning_rate": 1.9665647371078225e-05, "loss": 0.4329, "step": 1693 }, { "epoch": 0.3339905362776025, "grad_norm": 0.7246225471538845, "learning_rate": 1.966524979309692e-05, "loss": 0.5341, "step": 1694 }, { "epoch": 0.33418769716088326, "grad_norm": 0.6987766778208062, "learning_rate": 1.966485198290021e-05, "loss": 0.48, "step": 1695 }, { "epoch": 0.334384858044164, "grad_norm": 0.6138348348118343, "learning_rate": 1.9664453940497642e-05, "loss": 0.4447, "step": 1696 }, { "epoch": 0.33458201892744477, "grad_norm": 0.6567138637243869, "learning_rate": 1.966405566589879e-05, "loss": 0.4991, "step": 1697 }, { "epoch": 0.3347791798107255, "grad_norm": 0.6352662846456245, "learning_rate": 1.9663657159113217e-05, "loss": 0.4652, "step": 1698 }, { "epoch": 0.33497634069400634, "grad_norm": 0.8883699236623674, "learning_rate": 1.96632584201505e-05, "loss": 0.4797, "step": 1699 }, { "epoch": 0.3351735015772871, "grad_norm": 0.6256688722873787, "learning_rate": 1.9662859449020214e-05, "loss": 0.4771, "step": 1700 }, { "epoch": 0.33537066246056785, "grad_norm": 0.6792106596589087, "learning_rate": 1.966246024573195e-05, "loss": 0.4561, "step": 1701 }, { "epoch": 0.3355678233438486, "grad_norm": 0.5975526642006157, "learning_rate": 1.96620608102953e-05, "loss": 0.4372, "step": 1702 }, { "epoch": 0.33576498422712936, "grad_norm": 0.6481968588062909, "learning_rate": 1.9661661142719856e-05, "loss": 0.4679, "step": 1703 }, { "epoch": 0.3359621451104101, "grad_norm": 0.625117310912806, "learning_rate": 1.966126124301522e-05, "loss": 0.4722, "step": 1704 }, { "epoch": 0.33615930599369087, "grad_norm": 0.6779932286491652, "learning_rate": 1.9660861111191004e-05, "loss": 0.4548, "step": 1705 }, { "epoch": 0.3363564668769716, "grad_norm": 0.5897827532389534, "learning_rate": 1.9660460747256823e-05, "loss": 0.4221, "step": 1706 }, { "epoch": 0.3365536277602524, "grad_norm": 0.600745494776992, "learning_rate": 1.9660060151222292e-05, "loss": 0.4359, "step": 1707 }, { "epoch": 0.33675078864353314, "grad_norm": 0.6263245381562469, "learning_rate": 1.9659659323097037e-05, "loss": 0.4537, "step": 1708 }, { "epoch": 0.3369479495268139, "grad_norm": 0.6059447411600158, "learning_rate": 1.9659258262890683e-05, "loss": 0.4557, "step": 1709 }, { "epoch": 0.33714511041009465, "grad_norm": 0.6276333332545005, "learning_rate": 1.9658856970612878e-05, "loss": 0.4539, "step": 1710 }, { "epoch": 0.3373422712933754, "grad_norm": 0.61826217784891, "learning_rate": 1.965845544627325e-05, "loss": 0.441, "step": 1711 }, { "epoch": 0.33753943217665616, "grad_norm": 0.6053223097223316, "learning_rate": 1.9658053689881453e-05, "loss": 0.4813, "step": 1712 }, { "epoch": 0.3377365930599369, "grad_norm": 0.5730530402368444, "learning_rate": 1.965765170144714e-05, "loss": 0.4452, "step": 1713 }, { "epoch": 0.3379337539432177, "grad_norm": 4.5489083531739265, "learning_rate": 1.9657249480979968e-05, "loss": 0.4672, "step": 1714 }, { "epoch": 0.33813091482649843, "grad_norm": 0.6468829198177741, "learning_rate": 1.9656847028489597e-05, "loss": 0.447, "step": 1715 }, { "epoch": 0.3383280757097792, "grad_norm": 0.6197378264065176, "learning_rate": 1.9656444343985705e-05, "loss": 0.4681, "step": 1716 }, { "epoch": 0.33852523659305994, "grad_norm": 0.7104123212898853, "learning_rate": 1.9656041427477957e-05, "loss": 0.4457, "step": 1717 }, { "epoch": 0.3387223974763407, "grad_norm": 0.7070319729266388, "learning_rate": 1.965563827897604e-05, "loss": 0.4517, "step": 1718 }, { "epoch": 0.33891955835962145, "grad_norm": 0.6119590288529166, "learning_rate": 1.9655234898489634e-05, "loss": 0.4346, "step": 1719 }, { "epoch": 0.3391167192429022, "grad_norm": 0.6245842477692922, "learning_rate": 1.965483128602844e-05, "loss": 0.4604, "step": 1720 }, { "epoch": 0.33931388012618297, "grad_norm": 1.2443841217690506, "learning_rate": 1.9654427441602145e-05, "loss": 0.479, "step": 1721 }, { "epoch": 0.3395110410094637, "grad_norm": 0.6732229801004805, "learning_rate": 1.9654023365220456e-05, "loss": 0.4513, "step": 1722 }, { "epoch": 0.3397082018927445, "grad_norm": 0.6568708942903064, "learning_rate": 1.9653619056893082e-05, "loss": 0.4406, "step": 1723 }, { "epoch": 0.33990536277602523, "grad_norm": 0.663524577710516, "learning_rate": 1.9653214516629737e-05, "loss": 0.4763, "step": 1724 }, { "epoch": 0.340102523659306, "grad_norm": 0.6675810755775939, "learning_rate": 1.965280974444014e-05, "loss": 0.4489, "step": 1725 }, { "epoch": 0.34029968454258674, "grad_norm": 0.6563303589975222, "learning_rate": 1.9652404740334015e-05, "loss": 0.4309, "step": 1726 }, { "epoch": 0.3404968454258675, "grad_norm": 0.7598373649184584, "learning_rate": 1.9651999504321094e-05, "loss": 0.4272, "step": 1727 }, { "epoch": 0.34069400630914826, "grad_norm": 0.651084326470726, "learning_rate": 1.9651594036411107e-05, "loss": 0.4475, "step": 1728 }, { "epoch": 0.340891167192429, "grad_norm": 0.6799855930682017, "learning_rate": 1.9651188336613807e-05, "loss": 0.4524, "step": 1729 }, { "epoch": 0.34108832807570977, "grad_norm": 0.8147708964689288, "learning_rate": 1.9650782404938933e-05, "loss": 0.4748, "step": 1730 }, { "epoch": 0.3412854889589905, "grad_norm": 1.9599798652041918, "learning_rate": 1.965037624139624e-05, "loss": 0.4634, "step": 1731 }, { "epoch": 0.3414826498422713, "grad_norm": 0.9750946996661596, "learning_rate": 1.9649969845995486e-05, "loss": 0.4522, "step": 1732 }, { "epoch": 0.34167981072555204, "grad_norm": 0.7830583556096765, "learning_rate": 1.9649563218746436e-05, "loss": 0.4837, "step": 1733 }, { "epoch": 0.3418769716088328, "grad_norm": 1.8522800709343359, "learning_rate": 1.9649156359658857e-05, "loss": 0.4907, "step": 1734 }, { "epoch": 0.34207413249211355, "grad_norm": 0.7087784601927121, "learning_rate": 1.964874926874253e-05, "loss": 0.4537, "step": 1735 }, { "epoch": 0.3422712933753943, "grad_norm": 0.9263867760746262, "learning_rate": 1.9648341946007228e-05, "loss": 0.4481, "step": 1736 }, { "epoch": 0.34246845425867506, "grad_norm": 0.7131592632953431, "learning_rate": 1.9647934391462743e-05, "loss": 0.4859, "step": 1737 }, { "epoch": 0.3426656151419558, "grad_norm": 0.6812545398255945, "learning_rate": 1.9647526605118863e-05, "loss": 0.4419, "step": 1738 }, { "epoch": 0.34286277602523657, "grad_norm": 0.8324328718435976, "learning_rate": 1.964711858698539e-05, "loss": 0.4495, "step": 1739 }, { "epoch": 0.3430599369085173, "grad_norm": 0.7569149729504874, "learning_rate": 1.964671033707212e-05, "loss": 0.4779, "step": 1740 }, { "epoch": 0.3432570977917981, "grad_norm": 0.7011727969185421, "learning_rate": 1.9646301855388868e-05, "loss": 0.486, "step": 1741 }, { "epoch": 0.34345425867507884, "grad_norm": 0.6487547062067612, "learning_rate": 1.9645893141945444e-05, "loss": 0.4265, "step": 1742 }, { "epoch": 0.3436514195583596, "grad_norm": 0.7657920846629185, "learning_rate": 1.9645484196751676e-05, "loss": 0.455, "step": 1743 }, { "epoch": 0.3438485804416404, "grad_norm": 0.696631675500191, "learning_rate": 1.9645075019817374e-05, "loss": 0.4743, "step": 1744 }, { "epoch": 0.34404574132492116, "grad_norm": 0.6735160176131795, "learning_rate": 1.9644665611152384e-05, "loss": 0.458, "step": 1745 }, { "epoch": 0.3442429022082019, "grad_norm": 0.7721263917052835, "learning_rate": 1.964425597076653e-05, "loss": 0.4765, "step": 1746 }, { "epoch": 0.3444400630914827, "grad_norm": 0.7497548232948051, "learning_rate": 1.9643846098669664e-05, "loss": 0.4772, "step": 1747 }, { "epoch": 0.34463722397476343, "grad_norm": 0.6847650811355418, "learning_rate": 1.9643435994871626e-05, "loss": 0.4417, "step": 1748 }, { "epoch": 0.3448343848580442, "grad_norm": 0.6433657002066264, "learning_rate": 1.9643025659382274e-05, "loss": 0.415, "step": 1749 }, { "epoch": 0.34503154574132494, "grad_norm": 0.6855538495003168, "learning_rate": 1.9642615092211468e-05, "loss": 0.4665, "step": 1750 }, { "epoch": 0.3452287066246057, "grad_norm": 0.6890543003455781, "learning_rate": 1.9642204293369066e-05, "loss": 0.4571, "step": 1751 }, { "epoch": 0.34542586750788645, "grad_norm": 0.8006291058144541, "learning_rate": 1.9641793262864942e-05, "loss": 0.4617, "step": 1752 }, { "epoch": 0.3456230283911672, "grad_norm": 0.7454470217038145, "learning_rate": 1.9641382000708972e-05, "loss": 0.4697, "step": 1753 }, { "epoch": 0.34582018927444796, "grad_norm": 0.6090095712411363, "learning_rate": 1.9640970506911033e-05, "loss": 0.4375, "step": 1754 }, { "epoch": 0.3460173501577287, "grad_norm": 0.7015624149655699, "learning_rate": 1.9640558781481015e-05, "loss": 0.4219, "step": 1755 }, { "epoch": 0.3462145110410095, "grad_norm": 0.7670637934340552, "learning_rate": 1.9640146824428807e-05, "loss": 0.4899, "step": 1756 }, { "epoch": 0.34641167192429023, "grad_norm": 0.7363863325209535, "learning_rate": 1.963973463576431e-05, "loss": 0.4664, "step": 1757 }, { "epoch": 0.346608832807571, "grad_norm": 0.6131339077035906, "learning_rate": 1.9639322215497423e-05, "loss": 0.4072, "step": 1758 }, { "epoch": 0.34680599369085174, "grad_norm": 0.6707877768295646, "learning_rate": 1.963890956363806e-05, "loss": 0.4662, "step": 1759 }, { "epoch": 0.3470031545741325, "grad_norm": 0.677317205988406, "learning_rate": 1.9638496680196135e-05, "loss": 0.4106, "step": 1760 }, { "epoch": 0.34720031545741326, "grad_norm": 0.6492787706072033, "learning_rate": 1.963808356518156e-05, "loss": 0.4077, "step": 1761 }, { "epoch": 0.347397476340694, "grad_norm": 0.7823981215932518, "learning_rate": 1.9637670218604267e-05, "loss": 0.4573, "step": 1762 }, { "epoch": 0.34759463722397477, "grad_norm": 0.5953310114048408, "learning_rate": 1.9637256640474187e-05, "loss": 0.4539, "step": 1763 }, { "epoch": 0.3477917981072555, "grad_norm": 0.759527295176903, "learning_rate": 1.9636842830801255e-05, "loss": 0.4354, "step": 1764 }, { "epoch": 0.3479889589905363, "grad_norm": 0.8180906973487583, "learning_rate": 1.9636428789595413e-05, "loss": 0.4312, "step": 1765 }, { "epoch": 0.34818611987381703, "grad_norm": 4.382079962402747, "learning_rate": 1.963601451686661e-05, "loss": 0.4406, "step": 1766 }, { "epoch": 0.3483832807570978, "grad_norm": 0.7560610305482834, "learning_rate": 1.9635600012624798e-05, "loss": 0.4313, "step": 1767 }, { "epoch": 0.34858044164037855, "grad_norm": 0.7418288958717549, "learning_rate": 1.9635185276879936e-05, "loss": 0.4793, "step": 1768 }, { "epoch": 0.3487776025236593, "grad_norm": 1.072851573671909, "learning_rate": 1.963477030964199e-05, "loss": 0.485, "step": 1769 }, { "epoch": 0.34897476340694006, "grad_norm": 0.7920038346678471, "learning_rate": 1.963435511092093e-05, "loss": 0.4696, "step": 1770 }, { "epoch": 0.3491719242902208, "grad_norm": 0.7093452794943754, "learning_rate": 1.9633939680726724e-05, "loss": 0.4727, "step": 1771 }, { "epoch": 0.34936908517350157, "grad_norm": 0.8112163451470932, "learning_rate": 1.9633524019069365e-05, "loss": 0.5013, "step": 1772 }, { "epoch": 0.3495662460567823, "grad_norm": 3.917584519790698, "learning_rate": 1.963310812595883e-05, "loss": 0.5098, "step": 1773 }, { "epoch": 0.3497634069400631, "grad_norm": 1.000199323692884, "learning_rate": 1.9632692001405113e-05, "loss": 0.4594, "step": 1774 }, { "epoch": 0.34996056782334384, "grad_norm": 0.8133085348010322, "learning_rate": 1.9632275645418218e-05, "loss": 0.4881, "step": 1775 }, { "epoch": 0.3501577287066246, "grad_norm": 0.9472822270595948, "learning_rate": 1.963185905800814e-05, "loss": 0.4973, "step": 1776 }, { "epoch": 0.35035488958990535, "grad_norm": 0.9453081319239314, "learning_rate": 1.9631442239184894e-05, "loss": 0.4266, "step": 1777 }, { "epoch": 0.3505520504731861, "grad_norm": 0.7602206983614878, "learning_rate": 1.9631025188958492e-05, "loss": 0.4783, "step": 1778 }, { "epoch": 0.35074921135646686, "grad_norm": 0.8604246184240586, "learning_rate": 1.9630607907338953e-05, "loss": 0.4293, "step": 1779 }, { "epoch": 0.3509463722397476, "grad_norm": 0.7616354505596147, "learning_rate": 1.9630190394336304e-05, "loss": 0.5098, "step": 1780 }, { "epoch": 0.3511435331230284, "grad_norm": 0.819713708870105, "learning_rate": 1.9629772649960574e-05, "loss": 0.4656, "step": 1781 }, { "epoch": 0.35134069400630913, "grad_norm": 0.6981497101426539, "learning_rate": 1.9629354674221803e-05, "loss": 0.4555, "step": 1782 }, { "epoch": 0.3515378548895899, "grad_norm": 0.8239076401594878, "learning_rate": 1.962893646713003e-05, "loss": 0.4619, "step": 1783 }, { "epoch": 0.35173501577287064, "grad_norm": 0.6349622015636447, "learning_rate": 1.9628518028695307e-05, "loss": 0.4508, "step": 1784 }, { "epoch": 0.3519321766561514, "grad_norm": 0.7787614675612314, "learning_rate": 1.9628099358927684e-05, "loss": 0.4399, "step": 1785 }, { "epoch": 0.35212933753943215, "grad_norm": 0.6213692400849354, "learning_rate": 1.962768045783722e-05, "loss": 0.4302, "step": 1786 }, { "epoch": 0.3523264984227129, "grad_norm": 0.8089918611577429, "learning_rate": 1.9627261325433976e-05, "loss": 0.4447, "step": 1787 }, { "epoch": 0.35252365930599366, "grad_norm": 0.8637850568343813, "learning_rate": 1.962684196172803e-05, "loss": 0.4683, "step": 1788 }, { "epoch": 0.3527208201892745, "grad_norm": 0.7190171498413642, "learning_rate": 1.9626422366729453e-05, "loss": 0.4591, "step": 1789 }, { "epoch": 0.35291798107255523, "grad_norm": 0.7746352347399187, "learning_rate": 1.9626002540448325e-05, "loss": 0.4545, "step": 1790 }, { "epoch": 0.353115141955836, "grad_norm": 0.7705463288801266, "learning_rate": 1.9625582482894735e-05, "loss": 0.4505, "step": 1791 }, { "epoch": 0.35331230283911674, "grad_norm": 0.6701387506672304, "learning_rate": 1.9625162194078775e-05, "loss": 0.4823, "step": 1792 }, { "epoch": 0.3535094637223975, "grad_norm": 0.7068914753075993, "learning_rate": 1.9624741674010544e-05, "loss": 0.4654, "step": 1793 }, { "epoch": 0.35370662460567825, "grad_norm": 0.6064047478157758, "learning_rate": 1.9624320922700138e-05, "loss": 0.4057, "step": 1794 }, { "epoch": 0.353903785488959, "grad_norm": 0.6898360183497596, "learning_rate": 1.9623899940157675e-05, "loss": 0.4221, "step": 1795 }, { "epoch": 0.35410094637223977, "grad_norm": 0.6629057087304656, "learning_rate": 1.9623478726393266e-05, "loss": 0.5155, "step": 1796 }, { "epoch": 0.3542981072555205, "grad_norm": 0.6794292374990497, "learning_rate": 1.9623057281417028e-05, "loss": 0.4402, "step": 1797 }, { "epoch": 0.3544952681388013, "grad_norm": 0.6122789190306797, "learning_rate": 1.9622635605239095e-05, "loss": 0.4431, "step": 1798 }, { "epoch": 0.35469242902208203, "grad_norm": 0.6459796432517372, "learning_rate": 1.9622213697869587e-05, "loss": 0.4388, "step": 1799 }, { "epoch": 0.3548895899053628, "grad_norm": 0.6623695426508792, "learning_rate": 1.9621791559318648e-05, "loss": 0.4812, "step": 1800 }, { "epoch": 0.35508675078864355, "grad_norm": 0.6324064996658313, "learning_rate": 1.962136918959642e-05, "loss": 0.4675, "step": 1801 }, { "epoch": 0.3552839116719243, "grad_norm": 0.5963028662353506, "learning_rate": 1.9620946588713048e-05, "loss": 0.4412, "step": 1802 }, { "epoch": 0.35548107255520506, "grad_norm": 0.6744345317892008, "learning_rate": 1.9620523756678685e-05, "loss": 0.4177, "step": 1803 }, { "epoch": 0.3556782334384858, "grad_norm": 0.645630963745389, "learning_rate": 1.9620100693503494e-05, "loss": 0.4511, "step": 1804 }, { "epoch": 0.35587539432176657, "grad_norm": 0.6672682741456414, "learning_rate": 1.9619677399197634e-05, "loss": 0.4382, "step": 1805 }, { "epoch": 0.3560725552050473, "grad_norm": 0.7740606862475723, "learning_rate": 1.961925387377128e-05, "loss": 0.4462, "step": 1806 }, { "epoch": 0.3562697160883281, "grad_norm": 0.6497305621170518, "learning_rate": 1.9618830117234603e-05, "loss": 0.4725, "step": 1807 }, { "epoch": 0.35646687697160884, "grad_norm": 0.640815305534194, "learning_rate": 1.9618406129597787e-05, "loss": 0.4651, "step": 1808 }, { "epoch": 0.3566640378548896, "grad_norm": 0.6155816175110397, "learning_rate": 1.961798191087102e-05, "loss": 0.4281, "step": 1809 }, { "epoch": 0.35686119873817035, "grad_norm": 0.633839320877963, "learning_rate": 1.9617557461064495e-05, "loss": 0.4938, "step": 1810 }, { "epoch": 0.3570583596214511, "grad_norm": 0.635922394462134, "learning_rate": 1.96171327801884e-05, "loss": 0.4393, "step": 1811 }, { "epoch": 0.35725552050473186, "grad_norm": 0.6410247655705709, "learning_rate": 1.961670786825295e-05, "loss": 0.4629, "step": 1812 }, { "epoch": 0.3574526813880126, "grad_norm": 1.9279852657649488, "learning_rate": 1.9616282725268347e-05, "loss": 0.4694, "step": 1813 }, { "epoch": 0.35764984227129337, "grad_norm": 0.6710119594137577, "learning_rate": 1.961585735124481e-05, "loss": 0.4594, "step": 1814 }, { "epoch": 0.35784700315457413, "grad_norm": 0.7397595757858347, "learning_rate": 1.9615431746192553e-05, "loss": 0.4534, "step": 1815 }, { "epoch": 0.3580441640378549, "grad_norm": 0.8225787141793098, "learning_rate": 1.9615005910121806e-05, "loss": 0.4832, "step": 1816 }, { "epoch": 0.35824132492113564, "grad_norm": 0.6983577011027828, "learning_rate": 1.96145798430428e-05, "loss": 0.4377, "step": 1817 }, { "epoch": 0.3584384858044164, "grad_norm": 0.7361587716765654, "learning_rate": 1.9614153544965773e-05, "loss": 0.4893, "step": 1818 }, { "epoch": 0.35863564668769715, "grad_norm": 0.6203265764684508, "learning_rate": 1.9613727015900962e-05, "loss": 0.3938, "step": 1819 }, { "epoch": 0.3588328075709779, "grad_norm": 0.6980283936814929, "learning_rate": 1.9613300255858615e-05, "loss": 0.4953, "step": 1820 }, { "epoch": 0.35902996845425866, "grad_norm": 0.7026401510766678, "learning_rate": 1.9612873264848994e-05, "loss": 0.4594, "step": 1821 }, { "epoch": 0.3592271293375394, "grad_norm": 0.6405607709544402, "learning_rate": 1.9612446042882345e-05, "loss": 0.4146, "step": 1822 }, { "epoch": 0.3594242902208202, "grad_norm": 0.8137624725348499, "learning_rate": 1.961201858996894e-05, "loss": 0.4603, "step": 1823 }, { "epoch": 0.35962145110410093, "grad_norm": 0.6238197035171101, "learning_rate": 1.9611590906119055e-05, "loss": 0.4301, "step": 1824 }, { "epoch": 0.3598186119873817, "grad_norm": 0.7765705822707861, "learning_rate": 1.9611162991342952e-05, "loss": 0.4622, "step": 1825 }, { "epoch": 0.36001577287066244, "grad_norm": 0.8129569752866966, "learning_rate": 1.961073484565092e-05, "loss": 0.4634, "step": 1826 }, { "epoch": 0.3602129337539432, "grad_norm": 0.7674729263580945, "learning_rate": 1.9610306469053243e-05, "loss": 0.4797, "step": 1827 }, { "epoch": 0.36041009463722395, "grad_norm": 0.6433782160365153, "learning_rate": 1.9609877861560213e-05, "loss": 0.4427, "step": 1828 }, { "epoch": 0.3606072555205047, "grad_norm": 0.8189751748622848, "learning_rate": 1.9609449023182133e-05, "loss": 0.4925, "step": 1829 }, { "epoch": 0.36080441640378547, "grad_norm": 0.6603921192162837, "learning_rate": 1.9609019953929298e-05, "loss": 0.4328, "step": 1830 }, { "epoch": 0.3610015772870662, "grad_norm": 0.7020104873363077, "learning_rate": 1.960859065381202e-05, "loss": 0.4532, "step": 1831 }, { "epoch": 0.361198738170347, "grad_norm": 0.6514968871084654, "learning_rate": 1.9608161122840614e-05, "loss": 0.4262, "step": 1832 }, { "epoch": 0.36139589905362773, "grad_norm": 1.3363500449543768, "learning_rate": 1.9607731361025402e-05, "loss": 0.4738, "step": 1833 }, { "epoch": 0.3615930599369085, "grad_norm": 0.6353691592363815, "learning_rate": 1.9607301368376706e-05, "loss": 0.4918, "step": 1834 }, { "epoch": 0.3617902208201893, "grad_norm": 0.6784040176469287, "learning_rate": 1.9606871144904855e-05, "loss": 0.4827, "step": 1835 }, { "epoch": 0.36198738170347006, "grad_norm": 0.6264342538936625, "learning_rate": 1.960644069062019e-05, "loss": 0.4172, "step": 1836 }, { "epoch": 0.3621845425867508, "grad_norm": 3.5120658505114277, "learning_rate": 1.9606010005533055e-05, "loss": 0.4948, "step": 1837 }, { "epoch": 0.36238170347003157, "grad_norm": 0.9363587178765608, "learning_rate": 1.960557908965379e-05, "loss": 0.468, "step": 1838 }, { "epoch": 0.3625788643533123, "grad_norm": 0.6948665441777858, "learning_rate": 1.9605147942992752e-05, "loss": 0.4493, "step": 1839 }, { "epoch": 0.3627760252365931, "grad_norm": 0.7506323131912949, "learning_rate": 1.9604716565560303e-05, "loss": 0.4361, "step": 1840 }, { "epoch": 0.36297318611987384, "grad_norm": 0.6646663396446123, "learning_rate": 1.96042849573668e-05, "loss": 0.4565, "step": 1841 }, { "epoch": 0.3631703470031546, "grad_norm": 0.7021081874099678, "learning_rate": 1.9603853118422618e-05, "loss": 0.4791, "step": 1842 }, { "epoch": 0.36336750788643535, "grad_norm": 0.7405667369095288, "learning_rate": 1.960342104873813e-05, "loss": 0.4507, "step": 1843 }, { "epoch": 0.3635646687697161, "grad_norm": 0.8937542644763167, "learning_rate": 1.9602988748323718e-05, "loss": 0.4739, "step": 1844 }, { "epoch": 0.36376182965299686, "grad_norm": 0.647962965489675, "learning_rate": 1.960255621718977e-05, "loss": 0.4505, "step": 1845 }, { "epoch": 0.3639589905362776, "grad_norm": 0.6679476483113629, "learning_rate": 1.9602123455346677e-05, "loss": 0.4594, "step": 1846 }, { "epoch": 0.36415615141955837, "grad_norm": 0.6248714424291023, "learning_rate": 1.960169046280483e-05, "loss": 0.4452, "step": 1847 }, { "epoch": 0.3643533123028391, "grad_norm": 0.5629821941055176, "learning_rate": 1.960125723957464e-05, "loss": 0.4021, "step": 1848 }, { "epoch": 0.3645504731861199, "grad_norm": 0.6196696074939825, "learning_rate": 1.9600823785666515e-05, "loss": 0.4365, "step": 1849 }, { "epoch": 0.36474763406940064, "grad_norm": 0.6115029014398423, "learning_rate": 1.9600390101090867e-05, "loss": 0.4279, "step": 1850 }, { "epoch": 0.3649447949526814, "grad_norm": 0.6120747583019098, "learning_rate": 1.9599956185858112e-05, "loss": 0.4548, "step": 1851 }, { "epoch": 0.36514195583596215, "grad_norm": 0.7496535911475231, "learning_rate": 1.959952203997868e-05, "loss": 0.4646, "step": 1852 }, { "epoch": 0.3653391167192429, "grad_norm": 0.6382300755564924, "learning_rate": 1.9599087663463003e-05, "loss": 0.4355, "step": 1853 }, { "epoch": 0.36553627760252366, "grad_norm": 0.6602319601851913, "learning_rate": 1.9598653056321512e-05, "loss": 0.4632, "step": 1854 }, { "epoch": 0.3657334384858044, "grad_norm": 0.6191066463069663, "learning_rate": 1.9598218218564656e-05, "loss": 0.4238, "step": 1855 }, { "epoch": 0.3659305993690852, "grad_norm": 0.6072202775912791, "learning_rate": 1.9597783150202873e-05, "loss": 0.4683, "step": 1856 }, { "epoch": 0.36612776025236593, "grad_norm": 0.5689905034550738, "learning_rate": 1.9597347851246623e-05, "loss": 0.4102, "step": 1857 }, { "epoch": 0.3663249211356467, "grad_norm": 0.6023153592091132, "learning_rate": 1.959691232170636e-05, "loss": 0.4748, "step": 1858 }, { "epoch": 0.36652208201892744, "grad_norm": 0.6035103505300697, "learning_rate": 1.9596476561592553e-05, "loss": 0.429, "step": 1859 }, { "epoch": 0.3667192429022082, "grad_norm": 0.6246219996358824, "learning_rate": 1.9596040570915666e-05, "loss": 0.4179, "step": 1860 }, { "epoch": 0.36691640378548895, "grad_norm": 0.6764471802280777, "learning_rate": 1.959560434968618e-05, "loss": 0.5002, "step": 1861 }, { "epoch": 0.3671135646687697, "grad_norm": 0.6218434897962343, "learning_rate": 1.959516789791457e-05, "loss": 0.4436, "step": 1862 }, { "epoch": 0.36731072555205047, "grad_norm": 0.6206678178233864, "learning_rate": 1.959473121561132e-05, "loss": 0.4553, "step": 1863 }, { "epoch": 0.3675078864353312, "grad_norm": 0.5901550517207009, "learning_rate": 1.9594294302786933e-05, "loss": 0.4377, "step": 1864 }, { "epoch": 0.367705047318612, "grad_norm": 0.6112016781481284, "learning_rate": 1.9593857159451897e-05, "loss": 0.4622, "step": 1865 }, { "epoch": 0.36790220820189273, "grad_norm": 0.6154064090788753, "learning_rate": 1.9593419785616716e-05, "loss": 0.4587, "step": 1866 }, { "epoch": 0.3680993690851735, "grad_norm": 0.6350370151618049, "learning_rate": 1.95929821812919e-05, "loss": 0.4683, "step": 1867 }, { "epoch": 0.36829652996845424, "grad_norm": 1.6840479608127483, "learning_rate": 1.9592544346487958e-05, "loss": 0.4761, "step": 1868 }, { "epoch": 0.368493690851735, "grad_norm": 0.586315544529278, "learning_rate": 1.9592106281215418e-05, "loss": 0.4171, "step": 1869 }, { "epoch": 0.36869085173501576, "grad_norm": 0.6098943093456206, "learning_rate": 1.95916679854848e-05, "loss": 0.4483, "step": 1870 }, { "epoch": 0.3688880126182965, "grad_norm": 0.6030006559111836, "learning_rate": 1.959122945930663e-05, "loss": 0.4379, "step": 1871 }, { "epoch": 0.36908517350157727, "grad_norm": 0.6788148490316216, "learning_rate": 1.9590790702691453e-05, "loss": 0.4691, "step": 1872 }, { "epoch": 0.369282334384858, "grad_norm": 0.6629668903573882, "learning_rate": 1.9590351715649803e-05, "loss": 0.4551, "step": 1873 }, { "epoch": 0.3694794952681388, "grad_norm": 0.7978043884977309, "learning_rate": 1.9589912498192233e-05, "loss": 0.4405, "step": 1874 }, { "epoch": 0.36967665615141954, "grad_norm": 0.7049104610666904, "learning_rate": 1.958947305032929e-05, "loss": 0.4673, "step": 1875 }, { "epoch": 0.3698738170347003, "grad_norm": 0.6949512497820435, "learning_rate": 1.9589033372071537e-05, "loss": 0.4147, "step": 1876 }, { "epoch": 0.37007097791798105, "grad_norm": 0.7972703080682879, "learning_rate": 1.9588593463429532e-05, "loss": 0.4636, "step": 1877 }, { "epoch": 0.3702681388012618, "grad_norm": 0.6733154115167292, "learning_rate": 1.958815332441385e-05, "loss": 0.4556, "step": 1878 }, { "epoch": 0.37046529968454256, "grad_norm": 0.633932691284626, "learning_rate": 1.9587712955035064e-05, "loss": 0.457, "step": 1879 }, { "epoch": 0.37066246056782337, "grad_norm": 0.6626177934715735, "learning_rate": 1.958727235530375e-05, "loss": 0.4682, "step": 1880 }, { "epoch": 0.3708596214511041, "grad_norm": 0.6359733697611749, "learning_rate": 1.9586831525230496e-05, "loss": 0.4482, "step": 1881 }, { "epoch": 0.3710567823343849, "grad_norm": 0.8674782473950938, "learning_rate": 1.9586390464825896e-05, "loss": 0.447, "step": 1882 }, { "epoch": 0.37125394321766564, "grad_norm": 0.7212842297017934, "learning_rate": 1.958594917410055e-05, "loss": 0.4928, "step": 1883 }, { "epoch": 0.3714511041009464, "grad_norm": 0.5994454432815358, "learning_rate": 1.958550765306505e-05, "loss": 0.4354, "step": 1884 }, { "epoch": 0.37164826498422715, "grad_norm": 0.5881488736842267, "learning_rate": 1.9585065901730013e-05, "loss": 0.4114, "step": 1885 }, { "epoch": 0.3718454258675079, "grad_norm": 0.5729776216743723, "learning_rate": 1.9584623920106044e-05, "loss": 0.3996, "step": 1886 }, { "epoch": 0.37204258675078866, "grad_norm": 0.6292024150236367, "learning_rate": 1.9584181708203772e-05, "loss": 0.4452, "step": 1887 }, { "epoch": 0.3722397476340694, "grad_norm": 0.6434967873079545, "learning_rate": 1.958373926603381e-05, "loss": 0.4864, "step": 1888 }, { "epoch": 0.3724369085173502, "grad_norm": 0.6189683570278633, "learning_rate": 1.95832965936068e-05, "loss": 0.4288, "step": 1889 }, { "epoch": 0.37263406940063093, "grad_norm": 0.6925481427276183, "learning_rate": 1.958285369093337e-05, "loss": 0.4982, "step": 1890 }, { "epoch": 0.3728312302839117, "grad_norm": 0.5954823776943323, "learning_rate": 1.9582410558024162e-05, "loss": 0.4159, "step": 1891 }, { "epoch": 0.37302839116719244, "grad_norm": 0.6671992939469052, "learning_rate": 1.9581967194889826e-05, "loss": 0.4524, "step": 1892 }, { "epoch": 0.3732255520504732, "grad_norm": 0.6322337439769842, "learning_rate": 1.9581523601541012e-05, "loss": 0.4909, "step": 1893 }, { "epoch": 0.37342271293375395, "grad_norm": 0.6712642730299693, "learning_rate": 1.9581079777988375e-05, "loss": 0.4483, "step": 1894 }, { "epoch": 0.3736198738170347, "grad_norm": 0.6673822790768886, "learning_rate": 1.958063572424258e-05, "loss": 0.4431, "step": 1895 }, { "epoch": 0.37381703470031546, "grad_norm": 2.28916156009429, "learning_rate": 1.9580191440314304e-05, "loss": 0.5002, "step": 1896 }, { "epoch": 0.3740141955835962, "grad_norm": 0.6439890915303407, "learning_rate": 1.9579746926214205e-05, "loss": 0.4325, "step": 1897 }, { "epoch": 0.374211356466877, "grad_norm": 0.7309784046021603, "learning_rate": 1.9579302181952977e-05, "loss": 0.4615, "step": 1898 }, { "epoch": 0.37440851735015773, "grad_norm": 0.5998628834963219, "learning_rate": 1.9578857207541296e-05, "loss": 0.4563, "step": 1899 }, { "epoch": 0.3746056782334385, "grad_norm": 0.6925977755644835, "learning_rate": 1.957841200298986e-05, "loss": 0.4705, "step": 1900 }, { "epoch": 0.37480283911671924, "grad_norm": 0.7297338025869237, "learning_rate": 1.9577966568309358e-05, "loss": 0.4749, "step": 1901 }, { "epoch": 0.375, "grad_norm": 0.7547412508318946, "learning_rate": 1.9577520903510497e-05, "loss": 0.4826, "step": 1902 }, { "epoch": 0.37519716088328076, "grad_norm": 0.6220978689967857, "learning_rate": 1.957707500860399e-05, "loss": 0.4348, "step": 1903 }, { "epoch": 0.3753943217665615, "grad_norm": 0.6472282954783044, "learning_rate": 1.9576628883600533e-05, "loss": 0.4102, "step": 1904 }, { "epoch": 0.37559148264984227, "grad_norm": 0.6841423764886676, "learning_rate": 1.9576182528510864e-05, "loss": 0.4776, "step": 1905 }, { "epoch": 0.375788643533123, "grad_norm": 0.5900859903843134, "learning_rate": 1.957573594334569e-05, "loss": 0.4334, "step": 1906 }, { "epoch": 0.3759858044164038, "grad_norm": 0.6428081199582281, "learning_rate": 1.9575289128115758e-05, "loss": 0.4459, "step": 1907 }, { "epoch": 0.37618296529968454, "grad_norm": 0.6123822425477046, "learning_rate": 1.9574842082831788e-05, "loss": 0.4674, "step": 1908 }, { "epoch": 0.3763801261829653, "grad_norm": 0.7460401880955564, "learning_rate": 1.957439480750453e-05, "loss": 0.4364, "step": 1909 }, { "epoch": 0.37657728706624605, "grad_norm": 0.6613935489626954, "learning_rate": 1.957394730214472e-05, "loss": 0.4688, "step": 1910 }, { "epoch": 0.3767744479495268, "grad_norm": 1.1706846880413952, "learning_rate": 1.9573499566763124e-05, "loss": 0.4627, "step": 1911 }, { "epoch": 0.37697160883280756, "grad_norm": 1.4249063300767362, "learning_rate": 1.9573051601370485e-05, "loss": 0.4546, "step": 1912 }, { "epoch": 0.3771687697160883, "grad_norm": 0.7269862868607329, "learning_rate": 1.9572603405977573e-05, "loss": 0.4929, "step": 1913 }, { "epoch": 0.37736593059936907, "grad_norm": 0.7060411924408715, "learning_rate": 1.957215498059516e-05, "loss": 0.4406, "step": 1914 }, { "epoch": 0.3775630914826498, "grad_norm": 0.6494093122802824, "learning_rate": 1.957170632523401e-05, "loss": 0.4622, "step": 1915 }, { "epoch": 0.3777602523659306, "grad_norm": 0.6622590367838171, "learning_rate": 1.957125743990491e-05, "loss": 0.5108, "step": 1916 }, { "epoch": 0.37795741324921134, "grad_norm": 0.6744814131396353, "learning_rate": 1.9570808324618646e-05, "loss": 0.4583, "step": 1917 }, { "epoch": 0.3781545741324921, "grad_norm": 3.197758463117182, "learning_rate": 1.9570358979386e-05, "loss": 0.4479, "step": 1918 }, { "epoch": 0.37835173501577285, "grad_norm": 0.6725127235656607, "learning_rate": 1.956990940421777e-05, "loss": 0.4218, "step": 1919 }, { "epoch": 0.3785488958990536, "grad_norm": 0.6774677552967868, "learning_rate": 1.9569459599124765e-05, "loss": 0.4654, "step": 1920 }, { "epoch": 0.37874605678233436, "grad_norm": 2.1023918912871715, "learning_rate": 1.9569009564117783e-05, "loss": 0.4601, "step": 1921 }, { "epoch": 0.3789432176656151, "grad_norm": 0.8729529815113425, "learning_rate": 1.9568559299207645e-05, "loss": 0.4609, "step": 1922 }, { "epoch": 0.3791403785488959, "grad_norm": 0.6586715530887697, "learning_rate": 1.9568108804405162e-05, "loss": 0.454, "step": 1923 }, { "epoch": 0.37933753943217663, "grad_norm": 0.6950591232290906, "learning_rate": 1.956765807972116e-05, "loss": 0.4587, "step": 1924 }, { "epoch": 0.37953470031545744, "grad_norm": 0.5859553428507086, "learning_rate": 1.9567207125166466e-05, "loss": 0.4559, "step": 1925 }, { "epoch": 0.3797318611987382, "grad_norm": 0.654061925363833, "learning_rate": 1.9566755940751916e-05, "loss": 0.4539, "step": 1926 }, { "epoch": 0.37992902208201895, "grad_norm": 0.7394858768473541, "learning_rate": 1.956630452648835e-05, "loss": 0.4135, "step": 1927 }, { "epoch": 0.3801261829652997, "grad_norm": 0.7096797120999595, "learning_rate": 1.956585288238662e-05, "loss": 0.4421, "step": 1928 }, { "epoch": 0.38032334384858046, "grad_norm": 0.8603308022382222, "learning_rate": 1.9565401008457567e-05, "loss": 0.4694, "step": 1929 }, { "epoch": 0.3805205047318612, "grad_norm": 0.6288376088275459, "learning_rate": 1.956494890471205e-05, "loss": 0.4178, "step": 1930 }, { "epoch": 0.380717665615142, "grad_norm": 0.6656485291036316, "learning_rate": 1.9564496571160935e-05, "loss": 0.4571, "step": 1931 }, { "epoch": 0.38091482649842273, "grad_norm": 0.6168171412436806, "learning_rate": 1.9564044007815087e-05, "loss": 0.4142, "step": 1932 }, { "epoch": 0.3811119873817035, "grad_norm": 0.7052282174762466, "learning_rate": 1.956359121468538e-05, "loss": 0.4445, "step": 1933 }, { "epoch": 0.38130914826498424, "grad_norm": 0.7120124227298402, "learning_rate": 1.9563138191782692e-05, "loss": 0.4557, "step": 1934 }, { "epoch": 0.381506309148265, "grad_norm": 0.6317595911299724, "learning_rate": 1.956268493911791e-05, "loss": 0.4587, "step": 1935 }, { "epoch": 0.38170347003154576, "grad_norm": 0.6798856792123635, "learning_rate": 1.9562231456701922e-05, "loss": 0.4612, "step": 1936 }, { "epoch": 0.3819006309148265, "grad_norm": 1.1002750501181588, "learning_rate": 1.9561777744545616e-05, "loss": 0.4516, "step": 1937 }, { "epoch": 0.38209779179810727, "grad_norm": 0.7801889450244034, "learning_rate": 1.9561323802659908e-05, "loss": 0.4882, "step": 1938 }, { "epoch": 0.382294952681388, "grad_norm": 0.6312650051830359, "learning_rate": 1.9560869631055693e-05, "loss": 0.4616, "step": 1939 }, { "epoch": 0.3824921135646688, "grad_norm": 0.802737212887057, "learning_rate": 1.9560415229743885e-05, "loss": 0.4417, "step": 1940 }, { "epoch": 0.38268927444794953, "grad_norm": 0.7357740131308222, "learning_rate": 1.9559960598735403e-05, "loss": 0.4701, "step": 1941 }, { "epoch": 0.3828864353312303, "grad_norm": 0.6284042286654438, "learning_rate": 1.9559505738041167e-05, "loss": 0.4459, "step": 1942 }, { "epoch": 0.38308359621451105, "grad_norm": 0.6451867717261572, "learning_rate": 1.955905064767211e-05, "loss": 0.4818, "step": 1943 }, { "epoch": 0.3832807570977918, "grad_norm": 0.9841866252584824, "learning_rate": 1.955859532763916e-05, "loss": 0.412, "step": 1944 }, { "epoch": 0.38347791798107256, "grad_norm": 0.7692893239650029, "learning_rate": 1.955813977795326e-05, "loss": 0.4328, "step": 1945 }, { "epoch": 0.3836750788643533, "grad_norm": 0.740511429503883, "learning_rate": 1.955768399862536e-05, "loss": 0.4491, "step": 1946 }, { "epoch": 0.38387223974763407, "grad_norm": 0.6674798746464747, "learning_rate": 1.95572279896664e-05, "loss": 0.499, "step": 1947 }, { "epoch": 0.3840694006309148, "grad_norm": 0.8324473358303699, "learning_rate": 1.9556771751087343e-05, "loss": 0.4653, "step": 1948 }, { "epoch": 0.3842665615141956, "grad_norm": 0.7000802793104866, "learning_rate": 1.955631528289915e-05, "loss": 0.4386, "step": 1949 }, { "epoch": 0.38446372239747634, "grad_norm": 0.7005116186670807, "learning_rate": 1.9555858585112784e-05, "loss": 0.4615, "step": 1950 }, { "epoch": 0.3846608832807571, "grad_norm": 0.7023205716896679, "learning_rate": 1.9555401657739222e-05, "loss": 0.4507, "step": 1951 }, { "epoch": 0.38485804416403785, "grad_norm": 0.6267525065077723, "learning_rate": 1.9554944500789438e-05, "loss": 0.4291, "step": 1952 }, { "epoch": 0.3850552050473186, "grad_norm": 0.6719159244755133, "learning_rate": 1.955448711427442e-05, "loss": 0.4891, "step": 1953 }, { "epoch": 0.38525236593059936, "grad_norm": 0.6908923759078575, "learning_rate": 1.9554029498205154e-05, "loss": 0.4753, "step": 1954 }, { "epoch": 0.3854495268138801, "grad_norm": 0.6560268963325017, "learning_rate": 1.9553571652592637e-05, "loss": 0.4271, "step": 1955 }, { "epoch": 0.3856466876971609, "grad_norm": 0.8242232954813055, "learning_rate": 1.9553113577447866e-05, "loss": 0.4337, "step": 1956 }, { "epoch": 0.38584384858044163, "grad_norm": 0.5786503721027892, "learning_rate": 1.9552655272781848e-05, "loss": 0.4491, "step": 1957 }, { "epoch": 0.3860410094637224, "grad_norm": 0.6178851746400039, "learning_rate": 1.9552196738605596e-05, "loss": 0.4676, "step": 1958 }, { "epoch": 0.38623817034700314, "grad_norm": 0.6433261446448528, "learning_rate": 1.9551737974930124e-05, "loss": 0.4405, "step": 1959 }, { "epoch": 0.3864353312302839, "grad_norm": 0.657702346180845, "learning_rate": 1.9551278981766453e-05, "loss": 0.4697, "step": 1960 }, { "epoch": 0.38663249211356465, "grad_norm": 0.5617264106547595, "learning_rate": 1.9550819759125613e-05, "loss": 0.4263, "step": 1961 }, { "epoch": 0.3868296529968454, "grad_norm": 0.6078848131857936, "learning_rate": 1.955036030701864e-05, "loss": 0.4693, "step": 1962 }, { "epoch": 0.38702681388012616, "grad_norm": 0.6098840021530041, "learning_rate": 1.954990062545657e-05, "loss": 0.4535, "step": 1963 }, { "epoch": 0.3872239747634069, "grad_norm": 0.6666148228118189, "learning_rate": 1.9549440714450447e-05, "loss": 0.4376, "step": 1964 }, { "epoch": 0.3874211356466877, "grad_norm": 0.6007383801169028, "learning_rate": 1.9548980574011315e-05, "loss": 0.4488, "step": 1965 }, { "epoch": 0.38761829652996843, "grad_norm": 1.1165576178172691, "learning_rate": 1.954852020415024e-05, "loss": 0.4622, "step": 1966 }, { "epoch": 0.3878154574132492, "grad_norm": 0.6069174688430108, "learning_rate": 1.9548059604878277e-05, "loss": 0.414, "step": 1967 }, { "epoch": 0.38801261829652994, "grad_norm": 0.6426555376380005, "learning_rate": 1.9547598776206492e-05, "loss": 0.4859, "step": 1968 }, { "epoch": 0.3882097791798107, "grad_norm": 0.643323906059335, "learning_rate": 1.954713771814596e-05, "loss": 0.476, "step": 1969 }, { "epoch": 0.3884069400630915, "grad_norm": 0.6214422402840049, "learning_rate": 1.9546676430707758e-05, "loss": 0.456, "step": 1970 }, { "epoch": 0.38860410094637227, "grad_norm": 0.5933166205495725, "learning_rate": 1.954621491390296e-05, "loss": 0.4282, "step": 1971 }, { "epoch": 0.388801261829653, "grad_norm": 0.6469075563555098, "learning_rate": 1.9545753167742664e-05, "loss": 0.4395, "step": 1972 }, { "epoch": 0.3889984227129338, "grad_norm": 0.575746232416297, "learning_rate": 1.9545291192237962e-05, "loss": 0.4426, "step": 1973 }, { "epoch": 0.38919558359621453, "grad_norm": 0.8936881259787512, "learning_rate": 1.954482898739995e-05, "loss": 0.4218, "step": 1974 }, { "epoch": 0.3893927444794953, "grad_norm": 0.5907560428830476, "learning_rate": 1.9544366553239738e-05, "loss": 0.4408, "step": 1975 }, { "epoch": 0.38958990536277605, "grad_norm": 0.6531849605246557, "learning_rate": 1.9543903889768435e-05, "loss": 0.4718, "step": 1976 }, { "epoch": 0.3897870662460568, "grad_norm": 0.5847696666259158, "learning_rate": 1.9543440996997152e-05, "loss": 0.4284, "step": 1977 }, { "epoch": 0.38998422712933756, "grad_norm": 0.6685285347892534, "learning_rate": 1.9542977874937014e-05, "loss": 0.3962, "step": 1978 }, { "epoch": 0.3901813880126183, "grad_norm": 0.5988664018630944, "learning_rate": 1.954251452359915e-05, "loss": 0.4522, "step": 1979 }, { "epoch": 0.39037854889589907, "grad_norm": 0.683915321776376, "learning_rate": 1.9542050942994686e-05, "loss": 0.4443, "step": 1980 }, { "epoch": 0.3905757097791798, "grad_norm": 0.5965994237087414, "learning_rate": 1.9541587133134766e-05, "loss": 0.4836, "step": 1981 }, { "epoch": 0.3907728706624606, "grad_norm": 0.6874247628508368, "learning_rate": 1.9541123094030528e-05, "loss": 0.4478, "step": 1982 }, { "epoch": 0.39097003154574134, "grad_norm": 0.6164585865243617, "learning_rate": 1.954065882569313e-05, "loss": 0.4458, "step": 1983 }, { "epoch": 0.3911671924290221, "grad_norm": 0.6370965104931681, "learning_rate": 1.954019432813372e-05, "loss": 0.421, "step": 1984 }, { "epoch": 0.39136435331230285, "grad_norm": 0.6849623127670889, "learning_rate": 1.9539729601363456e-05, "loss": 0.4799, "step": 1985 }, { "epoch": 0.3915615141955836, "grad_norm": 0.6885520124409992, "learning_rate": 1.9539264645393508e-05, "loss": 0.4901, "step": 1986 }, { "epoch": 0.39175867507886436, "grad_norm": 0.6261317061235863, "learning_rate": 1.9538799460235044e-05, "loss": 0.4266, "step": 1987 }, { "epoch": 0.3919558359621451, "grad_norm": 0.6441025202938847, "learning_rate": 1.953833404589924e-05, "loss": 0.4428, "step": 1988 }, { "epoch": 0.39215299684542587, "grad_norm": 0.7945693849241009, "learning_rate": 1.953786840239728e-05, "loss": 0.4498, "step": 1989 }, { "epoch": 0.39235015772870663, "grad_norm": 0.6236426936646374, "learning_rate": 1.953740252974035e-05, "loss": 0.4433, "step": 1990 }, { "epoch": 0.3925473186119874, "grad_norm": 0.7354383225024252, "learning_rate": 1.9536936427939647e-05, "loss": 0.4586, "step": 1991 }, { "epoch": 0.39274447949526814, "grad_norm": 0.7831532504579899, "learning_rate": 1.9536470097006363e-05, "loss": 0.4887, "step": 1992 }, { "epoch": 0.3929416403785489, "grad_norm": 0.6872639246256627, "learning_rate": 1.9536003536951708e-05, "loss": 0.4405, "step": 1993 }, { "epoch": 0.39313880126182965, "grad_norm": 0.6736696437372744, "learning_rate": 1.9535536747786884e-05, "loss": 0.441, "step": 1994 }, { "epoch": 0.3933359621451104, "grad_norm": 0.7082789400644977, "learning_rate": 1.953506972952312e-05, "loss": 0.4596, "step": 1995 }, { "epoch": 0.39353312302839116, "grad_norm": 0.7181262186981547, "learning_rate": 1.9534602482171618e-05, "loss": 0.4626, "step": 1996 }, { "epoch": 0.3937302839116719, "grad_norm": 0.7063551366210826, "learning_rate": 1.9534135005743614e-05, "loss": 0.4472, "step": 1997 }, { "epoch": 0.3939274447949527, "grad_norm": 0.7562123798608619, "learning_rate": 1.9533667300250343e-05, "loss": 0.4832, "step": 1998 }, { "epoch": 0.39412460567823343, "grad_norm": 0.6940975477174752, "learning_rate": 1.9533199365703035e-05, "loss": 0.4811, "step": 1999 }, { "epoch": 0.3943217665615142, "grad_norm": 0.6260579790621669, "learning_rate": 1.9532731202112935e-05, "loss": 0.4616, "step": 2000 }, { "epoch": 0.39451892744479494, "grad_norm": 0.6557298785672231, "learning_rate": 1.9532262809491294e-05, "loss": 0.4785, "step": 2001 }, { "epoch": 0.3947160883280757, "grad_norm": 0.6533941335015038, "learning_rate": 1.953179418784936e-05, "loss": 0.4448, "step": 2002 }, { "epoch": 0.39491324921135645, "grad_norm": 0.6709234409900684, "learning_rate": 1.9531325337198394e-05, "loss": 0.4481, "step": 2003 }, { "epoch": 0.3951104100946372, "grad_norm": 1.528459654964046, "learning_rate": 1.9530856257549664e-05, "loss": 0.4861, "step": 2004 }, { "epoch": 0.39530757097791797, "grad_norm": 0.6914635119888252, "learning_rate": 1.9530386948914436e-05, "loss": 0.4359, "step": 2005 }, { "epoch": 0.3955047318611987, "grad_norm": 0.8898433190590056, "learning_rate": 1.9529917411303984e-05, "loss": 0.4643, "step": 2006 }, { "epoch": 0.3957018927444795, "grad_norm": 0.6810967882315899, "learning_rate": 1.95294476447296e-05, "loss": 0.458, "step": 2007 }, { "epoch": 0.39589905362776023, "grad_norm": 0.6805859515758788, "learning_rate": 1.9528977649202554e-05, "loss": 0.4199, "step": 2008 }, { "epoch": 0.396096214511041, "grad_norm": 0.5998994373884956, "learning_rate": 1.9528507424734148e-05, "loss": 0.4118, "step": 2009 }, { "epoch": 0.39629337539432175, "grad_norm": 0.8444604178654211, "learning_rate": 1.9528036971335678e-05, "loss": 0.4933, "step": 2010 }, { "epoch": 0.3964905362776025, "grad_norm": 1.2768307772665541, "learning_rate": 1.952756628901845e-05, "loss": 0.4256, "step": 2011 }, { "epoch": 0.39668769716088326, "grad_norm": 0.7512293164398522, "learning_rate": 1.9527095377793765e-05, "loss": 0.4721, "step": 2012 }, { "epoch": 0.396884858044164, "grad_norm": 0.9372345492820653, "learning_rate": 1.9526624237672945e-05, "loss": 0.4319, "step": 2013 }, { "epoch": 0.39708201892744477, "grad_norm": 3.610388601557778, "learning_rate": 1.9526152868667302e-05, "loss": 0.511, "step": 2014 }, { "epoch": 0.3972791798107255, "grad_norm": 0.9133527465332176, "learning_rate": 1.952568127078817e-05, "loss": 0.4852, "step": 2015 }, { "epoch": 0.39747634069400634, "grad_norm": 0.801460794349205, "learning_rate": 1.952520944404687e-05, "loss": 0.4679, "step": 2016 }, { "epoch": 0.3976735015772871, "grad_norm": 0.6184192490263251, "learning_rate": 1.9524737388454745e-05, "loss": 0.4306, "step": 2017 }, { "epoch": 0.39787066246056785, "grad_norm": 0.7074190008986967, "learning_rate": 1.9524265104023133e-05, "loss": 0.4233, "step": 2018 }, { "epoch": 0.3980678233438486, "grad_norm": 0.7029283154800328, "learning_rate": 1.9523792590763382e-05, "loss": 0.4627, "step": 2019 }, { "epoch": 0.39826498422712936, "grad_norm": 0.851532590870496, "learning_rate": 1.9523319848686845e-05, "loss": 0.4506, "step": 2020 }, { "epoch": 0.3984621451104101, "grad_norm": 0.7525221362585885, "learning_rate": 1.952284687780488e-05, "loss": 0.4434, "step": 2021 }, { "epoch": 0.39865930599369087, "grad_norm": 0.8241310038169775, "learning_rate": 1.952237367812885e-05, "loss": 0.4552, "step": 2022 }, { "epoch": 0.3988564668769716, "grad_norm": 0.6291466115452881, "learning_rate": 1.952190024967012e-05, "loss": 0.441, "step": 2023 }, { "epoch": 0.3990536277602524, "grad_norm": 0.8316147796097879, "learning_rate": 1.9521426592440075e-05, "loss": 0.4394, "step": 2024 }, { "epoch": 0.39925078864353314, "grad_norm": 0.6195605062293554, "learning_rate": 1.9520952706450083e-05, "loss": 0.4547, "step": 2025 }, { "epoch": 0.3994479495268139, "grad_norm": 1.0728222016893565, "learning_rate": 1.952047859171154e-05, "loss": 0.4847, "step": 2026 }, { "epoch": 0.39964511041009465, "grad_norm": 0.6543029079987064, "learning_rate": 1.9520004248235826e-05, "loss": 0.4412, "step": 2027 }, { "epoch": 0.3998422712933754, "grad_norm": 0.8215362022698259, "learning_rate": 1.9519529676034347e-05, "loss": 0.4413, "step": 2028 }, { "epoch": 0.40003943217665616, "grad_norm": 0.6394436878246803, "learning_rate": 1.95190548751185e-05, "loss": 0.4091, "step": 2029 }, { "epoch": 0.4002365930599369, "grad_norm": 0.8545402719858461, "learning_rate": 1.9518579845499698e-05, "loss": 0.4713, "step": 2030 }, { "epoch": 0.4004337539432177, "grad_norm": 0.6930745549889441, "learning_rate": 1.9518104587189348e-05, "loss": 0.4474, "step": 2031 }, { "epoch": 0.40063091482649843, "grad_norm": 0.8730350351232401, "learning_rate": 1.951762910019887e-05, "loss": 0.4678, "step": 2032 }, { "epoch": 0.4008280757097792, "grad_norm": 0.7083655229046756, "learning_rate": 1.9517153384539685e-05, "loss": 0.4452, "step": 2033 }, { "epoch": 0.40102523659305994, "grad_norm": 0.9617195558090388, "learning_rate": 1.9516677440223232e-05, "loss": 0.4631, "step": 2034 }, { "epoch": 0.4012223974763407, "grad_norm": 0.7580517534109538, "learning_rate": 1.9516201267260935e-05, "loss": 0.4548, "step": 2035 }, { "epoch": 0.40141955835962145, "grad_norm": 1.0525747613425824, "learning_rate": 1.9515724865664242e-05, "loss": 0.4953, "step": 2036 }, { "epoch": 0.4016167192429022, "grad_norm": 0.7820613688472805, "learning_rate": 1.9515248235444595e-05, "loss": 0.4339, "step": 2037 }, { "epoch": 0.40181388012618297, "grad_norm": 0.7403224002007617, "learning_rate": 1.9514771376613446e-05, "loss": 0.4766, "step": 2038 }, { "epoch": 0.4020110410094637, "grad_norm": 0.8359871396836571, "learning_rate": 1.9514294289182253e-05, "loss": 0.5007, "step": 2039 }, { "epoch": 0.4022082018927445, "grad_norm": 51.06976499844091, "learning_rate": 1.9513816973162475e-05, "loss": 0.7096, "step": 2040 }, { "epoch": 0.40240536277602523, "grad_norm": 0.9110749969850523, "learning_rate": 1.9513339428565588e-05, "loss": 0.4644, "step": 2041 }, { "epoch": 0.402602523659306, "grad_norm": 0.7178947368074214, "learning_rate": 1.9512861655403057e-05, "loss": 0.4586, "step": 2042 }, { "epoch": 0.40279968454258674, "grad_norm": 0.6762975622616101, "learning_rate": 1.9512383653686364e-05, "loss": 0.4217, "step": 2043 }, { "epoch": 0.4029968454258675, "grad_norm": 0.7412728040347322, "learning_rate": 1.9511905423426992e-05, "loss": 0.5072, "step": 2044 }, { "epoch": 0.40319400630914826, "grad_norm": 0.8922820307079298, "learning_rate": 1.9511426964636437e-05, "loss": 0.4553, "step": 2045 }, { "epoch": 0.403391167192429, "grad_norm": 0.6604569080074368, "learning_rate": 1.9510948277326188e-05, "loss": 0.4388, "step": 2046 }, { "epoch": 0.40358832807570977, "grad_norm": 0.7657116520818048, "learning_rate": 1.9510469361507747e-05, "loss": 0.4703, "step": 2047 }, { "epoch": 0.4037854889589905, "grad_norm": 0.6403472748402534, "learning_rate": 1.950999021719262e-05, "loss": 0.4426, "step": 2048 }, { "epoch": 0.4039826498422713, "grad_norm": 0.7942729480613527, "learning_rate": 1.950951084439232e-05, "loss": 0.4722, "step": 2049 }, { "epoch": 0.40417981072555204, "grad_norm": 0.6777646106474043, "learning_rate": 1.9509031243118365e-05, "loss": 0.4596, "step": 2050 }, { "epoch": 0.4043769716088328, "grad_norm": 0.6657983977712271, "learning_rate": 1.9508551413382274e-05, "loss": 0.4649, "step": 2051 }, { "epoch": 0.40457413249211355, "grad_norm": 0.7409227648041087, "learning_rate": 1.950807135519558e-05, "loss": 0.4949, "step": 2052 }, { "epoch": 0.4047712933753943, "grad_norm": 0.6979782066122284, "learning_rate": 1.9507591068569812e-05, "loss": 0.519, "step": 2053 }, { "epoch": 0.40496845425867506, "grad_norm": 0.6505454029015588, "learning_rate": 1.9507110553516518e-05, "loss": 0.4148, "step": 2054 }, { "epoch": 0.4051656151419558, "grad_norm": 0.6905829191499583, "learning_rate": 1.9506629810047233e-05, "loss": 0.4269, "step": 2055 }, { "epoch": 0.40536277602523657, "grad_norm": 0.6726973855018206, "learning_rate": 1.9506148838173512e-05, "loss": 0.4714, "step": 2056 }, { "epoch": 0.4055599369085173, "grad_norm": 0.6449181697582981, "learning_rate": 1.950566763790691e-05, "loss": 0.4611, "step": 2057 }, { "epoch": 0.4057570977917981, "grad_norm": 0.6541125973886486, "learning_rate": 1.9505186209258987e-05, "loss": 0.4657, "step": 2058 }, { "epoch": 0.40595425867507884, "grad_norm": 0.6492031501355185, "learning_rate": 1.950470455224131e-05, "loss": 0.419, "step": 2059 }, { "epoch": 0.4061514195583596, "grad_norm": 0.6958217286443721, "learning_rate": 1.9504222666865457e-05, "loss": 0.4559, "step": 2060 }, { "epoch": 0.4063485804416404, "grad_norm": 0.6660544919270931, "learning_rate": 1.9503740553142995e-05, "loss": 0.4614, "step": 2061 }, { "epoch": 0.40654574132492116, "grad_norm": 0.6441549325271921, "learning_rate": 1.9503258211085515e-05, "loss": 0.4832, "step": 2062 }, { "epoch": 0.4067429022082019, "grad_norm": 1.5150959012964547, "learning_rate": 1.9502775640704606e-05, "loss": 0.434, "step": 2063 }, { "epoch": 0.4069400630914827, "grad_norm": 0.6244719601196084, "learning_rate": 1.9502292842011857e-05, "loss": 0.4173, "step": 2064 }, { "epoch": 0.40713722397476343, "grad_norm": 0.6628533283416533, "learning_rate": 1.950180981501887e-05, "loss": 0.4414, "step": 2065 }, { "epoch": 0.4073343848580442, "grad_norm": 0.7233298009650888, "learning_rate": 1.950132655973725e-05, "loss": 0.4296, "step": 2066 }, { "epoch": 0.40753154574132494, "grad_norm": 0.6648377046224346, "learning_rate": 1.9500843076178612e-05, "loss": 0.4455, "step": 2067 }, { "epoch": 0.4077287066246057, "grad_norm": 0.6564892733023274, "learning_rate": 1.9500359364354565e-05, "loss": 0.4545, "step": 2068 }, { "epoch": 0.40792586750788645, "grad_norm": 0.6435233243344421, "learning_rate": 1.9499875424276734e-05, "loss": 0.4551, "step": 2069 }, { "epoch": 0.4081230283911672, "grad_norm": 0.6478000940884499, "learning_rate": 1.9499391255956745e-05, "loss": 0.4262, "step": 2070 }, { "epoch": 0.40832018927444796, "grad_norm": 2.6194886693717487, "learning_rate": 1.949890685940623e-05, "loss": 0.4949, "step": 2071 }, { "epoch": 0.4085173501577287, "grad_norm": 0.7796501553012304, "learning_rate": 1.949842223463683e-05, "loss": 0.4521, "step": 2072 }, { "epoch": 0.4087145110410095, "grad_norm": 1.1718471052149675, "learning_rate": 1.9497937381660188e-05, "loss": 0.43, "step": 2073 }, { "epoch": 0.40891167192429023, "grad_norm": 0.7069733785890165, "learning_rate": 1.949745230048795e-05, "loss": 0.4567, "step": 2074 }, { "epoch": 0.409108832807571, "grad_norm": 0.7134975969386784, "learning_rate": 1.9496966991131775e-05, "loss": 0.3937, "step": 2075 }, { "epoch": 0.40930599369085174, "grad_norm": 0.6204131343159649, "learning_rate": 1.9496481453603318e-05, "loss": 0.4501, "step": 2076 }, { "epoch": 0.4095031545741325, "grad_norm": 0.6233092334560004, "learning_rate": 1.9495995687914244e-05, "loss": 0.4122, "step": 2077 }, { "epoch": 0.40970031545741326, "grad_norm": 0.6614510142665343, "learning_rate": 1.949550969407623e-05, "loss": 0.4269, "step": 2078 }, { "epoch": 0.409897476340694, "grad_norm": 0.6647109624994366, "learning_rate": 1.949502347210095e-05, "loss": 0.464, "step": 2079 }, { "epoch": 0.41009463722397477, "grad_norm": 0.6704844205652758, "learning_rate": 1.949453702200008e-05, "loss": 0.471, "step": 2080 }, { "epoch": 0.4102917981072555, "grad_norm": 0.6946952276485558, "learning_rate": 1.9494050343785317e-05, "loss": 0.461, "step": 2081 }, { "epoch": 0.4104889589905363, "grad_norm": 0.6724334955144293, "learning_rate": 1.9493563437468344e-05, "loss": 0.4693, "step": 2082 }, { "epoch": 0.41068611987381703, "grad_norm": 0.7089092005187811, "learning_rate": 1.9493076303060866e-05, "loss": 0.4482, "step": 2083 }, { "epoch": 0.4108832807570978, "grad_norm": 0.5996155537534351, "learning_rate": 1.9492588940574588e-05, "loss": 0.445, "step": 2084 }, { "epoch": 0.41108044164037855, "grad_norm": 0.6632658160746489, "learning_rate": 1.9492101350021216e-05, "loss": 0.42, "step": 2085 }, { "epoch": 0.4112776025236593, "grad_norm": 0.6093300916331452, "learning_rate": 1.9491613531412463e-05, "loss": 0.4507, "step": 2086 }, { "epoch": 0.41147476340694006, "grad_norm": 0.6358547202310659, "learning_rate": 1.949112548476005e-05, "loss": 0.4588, "step": 2087 }, { "epoch": 0.4116719242902208, "grad_norm": 0.6371219707058599, "learning_rate": 1.9490637210075708e-05, "loss": 0.4772, "step": 2088 }, { "epoch": 0.41186908517350157, "grad_norm": 0.5999352611847947, "learning_rate": 1.9490148707371163e-05, "loss": 0.4604, "step": 2089 }, { "epoch": 0.4120662460567823, "grad_norm": 0.653380052430745, "learning_rate": 1.9489659976658152e-05, "loss": 0.4377, "step": 2090 }, { "epoch": 0.4122634069400631, "grad_norm": 0.6174059523962631, "learning_rate": 1.948917101794842e-05, "loss": 0.4545, "step": 2091 }, { "epoch": 0.41246056782334384, "grad_norm": 0.6989529903953323, "learning_rate": 1.9488681831253706e-05, "loss": 0.4615, "step": 2092 }, { "epoch": 0.4126577287066246, "grad_norm": 0.6751206895361119, "learning_rate": 1.9488192416585775e-05, "loss": 0.4749, "step": 2093 }, { "epoch": 0.41285488958990535, "grad_norm": 0.6067234928335872, "learning_rate": 1.948770277395638e-05, "loss": 0.4495, "step": 2094 }, { "epoch": 0.4130520504731861, "grad_norm": 0.6038861471266463, "learning_rate": 1.9487212903377286e-05, "loss": 0.4318, "step": 2095 }, { "epoch": 0.41324921135646686, "grad_norm": 0.6099951703516203, "learning_rate": 1.9486722804860262e-05, "loss": 0.4232, "step": 2096 }, { "epoch": 0.4134463722397476, "grad_norm": 0.6018481400930096, "learning_rate": 1.948623247841708e-05, "loss": 0.4583, "step": 2097 }, { "epoch": 0.4136435331230284, "grad_norm": 0.6300586347763767, "learning_rate": 1.948574192405953e-05, "loss": 0.4805, "step": 2098 }, { "epoch": 0.41384069400630913, "grad_norm": 0.6261753743075679, "learning_rate": 1.9485251141799387e-05, "loss": 0.4929, "step": 2099 }, { "epoch": 0.4140378548895899, "grad_norm": 0.5502783264260944, "learning_rate": 1.9484760131648447e-05, "loss": 0.4201, "step": 2100 }, { "epoch": 0.41423501577287064, "grad_norm": 0.6373152594526875, "learning_rate": 1.9484268893618504e-05, "loss": 0.5034, "step": 2101 }, { "epoch": 0.4144321766561514, "grad_norm": 0.550511658521269, "learning_rate": 1.9483777427721367e-05, "loss": 0.4215, "step": 2102 }, { "epoch": 0.41462933753943215, "grad_norm": 0.600033811832785, "learning_rate": 1.948328573396884e-05, "loss": 0.4536, "step": 2103 }, { "epoch": 0.4148264984227129, "grad_norm": 0.591267981029353, "learning_rate": 1.9482793812372732e-05, "loss": 0.4195, "step": 2104 }, { "epoch": 0.41502365930599366, "grad_norm": 0.6355093625177733, "learning_rate": 1.9482301662944872e-05, "loss": 0.45, "step": 2105 }, { "epoch": 0.4152208201892745, "grad_norm": 0.6066083667212502, "learning_rate": 1.9481809285697076e-05, "loss": 0.438, "step": 2106 }, { "epoch": 0.41541798107255523, "grad_norm": 0.6268489081451546, "learning_rate": 1.9481316680641175e-05, "loss": 0.4482, "step": 2107 }, { "epoch": 0.415615141955836, "grad_norm": 0.6292327520694636, "learning_rate": 1.9480823847789007e-05, "loss": 0.4657, "step": 2108 }, { "epoch": 0.41581230283911674, "grad_norm": 0.5829052402848848, "learning_rate": 1.9480330787152413e-05, "loss": 0.429, "step": 2109 }, { "epoch": 0.4160094637223975, "grad_norm": 1.3748165275525082, "learning_rate": 1.9479837498743236e-05, "loss": 0.4193, "step": 2110 }, { "epoch": 0.41620662460567825, "grad_norm": 0.5682917102110292, "learning_rate": 1.9479343982573326e-05, "loss": 0.4422, "step": 2111 }, { "epoch": 0.416403785488959, "grad_norm": 0.5947136497114629, "learning_rate": 1.9478850238654546e-05, "loss": 0.4614, "step": 2112 }, { "epoch": 0.41660094637223977, "grad_norm": 0.7037292516298015, "learning_rate": 1.9478356266998757e-05, "loss": 0.5048, "step": 2113 }, { "epoch": 0.4167981072555205, "grad_norm": 0.6478893055908316, "learning_rate": 1.947786206761782e-05, "loss": 0.4463, "step": 2114 }, { "epoch": 0.4169952681388013, "grad_norm": 0.6013780035939855, "learning_rate": 1.9477367640523622e-05, "loss": 0.4595, "step": 2115 }, { "epoch": 0.41719242902208203, "grad_norm": 0.5984366322194706, "learning_rate": 1.947687298572803e-05, "loss": 0.4312, "step": 2116 }, { "epoch": 0.4173895899053628, "grad_norm": 0.6057400198881939, "learning_rate": 1.9476378103242934e-05, "loss": 0.4423, "step": 2117 }, { "epoch": 0.41758675078864355, "grad_norm": 0.5985571921288283, "learning_rate": 1.9475882993080223e-05, "loss": 0.4192, "step": 2118 }, { "epoch": 0.4177839116719243, "grad_norm": 0.6313104451272632, "learning_rate": 1.947538765525179e-05, "loss": 0.4382, "step": 2119 }, { "epoch": 0.41798107255520506, "grad_norm": 0.6434996318524585, "learning_rate": 1.9474892089769538e-05, "loss": 0.4643, "step": 2120 }, { "epoch": 0.4181782334384858, "grad_norm": 0.5983507556937149, "learning_rate": 1.947439629664538e-05, "loss": 0.4306, "step": 2121 }, { "epoch": 0.41837539432176657, "grad_norm": 0.6227894910009829, "learning_rate": 1.9473900275891214e-05, "loss": 0.4185, "step": 2122 }, { "epoch": 0.4185725552050473, "grad_norm": 0.6249459358371175, "learning_rate": 1.9473404027518965e-05, "loss": 0.4422, "step": 2123 }, { "epoch": 0.4187697160883281, "grad_norm": 0.5581857659991755, "learning_rate": 1.9472907551540557e-05, "loss": 0.3957, "step": 2124 }, { "epoch": 0.41896687697160884, "grad_norm": 0.6086539772510696, "learning_rate": 1.9472410847967917e-05, "loss": 0.4538, "step": 2125 }, { "epoch": 0.4191640378548896, "grad_norm": 0.6001107475689484, "learning_rate": 1.947191391681298e-05, "loss": 0.4105, "step": 2126 }, { "epoch": 0.41936119873817035, "grad_norm": 0.66517888101447, "learning_rate": 1.947141675808768e-05, "loss": 0.4399, "step": 2127 }, { "epoch": 0.4195583596214511, "grad_norm": 0.5752990303617271, "learning_rate": 1.9470919371803966e-05, "loss": 0.4208, "step": 2128 }, { "epoch": 0.41975552050473186, "grad_norm": 0.6156099256870735, "learning_rate": 1.947042175797379e-05, "loss": 0.4578, "step": 2129 }, { "epoch": 0.4199526813880126, "grad_norm": 0.5825517100633804, "learning_rate": 1.94699239166091e-05, "loss": 0.45, "step": 2130 }, { "epoch": 0.42014984227129337, "grad_norm": 0.6011744479891673, "learning_rate": 1.9469425847721865e-05, "loss": 0.4734, "step": 2131 }, { "epoch": 0.42034700315457413, "grad_norm": 0.5606446063107278, "learning_rate": 1.9468927551324045e-05, "loss": 0.4242, "step": 2132 }, { "epoch": 0.4205441640378549, "grad_norm": 0.5730880385070863, "learning_rate": 1.946842902742762e-05, "loss": 0.4439, "step": 2133 }, { "epoch": 0.42074132492113564, "grad_norm": 0.6125765323790818, "learning_rate": 1.9467930276044557e-05, "loss": 0.4655, "step": 2134 }, { "epoch": 0.4209384858044164, "grad_norm": 0.5806851297859154, "learning_rate": 1.946743129718685e-05, "loss": 0.441, "step": 2135 }, { "epoch": 0.42113564668769715, "grad_norm": 0.6077365774621822, "learning_rate": 1.946693209086648e-05, "loss": 0.4479, "step": 2136 }, { "epoch": 0.4213328075709779, "grad_norm": 0.6092599482658368, "learning_rate": 1.9466432657095443e-05, "loss": 0.4439, "step": 2137 }, { "epoch": 0.42152996845425866, "grad_norm": 0.5972879209489058, "learning_rate": 1.9465932995885737e-05, "loss": 0.4174, "step": 2138 }, { "epoch": 0.4217271293375394, "grad_norm": 0.6448412899637194, "learning_rate": 1.946543310724937e-05, "loss": 0.4802, "step": 2139 }, { "epoch": 0.4219242902208202, "grad_norm": 0.5919021815947381, "learning_rate": 1.946493299119835e-05, "loss": 0.428, "step": 2140 }, { "epoch": 0.42212145110410093, "grad_norm": 0.5832897545671972, "learning_rate": 1.9464432647744693e-05, "loss": 0.4152, "step": 2141 }, { "epoch": 0.4223186119873817, "grad_norm": 0.7594410044159404, "learning_rate": 1.9463932076900416e-05, "loss": 0.4921, "step": 2142 }, { "epoch": 0.42251577287066244, "grad_norm": 0.5750713155262214, "learning_rate": 1.9463431278677552e-05, "loss": 0.4064, "step": 2143 }, { "epoch": 0.4227129337539432, "grad_norm": 0.6628109681634553, "learning_rate": 1.946293025308813e-05, "loss": 0.4372, "step": 2144 }, { "epoch": 0.42291009463722395, "grad_norm": 0.6309603811899096, "learning_rate": 1.946242900014419e-05, "loss": 0.4862, "step": 2145 }, { "epoch": 0.4231072555205047, "grad_norm": 0.6540676115612354, "learning_rate": 1.9461927519857772e-05, "loss": 0.4509, "step": 2146 }, { "epoch": 0.42330441640378547, "grad_norm": 0.6395341421094025, "learning_rate": 1.9461425812240925e-05, "loss": 0.4526, "step": 2147 }, { "epoch": 0.4235015772870662, "grad_norm": 0.6394713150271438, "learning_rate": 1.9460923877305706e-05, "loss": 0.4616, "step": 2148 }, { "epoch": 0.423698738170347, "grad_norm": 0.6069065769872102, "learning_rate": 1.9460421715064172e-05, "loss": 0.4289, "step": 2149 }, { "epoch": 0.42389589905362773, "grad_norm": 0.6064632740648996, "learning_rate": 1.9459919325528384e-05, "loss": 0.4635, "step": 2150 }, { "epoch": 0.4240930599369085, "grad_norm": 0.616873552338082, "learning_rate": 1.945941670871042e-05, "loss": 0.4648, "step": 2151 }, { "epoch": 0.4242902208201893, "grad_norm": 0.5805094134522223, "learning_rate": 1.945891386462235e-05, "loss": 0.4254, "step": 2152 }, { "epoch": 0.42448738170347006, "grad_norm": 0.6016630618010061, "learning_rate": 1.9458410793276256e-05, "loss": 0.4537, "step": 2153 }, { "epoch": 0.4246845425867508, "grad_norm": 0.6221143855785904, "learning_rate": 1.9457907494684227e-05, "loss": 0.4689, "step": 2154 }, { "epoch": 0.42488170347003157, "grad_norm": 0.786382722019145, "learning_rate": 1.9457403968858358e-05, "loss": 0.4614, "step": 2155 }, { "epoch": 0.4250788643533123, "grad_norm": 0.5847320244340704, "learning_rate": 1.9456900215810737e-05, "loss": 0.4293, "step": 2156 }, { "epoch": 0.4252760252365931, "grad_norm": 0.695033418240993, "learning_rate": 1.9456396235553474e-05, "loss": 0.4215, "step": 2157 }, { "epoch": 0.42547318611987384, "grad_norm": 0.6844045999840678, "learning_rate": 1.9455892028098677e-05, "loss": 0.5193, "step": 2158 }, { "epoch": 0.4256703470031546, "grad_norm": 0.6184226453391258, "learning_rate": 1.945538759345846e-05, "loss": 0.486, "step": 2159 }, { "epoch": 0.42586750788643535, "grad_norm": 0.6324228885355624, "learning_rate": 1.9454882931644942e-05, "loss": 0.4585, "step": 2160 }, { "epoch": 0.4260646687697161, "grad_norm": 0.6165127471239128, "learning_rate": 1.9454378042670245e-05, "loss": 0.46, "step": 2161 }, { "epoch": 0.42626182965299686, "grad_norm": 0.6347931854232857, "learning_rate": 1.9453872926546505e-05, "loss": 0.4598, "step": 2162 }, { "epoch": 0.4264589905362776, "grad_norm": 0.6539072568560251, "learning_rate": 1.9453367583285853e-05, "loss": 0.4291, "step": 2163 }, { "epoch": 0.42665615141955837, "grad_norm": 0.6184889881753938, "learning_rate": 1.945286201290043e-05, "loss": 0.4624, "step": 2164 }, { "epoch": 0.4268533123028391, "grad_norm": 0.619366970782029, "learning_rate": 1.945235621540239e-05, "loss": 0.4838, "step": 2165 }, { "epoch": 0.4270504731861199, "grad_norm": 0.5940273163643314, "learning_rate": 1.9451850190803877e-05, "loss": 0.4219, "step": 2166 }, { "epoch": 0.42724763406940064, "grad_norm": 0.6055311252553117, "learning_rate": 1.9451343939117052e-05, "loss": 0.4413, "step": 2167 }, { "epoch": 0.4274447949526814, "grad_norm": 0.6326779232777279, "learning_rate": 1.9450837460354073e-05, "loss": 0.4506, "step": 2168 }, { "epoch": 0.42764195583596215, "grad_norm": 0.6835756822811451, "learning_rate": 1.9450330754527118e-05, "loss": 0.4916, "step": 2169 }, { "epoch": 0.4278391167192429, "grad_norm": 0.6561532078291141, "learning_rate": 1.9449823821648357e-05, "loss": 0.461, "step": 2170 }, { "epoch": 0.42803627760252366, "grad_norm": 0.7237386411398435, "learning_rate": 1.944931666172997e-05, "loss": 0.4694, "step": 2171 }, { "epoch": 0.4282334384858044, "grad_norm": 0.615056095691249, "learning_rate": 1.9448809274784136e-05, "loss": 0.4374, "step": 2172 }, { "epoch": 0.4284305993690852, "grad_norm": 0.5965296460751116, "learning_rate": 1.944830166082305e-05, "loss": 0.4651, "step": 2173 }, { "epoch": 0.42862776025236593, "grad_norm": 0.6166157125058764, "learning_rate": 1.9447793819858912e-05, "loss": 0.4427, "step": 2174 }, { "epoch": 0.4288249211356467, "grad_norm": 0.5827660473699446, "learning_rate": 1.944728575190392e-05, "loss": 0.4428, "step": 2175 }, { "epoch": 0.42902208201892744, "grad_norm": 0.5867427682120541, "learning_rate": 1.9446777456970276e-05, "loss": 0.4346, "step": 2176 }, { "epoch": 0.4292192429022082, "grad_norm": 0.5549950519191269, "learning_rate": 1.9446268935070197e-05, "loss": 0.4119, "step": 2177 }, { "epoch": 0.42941640378548895, "grad_norm": 0.6125305517287657, "learning_rate": 1.94457601862159e-05, "loss": 0.4829, "step": 2178 }, { "epoch": 0.4296135646687697, "grad_norm": 0.6921490966397458, "learning_rate": 1.944525121041961e-05, "loss": 0.4758, "step": 2179 }, { "epoch": 0.42981072555205047, "grad_norm": 0.585352142286126, "learning_rate": 1.944474200769355e-05, "loss": 0.4497, "step": 2180 }, { "epoch": 0.4300078864353312, "grad_norm": 0.6266546137858079, "learning_rate": 1.944423257804996e-05, "loss": 0.4481, "step": 2181 }, { "epoch": 0.430205047318612, "grad_norm": 0.6516582163069267, "learning_rate": 1.9443722921501074e-05, "loss": 0.4473, "step": 2182 }, { "epoch": 0.43040220820189273, "grad_norm": 0.6099356219217796, "learning_rate": 1.9443213038059145e-05, "loss": 0.4546, "step": 2183 }, { "epoch": 0.4305993690851735, "grad_norm": 0.6526605435968623, "learning_rate": 1.944270292773641e-05, "loss": 0.4225, "step": 2184 }, { "epoch": 0.43079652996845424, "grad_norm": 0.7044967095269495, "learning_rate": 1.944219259054514e-05, "loss": 0.4483, "step": 2185 }, { "epoch": 0.430993690851735, "grad_norm": 0.6635399522882317, "learning_rate": 1.9441682026497587e-05, "loss": 0.4396, "step": 2186 }, { "epoch": 0.43119085173501576, "grad_norm": 0.7462506165933283, "learning_rate": 1.944117123560602e-05, "loss": 0.4184, "step": 2187 }, { "epoch": 0.4313880126182965, "grad_norm": 0.630701248717489, "learning_rate": 1.944066021788271e-05, "loss": 0.4723, "step": 2188 }, { "epoch": 0.43158517350157727, "grad_norm": 0.6815590189822918, "learning_rate": 1.9440148973339937e-05, "loss": 0.461, "step": 2189 }, { "epoch": 0.431782334384858, "grad_norm": 0.5608877290281826, "learning_rate": 1.9439637501989984e-05, "loss": 0.4261, "step": 2190 }, { "epoch": 0.4319794952681388, "grad_norm": 0.5678482544734272, "learning_rate": 1.9439125803845136e-05, "loss": 0.4343, "step": 2191 }, { "epoch": 0.43217665615141954, "grad_norm": 0.6142946467397039, "learning_rate": 1.9438613878917693e-05, "loss": 0.4446, "step": 2192 }, { "epoch": 0.4323738170347003, "grad_norm": 0.5554804879141013, "learning_rate": 1.9438101727219946e-05, "loss": 0.4212, "step": 2193 }, { "epoch": 0.43257097791798105, "grad_norm": 0.7084789316007144, "learning_rate": 1.943758934876421e-05, "loss": 0.4591, "step": 2194 }, { "epoch": 0.4327681388012618, "grad_norm": 0.615339701668569, "learning_rate": 1.9437076743562785e-05, "loss": 0.4361, "step": 2195 }, { "epoch": 0.43296529968454256, "grad_norm": 0.790061368854975, "learning_rate": 1.943656391162799e-05, "loss": 0.4531, "step": 2196 }, { "epoch": 0.43316246056782337, "grad_norm": 0.6198525493844337, "learning_rate": 1.943605085297215e-05, "loss": 0.4438, "step": 2197 }, { "epoch": 0.4333596214511041, "grad_norm": 0.6215188641437128, "learning_rate": 1.943553756760759e-05, "loss": 0.4106, "step": 2198 }, { "epoch": 0.4335567823343849, "grad_norm": 0.6051622234791362, "learning_rate": 1.9435024055546644e-05, "loss": 0.4059, "step": 2199 }, { "epoch": 0.43375394321766564, "grad_norm": 0.6070081944728062, "learning_rate": 1.9434510316801644e-05, "loss": 0.4448, "step": 2200 }, { "epoch": 0.4339511041009464, "grad_norm": 0.6052231890529391, "learning_rate": 1.9433996351384936e-05, "loss": 0.4059, "step": 2201 }, { "epoch": 0.43414826498422715, "grad_norm": 0.654268310381098, "learning_rate": 1.943348215930887e-05, "loss": 0.4578, "step": 2202 }, { "epoch": 0.4343454258675079, "grad_norm": 0.5896345157357851, "learning_rate": 1.9432967740585797e-05, "loss": 0.4017, "step": 2203 }, { "epoch": 0.43454258675078866, "grad_norm": 0.7026359807455905, "learning_rate": 1.9432453095228078e-05, "loss": 0.4789, "step": 2204 }, { "epoch": 0.4347397476340694, "grad_norm": 0.9860127509953628, "learning_rate": 1.9431938223248076e-05, "loss": 0.4495, "step": 2205 }, { "epoch": 0.4349369085173502, "grad_norm": 0.5609344606036918, "learning_rate": 1.9431423124658165e-05, "loss": 0.4384, "step": 2206 }, { "epoch": 0.43513406940063093, "grad_norm": 0.6369183512758136, "learning_rate": 1.943090779947072e-05, "loss": 0.4943, "step": 2207 }, { "epoch": 0.4353312302839117, "grad_norm": 0.5765316628970028, "learning_rate": 1.9430392247698117e-05, "loss": 0.4622, "step": 2208 }, { "epoch": 0.43552839116719244, "grad_norm": 0.5960810357622368, "learning_rate": 1.9429876469352746e-05, "loss": 0.4399, "step": 2209 }, { "epoch": 0.4357255520504732, "grad_norm": 0.6096812488671478, "learning_rate": 1.9429360464447e-05, "loss": 0.4627, "step": 2210 }, { "epoch": 0.43592271293375395, "grad_norm": 0.5776557605282492, "learning_rate": 1.9428844232993275e-05, "loss": 0.411, "step": 2211 }, { "epoch": 0.4361198738170347, "grad_norm": 0.6052522052120832, "learning_rate": 1.9428327775003978e-05, "loss": 0.4448, "step": 2212 }, { "epoch": 0.43631703470031546, "grad_norm": 0.5994361538832661, "learning_rate": 1.942781109049151e-05, "loss": 0.4331, "step": 2213 }, { "epoch": 0.4365141955835962, "grad_norm": 0.6941586070053137, "learning_rate": 1.9427294179468287e-05, "loss": 0.457, "step": 2214 }, { "epoch": 0.436711356466877, "grad_norm": 0.6424583665064718, "learning_rate": 1.942677704194673e-05, "loss": 0.4651, "step": 2215 }, { "epoch": 0.43690851735015773, "grad_norm": 0.6074306424288837, "learning_rate": 1.9426259677939264e-05, "loss": 0.4518, "step": 2216 }, { "epoch": 0.4371056782334385, "grad_norm": 0.6404681725403938, "learning_rate": 1.9425742087458318e-05, "loss": 0.4514, "step": 2217 }, { "epoch": 0.43730283911671924, "grad_norm": 0.656741496330048, "learning_rate": 1.942522427051633e-05, "loss": 0.4355, "step": 2218 }, { "epoch": 0.4375, "grad_norm": 0.6260232241706599, "learning_rate": 1.942470622712574e-05, "loss": 0.4314, "step": 2219 }, { "epoch": 0.43769716088328076, "grad_norm": 0.6238147702128097, "learning_rate": 1.942418795729899e-05, "loss": 0.3956, "step": 2220 }, { "epoch": 0.4378943217665615, "grad_norm": 0.6274596830509961, "learning_rate": 1.9423669461048534e-05, "loss": 0.4483, "step": 2221 }, { "epoch": 0.43809148264984227, "grad_norm": 0.6495347166026656, "learning_rate": 1.942315073838683e-05, "loss": 0.4945, "step": 2222 }, { "epoch": 0.438288643533123, "grad_norm": 0.6005997995972219, "learning_rate": 1.942263178932634e-05, "loss": 0.4541, "step": 2223 }, { "epoch": 0.4384858044164038, "grad_norm": 0.5741164876920474, "learning_rate": 1.942211261387954e-05, "loss": 0.4174, "step": 2224 }, { "epoch": 0.43868296529968454, "grad_norm": 0.6438713859666909, "learning_rate": 1.9421593212058894e-05, "loss": 0.4801, "step": 2225 }, { "epoch": 0.4388801261829653, "grad_norm": 0.6248222261459098, "learning_rate": 1.9421073583876882e-05, "loss": 0.4509, "step": 2226 }, { "epoch": 0.43907728706624605, "grad_norm": 0.6012676799031479, "learning_rate": 1.9420553729345993e-05, "loss": 0.4304, "step": 2227 }, { "epoch": 0.4392744479495268, "grad_norm": 0.6083239790788654, "learning_rate": 1.942003364847871e-05, "loss": 0.4619, "step": 2228 }, { "epoch": 0.43947160883280756, "grad_norm": 0.624739792282654, "learning_rate": 1.9419513341287537e-05, "loss": 0.4327, "step": 2229 }, { "epoch": 0.4396687697160883, "grad_norm": 0.6099088599325223, "learning_rate": 1.9418992807784967e-05, "loss": 0.4361, "step": 2230 }, { "epoch": 0.43986593059936907, "grad_norm": 0.6184689759512897, "learning_rate": 1.9418472047983512e-05, "loss": 0.4254, "step": 2231 }, { "epoch": 0.4400630914826498, "grad_norm": 0.6336984491426801, "learning_rate": 1.941795106189568e-05, "loss": 0.4799, "step": 2232 }, { "epoch": 0.4402602523659306, "grad_norm": 0.6363183171959824, "learning_rate": 1.9417429849533992e-05, "loss": 0.4786, "step": 2233 }, { "epoch": 0.44045741324921134, "grad_norm": 0.634185491973065, "learning_rate": 1.9416908410910965e-05, "loss": 0.4645, "step": 2234 }, { "epoch": 0.4406545741324921, "grad_norm": 0.5890796293233364, "learning_rate": 1.941638674603913e-05, "loss": 0.4377, "step": 2235 }, { "epoch": 0.44085173501577285, "grad_norm": 0.7416948900678109, "learning_rate": 1.9415864854931024e-05, "loss": 0.4953, "step": 2236 }, { "epoch": 0.4410488958990536, "grad_norm": 0.6100278936853679, "learning_rate": 1.941534273759918e-05, "loss": 0.4839, "step": 2237 }, { "epoch": 0.44124605678233436, "grad_norm": 0.679877854057984, "learning_rate": 1.9414820394056143e-05, "loss": 0.4574, "step": 2238 }, { "epoch": 0.4414432176656151, "grad_norm": 0.6337313987110674, "learning_rate": 1.9414297824314466e-05, "loss": 0.4798, "step": 2239 }, { "epoch": 0.4416403785488959, "grad_norm": 0.602381577533317, "learning_rate": 1.9413775028386702e-05, "loss": 0.4421, "step": 2240 }, { "epoch": 0.44183753943217663, "grad_norm": 0.7468962543654029, "learning_rate": 1.9413252006285416e-05, "loss": 0.4569, "step": 2241 }, { "epoch": 0.44203470031545744, "grad_norm": 0.6131581613338967, "learning_rate": 1.9412728758023166e-05, "loss": 0.469, "step": 2242 }, { "epoch": 0.4422318611987382, "grad_norm": 0.6247638637761316, "learning_rate": 1.9412205283612527e-05, "loss": 0.4668, "step": 2243 }, { "epoch": 0.44242902208201895, "grad_norm": 0.6074735973961677, "learning_rate": 1.9411681583066077e-05, "loss": 0.4293, "step": 2244 }, { "epoch": 0.4426261829652997, "grad_norm": 0.5960980998564303, "learning_rate": 1.94111576563964e-05, "loss": 0.3827, "step": 2245 }, { "epoch": 0.44282334384858046, "grad_norm": 0.5711582764920223, "learning_rate": 1.9410633503616077e-05, "loss": 0.4133, "step": 2246 }, { "epoch": 0.4430205047318612, "grad_norm": 0.572834729047988, "learning_rate": 1.9410109124737708e-05, "loss": 0.4487, "step": 2247 }, { "epoch": 0.443217665615142, "grad_norm": 0.5941534079486432, "learning_rate": 1.940958451977389e-05, "loss": 0.439, "step": 2248 }, { "epoch": 0.44341482649842273, "grad_norm": 0.6057683342246314, "learning_rate": 1.9409059688737226e-05, "loss": 0.466, "step": 2249 }, { "epoch": 0.4436119873817035, "grad_norm": 0.6128435575114525, "learning_rate": 1.9408534631640328e-05, "loss": 0.4392, "step": 2250 }, { "epoch": 0.44380914826498424, "grad_norm": 0.593633225129246, "learning_rate": 1.9408009348495808e-05, "loss": 0.4084, "step": 2251 }, { "epoch": 0.444006309148265, "grad_norm": 0.656165917461389, "learning_rate": 1.9407483839316284e-05, "loss": 0.4554, "step": 2252 }, { "epoch": 0.44420347003154576, "grad_norm": 0.5874000065946977, "learning_rate": 1.9406958104114387e-05, "loss": 0.4118, "step": 2253 }, { "epoch": 0.4444006309148265, "grad_norm": 0.6161464173752417, "learning_rate": 1.9406432142902748e-05, "loss": 0.4291, "step": 2254 }, { "epoch": 0.44459779179810727, "grad_norm": 0.6113342594946223, "learning_rate": 1.9405905955694e-05, "loss": 0.4582, "step": 2255 }, { "epoch": 0.444794952681388, "grad_norm": 0.6825339340881406, "learning_rate": 1.9405379542500786e-05, "loss": 0.4354, "step": 2256 }, { "epoch": 0.4449921135646688, "grad_norm": 0.6663414389872298, "learning_rate": 1.9404852903335752e-05, "loss": 0.4717, "step": 2257 }, { "epoch": 0.44518927444794953, "grad_norm": 0.5510169867193097, "learning_rate": 1.9404326038211558e-05, "loss": 0.397, "step": 2258 }, { "epoch": 0.4453864353312303, "grad_norm": 0.6117720610593416, "learning_rate": 1.9403798947140857e-05, "loss": 0.4518, "step": 2259 }, { "epoch": 0.44558359621451105, "grad_norm": 0.6131687458644133, "learning_rate": 1.9403271630136312e-05, "loss": 0.4612, "step": 2260 }, { "epoch": 0.4457807570977918, "grad_norm": 0.5958608772380412, "learning_rate": 1.9402744087210594e-05, "loss": 0.4574, "step": 2261 }, { "epoch": 0.44597791798107256, "grad_norm": 0.6397300220383113, "learning_rate": 1.9402216318376377e-05, "loss": 0.4359, "step": 2262 }, { "epoch": 0.4461750788643533, "grad_norm": 0.6132315582223438, "learning_rate": 1.940168832364634e-05, "loss": 0.4525, "step": 2263 }, { "epoch": 0.44637223974763407, "grad_norm": 0.6126178978812497, "learning_rate": 1.9401160103033173e-05, "loss": 0.449, "step": 2264 }, { "epoch": 0.4465694006309148, "grad_norm": 0.6024472555100853, "learning_rate": 1.9400631656549566e-05, "loss": 0.4552, "step": 2265 }, { "epoch": 0.4467665615141956, "grad_norm": 0.6325409183391781, "learning_rate": 1.9400102984208208e-05, "loss": 0.4723, "step": 2266 }, { "epoch": 0.44696372239747634, "grad_norm": 0.5774023003574291, "learning_rate": 1.939957408602181e-05, "loss": 0.43, "step": 2267 }, { "epoch": 0.4471608832807571, "grad_norm": 0.5773695869045778, "learning_rate": 1.939904496200307e-05, "loss": 0.4507, "step": 2268 }, { "epoch": 0.44735804416403785, "grad_norm": 0.5873923629131955, "learning_rate": 1.939851561216471e-05, "loss": 0.426, "step": 2269 }, { "epoch": 0.4475552050473186, "grad_norm": 0.6124825928799854, "learning_rate": 1.939798603651944e-05, "loss": 0.4621, "step": 2270 }, { "epoch": 0.44775236593059936, "grad_norm": 0.6004472428898958, "learning_rate": 1.939745623507999e-05, "loss": 0.4213, "step": 2271 }, { "epoch": 0.4479495268138801, "grad_norm": 0.584595466954028, "learning_rate": 1.9396926207859085e-05, "loss": 0.4066, "step": 2272 }, { "epoch": 0.4481466876971609, "grad_norm": 0.6100782446987316, "learning_rate": 1.9396395954869463e-05, "loss": 0.4527, "step": 2273 }, { "epoch": 0.44834384858044163, "grad_norm": 0.5988478613742948, "learning_rate": 1.939586547612386e-05, "loss": 0.4301, "step": 2274 }, { "epoch": 0.4485410094637224, "grad_norm": 0.5892383164078985, "learning_rate": 1.939533477163502e-05, "loss": 0.4637, "step": 2275 }, { "epoch": 0.44873817034700314, "grad_norm": 0.6188258893089192, "learning_rate": 1.93948038414157e-05, "loss": 0.4286, "step": 2276 }, { "epoch": 0.4489353312302839, "grad_norm": 0.5859833886607909, "learning_rate": 1.9394272685478646e-05, "loss": 0.4393, "step": 2277 }, { "epoch": 0.44913249211356465, "grad_norm": 0.6243646580192782, "learning_rate": 1.9393741303836633e-05, "loss": 0.4472, "step": 2278 }, { "epoch": 0.4493296529968454, "grad_norm": 0.597880669331921, "learning_rate": 1.9393209696502416e-05, "loss": 0.4429, "step": 2279 }, { "epoch": 0.44952681388012616, "grad_norm": 0.6092038363303693, "learning_rate": 1.9392677863488773e-05, "loss": 0.4427, "step": 2280 }, { "epoch": 0.4497239747634069, "grad_norm": 0.5945045969064071, "learning_rate": 1.9392145804808484e-05, "loss": 0.4521, "step": 2281 }, { "epoch": 0.4499211356466877, "grad_norm": 0.5341262843103938, "learning_rate": 1.939161352047432e-05, "loss": 0.4044, "step": 2282 }, { "epoch": 0.45011829652996843, "grad_norm": 0.6003095082379514, "learning_rate": 1.9391081010499085e-05, "loss": 0.4407, "step": 2283 }, { "epoch": 0.4503154574132492, "grad_norm": 0.6015907908122791, "learning_rate": 1.9390548274895563e-05, "loss": 0.4399, "step": 2284 }, { "epoch": 0.45051261829652994, "grad_norm": 0.5889743466466599, "learning_rate": 1.9390015313676558e-05, "loss": 0.4748, "step": 2285 }, { "epoch": 0.4507097791798107, "grad_norm": 0.589434443034765, "learning_rate": 1.938948212685487e-05, "loss": 0.4502, "step": 2286 }, { "epoch": 0.4509069400630915, "grad_norm": 0.5794383012877807, "learning_rate": 1.9388948714443317e-05, "loss": 0.4249, "step": 2287 }, { "epoch": 0.45110410094637227, "grad_norm": 0.5975560713887517, "learning_rate": 1.938841507645471e-05, "loss": 0.463, "step": 2288 }, { "epoch": 0.451301261829653, "grad_norm": 0.5692154145280981, "learning_rate": 1.938788121290187e-05, "loss": 0.4282, "step": 2289 }, { "epoch": 0.4514984227129338, "grad_norm": 0.602494745609716, "learning_rate": 1.938734712379762e-05, "loss": 0.4488, "step": 2290 }, { "epoch": 0.45169558359621453, "grad_norm": 0.587913158355413, "learning_rate": 1.93868128091548e-05, "loss": 0.4679, "step": 2291 }, { "epoch": 0.4518927444794953, "grad_norm": 0.5718028537813307, "learning_rate": 1.9386278268986243e-05, "loss": 0.4573, "step": 2292 }, { "epoch": 0.45208990536277605, "grad_norm": 0.6226913772837778, "learning_rate": 1.938574350330479e-05, "loss": 0.4422, "step": 2293 }, { "epoch": 0.4522870662460568, "grad_norm": 0.6822157964341871, "learning_rate": 1.9385208512123293e-05, "loss": 0.4359, "step": 2294 }, { "epoch": 0.45248422712933756, "grad_norm": 0.631137044281597, "learning_rate": 1.9384673295454603e-05, "loss": 0.446, "step": 2295 }, { "epoch": 0.4526813880126183, "grad_norm": 0.6264948809573254, "learning_rate": 1.9384137853311576e-05, "loss": 0.4355, "step": 2296 }, { "epoch": 0.45287854889589907, "grad_norm": 0.566650305099432, "learning_rate": 1.9383602185707082e-05, "loss": 0.4404, "step": 2297 }, { "epoch": 0.4530757097791798, "grad_norm": 0.6365696893318133, "learning_rate": 1.938306629265399e-05, "loss": 0.4407, "step": 2298 }, { "epoch": 0.4532728706624606, "grad_norm": 0.6109514828404591, "learning_rate": 1.9382530174165175e-05, "loss": 0.4282, "step": 2299 }, { "epoch": 0.45347003154574134, "grad_norm": 0.5636586001477206, "learning_rate": 1.9381993830253515e-05, "loss": 0.4267, "step": 2300 }, { "epoch": 0.4536671924290221, "grad_norm": 0.705135133601523, "learning_rate": 1.93814572609319e-05, "loss": 0.4815, "step": 2301 }, { "epoch": 0.45386435331230285, "grad_norm": 0.6238025886975589, "learning_rate": 1.9380920466213217e-05, "loss": 0.4439, "step": 2302 }, { "epoch": 0.4540615141955836, "grad_norm": 0.7081859082809666, "learning_rate": 1.9380383446110368e-05, "loss": 0.4483, "step": 2303 }, { "epoch": 0.45425867507886436, "grad_norm": 0.6531999149924236, "learning_rate": 1.937984620063625e-05, "loss": 0.4331, "step": 2304 }, { "epoch": 0.4544558359621451, "grad_norm": 0.578550340908979, "learning_rate": 1.9379308729803775e-05, "loss": 0.3904, "step": 2305 }, { "epoch": 0.45465299684542587, "grad_norm": 0.638465715516517, "learning_rate": 1.9378771033625855e-05, "loss": 0.4311, "step": 2306 }, { "epoch": 0.45485015772870663, "grad_norm": 1.0881560912923343, "learning_rate": 1.9378233112115406e-05, "loss": 0.4634, "step": 2307 }, { "epoch": 0.4550473186119874, "grad_norm": 0.6713906883533458, "learning_rate": 1.9377694965285356e-05, "loss": 0.4347, "step": 2308 }, { "epoch": 0.45524447949526814, "grad_norm": 0.6120327107112462, "learning_rate": 1.9377156593148632e-05, "loss": 0.4135, "step": 2309 }, { "epoch": 0.4554416403785489, "grad_norm": 0.6652885718648691, "learning_rate": 1.937661799571817e-05, "loss": 0.4182, "step": 2310 }, { "epoch": 0.45563880126182965, "grad_norm": 0.8090629623799119, "learning_rate": 1.937607917300691e-05, "loss": 0.4534, "step": 2311 }, { "epoch": 0.4558359621451104, "grad_norm": 1.1834526081440842, "learning_rate": 1.9375540125027796e-05, "loss": 0.4866, "step": 2312 }, { "epoch": 0.45603312302839116, "grad_norm": 0.6151631247816741, "learning_rate": 1.937500085179378e-05, "loss": 0.4517, "step": 2313 }, { "epoch": 0.4562302839116719, "grad_norm": 1.0542335058904364, "learning_rate": 1.937446135331782e-05, "loss": 0.4423, "step": 2314 }, { "epoch": 0.4564274447949527, "grad_norm": 0.6255363893355489, "learning_rate": 1.9373921629612876e-05, "loss": 0.46, "step": 2315 }, { "epoch": 0.45662460567823343, "grad_norm": 0.5392418471066178, "learning_rate": 1.937338168069192e-05, "loss": 0.3992, "step": 2316 }, { "epoch": 0.4568217665615142, "grad_norm": 0.7140904386654281, "learning_rate": 1.9372841506567916e-05, "loss": 0.4168, "step": 2317 }, { "epoch": 0.45701892744479494, "grad_norm": 0.8124956257464673, "learning_rate": 1.937230110725385e-05, "loss": 0.4444, "step": 2318 }, { "epoch": 0.4572160883280757, "grad_norm": 0.5825397168265147, "learning_rate": 1.93717604827627e-05, "loss": 0.4125, "step": 2319 }, { "epoch": 0.45741324921135645, "grad_norm": 0.5926863567356023, "learning_rate": 1.937121963310746e-05, "loss": 0.4414, "step": 2320 }, { "epoch": 0.4576104100946372, "grad_norm": 0.649677750333782, "learning_rate": 1.9370678558301117e-05, "loss": 0.4315, "step": 2321 }, { "epoch": 0.45780757097791797, "grad_norm": 1.1050583010405093, "learning_rate": 1.937013725835668e-05, "loss": 0.4948, "step": 2322 }, { "epoch": 0.4580047318611987, "grad_norm": 0.6313289986933639, "learning_rate": 1.9369595733287147e-05, "loss": 0.4265, "step": 2323 }, { "epoch": 0.4582018927444795, "grad_norm": 0.6683692439342528, "learning_rate": 1.9369053983105533e-05, "loss": 0.4648, "step": 2324 }, { "epoch": 0.45839905362776023, "grad_norm": 0.7637595774268595, "learning_rate": 1.9368512007824852e-05, "loss": 0.4634, "step": 2325 }, { "epoch": 0.458596214511041, "grad_norm": 0.634965367240232, "learning_rate": 1.9367969807458125e-05, "loss": 0.4925, "step": 2326 }, { "epoch": 0.45879337539432175, "grad_norm": 0.6164240403541263, "learning_rate": 1.936742738201838e-05, "loss": 0.4655, "step": 2327 }, { "epoch": 0.4589905362776025, "grad_norm": 0.6481028520735399, "learning_rate": 1.9366884731518648e-05, "loss": 0.4842, "step": 2328 }, { "epoch": 0.45918769716088326, "grad_norm": 0.6211874377041051, "learning_rate": 1.9366341855971967e-05, "loss": 0.468, "step": 2329 }, { "epoch": 0.459384858044164, "grad_norm": 0.7567018049090279, "learning_rate": 1.936579875539138e-05, "loss": 0.4688, "step": 2330 }, { "epoch": 0.45958201892744477, "grad_norm": 0.6196167687850394, "learning_rate": 1.9365255429789934e-05, "loss": 0.4041, "step": 2331 }, { "epoch": 0.4597791798107255, "grad_norm": 0.5842148695471889, "learning_rate": 1.9364711879180688e-05, "loss": 0.4289, "step": 2332 }, { "epoch": 0.45997634069400634, "grad_norm": 0.6604436365980313, "learning_rate": 1.9364168103576696e-05, "loss": 0.4489, "step": 2333 }, { "epoch": 0.4601735015772871, "grad_norm": 1.3308649937601518, "learning_rate": 1.9363624102991022e-05, "loss": 0.451, "step": 2334 }, { "epoch": 0.46037066246056785, "grad_norm": 0.6093584018980908, "learning_rate": 1.9363079877436744e-05, "loss": 0.4565, "step": 2335 }, { "epoch": 0.4605678233438486, "grad_norm": 0.7358912383290898, "learning_rate": 1.936253542692693e-05, "loss": 0.4456, "step": 2336 }, { "epoch": 0.46076498422712936, "grad_norm": 0.5582505407627746, "learning_rate": 1.936199075147466e-05, "loss": 0.3969, "step": 2337 }, { "epoch": 0.4609621451104101, "grad_norm": 0.6033721253583829, "learning_rate": 1.936144585109302e-05, "loss": 0.4284, "step": 2338 }, { "epoch": 0.46115930599369087, "grad_norm": 0.6175274199097704, "learning_rate": 1.9360900725795112e-05, "loss": 0.4613, "step": 2339 }, { "epoch": 0.4613564668769716, "grad_norm": 0.5765237098211964, "learning_rate": 1.9360355375594025e-05, "loss": 0.4307, "step": 2340 }, { "epoch": 0.4615536277602524, "grad_norm": 0.7141205911743096, "learning_rate": 1.9359809800502858e-05, "loss": 0.4762, "step": 2341 }, { "epoch": 0.46175078864353314, "grad_norm": 0.8234992518594635, "learning_rate": 1.9359264000534726e-05, "loss": 0.4912, "step": 2342 }, { "epoch": 0.4619479495268139, "grad_norm": 0.5994202989618423, "learning_rate": 1.9358717975702735e-05, "loss": 0.4382, "step": 2343 }, { "epoch": 0.46214511041009465, "grad_norm": 0.6761671021504989, "learning_rate": 1.9358171726020014e-05, "loss": 0.447, "step": 2344 }, { "epoch": 0.4623422712933754, "grad_norm": 0.6365526106317112, "learning_rate": 1.9357625251499682e-05, "loss": 0.4319, "step": 2345 }, { "epoch": 0.46253943217665616, "grad_norm": 0.7520248964390419, "learning_rate": 1.9357078552154864e-05, "loss": 0.4967, "step": 2346 }, { "epoch": 0.4627365930599369, "grad_norm": 0.6558895684983119, "learning_rate": 1.9356531627998696e-05, "loss": 0.4489, "step": 2347 }, { "epoch": 0.4629337539432177, "grad_norm": 0.6828662741868297, "learning_rate": 1.9355984479044324e-05, "loss": 0.4477, "step": 2348 }, { "epoch": 0.46313091482649843, "grad_norm": 0.6163218102500873, "learning_rate": 1.9355437105304893e-05, "loss": 0.4666, "step": 2349 }, { "epoch": 0.4633280757097792, "grad_norm": 0.629723944986669, "learning_rate": 1.9354889506793548e-05, "loss": 0.4734, "step": 2350 }, { "epoch": 0.46352523659305994, "grad_norm": 0.5904354659114018, "learning_rate": 1.935434168352345e-05, "loss": 0.4522, "step": 2351 }, { "epoch": 0.4637223974763407, "grad_norm": 0.6172193362795846, "learning_rate": 1.935379363550776e-05, "loss": 0.4478, "step": 2352 }, { "epoch": 0.46391955835962145, "grad_norm": 0.6097296169978487, "learning_rate": 1.9353245362759647e-05, "loss": 0.4407, "step": 2353 }, { "epoch": 0.4641167192429022, "grad_norm": 0.6073643122176055, "learning_rate": 1.9352696865292278e-05, "loss": 0.4116, "step": 2354 }, { "epoch": 0.46431388012618297, "grad_norm": 0.6723779390449313, "learning_rate": 1.935214814311884e-05, "loss": 0.4183, "step": 2355 }, { "epoch": 0.4645110410094637, "grad_norm": 0.5416360235400502, "learning_rate": 1.935159919625251e-05, "loss": 0.4128, "step": 2356 }, { "epoch": 0.4647082018927445, "grad_norm": 0.6434910428180349, "learning_rate": 1.9351050024706476e-05, "loss": 0.3999, "step": 2357 }, { "epoch": 0.46490536277602523, "grad_norm": 0.5545962685011621, "learning_rate": 1.9350500628493938e-05, "loss": 0.4059, "step": 2358 }, { "epoch": 0.465102523659306, "grad_norm": 0.6598887221595904, "learning_rate": 1.9349951007628093e-05, "loss": 0.449, "step": 2359 }, { "epoch": 0.46529968454258674, "grad_norm": 0.6254882672594428, "learning_rate": 1.934940116212214e-05, "loss": 0.4422, "step": 2360 }, { "epoch": 0.4654968454258675, "grad_norm": 0.6643297973480146, "learning_rate": 1.93488510919893e-05, "loss": 0.4693, "step": 2361 }, { "epoch": 0.46569400630914826, "grad_norm": 0.6671960843874735, "learning_rate": 1.9348300797242784e-05, "loss": 0.4674, "step": 2362 }, { "epoch": 0.465891167192429, "grad_norm": 0.6583457703373533, "learning_rate": 1.934775027789581e-05, "loss": 0.4454, "step": 2363 }, { "epoch": 0.46608832807570977, "grad_norm": 0.6019895419204817, "learning_rate": 1.934719953396161e-05, "loss": 0.4585, "step": 2364 }, { "epoch": 0.4662854889589905, "grad_norm": 0.5539573629879283, "learning_rate": 1.9346648565453412e-05, "loss": 0.3792, "step": 2365 }, { "epoch": 0.4664826498422713, "grad_norm": 0.5851973307214345, "learning_rate": 1.934609737238446e-05, "loss": 0.4116, "step": 2366 }, { "epoch": 0.46667981072555204, "grad_norm": 0.5995849117001575, "learning_rate": 1.9345545954767985e-05, "loss": 0.4468, "step": 2367 }, { "epoch": 0.4668769716088328, "grad_norm": 0.713328117155134, "learning_rate": 1.934499431261725e-05, "loss": 0.4863, "step": 2368 }, { "epoch": 0.46707413249211355, "grad_norm": 0.6211541889706627, "learning_rate": 1.93444424459455e-05, "loss": 0.4355, "step": 2369 }, { "epoch": 0.4672712933753943, "grad_norm": 0.5811288166838467, "learning_rate": 1.934389035476599e-05, "loss": 0.4487, "step": 2370 }, { "epoch": 0.46746845425867506, "grad_norm": 0.7101811484585888, "learning_rate": 1.9343338039091992e-05, "loss": 0.4968, "step": 2371 }, { "epoch": 0.4676656151419558, "grad_norm": 0.6800579406115673, "learning_rate": 1.9342785498936775e-05, "loss": 0.4606, "step": 2372 }, { "epoch": 0.46786277602523657, "grad_norm": 0.6222849178571329, "learning_rate": 1.934223273431361e-05, "loss": 0.4408, "step": 2373 }, { "epoch": 0.4680599369085173, "grad_norm": 0.6463617017560804, "learning_rate": 1.9341679745235783e-05, "loss": 0.3811, "step": 2374 }, { "epoch": 0.4682570977917981, "grad_norm": 0.6104669769368634, "learning_rate": 1.9341126531716575e-05, "loss": 0.4963, "step": 2375 }, { "epoch": 0.46845425867507884, "grad_norm": 0.6755247672185276, "learning_rate": 1.934057309376928e-05, "loss": 0.4448, "step": 2376 }, { "epoch": 0.4686514195583596, "grad_norm": 0.5842414289182047, "learning_rate": 1.93400194314072e-05, "loss": 0.4339, "step": 2377 }, { "epoch": 0.4688485804416404, "grad_norm": 0.6069017142653811, "learning_rate": 1.9339465544643623e-05, "loss": 0.4667, "step": 2378 }, { "epoch": 0.46904574132492116, "grad_norm": 0.7521849096494124, "learning_rate": 1.9338911433491868e-05, "loss": 0.4821, "step": 2379 }, { "epoch": 0.4692429022082019, "grad_norm": 0.6450195680499727, "learning_rate": 1.933835709796525e-05, "loss": 0.451, "step": 2380 }, { "epoch": 0.4694400630914827, "grad_norm": 0.6039688998164772, "learning_rate": 1.933780253807708e-05, "loss": 0.4661, "step": 2381 }, { "epoch": 0.46963722397476343, "grad_norm": 0.5834276251343435, "learning_rate": 1.933724775384068e-05, "loss": 0.4327, "step": 2382 }, { "epoch": 0.4698343848580442, "grad_norm": 0.6142679272929852, "learning_rate": 1.9336692745269388e-05, "loss": 0.4644, "step": 2383 }, { "epoch": 0.47003154574132494, "grad_norm": 0.6124895653388481, "learning_rate": 1.9336137512376532e-05, "loss": 0.4099, "step": 2384 }, { "epoch": 0.4702287066246057, "grad_norm": 0.6384760441175886, "learning_rate": 1.9335582055175454e-05, "loss": 0.4585, "step": 2385 }, { "epoch": 0.47042586750788645, "grad_norm": 0.6605064367442711, "learning_rate": 1.9335026373679503e-05, "loss": 0.4624, "step": 2386 }, { "epoch": 0.4706230283911672, "grad_norm": 0.9621836476711323, "learning_rate": 1.9334470467902024e-05, "loss": 0.4828, "step": 2387 }, { "epoch": 0.47082018927444796, "grad_norm": 0.6671978180371507, "learning_rate": 1.9333914337856373e-05, "loss": 0.4651, "step": 2388 }, { "epoch": 0.4710173501577287, "grad_norm": 0.6234859549259715, "learning_rate": 1.933335798355591e-05, "loss": 0.4176, "step": 2389 }, { "epoch": 0.4712145110410095, "grad_norm": 0.7746006898875549, "learning_rate": 1.9332801405014013e-05, "loss": 0.4175, "step": 2390 }, { "epoch": 0.47141167192429023, "grad_norm": 0.6524991860281778, "learning_rate": 1.9332244602244042e-05, "loss": 0.4798, "step": 2391 }, { "epoch": 0.471608832807571, "grad_norm": 0.6671756189730206, "learning_rate": 1.9331687575259378e-05, "loss": 0.4623, "step": 2392 }, { "epoch": 0.47180599369085174, "grad_norm": 0.5980869066369106, "learning_rate": 1.933113032407341e-05, "loss": 0.3908, "step": 2393 }, { "epoch": 0.4720031545741325, "grad_norm": 0.6523513564819309, "learning_rate": 1.933057284869952e-05, "loss": 0.4516, "step": 2394 }, { "epoch": 0.47220031545741326, "grad_norm": 0.6761092970782154, "learning_rate": 1.93300151491511e-05, "loss": 0.4672, "step": 2395 }, { "epoch": 0.472397476340694, "grad_norm": 0.6145122564628309, "learning_rate": 1.9329457225441554e-05, "loss": 0.4365, "step": 2396 }, { "epoch": 0.47259463722397477, "grad_norm": 0.9246573499880514, "learning_rate": 1.932889907758429e-05, "loss": 0.4831, "step": 2397 }, { "epoch": 0.4727917981072555, "grad_norm": 3.0455979525914487, "learning_rate": 1.9328340705592708e-05, "loss": 0.4655, "step": 2398 }, { "epoch": 0.4729889589905363, "grad_norm": 3.2802273681352085, "learning_rate": 1.932778210948023e-05, "loss": 0.4443, "step": 2399 }, { "epoch": 0.47318611987381703, "grad_norm": 0.715855291735479, "learning_rate": 1.9327223289260274e-05, "loss": 0.4375, "step": 2400 }, { "epoch": 0.4733832807570978, "grad_norm": 0.7740268211710657, "learning_rate": 1.932666424494627e-05, "loss": 0.4653, "step": 2401 }, { "epoch": 0.47358044164037855, "grad_norm": 0.6204311682591072, "learning_rate": 1.9326104976551643e-05, "loss": 0.4134, "step": 2402 }, { "epoch": 0.4737776025236593, "grad_norm": 0.7091391867960442, "learning_rate": 1.932554548408984e-05, "loss": 0.449, "step": 2403 }, { "epoch": 0.47397476340694006, "grad_norm": 0.7803886372499086, "learning_rate": 1.932498576757429e-05, "loss": 0.4777, "step": 2404 }, { "epoch": 0.4741719242902208, "grad_norm": 0.716866961037204, "learning_rate": 1.9324425827018452e-05, "loss": 0.4242, "step": 2405 }, { "epoch": 0.47436908517350157, "grad_norm": 0.6489682185779229, "learning_rate": 1.932386566243577e-05, "loss": 0.4261, "step": 2406 }, { "epoch": 0.4745662460567823, "grad_norm": 0.6254026998694395, "learning_rate": 1.9323305273839713e-05, "loss": 0.4012, "step": 2407 }, { "epoch": 0.4747634069400631, "grad_norm": 0.664068211708124, "learning_rate": 1.9322744661243732e-05, "loss": 0.4401, "step": 2408 }, { "epoch": 0.47496056782334384, "grad_norm": 0.859745025842503, "learning_rate": 1.9322183824661306e-05, "loss": 0.4634, "step": 2409 }, { "epoch": 0.4751577287066246, "grad_norm": 0.7510084681382883, "learning_rate": 1.932162276410591e-05, "loss": 0.4403, "step": 2410 }, { "epoch": 0.47535488958990535, "grad_norm": 0.62038420709768, "learning_rate": 1.9321061479591017e-05, "loss": 0.4503, "step": 2411 }, { "epoch": 0.4755520504731861, "grad_norm": 0.6669051292027413, "learning_rate": 1.9320499971130114e-05, "loss": 0.4232, "step": 2412 }, { "epoch": 0.47574921135646686, "grad_norm": 0.6402332048996934, "learning_rate": 1.93199382387367e-05, "loss": 0.4647, "step": 2413 }, { "epoch": 0.4759463722397476, "grad_norm": 0.6407606660148513, "learning_rate": 1.9319376282424255e-05, "loss": 0.415, "step": 2414 }, { "epoch": 0.4761435331230284, "grad_norm": 0.6662928458835895, "learning_rate": 1.9318814102206296e-05, "loss": 0.4525, "step": 2415 }, { "epoch": 0.47634069400630913, "grad_norm": 0.66510230765238, "learning_rate": 1.9318251698096322e-05, "loss": 0.4325, "step": 2416 }, { "epoch": 0.4765378548895899, "grad_norm": 0.8178454623617458, "learning_rate": 1.931768907010785e-05, "loss": 0.448, "step": 2417 }, { "epoch": 0.47673501577287064, "grad_norm": 0.6008989268966388, "learning_rate": 1.931712621825439e-05, "loss": 0.4561, "step": 2418 }, { "epoch": 0.4769321766561514, "grad_norm": 0.6058146707232347, "learning_rate": 1.9316563142549475e-05, "loss": 0.4263, "step": 2419 }, { "epoch": 0.47712933753943215, "grad_norm": 0.6620728520314264, "learning_rate": 1.9315999843006624e-05, "loss": 0.4697, "step": 2420 }, { "epoch": 0.4773264984227129, "grad_norm": 0.6180305422847591, "learning_rate": 1.9315436319639375e-05, "loss": 0.4495, "step": 2421 }, { "epoch": 0.47752365930599366, "grad_norm": 0.7048061474007149, "learning_rate": 1.9314872572461265e-05, "loss": 0.4593, "step": 2422 }, { "epoch": 0.4777208201892745, "grad_norm": 1.0945355318730723, "learning_rate": 1.9314308601485842e-05, "loss": 0.4426, "step": 2423 }, { "epoch": 0.47791798107255523, "grad_norm": 0.5490282310707831, "learning_rate": 1.9313744406726656e-05, "loss": 0.4061, "step": 2424 }, { "epoch": 0.478115141955836, "grad_norm": 0.6400659060403348, "learning_rate": 1.931317998819726e-05, "loss": 0.4378, "step": 2425 }, { "epoch": 0.47831230283911674, "grad_norm": 0.6364254959448777, "learning_rate": 1.931261534591121e-05, "loss": 0.4796, "step": 2426 }, { "epoch": 0.4785094637223975, "grad_norm": 0.5901543416183237, "learning_rate": 1.9312050479882082e-05, "loss": 0.4326, "step": 2427 }, { "epoch": 0.47870662460567825, "grad_norm": 0.7359527860806958, "learning_rate": 1.9311485390123442e-05, "loss": 0.5009, "step": 2428 }, { "epoch": 0.478903785488959, "grad_norm": 0.9323925635649081, "learning_rate": 1.931092007664886e-05, "loss": 0.4826, "step": 2429 }, { "epoch": 0.47910094637223977, "grad_norm": 0.5787415894460493, "learning_rate": 1.9310354539471935e-05, "loss": 0.4639, "step": 2430 }, { "epoch": 0.4792981072555205, "grad_norm": 0.6479424198694343, "learning_rate": 1.930978877860624e-05, "loss": 0.4823, "step": 2431 }, { "epoch": 0.4794952681388013, "grad_norm": 0.5894770732012385, "learning_rate": 1.9309222794065373e-05, "loss": 0.4604, "step": 2432 }, { "epoch": 0.47969242902208203, "grad_norm": 0.639884540685977, "learning_rate": 1.930865658586293e-05, "loss": 0.4675, "step": 2433 }, { "epoch": 0.4798895899053628, "grad_norm": 0.6684635586126257, "learning_rate": 1.930809015401252e-05, "loss": 0.4585, "step": 2434 }, { "epoch": 0.48008675078864355, "grad_norm": 0.6336667692984291, "learning_rate": 1.9307523498527744e-05, "loss": 0.471, "step": 2435 }, { "epoch": 0.4802839116719243, "grad_norm": 0.6876919208132971, "learning_rate": 1.930695661942222e-05, "loss": 0.4582, "step": 2436 }, { "epoch": 0.48048107255520506, "grad_norm": 0.5960087219303766, "learning_rate": 1.9306389516709575e-05, "loss": 0.4476, "step": 2437 }, { "epoch": 0.4806782334384858, "grad_norm": 0.6171728207106327, "learning_rate": 1.9305822190403422e-05, "loss": 0.4362, "step": 2438 }, { "epoch": 0.48087539432176657, "grad_norm": 0.6355089188560651, "learning_rate": 1.9305254640517398e-05, "loss": 0.4392, "step": 2439 }, { "epoch": 0.4810725552050473, "grad_norm": 1.1033461138258283, "learning_rate": 1.9304686867065138e-05, "loss": 0.4564, "step": 2440 }, { "epoch": 0.4812697160883281, "grad_norm": 0.6067878466977591, "learning_rate": 1.9304118870060283e-05, "loss": 0.4234, "step": 2441 }, { "epoch": 0.48146687697160884, "grad_norm": 0.6684348483494865, "learning_rate": 1.930355064951648e-05, "loss": 0.4558, "step": 2442 }, { "epoch": 0.4816640378548896, "grad_norm": 0.62445972264211, "learning_rate": 1.930298220544738e-05, "loss": 0.4464, "step": 2443 }, { "epoch": 0.48186119873817035, "grad_norm": 0.6151015537788406, "learning_rate": 1.9302413537866642e-05, "loss": 0.4514, "step": 2444 }, { "epoch": 0.4820583596214511, "grad_norm": 0.6241079129743627, "learning_rate": 1.9301844646787927e-05, "loss": 0.4433, "step": 2445 }, { "epoch": 0.48225552050473186, "grad_norm": 0.7891240381146803, "learning_rate": 1.93012755322249e-05, "loss": 0.4893, "step": 2446 }, { "epoch": 0.4824526813880126, "grad_norm": 0.6404678342701038, "learning_rate": 1.9300706194191244e-05, "loss": 0.4532, "step": 2447 }, { "epoch": 0.48264984227129337, "grad_norm": 0.6493424413450147, "learning_rate": 1.930013663270063e-05, "loss": 0.4498, "step": 2448 }, { "epoch": 0.48284700315457413, "grad_norm": 0.6508616343699691, "learning_rate": 1.929956684776674e-05, "loss": 0.487, "step": 2449 }, { "epoch": 0.4830441640378549, "grad_norm": 0.6434094284010808, "learning_rate": 1.929899683940327e-05, "loss": 0.4437, "step": 2450 }, { "epoch": 0.48324132492113564, "grad_norm": 0.6735775372022499, "learning_rate": 1.9298426607623915e-05, "loss": 0.4301, "step": 2451 }, { "epoch": 0.4834384858044164, "grad_norm": 0.6116434104938371, "learning_rate": 1.929785615244237e-05, "loss": 0.433, "step": 2452 }, { "epoch": 0.48363564668769715, "grad_norm": 0.6272151580269819, "learning_rate": 1.9297285473872343e-05, "loss": 0.4447, "step": 2453 }, { "epoch": 0.4838328075709779, "grad_norm": 0.6438884702800719, "learning_rate": 1.929671457192755e-05, "loss": 0.475, "step": 2454 }, { "epoch": 0.48402996845425866, "grad_norm": 0.7535169667410828, "learning_rate": 1.9296143446621697e-05, "loss": 0.4784, "step": 2455 }, { "epoch": 0.4842271293375394, "grad_norm": 44.84808687603809, "learning_rate": 1.9295572097968514e-05, "loss": 0.7183, "step": 2456 }, { "epoch": 0.4844242902208202, "grad_norm": 0.6854267912996111, "learning_rate": 1.9295000525981725e-05, "loss": 0.4581, "step": 2457 }, { "epoch": 0.48462145110410093, "grad_norm": 0.7188232334712162, "learning_rate": 1.929442873067506e-05, "loss": 0.4354, "step": 2458 }, { "epoch": 0.4848186119873817, "grad_norm": 9.85290231143863, "learning_rate": 1.9293856712062267e-05, "loss": 0.5323, "step": 2459 }, { "epoch": 0.48501577287066244, "grad_norm": 0.6382321261076438, "learning_rate": 1.9293284470157082e-05, "loss": 0.4332, "step": 2460 }, { "epoch": 0.4852129337539432, "grad_norm": 0.8581690728691596, "learning_rate": 1.9292712004973248e-05, "loss": 0.4314, "step": 2461 }, { "epoch": 0.48541009463722395, "grad_norm": 0.6494937725982591, "learning_rate": 1.9292139316524528e-05, "loss": 0.4891, "step": 2462 }, { "epoch": 0.4856072555205047, "grad_norm": 0.6193320454689136, "learning_rate": 1.9291566404824676e-05, "loss": 0.4077, "step": 2463 }, { "epoch": 0.48580441640378547, "grad_norm": 0.6374169775943455, "learning_rate": 1.9290993269887458e-05, "loss": 0.4282, "step": 2464 }, { "epoch": 0.4860015772870662, "grad_norm": 0.6787679196471768, "learning_rate": 1.9290419911726647e-05, "loss": 0.4902, "step": 2465 }, { "epoch": 0.486198738170347, "grad_norm": 0.6598399921444862, "learning_rate": 1.9289846330356018e-05, "loss": 0.4991, "step": 2466 }, { "epoch": 0.48639589905362773, "grad_norm": 0.6206359700753303, "learning_rate": 1.9289272525789348e-05, "loss": 0.4457, "step": 2467 }, { "epoch": 0.4865930599369085, "grad_norm": 0.5930790685776189, "learning_rate": 1.9288698498040423e-05, "loss": 0.4066, "step": 2468 }, { "epoch": 0.4867902208201893, "grad_norm": 0.651999060628564, "learning_rate": 1.928812424712304e-05, "loss": 0.4191, "step": 2469 }, { "epoch": 0.48698738170347006, "grad_norm": 0.615287126559333, "learning_rate": 1.9287549773050988e-05, "loss": 0.4301, "step": 2470 }, { "epoch": 0.4871845425867508, "grad_norm": 0.7055975170195661, "learning_rate": 1.9286975075838077e-05, "loss": 0.4719, "step": 2471 }, { "epoch": 0.48738170347003157, "grad_norm": 0.5856890230925806, "learning_rate": 1.9286400155498107e-05, "loss": 0.3913, "step": 2472 }, { "epoch": 0.4875788643533123, "grad_norm": 0.8762126171240702, "learning_rate": 1.92858250120449e-05, "loss": 0.4059, "step": 2473 }, { "epoch": 0.4877760252365931, "grad_norm": 0.6128580611588724, "learning_rate": 1.9285249645492266e-05, "loss": 0.4451, "step": 2474 }, { "epoch": 0.48797318611987384, "grad_norm": 0.803850920215572, "learning_rate": 1.928467405585403e-05, "loss": 0.4162, "step": 2475 }, { "epoch": 0.4881703470031546, "grad_norm": 0.684760361276952, "learning_rate": 1.9284098243144028e-05, "loss": 0.4731, "step": 2476 }, { "epoch": 0.48836750788643535, "grad_norm": 0.6490585074155004, "learning_rate": 1.9283522207376088e-05, "loss": 0.4238, "step": 2477 }, { "epoch": 0.4885646687697161, "grad_norm": 0.6065001605718633, "learning_rate": 1.9282945948564047e-05, "loss": 0.435, "step": 2478 }, { "epoch": 0.48876182965299686, "grad_norm": 0.6557206451679547, "learning_rate": 1.9282369466721756e-05, "loss": 0.4611, "step": 2479 }, { "epoch": 0.4889589905362776, "grad_norm": 0.5885897744446494, "learning_rate": 1.9281792761863067e-05, "loss": 0.475, "step": 2480 }, { "epoch": 0.48915615141955837, "grad_norm": 0.6645075192181713, "learning_rate": 1.928121583400183e-05, "loss": 0.4432, "step": 2481 }, { "epoch": 0.4893533123028391, "grad_norm": 0.854625778831215, "learning_rate": 1.9280638683151903e-05, "loss": 0.4287, "step": 2482 }, { "epoch": 0.4895504731861199, "grad_norm": 10.207500341633537, "learning_rate": 1.9280061309327164e-05, "loss": 0.4656, "step": 2483 }, { "epoch": 0.48974763406940064, "grad_norm": 0.8188261610562382, "learning_rate": 1.9279483712541477e-05, "loss": 0.4747, "step": 2484 }, { "epoch": 0.4899447949526814, "grad_norm": 0.6708338710083666, "learning_rate": 1.9278905892808725e-05, "loss": 0.466, "step": 2485 }, { "epoch": 0.49014195583596215, "grad_norm": 0.827156096180289, "learning_rate": 1.9278327850142783e-05, "loss": 0.4859, "step": 2486 }, { "epoch": 0.4903391167192429, "grad_norm": 0.7039638642215147, "learning_rate": 1.9277749584557543e-05, "loss": 0.457, "step": 2487 }, { "epoch": 0.49053627760252366, "grad_norm": 0.6238491942723505, "learning_rate": 1.9277171096066895e-05, "loss": 0.4593, "step": 2488 }, { "epoch": 0.4907334384858044, "grad_norm": 0.6466067939563221, "learning_rate": 1.9276592384684745e-05, "loss": 0.4694, "step": 2489 }, { "epoch": 0.4909305993690852, "grad_norm": 0.6199822833188464, "learning_rate": 1.9276013450424995e-05, "loss": 0.4572, "step": 2490 }, { "epoch": 0.49112776025236593, "grad_norm": 1.053077823726099, "learning_rate": 1.9275434293301544e-05, "loss": 0.4559, "step": 2491 }, { "epoch": 0.4913249211356467, "grad_norm": 0.6349781493349106, "learning_rate": 1.9274854913328317e-05, "loss": 0.5007, "step": 2492 }, { "epoch": 0.49152208201892744, "grad_norm": 0.616100989000789, "learning_rate": 1.9274275310519234e-05, "loss": 0.417, "step": 2493 }, { "epoch": 0.4917192429022082, "grad_norm": 0.6027522466238368, "learning_rate": 1.9273695484888216e-05, "loss": 0.454, "step": 2494 }, { "epoch": 0.49191640378548895, "grad_norm": 0.6388698511914507, "learning_rate": 1.9273115436449198e-05, "loss": 0.4398, "step": 2495 }, { "epoch": 0.4921135646687697, "grad_norm": 1.088727821676831, "learning_rate": 1.9272535165216112e-05, "loss": 0.4679, "step": 2496 }, { "epoch": 0.49231072555205047, "grad_norm": 0.6152883070859394, "learning_rate": 1.9271954671202902e-05, "loss": 0.4152, "step": 2497 }, { "epoch": 0.4925078864353312, "grad_norm": 0.5214691152255505, "learning_rate": 1.9271373954423517e-05, "loss": 0.4056, "step": 2498 }, { "epoch": 0.492705047318612, "grad_norm": 0.641068964137141, "learning_rate": 1.9270793014891906e-05, "loss": 0.4716, "step": 2499 }, { "epoch": 0.49290220820189273, "grad_norm": 0.5667713763054852, "learning_rate": 1.9270211852622024e-05, "loss": 0.4235, "step": 2500 }, { "epoch": 0.4930993690851735, "grad_norm": 0.7075480774876152, "learning_rate": 1.926963046762784e-05, "loss": 0.4775, "step": 2501 }, { "epoch": 0.49329652996845424, "grad_norm": 0.5892770191488615, "learning_rate": 1.9269048859923318e-05, "loss": 0.4885, "step": 2502 }, { "epoch": 0.493493690851735, "grad_norm": 0.6792628358438927, "learning_rate": 1.9268467029522432e-05, "loss": 0.4683, "step": 2503 }, { "epoch": 0.49369085173501576, "grad_norm": 0.637559207810441, "learning_rate": 1.9267884976439163e-05, "loss": 0.4298, "step": 2504 }, { "epoch": 0.4938880126182965, "grad_norm": 0.5698260135323511, "learning_rate": 1.9267302700687494e-05, "loss": 0.4026, "step": 2505 }, { "epoch": 0.49408517350157727, "grad_norm": 0.5949689949186419, "learning_rate": 1.9266720202281413e-05, "loss": 0.4076, "step": 2506 }, { "epoch": 0.494282334384858, "grad_norm": 0.6042856982676498, "learning_rate": 1.9266137481234918e-05, "loss": 0.4727, "step": 2507 }, { "epoch": 0.4944794952681388, "grad_norm": 0.5738882939815504, "learning_rate": 1.9265554537562008e-05, "loss": 0.4585, "step": 2508 }, { "epoch": 0.49467665615141954, "grad_norm": 1.6832821361365304, "learning_rate": 1.926497137127669e-05, "loss": 0.435, "step": 2509 }, { "epoch": 0.4948738170347003, "grad_norm": 0.7344488196925597, "learning_rate": 1.9264387982392972e-05, "loss": 0.4663, "step": 2510 }, { "epoch": 0.49507097791798105, "grad_norm": 0.8273300571453689, "learning_rate": 1.926380437092487e-05, "loss": 0.4682, "step": 2511 }, { "epoch": 0.4952681388012618, "grad_norm": 0.6195232818493738, "learning_rate": 1.9263220536886413e-05, "loss": 0.4296, "step": 2512 }, { "epoch": 0.49546529968454256, "grad_norm": 0.6274418197133689, "learning_rate": 1.9262636480291618e-05, "loss": 0.4445, "step": 2513 }, { "epoch": 0.49566246056782337, "grad_norm": 0.6220175805229026, "learning_rate": 1.9262052201154525e-05, "loss": 0.4491, "step": 2514 }, { "epoch": 0.4958596214511041, "grad_norm": 1.2792193280662592, "learning_rate": 1.926146769948917e-05, "loss": 0.441, "step": 2515 }, { "epoch": 0.4960567823343849, "grad_norm": 0.6309907361680024, "learning_rate": 1.926088297530959e-05, "loss": 0.4213, "step": 2516 }, { "epoch": 0.49625394321766564, "grad_norm": 0.6327839276931332, "learning_rate": 1.9260298028629846e-05, "loss": 0.4576, "step": 2517 }, { "epoch": 0.4964511041009464, "grad_norm": 0.6462083871492283, "learning_rate": 1.925971285946398e-05, "loss": 0.4825, "step": 2518 }, { "epoch": 0.49664826498422715, "grad_norm": 0.659095376571992, "learning_rate": 1.9259127467826055e-05, "loss": 0.4919, "step": 2519 }, { "epoch": 0.4968454258675079, "grad_norm": 0.7047541361963703, "learning_rate": 1.925854185373014e-05, "loss": 0.4622, "step": 2520 }, { "epoch": 0.49704258675078866, "grad_norm": 0.6556353060348127, "learning_rate": 1.9257956017190297e-05, "loss": 0.4269, "step": 2521 }, { "epoch": 0.4972397476340694, "grad_norm": 1.249097804465773, "learning_rate": 1.9257369958220612e-05, "loss": 0.4644, "step": 2522 }, { "epoch": 0.4974369085173502, "grad_norm": 0.5862685793054593, "learning_rate": 1.9256783676835153e-05, "loss": 0.4157, "step": 2523 }, { "epoch": 0.49763406940063093, "grad_norm": 0.5843281300414458, "learning_rate": 1.9256197173048013e-05, "loss": 0.4691, "step": 2524 }, { "epoch": 0.4978312302839117, "grad_norm": 0.642053300811986, "learning_rate": 1.925561044687328e-05, "loss": 0.4671, "step": 2525 }, { "epoch": 0.49802839116719244, "grad_norm": 2.457307700484372, "learning_rate": 1.9255023498325055e-05, "loss": 0.4732, "step": 2526 }, { "epoch": 0.4982255520504732, "grad_norm": 1.0179133708857775, "learning_rate": 1.9254436327417436e-05, "loss": 0.4766, "step": 2527 }, { "epoch": 0.49842271293375395, "grad_norm": 0.5737583222016323, "learning_rate": 1.9253848934164533e-05, "loss": 0.4076, "step": 2528 }, { "epoch": 0.4986198738170347, "grad_norm": 1.222029502495448, "learning_rate": 1.9253261318580456e-05, "loss": 0.482, "step": 2529 }, { "epoch": 0.49881703470031546, "grad_norm": 0.7053057865997824, "learning_rate": 1.9252673480679328e-05, "loss": 0.4837, "step": 2530 }, { "epoch": 0.4990141955835962, "grad_norm": 0.6259696508511272, "learning_rate": 1.9252085420475263e-05, "loss": 0.4232, "step": 2531 }, { "epoch": 0.499211356466877, "grad_norm": 0.6853061232406765, "learning_rate": 1.92514971379824e-05, "loss": 0.443, "step": 2532 }, { "epoch": 0.49940851735015773, "grad_norm": 0.8584950321451298, "learning_rate": 1.9250908633214863e-05, "loss": 0.4483, "step": 2533 }, { "epoch": 0.4996056782334385, "grad_norm": 0.6391676971123401, "learning_rate": 1.92503199061868e-05, "loss": 0.4797, "step": 2534 }, { "epoch": 0.49980283911671924, "grad_norm": 0.638850981357959, "learning_rate": 1.924973095691235e-05, "loss": 0.4311, "step": 2535 }, { "epoch": 0.5, "grad_norm": 0.6527695472648593, "learning_rate": 1.9249141785405666e-05, "loss": 0.4149, "step": 2536 }, { "epoch": 0.5, "eval_loss": 0.45128634572029114, "eval_runtime": 344.9468, "eval_samples_per_second": 23.569, "eval_steps_per_second": 1.476, "step": 2536 }, { "epoch": 0.5001971608832808, "grad_norm": 0.6134815871569128, "learning_rate": 1.9248552391680902e-05, "loss": 0.4394, "step": 2537 }, { "epoch": 0.5003943217665615, "grad_norm": 0.7190198346074168, "learning_rate": 1.924796277575222e-05, "loss": 0.4758, "step": 2538 }, { "epoch": 0.5005914826498423, "grad_norm": 0.6176497275780272, "learning_rate": 1.9247372937633785e-05, "loss": 0.4417, "step": 2539 }, { "epoch": 0.500788643533123, "grad_norm": 0.6368942778751133, "learning_rate": 1.9246782877339767e-05, "loss": 0.4418, "step": 2540 }, { "epoch": 0.5009858044164038, "grad_norm": 0.6136453962550936, "learning_rate": 1.9246192594884344e-05, "loss": 0.4454, "step": 2541 }, { "epoch": 0.5011829652996845, "grad_norm": 0.6118109954623908, "learning_rate": 1.9245602090281698e-05, "loss": 0.4704, "step": 2542 }, { "epoch": 0.5013801261829653, "grad_norm": 0.5608720572816418, "learning_rate": 1.924501136354602e-05, "loss": 0.4331, "step": 2543 }, { "epoch": 0.501577287066246, "grad_norm": 0.818907256932753, "learning_rate": 1.924442041469149e-05, "loss": 0.4582, "step": 2544 }, { "epoch": 0.5017744479495269, "grad_norm": 0.5786413688970435, "learning_rate": 1.9243829243732324e-05, "loss": 0.4307, "step": 2545 }, { "epoch": 0.5019716088328076, "grad_norm": 0.7567124449798754, "learning_rate": 1.924323785068271e-05, "loss": 0.4212, "step": 2546 }, { "epoch": 0.5021687697160884, "grad_norm": 0.7087458808657718, "learning_rate": 1.9242646235556868e-05, "loss": 0.4734, "step": 2547 }, { "epoch": 0.5023659305993691, "grad_norm": 0.8128756129153467, "learning_rate": 1.9242054398369005e-05, "loss": 0.4908, "step": 2548 }, { "epoch": 0.5025630914826499, "grad_norm": 0.7036392962632404, "learning_rate": 1.9241462339133342e-05, "loss": 0.429, "step": 2549 }, { "epoch": 0.5027602523659306, "grad_norm": 2.7528973749927608, "learning_rate": 1.9240870057864106e-05, "loss": 0.4387, "step": 2550 }, { "epoch": 0.5029574132492114, "grad_norm": 0.6836200522610701, "learning_rate": 1.9240277554575523e-05, "loss": 0.434, "step": 2551 }, { "epoch": 0.5031545741324921, "grad_norm": 0.6838558197223942, "learning_rate": 1.923968482928183e-05, "loss": 0.4878, "step": 2552 }, { "epoch": 0.5033517350157729, "grad_norm": 0.6771965904434417, "learning_rate": 1.9239091881997274e-05, "loss": 0.4582, "step": 2553 }, { "epoch": 0.5035488958990536, "grad_norm": 0.9137036161581693, "learning_rate": 1.923849871273609e-05, "loss": 0.4471, "step": 2554 }, { "epoch": 0.5037460567823344, "grad_norm": 0.5838279107533122, "learning_rate": 1.923790532151254e-05, "loss": 0.4392, "step": 2555 }, { "epoch": 0.5039432176656151, "grad_norm": 0.751287528093079, "learning_rate": 1.9237311708340867e-05, "loss": 0.4608, "step": 2556 }, { "epoch": 0.5041403785488959, "grad_norm": 0.6438474116366795, "learning_rate": 1.9236717873235347e-05, "loss": 0.4483, "step": 2557 }, { "epoch": 0.5043375394321766, "grad_norm": 0.6691961023553488, "learning_rate": 1.923612381621024e-05, "loss": 0.4729, "step": 2558 }, { "epoch": 0.5045347003154574, "grad_norm": 0.8212757423894174, "learning_rate": 1.923552953727982e-05, "loss": 0.3985, "step": 2559 }, { "epoch": 0.5047318611987381, "grad_norm": 0.8451322507658569, "learning_rate": 1.923493503645837e-05, "loss": 0.4875, "step": 2560 }, { "epoch": 0.504929022082019, "grad_norm": 1.3003193055502538, "learning_rate": 1.9234340313760163e-05, "loss": 0.4569, "step": 2561 }, { "epoch": 0.5051261829652997, "grad_norm": 0.6906807412142325, "learning_rate": 1.9233745369199495e-05, "loss": 0.4659, "step": 2562 }, { "epoch": 0.5053233438485805, "grad_norm": 0.7693812343976348, "learning_rate": 1.923315020279066e-05, "loss": 0.4422, "step": 2563 }, { "epoch": 0.5055205047318612, "grad_norm": 1.128657931598075, "learning_rate": 1.9232554814547953e-05, "loss": 0.4615, "step": 2564 }, { "epoch": 0.505717665615142, "grad_norm": 1.1572958591539255, "learning_rate": 1.923195920448569e-05, "loss": 0.4783, "step": 2565 }, { "epoch": 0.5059148264984227, "grad_norm": 0.698446780450944, "learning_rate": 1.9231363372618165e-05, "loss": 0.4343, "step": 2566 }, { "epoch": 0.5061119873817035, "grad_norm": 0.5921597967019708, "learning_rate": 1.92307673189597e-05, "loss": 0.3959, "step": 2567 }, { "epoch": 0.5063091482649842, "grad_norm": 0.7489688974414478, "learning_rate": 1.923017104352462e-05, "loss": 0.4237, "step": 2568 }, { "epoch": 0.506506309148265, "grad_norm": 0.6464100318912962, "learning_rate": 1.9229574546327247e-05, "loss": 0.4587, "step": 2569 }, { "epoch": 0.5067034700315457, "grad_norm": 0.5894037282136724, "learning_rate": 1.9228977827381914e-05, "loss": 0.412, "step": 2570 }, { "epoch": 0.5069006309148265, "grad_norm": 0.5896415150715301, "learning_rate": 1.922838088670296e-05, "loss": 0.4415, "step": 2571 }, { "epoch": 0.5070977917981072, "grad_norm": 0.6093444816516489, "learning_rate": 1.9227783724304716e-05, "loss": 0.4246, "step": 2572 }, { "epoch": 0.507294952681388, "grad_norm": 0.6133472401577025, "learning_rate": 1.922718634020154e-05, "loss": 0.4265, "step": 2573 }, { "epoch": 0.5074921135646687, "grad_norm": 0.6834888429313623, "learning_rate": 1.922658873440778e-05, "loss": 0.4595, "step": 2574 }, { "epoch": 0.5076892744479495, "grad_norm": 0.6948580579520518, "learning_rate": 1.92259909069378e-05, "loss": 0.448, "step": 2575 }, { "epoch": 0.5078864353312302, "grad_norm": 0.6374166697259309, "learning_rate": 1.9225392857805955e-05, "loss": 0.436, "step": 2576 }, { "epoch": 0.508083596214511, "grad_norm": 0.6666234831625328, "learning_rate": 1.922479458702662e-05, "loss": 0.4592, "step": 2577 }, { "epoch": 0.5082807570977917, "grad_norm": 0.6537577476123761, "learning_rate": 1.9224196094614163e-05, "loss": 0.4444, "step": 2578 }, { "epoch": 0.5084779179810726, "grad_norm": 0.608893272413261, "learning_rate": 1.9223597380582967e-05, "loss": 0.4611, "step": 2579 }, { "epoch": 0.5086750788643533, "grad_norm": 0.6235476792381212, "learning_rate": 1.9222998444947417e-05, "loss": 0.4138, "step": 2580 }, { "epoch": 0.5088722397476341, "grad_norm": 0.6352774634847786, "learning_rate": 1.92223992877219e-05, "loss": 0.4596, "step": 2581 }, { "epoch": 0.5090694006309149, "grad_norm": 0.7096357346244977, "learning_rate": 1.922179990892082e-05, "loss": 0.4706, "step": 2582 }, { "epoch": 0.5092665615141956, "grad_norm": 0.9207989672506494, "learning_rate": 1.9221200308558566e-05, "loss": 0.4623, "step": 2583 }, { "epoch": 0.5094637223974764, "grad_norm": 0.6470580276117848, "learning_rate": 1.922060048664955e-05, "loss": 0.4691, "step": 2584 }, { "epoch": 0.5096608832807571, "grad_norm": 0.6435644649688526, "learning_rate": 1.9220000443208183e-05, "loss": 0.4633, "step": 2585 }, { "epoch": 0.5098580441640379, "grad_norm": 0.7356132379703066, "learning_rate": 1.9219400178248876e-05, "loss": 0.4805, "step": 2586 }, { "epoch": 0.5100552050473186, "grad_norm": 0.7125001237201177, "learning_rate": 1.9218799691786062e-05, "loss": 0.4589, "step": 2587 }, { "epoch": 0.5102523659305994, "grad_norm": 0.5882472375305573, "learning_rate": 1.9218198983834155e-05, "loss": 0.4088, "step": 2588 }, { "epoch": 0.5104495268138801, "grad_norm": 0.6522458410768266, "learning_rate": 1.9217598054407598e-05, "loss": 0.4515, "step": 2589 }, { "epoch": 0.5106466876971609, "grad_norm": 0.6181026041148553, "learning_rate": 1.9216996903520827e-05, "loss": 0.4571, "step": 2590 }, { "epoch": 0.5108438485804416, "grad_norm": 0.5899550767444578, "learning_rate": 1.9216395531188277e-05, "loss": 0.4303, "step": 2591 }, { "epoch": 0.5110410094637224, "grad_norm": 0.6129864766903566, "learning_rate": 1.9215793937424404e-05, "loss": 0.4711, "step": 2592 }, { "epoch": 0.5112381703470031, "grad_norm": 0.687929702723702, "learning_rate": 1.9215192122243663e-05, "loss": 0.4661, "step": 2593 }, { "epoch": 0.511435331230284, "grad_norm": 0.5550118700233378, "learning_rate": 1.921459008566051e-05, "loss": 0.4122, "step": 2594 }, { "epoch": 0.5116324921135647, "grad_norm": 0.6517123976025492, "learning_rate": 1.921398782768941e-05, "loss": 0.4537, "step": 2595 }, { "epoch": 0.5118296529968455, "grad_norm": 0.5998912924516978, "learning_rate": 1.9213385348344827e-05, "loss": 0.4539, "step": 2596 }, { "epoch": 0.5120268138801262, "grad_norm": 0.6170261164147766, "learning_rate": 1.9212782647641247e-05, "loss": 0.4556, "step": 2597 }, { "epoch": 0.512223974763407, "grad_norm": 0.6459407251957247, "learning_rate": 1.9212179725593144e-05, "loss": 0.4729, "step": 2598 }, { "epoch": 0.5124211356466877, "grad_norm": 0.6260843018867657, "learning_rate": 1.9211576582215e-05, "loss": 0.4851, "step": 2599 }, { "epoch": 0.5126182965299685, "grad_norm": 0.5489077348828413, "learning_rate": 1.921097321752132e-05, "loss": 0.4099, "step": 2600 }, { "epoch": 0.5128154574132492, "grad_norm": 0.903385996056016, "learning_rate": 1.9210369631526583e-05, "loss": 0.429, "step": 2601 }, { "epoch": 0.51301261829653, "grad_norm": 0.608980064766385, "learning_rate": 1.9209765824245302e-05, "loss": 0.4643, "step": 2602 }, { "epoch": 0.5132097791798107, "grad_norm": 0.5824584984075388, "learning_rate": 1.9209161795691975e-05, "loss": 0.4359, "step": 2603 }, { "epoch": 0.5134069400630915, "grad_norm": 62.5920871913074, "learning_rate": 1.9208557545881127e-05, "loss": 0.7084, "step": 2604 }, { "epoch": 0.5136041009463722, "grad_norm": 0.7237096419484113, "learning_rate": 1.9207953074827264e-05, "loss": 0.4803, "step": 2605 }, { "epoch": 0.513801261829653, "grad_norm": 0.6244628305928418, "learning_rate": 1.9207348382544914e-05, "loss": 0.4455, "step": 2606 }, { "epoch": 0.5139984227129337, "grad_norm": 0.6740117956920548, "learning_rate": 1.9206743469048606e-05, "loss": 0.4243, "step": 2607 }, { "epoch": 0.5141955835962145, "grad_norm": 0.64353274531661, "learning_rate": 1.920613833435287e-05, "loss": 0.4422, "step": 2608 }, { "epoch": 0.5143927444794952, "grad_norm": 0.6873646678800827, "learning_rate": 1.920553297847225e-05, "loss": 0.4708, "step": 2609 }, { "epoch": 0.514589905362776, "grad_norm": 0.8576563607865519, "learning_rate": 1.9204927401421284e-05, "loss": 0.4518, "step": 2610 }, { "epoch": 0.5147870662460567, "grad_norm": 0.6288653111671121, "learning_rate": 1.9204321603214523e-05, "loss": 0.4579, "step": 2611 }, { "epoch": 0.5149842271293376, "grad_norm": 0.631342041680086, "learning_rate": 1.9203715583866527e-05, "loss": 0.4919, "step": 2612 }, { "epoch": 0.5151813880126183, "grad_norm": 0.7478480009750041, "learning_rate": 1.920310934339185e-05, "loss": 0.4642, "step": 2613 }, { "epoch": 0.5153785488958991, "grad_norm": 0.6757289333762163, "learning_rate": 1.920250288180506e-05, "loss": 0.4699, "step": 2614 }, { "epoch": 0.5155757097791798, "grad_norm": 0.569219529443845, "learning_rate": 1.9201896199120728e-05, "loss": 0.4038, "step": 2615 }, { "epoch": 0.5157728706624606, "grad_norm": 0.6266639043108642, "learning_rate": 1.920128929535343e-05, "loss": 0.4584, "step": 2616 }, { "epoch": 0.5159700315457413, "grad_norm": 0.6019770065246319, "learning_rate": 1.9200682170517746e-05, "loss": 0.4545, "step": 2617 }, { "epoch": 0.5161671924290221, "grad_norm": 0.7396548559691547, "learning_rate": 1.9200074824628267e-05, "loss": 0.4819, "step": 2618 }, { "epoch": 0.5163643533123028, "grad_norm": 0.6032049311937472, "learning_rate": 1.9199467257699577e-05, "loss": 0.4672, "step": 2619 }, { "epoch": 0.5165615141955836, "grad_norm": 0.5534977232280806, "learning_rate": 1.919885946974628e-05, "loss": 0.433, "step": 2620 }, { "epoch": 0.5167586750788643, "grad_norm": 0.6139976333469656, "learning_rate": 1.9198251460782974e-05, "loss": 0.4354, "step": 2621 }, { "epoch": 0.5169558359621451, "grad_norm": 0.6302918986448092, "learning_rate": 1.9197643230824272e-05, "loss": 0.4114, "step": 2622 }, { "epoch": 0.5171529968454258, "grad_norm": 0.6836953420990394, "learning_rate": 1.9197034779884785e-05, "loss": 0.4611, "step": 2623 }, { "epoch": 0.5173501577287066, "grad_norm": 0.5979385220267467, "learning_rate": 1.919642610797913e-05, "loss": 0.4061, "step": 2624 }, { "epoch": 0.5175473186119873, "grad_norm": 0.657979498979339, "learning_rate": 1.9195817215121933e-05, "loss": 0.4931, "step": 2625 }, { "epoch": 0.5177444794952681, "grad_norm": 0.9799814092912918, "learning_rate": 1.9195208101327818e-05, "loss": 0.4581, "step": 2626 }, { "epoch": 0.517941640378549, "grad_norm": 0.6066811057663211, "learning_rate": 1.9194598766611426e-05, "loss": 0.4601, "step": 2627 }, { "epoch": 0.5181388012618297, "grad_norm": 0.6509679416434392, "learning_rate": 1.9193989210987396e-05, "loss": 0.503, "step": 2628 }, { "epoch": 0.5183359621451105, "grad_norm": 0.5884583173326658, "learning_rate": 1.919337943447037e-05, "loss": 0.4744, "step": 2629 }, { "epoch": 0.5185331230283912, "grad_norm": 0.6369404434163963, "learning_rate": 1.9192769437075e-05, "loss": 0.4354, "step": 2630 }, { "epoch": 0.518730283911672, "grad_norm": 0.6280177351268637, "learning_rate": 1.919215921881594e-05, "loss": 0.4608, "step": 2631 }, { "epoch": 0.5189274447949527, "grad_norm": 0.6573633987043811, "learning_rate": 1.9191548779707854e-05, "loss": 0.4649, "step": 2632 }, { "epoch": 0.5191246056782335, "grad_norm": 0.5972245759129801, "learning_rate": 1.9190938119765404e-05, "loss": 0.4536, "step": 2633 }, { "epoch": 0.5193217665615142, "grad_norm": 0.625177544249115, "learning_rate": 1.9190327239003267e-05, "loss": 0.4509, "step": 2634 }, { "epoch": 0.519518927444795, "grad_norm": 0.5920183204667294, "learning_rate": 1.9189716137436118e-05, "loss": 0.466, "step": 2635 }, { "epoch": 0.5197160883280757, "grad_norm": 0.5840691953513331, "learning_rate": 1.9189104815078633e-05, "loss": 0.4253, "step": 2636 }, { "epoch": 0.5199132492113565, "grad_norm": 0.5749628137522924, "learning_rate": 1.918849327194551e-05, "loss": 0.4263, "step": 2637 }, { "epoch": 0.5201104100946372, "grad_norm": 0.645892075328813, "learning_rate": 1.9187881508051433e-05, "loss": 0.4376, "step": 2638 }, { "epoch": 0.520307570977918, "grad_norm": 0.5985813732201994, "learning_rate": 1.9187269523411108e-05, "loss": 0.469, "step": 2639 }, { "epoch": 0.5205047318611987, "grad_norm": 0.5662446538186162, "learning_rate": 1.918665731803923e-05, "loss": 0.4278, "step": 2640 }, { "epoch": 0.5207018927444795, "grad_norm": 0.5774151154266736, "learning_rate": 1.9186044891950514e-05, "loss": 0.4541, "step": 2641 }, { "epoch": 0.5208990536277602, "grad_norm": 0.5628515665275783, "learning_rate": 1.9185432245159675e-05, "loss": 0.4533, "step": 2642 }, { "epoch": 0.521096214511041, "grad_norm": 0.602812437259254, "learning_rate": 1.9184819377681425e-05, "loss": 0.4411, "step": 2643 }, { "epoch": 0.5212933753943217, "grad_norm": 0.6047746007443482, "learning_rate": 1.9184206289530496e-05, "loss": 0.4664, "step": 2644 }, { "epoch": 0.5214905362776026, "grad_norm": 0.8256532576919925, "learning_rate": 1.918359298072161e-05, "loss": 0.4513, "step": 2645 }, { "epoch": 0.5216876971608833, "grad_norm": 0.5378632559684782, "learning_rate": 1.9182979451269513e-05, "loss": 0.4264, "step": 2646 }, { "epoch": 0.5218848580441641, "grad_norm": 0.6701591614996074, "learning_rate": 1.9182365701188933e-05, "loss": 0.4815, "step": 2647 }, { "epoch": 0.5220820189274448, "grad_norm": 0.6319925276235775, "learning_rate": 1.918175173049463e-05, "loss": 0.4799, "step": 2648 }, { "epoch": 0.5222791798107256, "grad_norm": 0.6247215611056411, "learning_rate": 1.9181137539201343e-05, "loss": 0.4698, "step": 2649 }, { "epoch": 0.5224763406940063, "grad_norm": 0.5842030623500385, "learning_rate": 1.9180523127323834e-05, "loss": 0.4394, "step": 2650 }, { "epoch": 0.5226735015772871, "grad_norm": 0.5932577854627189, "learning_rate": 1.9179908494876863e-05, "loss": 0.451, "step": 2651 }, { "epoch": 0.5228706624605678, "grad_norm": 0.6071689058545614, "learning_rate": 1.91792936418752e-05, "loss": 0.4458, "step": 2652 }, { "epoch": 0.5230678233438486, "grad_norm": 0.6417327540656751, "learning_rate": 1.917867856833361e-05, "loss": 0.4421, "step": 2653 }, { "epoch": 0.5232649842271293, "grad_norm": 8.906897332645489, "learning_rate": 1.9178063274266884e-05, "loss": 0.4716, "step": 2654 }, { "epoch": 0.5234621451104101, "grad_norm": 0.76731746274502, "learning_rate": 1.9177447759689792e-05, "loss": 0.4537, "step": 2655 }, { "epoch": 0.5236593059936908, "grad_norm": 0.6082849024874425, "learning_rate": 1.9176832024617125e-05, "loss": 0.4746, "step": 2656 }, { "epoch": 0.5238564668769716, "grad_norm": 0.6698492507500728, "learning_rate": 1.9176216069063683e-05, "loss": 0.4377, "step": 2657 }, { "epoch": 0.5240536277602523, "grad_norm": 0.6683170097007389, "learning_rate": 1.917559989304426e-05, "loss": 0.4536, "step": 2658 }, { "epoch": 0.5242507886435331, "grad_norm": 0.6479408068752674, "learning_rate": 1.9174983496573657e-05, "loss": 0.4526, "step": 2659 }, { "epoch": 0.5244479495268138, "grad_norm": 0.6865722121413244, "learning_rate": 1.917436687966669e-05, "loss": 0.4687, "step": 2660 }, { "epoch": 0.5246451104100947, "grad_norm": 0.6689601140224801, "learning_rate": 1.917375004233817e-05, "loss": 0.4841, "step": 2661 }, { "epoch": 0.5248422712933754, "grad_norm": 0.6723818049998349, "learning_rate": 1.9173132984602914e-05, "loss": 0.4405, "step": 2662 }, { "epoch": 0.5250394321766562, "grad_norm": 0.6763870303856786, "learning_rate": 1.9172515706475755e-05, "loss": 0.4439, "step": 2663 }, { "epoch": 0.5252365930599369, "grad_norm": 0.7423211243997336, "learning_rate": 1.9171898207971518e-05, "loss": 0.466, "step": 2664 }, { "epoch": 0.5254337539432177, "grad_norm": 0.5578326316635425, "learning_rate": 1.9171280489105043e-05, "loss": 0.4387, "step": 2665 }, { "epoch": 0.5256309148264984, "grad_norm": 0.6327176422438838, "learning_rate": 1.9170662549891162e-05, "loss": 0.4419, "step": 2666 }, { "epoch": 0.5258280757097792, "grad_norm": 0.6097938287122503, "learning_rate": 1.9170044390344737e-05, "loss": 0.4684, "step": 2667 }, { "epoch": 0.5260252365930599, "grad_norm": 0.6652570785788708, "learning_rate": 1.9169426010480604e-05, "loss": 0.4439, "step": 2668 }, { "epoch": 0.5262223974763407, "grad_norm": 0.5705839336000622, "learning_rate": 1.916880741031363e-05, "loss": 0.4416, "step": 2669 }, { "epoch": 0.5264195583596214, "grad_norm": 0.5966638281252562, "learning_rate": 1.9168188589858675e-05, "loss": 0.4328, "step": 2670 }, { "epoch": 0.5266167192429022, "grad_norm": 0.5661798360166143, "learning_rate": 1.9167569549130604e-05, "loss": 0.4222, "step": 2671 }, { "epoch": 0.526813880126183, "grad_norm": 0.6409908246303587, "learning_rate": 1.9166950288144296e-05, "loss": 0.4262, "step": 2672 }, { "epoch": 0.5270110410094637, "grad_norm": 0.5846011872700821, "learning_rate": 1.916633080691462e-05, "loss": 0.4606, "step": 2673 }, { "epoch": 0.5272082018927445, "grad_norm": 0.6281868356100663, "learning_rate": 1.9165711105456468e-05, "loss": 0.4683, "step": 2674 }, { "epoch": 0.5274053627760252, "grad_norm": 0.6643878477972568, "learning_rate": 1.9165091183784722e-05, "loss": 0.4666, "step": 2675 }, { "epoch": 0.527602523659306, "grad_norm": 0.877520567898525, "learning_rate": 1.9164471041914283e-05, "loss": 0.4992, "step": 2676 }, { "epoch": 0.5277996845425867, "grad_norm": 0.8982575542215294, "learning_rate": 1.9163850679860046e-05, "loss": 0.4169, "step": 2677 }, { "epoch": 0.5279968454258676, "grad_norm": 0.9377399621895196, "learning_rate": 1.9163230097636917e-05, "loss": 0.4429, "step": 2678 }, { "epoch": 0.5281940063091483, "grad_norm": 0.5944294971560273, "learning_rate": 1.9162609295259805e-05, "loss": 0.4963, "step": 2679 }, { "epoch": 0.5283911671924291, "grad_norm": 0.6623678780030434, "learning_rate": 1.9161988272743627e-05, "loss": 0.4514, "step": 2680 }, { "epoch": 0.5285883280757098, "grad_norm": 0.7305199400460632, "learning_rate": 1.9161367030103303e-05, "loss": 0.4526, "step": 2681 }, { "epoch": 0.5287854889589906, "grad_norm": 0.6299026603110343, "learning_rate": 1.9160745567353758e-05, "loss": 0.4495, "step": 2682 }, { "epoch": 0.5289826498422713, "grad_norm": 0.606070194816596, "learning_rate": 1.9160123884509923e-05, "loss": 0.4555, "step": 2683 }, { "epoch": 0.5291798107255521, "grad_norm": 0.5864541808065882, "learning_rate": 1.9159501981586738e-05, "loss": 0.4447, "step": 2684 }, { "epoch": 0.5293769716088328, "grad_norm": 0.5378633706513408, "learning_rate": 1.9158879858599138e-05, "loss": 0.4205, "step": 2685 }, { "epoch": 0.5295741324921136, "grad_norm": 0.5692796059391859, "learning_rate": 1.9158257515562075e-05, "loss": 0.446, "step": 2686 }, { "epoch": 0.5297712933753943, "grad_norm": 0.5718248315346496, "learning_rate": 1.91576349524905e-05, "loss": 0.4417, "step": 2687 }, { "epoch": 0.5299684542586751, "grad_norm": 0.5934922303137548, "learning_rate": 1.9157012169399372e-05, "loss": 0.4271, "step": 2688 }, { "epoch": 0.5301656151419558, "grad_norm": 0.7026701042054814, "learning_rate": 1.9156389166303652e-05, "loss": 0.4772, "step": 2689 }, { "epoch": 0.5303627760252366, "grad_norm": 0.5525996284668334, "learning_rate": 1.9155765943218304e-05, "loss": 0.3835, "step": 2690 }, { "epoch": 0.5305599369085173, "grad_norm": 0.6691635308990771, "learning_rate": 1.9155142500158312e-05, "loss": 0.4962, "step": 2691 }, { "epoch": 0.5307570977917981, "grad_norm": 0.6586289639914643, "learning_rate": 1.9154518837138644e-05, "loss": 0.4623, "step": 2692 }, { "epoch": 0.5309542586750788, "grad_norm": 0.5627097183725107, "learning_rate": 1.9153894954174294e-05, "loss": 0.4543, "step": 2693 }, { "epoch": 0.5311514195583596, "grad_norm": 0.5790687836358622, "learning_rate": 1.9153270851280245e-05, "loss": 0.4632, "step": 2694 }, { "epoch": 0.5313485804416404, "grad_norm": 0.6605341566789122, "learning_rate": 1.915264652847149e-05, "loss": 0.4662, "step": 2695 }, { "epoch": 0.5315457413249212, "grad_norm": 0.5991069128744081, "learning_rate": 1.9152021985763035e-05, "loss": 0.4425, "step": 2696 }, { "epoch": 0.5317429022082019, "grad_norm": 0.6215310692072236, "learning_rate": 1.9151397223169877e-05, "loss": 0.4386, "step": 2697 }, { "epoch": 0.5319400630914827, "grad_norm": 0.5638306576019859, "learning_rate": 1.9150772240707038e-05, "loss": 0.4245, "step": 2698 }, { "epoch": 0.5321372239747634, "grad_norm": 0.6748512477251645, "learning_rate": 1.915014703838952e-05, "loss": 0.4187, "step": 2699 }, { "epoch": 0.5323343848580442, "grad_norm": 0.6108680696319301, "learning_rate": 1.9149521616232354e-05, "loss": 0.4595, "step": 2700 }, { "epoch": 0.5325315457413249, "grad_norm": 0.596636107232528, "learning_rate": 1.9148895974250562e-05, "loss": 0.4858, "step": 2701 }, { "epoch": 0.5327287066246057, "grad_norm": 0.5630137990858612, "learning_rate": 1.9148270112459178e-05, "loss": 0.4191, "step": 2702 }, { "epoch": 0.5329258675078864, "grad_norm": 0.6635599997569216, "learning_rate": 1.9147644030873236e-05, "loss": 0.4665, "step": 2703 }, { "epoch": 0.5331230283911672, "grad_norm": 0.792415273908661, "learning_rate": 1.914701772950778e-05, "loss": 0.4355, "step": 2704 }, { "epoch": 0.5333201892744479, "grad_norm": 0.6016005658016478, "learning_rate": 1.9146391208377856e-05, "loss": 0.4361, "step": 2705 }, { "epoch": 0.5335173501577287, "grad_norm": 0.58541615627604, "learning_rate": 1.914576446749852e-05, "loss": 0.4288, "step": 2706 }, { "epoch": 0.5337145110410094, "grad_norm": 0.6092999107646474, "learning_rate": 1.9145137506884826e-05, "loss": 0.4174, "step": 2707 }, { "epoch": 0.5339116719242902, "grad_norm": 0.7227982055069082, "learning_rate": 1.914451032655184e-05, "loss": 0.4822, "step": 2708 }, { "epoch": 0.5341088328075709, "grad_norm": 0.6246387176004011, "learning_rate": 1.914388292651463e-05, "loss": 0.436, "step": 2709 }, { "epoch": 0.5343059936908517, "grad_norm": 0.573293012232519, "learning_rate": 1.9143255306788266e-05, "loss": 0.4834, "step": 2710 }, { "epoch": 0.5345031545741324, "grad_norm": 0.5767714766984778, "learning_rate": 1.9142627467387833e-05, "loss": 0.4374, "step": 2711 }, { "epoch": 0.5347003154574133, "grad_norm": 0.5672517687751784, "learning_rate": 1.9141999408328412e-05, "loss": 0.4461, "step": 2712 }, { "epoch": 0.534897476340694, "grad_norm": 0.5888251273732579, "learning_rate": 1.914137112962509e-05, "loss": 0.4775, "step": 2713 }, { "epoch": 0.5350946372239748, "grad_norm": 0.633920406740737, "learning_rate": 1.914074263129297e-05, "loss": 0.4626, "step": 2714 }, { "epoch": 0.5352917981072555, "grad_norm": 0.6205409845878437, "learning_rate": 1.9140113913347145e-05, "loss": 0.4504, "step": 2715 }, { "epoch": 0.5354889589905363, "grad_norm": 1.6126724762687141, "learning_rate": 1.9139484975802723e-05, "loss": 0.4977, "step": 2716 }, { "epoch": 0.535686119873817, "grad_norm": 0.6349889203833219, "learning_rate": 1.9138855818674814e-05, "loss": 0.4596, "step": 2717 }, { "epoch": 0.5358832807570978, "grad_norm": 0.5920992239234482, "learning_rate": 1.9138226441978533e-05, "loss": 0.4302, "step": 2718 }, { "epoch": 0.5360804416403786, "grad_norm": 0.5585845890938809, "learning_rate": 1.9137596845729005e-05, "loss": 0.4447, "step": 2719 }, { "epoch": 0.5362776025236593, "grad_norm": 0.6075692038210041, "learning_rate": 1.9136967029941354e-05, "loss": 0.4849, "step": 2720 }, { "epoch": 0.5364747634069401, "grad_norm": 0.6678891128497162, "learning_rate": 1.9136336994630712e-05, "loss": 0.4498, "step": 2721 }, { "epoch": 0.5366719242902208, "grad_norm": 0.6172599424463148, "learning_rate": 1.9135706739812217e-05, "loss": 0.4605, "step": 2722 }, { "epoch": 0.5368690851735016, "grad_norm": 0.5878777248662985, "learning_rate": 1.913507626550101e-05, "loss": 0.4588, "step": 2723 }, { "epoch": 0.5370662460567823, "grad_norm": 0.6514199512488131, "learning_rate": 1.9134445571712237e-05, "loss": 0.3952, "step": 2724 }, { "epoch": 0.5372634069400631, "grad_norm": 0.5691982494960713, "learning_rate": 1.9133814658461056e-05, "loss": 0.419, "step": 2725 }, { "epoch": 0.5374605678233438, "grad_norm": 0.6165356310225942, "learning_rate": 1.9133183525762622e-05, "loss": 0.4368, "step": 2726 }, { "epoch": 0.5376577287066246, "grad_norm": 0.6483155000970097, "learning_rate": 1.9132552173632097e-05, "loss": 0.4526, "step": 2727 }, { "epoch": 0.5378548895899053, "grad_norm": 0.6668794939528021, "learning_rate": 1.9131920602084656e-05, "loss": 0.4663, "step": 2728 }, { "epoch": 0.5380520504731862, "grad_norm": 0.7674580396358098, "learning_rate": 1.9131288811135465e-05, "loss": 0.4869, "step": 2729 }, { "epoch": 0.5382492113564669, "grad_norm": 0.7119703322980993, "learning_rate": 1.9130656800799706e-05, "loss": 0.4495, "step": 2730 }, { "epoch": 0.5384463722397477, "grad_norm": 0.5810364329746894, "learning_rate": 1.9130024571092565e-05, "loss": 0.4188, "step": 2731 }, { "epoch": 0.5386435331230284, "grad_norm": 0.74091178804136, "learning_rate": 1.9129392122029233e-05, "loss": 0.4444, "step": 2732 }, { "epoch": 0.5388406940063092, "grad_norm": 0.5752138416355487, "learning_rate": 1.9128759453624904e-05, "loss": 0.4434, "step": 2733 }, { "epoch": 0.5390378548895899, "grad_norm": 0.7080034594600536, "learning_rate": 1.9128126565894776e-05, "loss": 0.4436, "step": 2734 }, { "epoch": 0.5392350157728707, "grad_norm": 0.5927536597698786, "learning_rate": 1.9127493458854055e-05, "loss": 0.4587, "step": 2735 }, { "epoch": 0.5394321766561514, "grad_norm": 0.7051671166398329, "learning_rate": 1.9126860132517958e-05, "loss": 0.4431, "step": 2736 }, { "epoch": 0.5396293375394322, "grad_norm": 0.6126458882890922, "learning_rate": 1.9126226586901693e-05, "loss": 0.4926, "step": 2737 }, { "epoch": 0.5398264984227129, "grad_norm": 0.6919952144838898, "learning_rate": 1.9125592822020485e-05, "loss": 0.4734, "step": 2738 }, { "epoch": 0.5400236593059937, "grad_norm": 0.6350862584452707, "learning_rate": 1.912495883788956e-05, "loss": 0.4682, "step": 2739 }, { "epoch": 0.5402208201892744, "grad_norm": 0.653383083517542, "learning_rate": 1.9124324634524153e-05, "loss": 0.4427, "step": 2740 }, { "epoch": 0.5404179810725552, "grad_norm": 0.6323326038671259, "learning_rate": 1.91236902119395e-05, "loss": 0.4433, "step": 2741 }, { "epoch": 0.5406151419558359, "grad_norm": 0.7015096284263576, "learning_rate": 1.912305557015084e-05, "loss": 0.4368, "step": 2742 }, { "epoch": 0.5408123028391167, "grad_norm": 0.6210161210833985, "learning_rate": 1.9122420709173422e-05, "loss": 0.409, "step": 2743 }, { "epoch": 0.5410094637223974, "grad_norm": 0.7037911524098324, "learning_rate": 1.9121785629022502e-05, "loss": 0.4446, "step": 2744 }, { "epoch": 0.5412066246056783, "grad_norm": 0.6143364835057887, "learning_rate": 1.9121150329713334e-05, "loss": 0.4485, "step": 2745 }, { "epoch": 0.541403785488959, "grad_norm": 0.6298390892240697, "learning_rate": 1.9120514811261187e-05, "loss": 0.4014, "step": 2746 }, { "epoch": 0.5416009463722398, "grad_norm": 0.6010553806758819, "learning_rate": 1.9119879073681328e-05, "loss": 0.4275, "step": 2747 }, { "epoch": 0.5417981072555205, "grad_norm": 0.6014151169068335, "learning_rate": 1.9119243116989022e-05, "loss": 0.438, "step": 2748 }, { "epoch": 0.5419952681388013, "grad_norm": 0.743894988167943, "learning_rate": 1.9118606941199565e-05, "loss": 0.4777, "step": 2749 }, { "epoch": 0.542192429022082, "grad_norm": 0.612605392695115, "learning_rate": 1.911797054632823e-05, "loss": 0.4606, "step": 2750 }, { "epoch": 0.5423895899053628, "grad_norm": 0.6393149368458824, "learning_rate": 1.911733393239031e-05, "loss": 0.4622, "step": 2751 }, { "epoch": 0.5425867507886435, "grad_norm": 0.6406348967467802, "learning_rate": 1.9116697099401103e-05, "loss": 0.4575, "step": 2752 }, { "epoch": 0.5427839116719243, "grad_norm": 0.6210504950614151, "learning_rate": 1.9116060047375903e-05, "loss": 0.4361, "step": 2753 }, { "epoch": 0.542981072555205, "grad_norm": 0.5936925930435107, "learning_rate": 1.911542277633002e-05, "loss": 0.4278, "step": 2754 }, { "epoch": 0.5431782334384858, "grad_norm": 0.6521721561800861, "learning_rate": 1.9114785286278767e-05, "loss": 0.4693, "step": 2755 }, { "epoch": 0.5433753943217665, "grad_norm": 0.5816573139740299, "learning_rate": 1.9114147577237452e-05, "loss": 0.4478, "step": 2756 }, { "epoch": 0.5435725552050473, "grad_norm": 0.5927002843696301, "learning_rate": 1.9113509649221403e-05, "loss": 0.4289, "step": 2757 }, { "epoch": 0.543769716088328, "grad_norm": 0.6312074883108042, "learning_rate": 1.911287150224595e-05, "loss": 0.4581, "step": 2758 }, { "epoch": 0.5439668769716088, "grad_norm": 0.63059238176542, "learning_rate": 1.9112233136326416e-05, "loss": 0.4336, "step": 2759 }, { "epoch": 0.5441640378548895, "grad_norm": 0.722821516571449, "learning_rate": 1.9111594551478146e-05, "loss": 0.4523, "step": 2760 }, { "epoch": 0.5443611987381703, "grad_norm": 0.5865645060807954, "learning_rate": 1.9110955747716478e-05, "loss": 0.4407, "step": 2761 }, { "epoch": 0.544558359621451, "grad_norm": 0.6247985614675826, "learning_rate": 1.911031672505676e-05, "loss": 0.4229, "step": 2762 }, { "epoch": 0.5447555205047319, "grad_norm": 0.6542180266531745, "learning_rate": 1.9109677483514346e-05, "loss": 0.5063, "step": 2763 }, { "epoch": 0.5449526813880127, "grad_norm": 0.5911009525801598, "learning_rate": 1.91090380231046e-05, "loss": 0.4443, "step": 2764 }, { "epoch": 0.5451498422712934, "grad_norm": 0.8124320978143662, "learning_rate": 1.9108398343842873e-05, "loss": 0.4413, "step": 2765 }, { "epoch": 0.5453470031545742, "grad_norm": 0.5550449237988123, "learning_rate": 1.910775844574454e-05, "loss": 0.4284, "step": 2766 }, { "epoch": 0.5455441640378549, "grad_norm": 0.597109232711348, "learning_rate": 1.910711832882498e-05, "loss": 0.4775, "step": 2767 }, { "epoch": 0.5457413249211357, "grad_norm": 0.7408895073247752, "learning_rate": 1.910647799309957e-05, "loss": 0.4716, "step": 2768 }, { "epoch": 0.5459384858044164, "grad_norm": 0.5592810141688882, "learning_rate": 1.9105837438583693e-05, "loss": 0.4494, "step": 2769 }, { "epoch": 0.5461356466876972, "grad_norm": 0.6407004174091825, "learning_rate": 1.9105196665292735e-05, "loss": 0.4471, "step": 2770 }, { "epoch": 0.5463328075709779, "grad_norm": 0.5920197717658957, "learning_rate": 1.9104555673242092e-05, "loss": 0.4632, "step": 2771 }, { "epoch": 0.5465299684542587, "grad_norm": 0.5534555677305497, "learning_rate": 1.9103914462447172e-05, "loss": 0.4683, "step": 2772 }, { "epoch": 0.5467271293375394, "grad_norm": 0.6333249803496639, "learning_rate": 1.9103273032923378e-05, "loss": 0.4167, "step": 2773 }, { "epoch": 0.5469242902208202, "grad_norm": 0.5735766563287512, "learning_rate": 1.9102631384686116e-05, "loss": 0.4556, "step": 2774 }, { "epoch": 0.5471214511041009, "grad_norm": 0.6225909847389804, "learning_rate": 1.91019895177508e-05, "loss": 0.4532, "step": 2775 }, { "epoch": 0.5473186119873817, "grad_norm": 0.6265493049371124, "learning_rate": 1.910134743213286e-05, "loss": 0.4352, "step": 2776 }, { "epoch": 0.5475157728706624, "grad_norm": 0.7433505225427158, "learning_rate": 1.910070512784772e-05, "loss": 0.4435, "step": 2777 }, { "epoch": 0.5477129337539433, "grad_norm": 0.6861455293800391, "learning_rate": 1.910006260491081e-05, "loss": 0.4573, "step": 2778 }, { "epoch": 0.547910094637224, "grad_norm": 0.6005840206295373, "learning_rate": 1.9099419863337567e-05, "loss": 0.4565, "step": 2779 }, { "epoch": 0.5481072555205048, "grad_norm": 0.5980496434263233, "learning_rate": 1.909877690314343e-05, "loss": 0.4452, "step": 2780 }, { "epoch": 0.5483044164037855, "grad_norm": 0.6336383497213709, "learning_rate": 1.9098133724343853e-05, "loss": 0.4745, "step": 2781 }, { "epoch": 0.5485015772870663, "grad_norm": 0.600357744618508, "learning_rate": 1.9097490326954288e-05, "loss": 0.4485, "step": 2782 }, { "epoch": 0.548698738170347, "grad_norm": 0.6216658860334479, "learning_rate": 1.9096846710990192e-05, "loss": 0.4393, "step": 2783 }, { "epoch": 0.5488958990536278, "grad_norm": 0.5921780263629273, "learning_rate": 1.9096202876467028e-05, "loss": 0.4365, "step": 2784 }, { "epoch": 0.5490930599369085, "grad_norm": 0.5793518128764239, "learning_rate": 1.9095558823400266e-05, "loss": 0.458, "step": 2785 }, { "epoch": 0.5492902208201893, "grad_norm": 0.5941381807403091, "learning_rate": 1.9094914551805377e-05, "loss": 0.4269, "step": 2786 }, { "epoch": 0.54948738170347, "grad_norm": 0.5243117956195389, "learning_rate": 1.909427006169784e-05, "loss": 0.4263, "step": 2787 }, { "epoch": 0.5496845425867508, "grad_norm": 0.5990209951331408, "learning_rate": 1.9093625353093146e-05, "loss": 0.4689, "step": 2788 }, { "epoch": 0.5498817034700315, "grad_norm": 0.6974111022304779, "learning_rate": 1.9092980426006774e-05, "loss": 0.4537, "step": 2789 }, { "epoch": 0.5500788643533123, "grad_norm": 0.5355199493063655, "learning_rate": 1.909233528045423e-05, "loss": 0.4156, "step": 2790 }, { "epoch": 0.550276025236593, "grad_norm": 0.6415663325485629, "learning_rate": 1.9091689916451006e-05, "loss": 0.4639, "step": 2791 }, { "epoch": 0.5504731861198738, "grad_norm": 0.5466690904751329, "learning_rate": 1.909104433401261e-05, "loss": 0.4197, "step": 2792 }, { "epoch": 0.5506703470031545, "grad_norm": 0.5535455971550053, "learning_rate": 1.9090398533154552e-05, "loss": 0.4221, "step": 2793 }, { "epoch": 0.5508675078864353, "grad_norm": 0.6772223151490671, "learning_rate": 1.908975251389235e-05, "loss": 0.4584, "step": 2794 }, { "epoch": 0.551064668769716, "grad_norm": 0.639977189301138, "learning_rate": 1.9089106276241523e-05, "loss": 0.479, "step": 2795 }, { "epoch": 0.5512618296529969, "grad_norm": 0.5718559924246178, "learning_rate": 1.9088459820217602e-05, "loss": 0.428, "step": 2796 }, { "epoch": 0.5514589905362776, "grad_norm": 0.5777292639541047, "learning_rate": 1.908781314583611e-05, "loss": 0.4451, "step": 2797 }, { "epoch": 0.5516561514195584, "grad_norm": 0.5900189358667072, "learning_rate": 1.9087166253112594e-05, "loss": 0.4692, "step": 2798 }, { "epoch": 0.5518533123028391, "grad_norm": 0.6961573118708897, "learning_rate": 1.9086519142062587e-05, "loss": 0.4743, "step": 2799 }, { "epoch": 0.5520504731861199, "grad_norm": 0.5509038753437092, "learning_rate": 1.9085871812701642e-05, "loss": 0.4182, "step": 2800 }, { "epoch": 0.5522476340694006, "grad_norm": 0.5723685332377061, "learning_rate": 1.908522426504531e-05, "loss": 0.4357, "step": 2801 }, { "epoch": 0.5524447949526814, "grad_norm": 0.6424941228887284, "learning_rate": 1.9084576499109148e-05, "loss": 0.4767, "step": 2802 }, { "epoch": 0.5526419558359621, "grad_norm": 0.6223021214852441, "learning_rate": 1.908392851490872e-05, "loss": 0.459, "step": 2803 }, { "epoch": 0.5528391167192429, "grad_norm": 0.5876602293739934, "learning_rate": 1.9083280312459595e-05, "loss": 0.4468, "step": 2804 }, { "epoch": 0.5530362776025236, "grad_norm": 0.6188653567310646, "learning_rate": 1.9082631891777345e-05, "loss": 0.42, "step": 2805 }, { "epoch": 0.5532334384858044, "grad_norm": 0.6259526543976492, "learning_rate": 1.9081983252877548e-05, "loss": 0.4388, "step": 2806 }, { "epoch": 0.5534305993690851, "grad_norm": 0.5477244518993755, "learning_rate": 1.9081334395775788e-05, "loss": 0.4735, "step": 2807 }, { "epoch": 0.5536277602523659, "grad_norm": 0.6135819658127445, "learning_rate": 1.908068532048766e-05, "loss": 0.4716, "step": 2808 }, { "epoch": 0.5538249211356467, "grad_norm": 0.5562204082368883, "learning_rate": 1.9080036027028752e-05, "loss": 0.455, "step": 2809 }, { "epoch": 0.5540220820189274, "grad_norm": 0.5792847495400975, "learning_rate": 1.9079386515414667e-05, "loss": 0.421, "step": 2810 }, { "epoch": 0.5542192429022083, "grad_norm": 0.6003218566784827, "learning_rate": 1.9078736785661012e-05, "loss": 0.427, "step": 2811 }, { "epoch": 0.554416403785489, "grad_norm": 0.5977583508502151, "learning_rate": 1.9078086837783393e-05, "loss": 0.4535, "step": 2812 }, { "epoch": 0.5546135646687698, "grad_norm": 0.5699405638630519, "learning_rate": 1.9077436671797426e-05, "loss": 0.4305, "step": 2813 }, { "epoch": 0.5548107255520505, "grad_norm": 0.5800436763185806, "learning_rate": 1.9076786287718734e-05, "loss": 0.4306, "step": 2814 }, { "epoch": 0.5550078864353313, "grad_norm": 0.5825228330765003, "learning_rate": 1.9076135685562942e-05, "loss": 0.4059, "step": 2815 }, { "epoch": 0.555205047318612, "grad_norm": 0.5807662460549189, "learning_rate": 1.9075484865345678e-05, "loss": 0.4399, "step": 2816 }, { "epoch": 0.5554022082018928, "grad_norm": 0.6218425966437778, "learning_rate": 1.9074833827082586e-05, "loss": 0.4857, "step": 2817 }, { "epoch": 0.5555993690851735, "grad_norm": 0.5785450171816167, "learning_rate": 1.90741825707893e-05, "loss": 0.4473, "step": 2818 }, { "epoch": 0.5557965299684543, "grad_norm": 0.5632479711426183, "learning_rate": 1.9073531096481475e-05, "loss": 0.4494, "step": 2819 }, { "epoch": 0.555993690851735, "grad_norm": 0.6791829687064139, "learning_rate": 1.9072879404174755e-05, "loss": 0.4438, "step": 2820 }, { "epoch": 0.5561908517350158, "grad_norm": 0.6216151824829282, "learning_rate": 1.90722274938848e-05, "loss": 0.4486, "step": 2821 }, { "epoch": 0.5563880126182965, "grad_norm": 0.6115679004172528, "learning_rate": 1.9071575365627274e-05, "loss": 0.4846, "step": 2822 }, { "epoch": 0.5565851735015773, "grad_norm": 0.6026873566499071, "learning_rate": 1.9070923019417848e-05, "loss": 0.4395, "step": 2823 }, { "epoch": 0.556782334384858, "grad_norm": 0.6123086454851155, "learning_rate": 1.907027045527219e-05, "loss": 0.406, "step": 2824 }, { "epoch": 0.5569794952681388, "grad_norm": 0.5882773742606572, "learning_rate": 1.906961767320598e-05, "loss": 0.4679, "step": 2825 }, { "epoch": 0.5571766561514195, "grad_norm": 0.5795368478426368, "learning_rate": 1.90689646732349e-05, "loss": 0.4508, "step": 2826 }, { "epoch": 0.5573738170347003, "grad_norm": 0.6306886418135007, "learning_rate": 1.9068311455374638e-05, "loss": 0.4591, "step": 2827 }, { "epoch": 0.557570977917981, "grad_norm": 0.6081030199830474, "learning_rate": 1.9067658019640897e-05, "loss": 0.4718, "step": 2828 }, { "epoch": 0.5577681388012619, "grad_norm": 0.6178637399147585, "learning_rate": 1.9067004366049367e-05, "loss": 0.4833, "step": 2829 }, { "epoch": 0.5579652996845426, "grad_norm": 0.6385458957420991, "learning_rate": 1.9066350494615756e-05, "loss": 0.4273, "step": 2830 }, { "epoch": 0.5581624605678234, "grad_norm": 0.645226093197602, "learning_rate": 1.9065696405355774e-05, "loss": 0.4959, "step": 2831 }, { "epoch": 0.5583596214511041, "grad_norm": 0.6114367401980242, "learning_rate": 1.9065042098285132e-05, "loss": 0.48, "step": 2832 }, { "epoch": 0.5585567823343849, "grad_norm": 0.60055106272059, "learning_rate": 1.9064387573419555e-05, "loss": 0.4782, "step": 2833 }, { "epoch": 0.5587539432176656, "grad_norm": 0.6195745515122923, "learning_rate": 1.906373283077477e-05, "loss": 0.4494, "step": 2834 }, { "epoch": 0.5589511041009464, "grad_norm": 0.7119201520791544, "learning_rate": 1.9063077870366504e-05, "loss": 0.4394, "step": 2835 }, { "epoch": 0.5591482649842271, "grad_norm": 0.5747295277509834, "learning_rate": 1.906242269221049e-05, "loss": 0.4137, "step": 2836 }, { "epoch": 0.5593454258675079, "grad_norm": 0.5453408326530682, "learning_rate": 1.9061767296322477e-05, "loss": 0.415, "step": 2837 }, { "epoch": 0.5595425867507886, "grad_norm": 1.321620620951851, "learning_rate": 1.9061111682718204e-05, "loss": 0.4296, "step": 2838 }, { "epoch": 0.5597397476340694, "grad_norm": 0.562269189961482, "learning_rate": 1.9060455851413424e-05, "loss": 0.4354, "step": 2839 }, { "epoch": 0.5599369085173501, "grad_norm": 0.550600831319121, "learning_rate": 1.90597998024239e-05, "loss": 0.4179, "step": 2840 }, { "epoch": 0.5601340694006309, "grad_norm": 0.5594944562329867, "learning_rate": 1.905914353576539e-05, "loss": 0.4536, "step": 2841 }, { "epoch": 0.5603312302839116, "grad_norm": 0.5764617009479925, "learning_rate": 1.9058487051453662e-05, "loss": 0.4629, "step": 2842 }, { "epoch": 0.5605283911671924, "grad_norm": 11.946607473336949, "learning_rate": 1.9057830349504484e-05, "loss": 0.4617, "step": 2843 }, { "epoch": 0.5607255520504731, "grad_norm": 0.6562336888788557, "learning_rate": 1.9057173429933636e-05, "loss": 0.4496, "step": 2844 }, { "epoch": 0.560922712933754, "grad_norm": 0.5696593285659094, "learning_rate": 1.905651629275691e-05, "loss": 0.4374, "step": 2845 }, { "epoch": 0.5611198738170347, "grad_norm": 0.6107715466947922, "learning_rate": 1.9055858937990083e-05, "loss": 0.4478, "step": 2846 }, { "epoch": 0.5613170347003155, "grad_norm": 0.6442549288365613, "learning_rate": 1.905520136564895e-05, "loss": 0.4537, "step": 2847 }, { "epoch": 0.5615141955835962, "grad_norm": 0.5746797728643914, "learning_rate": 1.9054543575749317e-05, "loss": 0.4498, "step": 2848 }, { "epoch": 0.561711356466877, "grad_norm": 0.9157971817240291, "learning_rate": 1.905388556830698e-05, "loss": 0.4024, "step": 2849 }, { "epoch": 0.5619085173501577, "grad_norm": 0.5954555864900205, "learning_rate": 1.905322734333775e-05, "loss": 0.4712, "step": 2850 }, { "epoch": 0.5621056782334385, "grad_norm": 0.68954636579152, "learning_rate": 1.9052568900857443e-05, "loss": 0.4205, "step": 2851 }, { "epoch": 0.5623028391167192, "grad_norm": 0.7460974392901148, "learning_rate": 1.9051910240881883e-05, "loss": 0.4483, "step": 2852 }, { "epoch": 0.5625, "grad_norm": 0.5355564846309835, "learning_rate": 1.9051251363426883e-05, "loss": 0.4232, "step": 2853 }, { "epoch": 0.5626971608832808, "grad_norm": 0.7155811623930997, "learning_rate": 1.9050592268508284e-05, "loss": 0.4553, "step": 2854 }, { "epoch": 0.5628943217665615, "grad_norm": 0.6320781522351271, "learning_rate": 1.9049932956141917e-05, "loss": 0.4436, "step": 2855 }, { "epoch": 0.5630914826498423, "grad_norm": 0.6801664356997779, "learning_rate": 1.9049273426343622e-05, "loss": 0.4637, "step": 2856 }, { "epoch": 0.563288643533123, "grad_norm": 0.5803175479602963, "learning_rate": 1.9048613679129246e-05, "loss": 0.466, "step": 2857 }, { "epoch": 0.5634858044164038, "grad_norm": 0.6126039116402423, "learning_rate": 1.904795371451464e-05, "loss": 0.4587, "step": 2858 }, { "epoch": 0.5636829652996845, "grad_norm": 0.5827030340943455, "learning_rate": 1.904729353251566e-05, "loss": 0.4518, "step": 2859 }, { "epoch": 0.5638801261829653, "grad_norm": 0.601328312382053, "learning_rate": 1.9046633133148164e-05, "loss": 0.4675, "step": 2860 }, { "epoch": 0.564077287066246, "grad_norm": 0.5717955796305233, "learning_rate": 1.9045972516428026e-05, "loss": 0.4549, "step": 2861 }, { "epoch": 0.5642744479495269, "grad_norm": 0.5613732974457137, "learning_rate": 1.904531168237111e-05, "loss": 0.443, "step": 2862 }, { "epoch": 0.5644716088328076, "grad_norm": 0.5744128359613806, "learning_rate": 1.90446506309933e-05, "loss": 0.4439, "step": 2863 }, { "epoch": 0.5646687697160884, "grad_norm": 0.6119814238497652, "learning_rate": 1.9043989362310472e-05, "loss": 0.4707, "step": 2864 }, { "epoch": 0.5648659305993691, "grad_norm": 0.5567314391501524, "learning_rate": 1.9043327876338517e-05, "loss": 0.4271, "step": 2865 }, { "epoch": 0.5650630914826499, "grad_norm": 0.6354333087115706, "learning_rate": 1.904266617309333e-05, "loss": 0.4968, "step": 2866 }, { "epoch": 0.5652602523659306, "grad_norm": 0.5773958814672833, "learning_rate": 1.9042004252590804e-05, "loss": 0.407, "step": 2867 }, { "epoch": 0.5654574132492114, "grad_norm": 0.5615283708079536, "learning_rate": 1.9041342114846844e-05, "loss": 0.4448, "step": 2868 }, { "epoch": 0.5656545741324921, "grad_norm": 0.634498935968448, "learning_rate": 1.9040679759877358e-05, "loss": 0.4633, "step": 2869 }, { "epoch": 0.5658517350157729, "grad_norm": 0.6007126242862416, "learning_rate": 1.904001718769826e-05, "loss": 0.477, "step": 2870 }, { "epoch": 0.5660488958990536, "grad_norm": 0.5903936190376452, "learning_rate": 1.903935439832547e-05, "loss": 0.4441, "step": 2871 }, { "epoch": 0.5662460567823344, "grad_norm": 0.6404835660073024, "learning_rate": 1.9038691391774913e-05, "loss": 0.4424, "step": 2872 }, { "epoch": 0.5664432176656151, "grad_norm": 0.524427319947031, "learning_rate": 1.9038028168062517e-05, "loss": 0.4137, "step": 2873 }, { "epoch": 0.5666403785488959, "grad_norm": 0.7149877450692915, "learning_rate": 1.9037364727204216e-05, "loss": 0.4914, "step": 2874 }, { "epoch": 0.5668375394321766, "grad_norm": 0.5728331748504152, "learning_rate": 1.9036701069215947e-05, "loss": 0.4609, "step": 2875 }, { "epoch": 0.5670347003154574, "grad_norm": 0.6344827370627714, "learning_rate": 1.9036037194113656e-05, "loss": 0.4456, "step": 2876 }, { "epoch": 0.5672318611987381, "grad_norm": 0.606955120653267, "learning_rate": 1.90353731019133e-05, "loss": 0.4341, "step": 2877 }, { "epoch": 0.567429022082019, "grad_norm": 0.7183503039613219, "learning_rate": 1.9034708792630824e-05, "loss": 0.4661, "step": 2878 }, { "epoch": 0.5676261829652997, "grad_norm": 0.6264186805484021, "learning_rate": 1.9034044266282196e-05, "loss": 0.4745, "step": 2879 }, { "epoch": 0.5678233438485805, "grad_norm": 0.686093957072564, "learning_rate": 1.903337952288338e-05, "loss": 0.4532, "step": 2880 }, { "epoch": 0.5680205047318612, "grad_norm": 0.5783475117989667, "learning_rate": 1.9032714562450345e-05, "loss": 0.449, "step": 2881 }, { "epoch": 0.568217665615142, "grad_norm": 0.6403276197455571, "learning_rate": 1.903204938499907e-05, "loss": 0.4453, "step": 2882 }, { "epoch": 0.5684148264984227, "grad_norm": 0.561414469598965, "learning_rate": 1.9031383990545532e-05, "loss": 0.4499, "step": 2883 }, { "epoch": 0.5686119873817035, "grad_norm": 0.6500041432778779, "learning_rate": 1.9030718379105726e-05, "loss": 0.4347, "step": 2884 }, { "epoch": 0.5688091482649842, "grad_norm": 0.504940933975182, "learning_rate": 1.9030052550695636e-05, "loss": 0.3846, "step": 2885 }, { "epoch": 0.569006309148265, "grad_norm": 0.6281846779360019, "learning_rate": 1.902938650533126e-05, "loss": 0.4713, "step": 2886 }, { "epoch": 0.5692034700315457, "grad_norm": 0.5974696177502193, "learning_rate": 1.9028720243028604e-05, "loss": 0.4417, "step": 2887 }, { "epoch": 0.5694006309148265, "grad_norm": 0.5826680180672237, "learning_rate": 1.9028053763803673e-05, "loss": 0.4566, "step": 2888 }, { "epoch": 0.5695977917981072, "grad_norm": 0.5875996261062907, "learning_rate": 1.902738706767248e-05, "loss": 0.4656, "step": 2889 }, { "epoch": 0.569794952681388, "grad_norm": 0.5602438327843606, "learning_rate": 1.902672015465104e-05, "loss": 0.4268, "step": 2890 }, { "epoch": 0.5699921135646687, "grad_norm": 0.6034100519824507, "learning_rate": 1.9026053024755384e-05, "loss": 0.499, "step": 2891 }, { "epoch": 0.5701892744479495, "grad_norm": 0.5961260542439667, "learning_rate": 1.902538567800153e-05, "loss": 0.4556, "step": 2892 }, { "epoch": 0.5703864353312302, "grad_norm": 0.5847619792244266, "learning_rate": 1.902471811440552e-05, "loss": 0.444, "step": 2893 }, { "epoch": 0.570583596214511, "grad_norm": 0.5717255423270757, "learning_rate": 1.902405033398339e-05, "loss": 0.4604, "step": 2894 }, { "epoch": 0.5707807570977917, "grad_norm": 0.5775461408981852, "learning_rate": 1.9023382336751185e-05, "loss": 0.4372, "step": 2895 }, { "epoch": 0.5709779179810726, "grad_norm": 0.5830553771363007, "learning_rate": 1.902271412272495e-05, "loss": 0.4547, "step": 2896 }, { "epoch": 0.5711750788643533, "grad_norm": 0.6305168162558938, "learning_rate": 1.9022045691920742e-05, "loss": 0.4703, "step": 2897 }, { "epoch": 0.5713722397476341, "grad_norm": 0.5848064263309912, "learning_rate": 1.9021377044354624e-05, "loss": 0.4542, "step": 2898 }, { "epoch": 0.5715694006309149, "grad_norm": 0.6638321170730301, "learning_rate": 1.9020708180042654e-05, "loss": 0.4936, "step": 2899 }, { "epoch": 0.5717665615141956, "grad_norm": 0.637624069032922, "learning_rate": 1.902003909900091e-05, "loss": 0.4551, "step": 2900 }, { "epoch": 0.5719637223974764, "grad_norm": 0.623095105439941, "learning_rate": 1.9019369801245458e-05, "loss": 0.4491, "step": 2901 }, { "epoch": 0.5721608832807571, "grad_norm": 0.548146806710325, "learning_rate": 1.9018700286792388e-05, "loss": 0.4262, "step": 2902 }, { "epoch": 0.5723580441640379, "grad_norm": 0.6045037208451278, "learning_rate": 1.9018030555657776e-05, "loss": 0.4825, "step": 2903 }, { "epoch": 0.5725552050473186, "grad_norm": 0.5859988705690671, "learning_rate": 1.9017360607857724e-05, "loss": 0.4446, "step": 2904 }, { "epoch": 0.5727523659305994, "grad_norm": 0.565267471448854, "learning_rate": 1.9016690443408314e-05, "loss": 0.4103, "step": 2905 }, { "epoch": 0.5729495268138801, "grad_norm": 0.6135424852694339, "learning_rate": 1.901602006232566e-05, "loss": 0.4822, "step": 2906 }, { "epoch": 0.5731466876971609, "grad_norm": 0.5918245843382935, "learning_rate": 1.901534946462586e-05, "loss": 0.4231, "step": 2907 }, { "epoch": 0.5733438485804416, "grad_norm": 0.5728688344893811, "learning_rate": 1.9014678650325035e-05, "loss": 0.4464, "step": 2908 }, { "epoch": 0.5735410094637224, "grad_norm": 0.6157367306965482, "learning_rate": 1.901400761943929e-05, "loss": 0.4399, "step": 2909 }, { "epoch": 0.5737381703470031, "grad_norm": 0.8263123254378486, "learning_rate": 1.9013336371984756e-05, "loss": 0.4274, "step": 2910 }, { "epoch": 0.573935331230284, "grad_norm": 0.685238179627279, "learning_rate": 1.9012664907977557e-05, "loss": 0.4683, "step": 2911 }, { "epoch": 0.5741324921135647, "grad_norm": 0.5358383697418044, "learning_rate": 1.9011993227433826e-05, "loss": 0.3689, "step": 2912 }, { "epoch": 0.5743296529968455, "grad_norm": 0.6077623131242404, "learning_rate": 1.9011321330369696e-05, "loss": 0.4245, "step": 2913 }, { "epoch": 0.5745268138801262, "grad_norm": 0.5994256634418688, "learning_rate": 1.9010649216801316e-05, "loss": 0.4768, "step": 2914 }, { "epoch": 0.574723974763407, "grad_norm": 0.6159531989573963, "learning_rate": 1.9009976886744837e-05, "loss": 0.4501, "step": 2915 }, { "epoch": 0.5749211356466877, "grad_norm": 0.6096049034892379, "learning_rate": 1.9009304340216403e-05, "loss": 0.4464, "step": 2916 }, { "epoch": 0.5751182965299685, "grad_norm": 0.5603268106696802, "learning_rate": 1.900863157723218e-05, "loss": 0.458, "step": 2917 }, { "epoch": 0.5753154574132492, "grad_norm": 0.5558645790716764, "learning_rate": 1.9007958597808326e-05, "loss": 0.3959, "step": 2918 }, { "epoch": 0.57551261829653, "grad_norm": 0.6195875338599605, "learning_rate": 1.9007285401961016e-05, "loss": 0.5038, "step": 2919 }, { "epoch": 0.5757097791798107, "grad_norm": 0.5783958420849634, "learning_rate": 1.9006611989706417e-05, "loss": 0.4647, "step": 2920 }, { "epoch": 0.5759069400630915, "grad_norm": 0.5986709701715736, "learning_rate": 1.9005938361060714e-05, "loss": 0.457, "step": 2921 }, { "epoch": 0.5761041009463722, "grad_norm": 0.5992935813507028, "learning_rate": 1.900526451604009e-05, "loss": 0.4146, "step": 2922 }, { "epoch": 0.576301261829653, "grad_norm": 0.6114620175570423, "learning_rate": 1.900459045466073e-05, "loss": 0.4699, "step": 2923 }, { "epoch": 0.5764984227129337, "grad_norm": 0.5778870682581325, "learning_rate": 1.9003916176938837e-05, "loss": 0.4132, "step": 2924 }, { "epoch": 0.5766955835962145, "grad_norm": 0.556437544810432, "learning_rate": 1.9003241682890607e-05, "loss": 0.4196, "step": 2925 }, { "epoch": 0.5768927444794952, "grad_norm": 0.5654030288060881, "learning_rate": 1.9002566972532242e-05, "loss": 0.4016, "step": 2926 }, { "epoch": 0.577089905362776, "grad_norm": 0.5874171442829121, "learning_rate": 1.9001892045879963e-05, "loss": 0.4617, "step": 2927 }, { "epoch": 0.5772870662460567, "grad_norm": 0.5875515040090509, "learning_rate": 1.9001216902949974e-05, "loss": 0.4511, "step": 2928 }, { "epoch": 0.5774842271293376, "grad_norm": 0.5568685656549751, "learning_rate": 1.9000541543758497e-05, "loss": 0.4347, "step": 2929 }, { "epoch": 0.5776813880126183, "grad_norm": 0.5746594931865293, "learning_rate": 1.8999865968321765e-05, "loss": 0.4523, "step": 2930 }, { "epoch": 0.5778785488958991, "grad_norm": 0.5500934493880131, "learning_rate": 1.8999190176656004e-05, "loss": 0.401, "step": 2931 }, { "epoch": 0.5780757097791798, "grad_norm": 0.5988645511870025, "learning_rate": 1.8998514168777453e-05, "loss": 0.4418, "step": 2932 }, { "epoch": 0.5782728706624606, "grad_norm": 0.5787533040554713, "learning_rate": 1.8997837944702352e-05, "loss": 0.4552, "step": 2933 }, { "epoch": 0.5784700315457413, "grad_norm": 0.5924302662548973, "learning_rate": 1.899716150444695e-05, "loss": 0.423, "step": 2934 }, { "epoch": 0.5786671924290221, "grad_norm": 2.4787691128805935, "learning_rate": 1.8996484848027496e-05, "loss": 0.4051, "step": 2935 }, { "epoch": 0.5788643533123028, "grad_norm": 0.722111103146418, "learning_rate": 1.8995807975460246e-05, "loss": 0.4383, "step": 2936 }, { "epoch": 0.5790615141955836, "grad_norm": 0.7452661199567964, "learning_rate": 1.8995130886761468e-05, "loss": 0.4328, "step": 2937 }, { "epoch": 0.5792586750788643, "grad_norm": 0.6272537937748907, "learning_rate": 1.8994453581947428e-05, "loss": 0.4501, "step": 2938 }, { "epoch": 0.5794558359621451, "grad_norm": 0.5871428411077138, "learning_rate": 1.8993776061034394e-05, "loss": 0.4197, "step": 2939 }, { "epoch": 0.5796529968454258, "grad_norm": 0.6082870109603066, "learning_rate": 1.899309832403865e-05, "loss": 0.4397, "step": 2940 }, { "epoch": 0.5798501577287066, "grad_norm": 0.5698173415968192, "learning_rate": 1.8992420370976476e-05, "loss": 0.4297, "step": 2941 }, { "epoch": 0.5800473186119873, "grad_norm": 0.659454422363326, "learning_rate": 1.899174220186416e-05, "loss": 0.4565, "step": 2942 }, { "epoch": 0.5802444794952681, "grad_norm": 0.5921121311398183, "learning_rate": 1.8991063816717998e-05, "loss": 0.4357, "step": 2943 }, { "epoch": 0.580441640378549, "grad_norm": 0.6950468872106844, "learning_rate": 1.899038521555429e-05, "loss": 0.4087, "step": 2944 }, { "epoch": 0.5806388012618297, "grad_norm": 0.5936144532941147, "learning_rate": 1.8989706398389335e-05, "loss": 0.4237, "step": 2945 }, { "epoch": 0.5808359621451105, "grad_norm": 0.5868915401669754, "learning_rate": 1.8989027365239443e-05, "loss": 0.4006, "step": 2946 }, { "epoch": 0.5810331230283912, "grad_norm": 0.6069291322319756, "learning_rate": 1.8988348116120926e-05, "loss": 0.4487, "step": 2947 }, { "epoch": 0.581230283911672, "grad_norm": 0.5598237023370913, "learning_rate": 1.8987668651050117e-05, "loss": 0.4363, "step": 2948 }, { "epoch": 0.5814274447949527, "grad_norm": 0.6162522565496759, "learning_rate": 1.8986988970043324e-05, "loss": 0.4778, "step": 2949 }, { "epoch": 0.5816246056782335, "grad_norm": 0.5922390245389011, "learning_rate": 1.8986309073116883e-05, "loss": 0.4492, "step": 2950 }, { "epoch": 0.5818217665615142, "grad_norm": 0.5993508377014622, "learning_rate": 1.8985628960287134e-05, "loss": 0.4637, "step": 2951 }, { "epoch": 0.582018927444795, "grad_norm": 0.549571504491819, "learning_rate": 1.898494863157041e-05, "loss": 0.425, "step": 2952 }, { "epoch": 0.5822160883280757, "grad_norm": 0.5759972011389546, "learning_rate": 1.898426808698306e-05, "loss": 0.4408, "step": 2953 }, { "epoch": 0.5824132492113565, "grad_norm": 0.5482466547534278, "learning_rate": 1.8983587326541437e-05, "loss": 0.4289, "step": 2954 }, { "epoch": 0.5826104100946372, "grad_norm": 0.5913250211398492, "learning_rate": 1.898290635026189e-05, "loss": 0.4629, "step": 2955 }, { "epoch": 0.582807570977918, "grad_norm": 0.570705758804096, "learning_rate": 1.8982225158160788e-05, "loss": 0.4442, "step": 2956 }, { "epoch": 0.5830047318611987, "grad_norm": 0.5878782769052985, "learning_rate": 1.898154375025449e-05, "loss": 0.3961, "step": 2957 }, { "epoch": 0.5832018927444795, "grad_norm": 0.5735902991837972, "learning_rate": 1.8980862126559373e-05, "loss": 0.4702, "step": 2958 }, { "epoch": 0.5833990536277602, "grad_norm": 0.5657885791348485, "learning_rate": 1.898018028709181e-05, "loss": 0.455, "step": 2959 }, { "epoch": 0.583596214511041, "grad_norm": 1.3679534419258494, "learning_rate": 1.8979498231868183e-05, "loss": 0.4948, "step": 2960 }, { "epoch": 0.5837933753943217, "grad_norm": 0.6017442927840219, "learning_rate": 1.897881596090488e-05, "loss": 0.4728, "step": 2961 }, { "epoch": 0.5839905362776026, "grad_norm": 0.5555623988000693, "learning_rate": 1.8978133474218294e-05, "loss": 0.4455, "step": 2962 }, { "epoch": 0.5841876971608833, "grad_norm": 0.6041207462728285, "learning_rate": 1.897745077182482e-05, "loss": 0.4544, "step": 2963 }, { "epoch": 0.5843848580441641, "grad_norm": 0.6250067192179154, "learning_rate": 1.8976767853740866e-05, "loss": 0.4752, "step": 2964 }, { "epoch": 0.5845820189274448, "grad_norm": 0.571207168389721, "learning_rate": 1.897608471998283e-05, "loss": 0.4342, "step": 2965 }, { "epoch": 0.5847791798107256, "grad_norm": 2.8095095602359463, "learning_rate": 1.897540137056713e-05, "loss": 0.4852, "step": 2966 }, { "epoch": 0.5849763406940063, "grad_norm": 0.6427772218920856, "learning_rate": 1.897471780551019e-05, "loss": 0.4679, "step": 2967 }, { "epoch": 0.5851735015772871, "grad_norm": 0.5594426060918991, "learning_rate": 1.897403402482842e-05, "loss": 0.4083, "step": 2968 }, { "epoch": 0.5853706624605678, "grad_norm": 0.5801095527319007, "learning_rate": 1.897335002853826e-05, "loss": 0.423, "step": 2969 }, { "epoch": 0.5855678233438486, "grad_norm": 0.7313301838049326, "learning_rate": 1.897266581665614e-05, "loss": 0.4418, "step": 2970 }, { "epoch": 0.5857649842271293, "grad_norm": 0.615578548987388, "learning_rate": 1.8971981389198495e-05, "loss": 0.4555, "step": 2971 }, { "epoch": 0.5859621451104101, "grad_norm": 0.5588797678763053, "learning_rate": 1.8971296746181774e-05, "loss": 0.423, "step": 2972 }, { "epoch": 0.5861593059936908, "grad_norm": 0.6472108314860092, "learning_rate": 1.8970611887622425e-05, "loss": 0.4733, "step": 2973 }, { "epoch": 0.5863564668769716, "grad_norm": 0.6277578303497903, "learning_rate": 1.89699268135369e-05, "loss": 0.4319, "step": 2974 }, { "epoch": 0.5865536277602523, "grad_norm": 8.45953869638077, "learning_rate": 1.8969241523941662e-05, "loss": 0.4668, "step": 2975 }, { "epoch": 0.5867507886435331, "grad_norm": 0.751366864143633, "learning_rate": 1.896855601885317e-05, "loss": 0.4836, "step": 2976 }, { "epoch": 0.5869479495268138, "grad_norm": 0.5909034853303601, "learning_rate": 1.89678702982879e-05, "loss": 0.5005, "step": 2977 }, { "epoch": 0.5871451104100947, "grad_norm": 0.7792417477818725, "learning_rate": 1.8967184362262324e-05, "loss": 0.4447, "step": 2978 }, { "epoch": 0.5873422712933754, "grad_norm": 1.0145547450801644, "learning_rate": 1.896649821079292e-05, "loss": 0.4624, "step": 2979 }, { "epoch": 0.5875394321766562, "grad_norm": 0.5792099704275391, "learning_rate": 1.8965811843896178e-05, "loss": 0.4701, "step": 2980 }, { "epoch": 0.5877365930599369, "grad_norm": 0.6324701038532613, "learning_rate": 1.8965125261588586e-05, "loss": 0.4543, "step": 2981 }, { "epoch": 0.5879337539432177, "grad_norm": 0.6298268546935512, "learning_rate": 1.8964438463886638e-05, "loss": 0.4484, "step": 2982 }, { "epoch": 0.5881309148264984, "grad_norm": 0.6842407738734424, "learning_rate": 1.896375145080684e-05, "loss": 0.4355, "step": 2983 }, { "epoch": 0.5883280757097792, "grad_norm": 0.6522884562722095, "learning_rate": 1.8963064222365694e-05, "loss": 0.469, "step": 2984 }, { "epoch": 0.5885252365930599, "grad_norm": 0.6206326292297457, "learning_rate": 1.896237677857971e-05, "loss": 0.471, "step": 2985 }, { "epoch": 0.5887223974763407, "grad_norm": 0.5650099483761517, "learning_rate": 1.896168911946541e-05, "loss": 0.4313, "step": 2986 }, { "epoch": 0.5889195583596214, "grad_norm": 0.653001742089255, "learning_rate": 1.896100124503931e-05, "loss": 0.4548, "step": 2987 }, { "epoch": 0.5891167192429022, "grad_norm": 0.5751129122223437, "learning_rate": 1.896031315531794e-05, "loss": 0.4307, "step": 2988 }, { "epoch": 0.589313880126183, "grad_norm": 0.5731286729374794, "learning_rate": 1.895962485031783e-05, "loss": 0.429, "step": 2989 }, { "epoch": 0.5895110410094637, "grad_norm": 0.5812391295799345, "learning_rate": 1.8958936330055516e-05, "loss": 0.4326, "step": 2990 }, { "epoch": 0.5897082018927445, "grad_norm": 0.6341740958943612, "learning_rate": 1.8958247594547543e-05, "loss": 0.4421, "step": 2991 }, { "epoch": 0.5899053627760252, "grad_norm": 0.5855292136590585, "learning_rate": 1.895755864381046e-05, "loss": 0.4476, "step": 2992 }, { "epoch": 0.590102523659306, "grad_norm": 0.5897754099618571, "learning_rate": 1.8956869477860813e-05, "loss": 0.4347, "step": 2993 }, { "epoch": 0.5902996845425867, "grad_norm": 0.5651589224007416, "learning_rate": 1.8956180096715166e-05, "loss": 0.47, "step": 2994 }, { "epoch": 0.5904968454258676, "grad_norm": 0.5901739340372232, "learning_rate": 1.895549050039008e-05, "loss": 0.4463, "step": 2995 }, { "epoch": 0.5906940063091483, "grad_norm": 0.5918462673605263, "learning_rate": 1.8954800688902125e-05, "loss": 0.4616, "step": 2996 }, { "epoch": 0.5908911671924291, "grad_norm": 0.5857619648713869, "learning_rate": 1.8954110662267868e-05, "loss": 0.4655, "step": 2997 }, { "epoch": 0.5910883280757098, "grad_norm": 0.5749020201058925, "learning_rate": 1.8953420420503894e-05, "loss": 0.4538, "step": 2998 }, { "epoch": 0.5912854889589906, "grad_norm": 0.6364951608672293, "learning_rate": 1.8952729963626783e-05, "loss": 0.4516, "step": 2999 }, { "epoch": 0.5914826498422713, "grad_norm": 0.5455105233117238, "learning_rate": 1.8952039291653126e-05, "loss": 0.4129, "step": 3000 }, { "epoch": 0.5916798107255521, "grad_norm": 0.5688101335852396, "learning_rate": 1.8951348404599518e-05, "loss": 0.436, "step": 3001 }, { "epoch": 0.5918769716088328, "grad_norm": 0.5869599963349247, "learning_rate": 1.895065730248255e-05, "loss": 0.4625, "step": 3002 }, { "epoch": 0.5920741324921136, "grad_norm": 0.583433314102157, "learning_rate": 1.8949965985318835e-05, "loss": 0.4619, "step": 3003 }, { "epoch": 0.5922712933753943, "grad_norm": 1.8179141375028536, "learning_rate": 1.8949274453124985e-05, "loss": 0.4457, "step": 3004 }, { "epoch": 0.5924684542586751, "grad_norm": 1.0052816097153328, "learning_rate": 1.8948582705917605e-05, "loss": 0.3993, "step": 3005 }, { "epoch": 0.5926656151419558, "grad_norm": 0.6655550567863493, "learning_rate": 1.8947890743713316e-05, "loss": 0.4633, "step": 3006 }, { "epoch": 0.5928627760252366, "grad_norm": 0.6343745778626908, "learning_rate": 1.8947198566528752e-05, "loss": 0.4534, "step": 3007 }, { "epoch": 0.5930599369085173, "grad_norm": 0.8146895268571629, "learning_rate": 1.894650617438054e-05, "loss": 0.4622, "step": 3008 }, { "epoch": 0.5932570977917981, "grad_norm": 0.5919980337355466, "learning_rate": 1.8945813567285303e-05, "loss": 0.4444, "step": 3009 }, { "epoch": 0.5934542586750788, "grad_norm": 0.6233642830617198, "learning_rate": 1.8945120745259696e-05, "loss": 0.4585, "step": 3010 }, { "epoch": 0.5936514195583596, "grad_norm": 0.6560319675501647, "learning_rate": 1.894442770832036e-05, "loss": 0.4733, "step": 3011 }, { "epoch": 0.5938485804416404, "grad_norm": 0.6361924491098656, "learning_rate": 1.8943734456483944e-05, "loss": 0.463, "step": 3012 }, { "epoch": 0.5940457413249212, "grad_norm": 0.5872105410259241, "learning_rate": 1.8943040989767104e-05, "loss": 0.4613, "step": 3013 }, { "epoch": 0.5942429022082019, "grad_norm": 0.9249850168680832, "learning_rate": 1.8942347308186506e-05, "loss": 0.4223, "step": 3014 }, { "epoch": 0.5944400630914827, "grad_norm": 0.5669241268386221, "learning_rate": 1.8941653411758813e-05, "loss": 0.4231, "step": 3015 }, { "epoch": 0.5946372239747634, "grad_norm": 0.6196619172043841, "learning_rate": 1.894095930050069e-05, "loss": 0.4403, "step": 3016 }, { "epoch": 0.5948343848580442, "grad_norm": 0.563726313430164, "learning_rate": 1.8940264974428827e-05, "loss": 0.4407, "step": 3017 }, { "epoch": 0.5950315457413249, "grad_norm": 0.6200267950921632, "learning_rate": 1.8939570433559894e-05, "loss": 0.4129, "step": 3018 }, { "epoch": 0.5952287066246057, "grad_norm": 0.6093402086204679, "learning_rate": 1.8938875677910586e-05, "loss": 0.464, "step": 3019 }, { "epoch": 0.5954258675078864, "grad_norm": 0.5932927503972287, "learning_rate": 1.8938180707497588e-05, "loss": 0.4361, "step": 3020 }, { "epoch": 0.5956230283911672, "grad_norm": 0.57662299638576, "learning_rate": 1.89374855223376e-05, "loss": 0.4501, "step": 3021 }, { "epoch": 0.5958201892744479, "grad_norm": 0.5619926542960811, "learning_rate": 1.8936790122447327e-05, "loss": 0.4682, "step": 3022 }, { "epoch": 0.5960173501577287, "grad_norm": 0.7525793185982179, "learning_rate": 1.8936094507843476e-05, "loss": 0.4779, "step": 3023 }, { "epoch": 0.5962145110410094, "grad_norm": 0.5548877208195396, "learning_rate": 1.8935398678542752e-05, "loss": 0.4321, "step": 3024 }, { "epoch": 0.5964116719242902, "grad_norm": 0.5988617075988154, "learning_rate": 1.8934702634561887e-05, "loss": 0.4293, "step": 3025 }, { "epoch": 0.5966088328075709, "grad_norm": 0.6782737257196965, "learning_rate": 1.893400637591759e-05, "loss": 0.4032, "step": 3026 }, { "epoch": 0.5968059936908517, "grad_norm": 0.5513926310853049, "learning_rate": 1.8933309902626598e-05, "loss": 0.4134, "step": 3027 }, { "epoch": 0.5970031545741324, "grad_norm": 0.5598624884027122, "learning_rate": 1.893261321470564e-05, "loss": 0.4409, "step": 3028 }, { "epoch": 0.5972003154574133, "grad_norm": 0.5745829727781053, "learning_rate": 1.893191631217146e-05, "loss": 0.4699, "step": 3029 }, { "epoch": 0.597397476340694, "grad_norm": 0.6008234547988593, "learning_rate": 1.8931219195040796e-05, "loss": 0.4526, "step": 3030 }, { "epoch": 0.5975946372239748, "grad_norm": 0.9177487857817422, "learning_rate": 1.8930521863330395e-05, "loss": 0.4306, "step": 3031 }, { "epoch": 0.5977917981072555, "grad_norm": 0.5675597820246167, "learning_rate": 1.892982431705702e-05, "loss": 0.4458, "step": 3032 }, { "epoch": 0.5979889589905363, "grad_norm": 18.36759892361865, "learning_rate": 1.892912655623742e-05, "loss": 0.4854, "step": 3033 }, { "epoch": 0.598186119873817, "grad_norm": 0.9555573861343462, "learning_rate": 1.8928428580888365e-05, "loss": 0.4249, "step": 3034 }, { "epoch": 0.5983832807570978, "grad_norm": 0.6045541668448458, "learning_rate": 1.8927730391026625e-05, "loss": 0.4369, "step": 3035 }, { "epoch": 0.5985804416403786, "grad_norm": 0.5905804898359281, "learning_rate": 1.8927031986668973e-05, "loss": 0.4367, "step": 3036 }, { "epoch": 0.5987776025236593, "grad_norm": 0.6171855342108601, "learning_rate": 1.8926333367832188e-05, "loss": 0.4311, "step": 3037 }, { "epoch": 0.5989747634069401, "grad_norm": 0.561383493809823, "learning_rate": 1.8925634534533054e-05, "loss": 0.4225, "step": 3038 }, { "epoch": 0.5991719242902208, "grad_norm": 0.6623749634079279, "learning_rate": 1.8924935486788362e-05, "loss": 0.4703, "step": 3039 }, { "epoch": 0.5993690851735016, "grad_norm": 0.6116882687261561, "learning_rate": 1.892423622461491e-05, "loss": 0.4566, "step": 3040 }, { "epoch": 0.5995662460567823, "grad_norm": 0.5713571661306532, "learning_rate": 1.8923536748029495e-05, "loss": 0.4275, "step": 3041 }, { "epoch": 0.5997634069400631, "grad_norm": 0.6069279944169308, "learning_rate": 1.8922837057048925e-05, "loss": 0.4318, "step": 3042 }, { "epoch": 0.5999605678233438, "grad_norm": 0.5706152548682657, "learning_rate": 1.8922137151690005e-05, "loss": 0.4448, "step": 3043 }, { "epoch": 0.6001577287066246, "grad_norm": 0.6191149624010278, "learning_rate": 1.8921437031969557e-05, "loss": 0.4668, "step": 3044 }, { "epoch": 0.6003548895899053, "grad_norm": 0.537186705194169, "learning_rate": 1.8920736697904406e-05, "loss": 0.3969, "step": 3045 }, { "epoch": 0.6005520504731862, "grad_norm": 0.5823679438434386, "learning_rate": 1.8920036149511365e-05, "loss": 0.4349, "step": 3046 }, { "epoch": 0.6007492113564669, "grad_norm": 0.5764684381732735, "learning_rate": 1.8919335386807275e-05, "loss": 0.449, "step": 3047 }, { "epoch": 0.6009463722397477, "grad_norm": 0.5972458224831989, "learning_rate": 1.891863440980897e-05, "loss": 0.4386, "step": 3048 }, { "epoch": 0.6011435331230284, "grad_norm": 0.6843819029628029, "learning_rate": 1.8917933218533285e-05, "loss": 0.4148, "step": 3049 }, { "epoch": 0.6013406940063092, "grad_norm": 23.053542751701265, "learning_rate": 1.891723181299708e-05, "loss": 0.4442, "step": 3050 }, { "epoch": 0.6015378548895899, "grad_norm": 0.6757852367698026, "learning_rate": 1.8916530193217197e-05, "loss": 0.411, "step": 3051 }, { "epoch": 0.6017350157728707, "grad_norm": 0.6378151249084146, "learning_rate": 1.8915828359210494e-05, "loss": 0.4503, "step": 3052 }, { "epoch": 0.6019321766561514, "grad_norm": 0.8860348502775296, "learning_rate": 1.8915126310993838e-05, "loss": 0.4358, "step": 3053 }, { "epoch": 0.6021293375394322, "grad_norm": 0.6733989234081417, "learning_rate": 1.891442404858409e-05, "loss": 0.4989, "step": 3054 }, { "epoch": 0.6023264984227129, "grad_norm": 0.6616823955387168, "learning_rate": 1.891372157199813e-05, "loss": 0.4833, "step": 3055 }, { "epoch": 0.6025236593059937, "grad_norm": 0.613822177088368, "learning_rate": 1.8913018881252827e-05, "loss": 0.4309, "step": 3056 }, { "epoch": 0.6027208201892744, "grad_norm": 0.7450747392214685, "learning_rate": 1.891231597636507e-05, "loss": 0.4752, "step": 3057 }, { "epoch": 0.6029179810725552, "grad_norm": 0.5822085751848699, "learning_rate": 1.8911612857351743e-05, "loss": 0.4617, "step": 3058 }, { "epoch": 0.6031151419558359, "grad_norm": 0.5976169928739026, "learning_rate": 1.891090952422974e-05, "loss": 0.4175, "step": 3059 }, { "epoch": 0.6033123028391167, "grad_norm": 0.6141756747523544, "learning_rate": 1.891020597701596e-05, "loss": 0.4467, "step": 3060 }, { "epoch": 0.6035094637223974, "grad_norm": 0.7375579702582761, "learning_rate": 1.890950221572731e-05, "loss": 0.4993, "step": 3061 }, { "epoch": 0.6037066246056783, "grad_norm": 0.6447777519058864, "learning_rate": 1.8908798240380692e-05, "loss": 0.4402, "step": 3062 }, { "epoch": 0.603903785488959, "grad_norm": 0.572469473088046, "learning_rate": 1.890809405099302e-05, "loss": 0.4245, "step": 3063 }, { "epoch": 0.6041009463722398, "grad_norm": 6.873905360016103, "learning_rate": 1.8907389647581216e-05, "loss": 0.5418, "step": 3064 }, { "epoch": 0.6042981072555205, "grad_norm": 0.7088428326506333, "learning_rate": 1.89066850301622e-05, "loss": 0.4588, "step": 3065 }, { "epoch": 0.6044952681388013, "grad_norm": 0.6487251096155348, "learning_rate": 1.890598019875291e-05, "loss": 0.47, "step": 3066 }, { "epoch": 0.604692429022082, "grad_norm": 0.7195687916217166, "learning_rate": 1.8905275153370272e-05, "loss": 0.4444, "step": 3067 }, { "epoch": 0.6048895899053628, "grad_norm": 0.6405425829888592, "learning_rate": 1.890456989403122e-05, "loss": 0.4242, "step": 3068 }, { "epoch": 0.6050867507886435, "grad_norm": 1.1334558627993632, "learning_rate": 1.8903864420752712e-05, "loss": 0.497, "step": 3069 }, { "epoch": 0.6052839116719243, "grad_norm": 0.612356344424891, "learning_rate": 1.890315873355169e-05, "loss": 0.3887, "step": 3070 }, { "epoch": 0.605481072555205, "grad_norm": 0.666337306996091, "learning_rate": 1.890245283244511e-05, "loss": 0.4491, "step": 3071 }, { "epoch": 0.6056782334384858, "grad_norm": 0.6409164321823476, "learning_rate": 1.8901746717449932e-05, "loss": 0.4323, "step": 3072 }, { "epoch": 0.6058753943217665, "grad_norm": 0.6288119419802719, "learning_rate": 1.8901040388583117e-05, "loss": 0.4444, "step": 3073 }, { "epoch": 0.6060725552050473, "grad_norm": 0.6764142603017349, "learning_rate": 1.8900333845861643e-05, "loss": 0.4598, "step": 3074 }, { "epoch": 0.606269716088328, "grad_norm": 0.6209145176055464, "learning_rate": 1.889962708930248e-05, "loss": 0.4582, "step": 3075 }, { "epoch": 0.6064668769716088, "grad_norm": 0.6837083145298952, "learning_rate": 1.8898920118922607e-05, "loss": 0.4073, "step": 3076 }, { "epoch": 0.6066640378548895, "grad_norm": 0.6516951362240437, "learning_rate": 1.8898212934739012e-05, "loss": 0.4418, "step": 3077 }, { "epoch": 0.6068611987381703, "grad_norm": 0.6228422659793167, "learning_rate": 1.889750553676869e-05, "loss": 0.4025, "step": 3078 }, { "epoch": 0.607058359621451, "grad_norm": 0.9492283175824956, "learning_rate": 1.8896797925028626e-05, "loss": 0.4269, "step": 3079 }, { "epoch": 0.6072555205047319, "grad_norm": 0.6988466626741697, "learning_rate": 1.8896090099535834e-05, "loss": 0.4966, "step": 3080 }, { "epoch": 0.6074526813880127, "grad_norm": 1.6653405313238836, "learning_rate": 1.889538206030731e-05, "loss": 0.4327, "step": 3081 }, { "epoch": 0.6076498422712934, "grad_norm": 0.6950411466750152, "learning_rate": 1.8894673807360065e-05, "loss": 0.4513, "step": 3082 }, { "epoch": 0.6078470031545742, "grad_norm": 0.5702620043948126, "learning_rate": 1.8893965340711126e-05, "loss": 0.4481, "step": 3083 }, { "epoch": 0.6080441640378549, "grad_norm": 0.6716600497276086, "learning_rate": 1.8893256660377505e-05, "loss": 0.4765, "step": 3084 }, { "epoch": 0.6082413249211357, "grad_norm": 0.602345996688224, "learning_rate": 1.8892547766376228e-05, "loss": 0.4651, "step": 3085 }, { "epoch": 0.6084384858044164, "grad_norm": 0.5853869541989889, "learning_rate": 1.889183865872433e-05, "loss": 0.4199, "step": 3086 }, { "epoch": 0.6086356466876972, "grad_norm": 0.6334963552684939, "learning_rate": 1.889112933743885e-05, "loss": 0.4836, "step": 3087 }, { "epoch": 0.6088328075709779, "grad_norm": 0.6800088424135952, "learning_rate": 1.8890419802536826e-05, "loss": 0.4875, "step": 3088 }, { "epoch": 0.6090299684542587, "grad_norm": 0.6475039848768125, "learning_rate": 1.888971005403531e-05, "loss": 0.4679, "step": 3089 }, { "epoch": 0.6092271293375394, "grad_norm": 0.6133780661367402, "learning_rate": 1.8889000091951347e-05, "loss": 0.4805, "step": 3090 }, { "epoch": 0.6094242902208202, "grad_norm": 0.6087154224375381, "learning_rate": 1.8888289916302e-05, "loss": 0.4193, "step": 3091 }, { "epoch": 0.6096214511041009, "grad_norm": 0.6392859491176853, "learning_rate": 1.8887579527104332e-05, "loss": 0.4499, "step": 3092 }, { "epoch": 0.6098186119873817, "grad_norm": 0.561076270997986, "learning_rate": 1.8886868924375407e-05, "loss": 0.4607, "step": 3093 }, { "epoch": 0.6100157728706624, "grad_norm": 0.6104019667478391, "learning_rate": 1.8886158108132298e-05, "loss": 0.448, "step": 3094 }, { "epoch": 0.6102129337539433, "grad_norm": 0.5694020567275695, "learning_rate": 1.888544707839209e-05, "loss": 0.4318, "step": 3095 }, { "epoch": 0.610410094637224, "grad_norm": 0.6155013174910009, "learning_rate": 1.888473583517185e-05, "loss": 0.4129, "step": 3096 }, { "epoch": 0.6106072555205048, "grad_norm": 0.8830383227880512, "learning_rate": 1.8884024378488686e-05, "loss": 0.4668, "step": 3097 }, { "epoch": 0.6108044164037855, "grad_norm": 0.5411137975815842, "learning_rate": 1.888331270835968e-05, "loss": 0.4269, "step": 3098 }, { "epoch": 0.6110015772870663, "grad_norm": 0.8549193019773949, "learning_rate": 1.8882600824801932e-05, "loss": 0.4349, "step": 3099 }, { "epoch": 0.611198738170347, "grad_norm": 5.778893003349931, "learning_rate": 1.888188872783255e-05, "loss": 0.4817, "step": 3100 }, { "epoch": 0.6113958990536278, "grad_norm": 1.1949763025723064, "learning_rate": 1.888117641746863e-05, "loss": 0.4478, "step": 3101 }, { "epoch": 0.6115930599369085, "grad_norm": 0.6510700265765879, "learning_rate": 1.8880463893727297e-05, "loss": 0.4496, "step": 3102 }, { "epoch": 0.6117902208201893, "grad_norm": 0.5839708585126449, "learning_rate": 1.8879751156625673e-05, "loss": 0.4457, "step": 3103 }, { "epoch": 0.61198738170347, "grad_norm": 17.48414582449077, "learning_rate": 1.887903820618087e-05, "loss": 0.4768, "step": 3104 }, { "epoch": 0.6121845425867508, "grad_norm": 1.437092612882128, "learning_rate": 1.887832504241003e-05, "loss": 0.4374, "step": 3105 }, { "epoch": 0.6123817034700315, "grad_norm": 4.569861399699513, "learning_rate": 1.887761166533028e-05, "loss": 0.4938, "step": 3106 }, { "epoch": 0.6125788643533123, "grad_norm": 0.8415642887494247, "learning_rate": 1.8876898074958757e-05, "loss": 0.4235, "step": 3107 }, { "epoch": 0.612776025236593, "grad_norm": 0.7258259413681535, "learning_rate": 1.887618427131261e-05, "loss": 0.4572, "step": 3108 }, { "epoch": 0.6129731861198738, "grad_norm": 0.6290488996802963, "learning_rate": 1.887547025440899e-05, "loss": 0.425, "step": 3109 }, { "epoch": 0.6131703470031545, "grad_norm": 0.7318494963979986, "learning_rate": 1.8874756024265045e-05, "loss": 0.4631, "step": 3110 }, { "epoch": 0.6133675078864353, "grad_norm": 0.6443190758237272, "learning_rate": 1.8874041580897944e-05, "loss": 0.4751, "step": 3111 }, { "epoch": 0.613564668769716, "grad_norm": 0.6538478656310595, "learning_rate": 1.887332692432485e-05, "loss": 0.4124, "step": 3112 }, { "epoch": 0.6137618296529969, "grad_norm": 0.6822067929384322, "learning_rate": 1.8872612054562927e-05, "loss": 0.4744, "step": 3113 }, { "epoch": 0.6139589905362776, "grad_norm": 0.5861492452369412, "learning_rate": 1.8871896971629356e-05, "loss": 0.4494, "step": 3114 }, { "epoch": 0.6141561514195584, "grad_norm": 0.6402603357742715, "learning_rate": 1.8871181675541316e-05, "loss": 0.4203, "step": 3115 }, { "epoch": 0.6143533123028391, "grad_norm": 0.5974596546772335, "learning_rate": 1.8870466166315992e-05, "loss": 0.4682, "step": 3116 }, { "epoch": 0.6145504731861199, "grad_norm": 0.6645668240438647, "learning_rate": 1.8869750443970574e-05, "loss": 0.461, "step": 3117 }, { "epoch": 0.6147476340694006, "grad_norm": 0.5555905216873552, "learning_rate": 1.8869034508522255e-05, "loss": 0.4412, "step": 3118 }, { "epoch": 0.6149447949526814, "grad_norm": 0.6339546381509664, "learning_rate": 1.8868318359988247e-05, "loss": 0.4321, "step": 3119 }, { "epoch": 0.6151419558359621, "grad_norm": 0.5848309865797923, "learning_rate": 1.8867601998385746e-05, "loss": 0.4476, "step": 3120 }, { "epoch": 0.6153391167192429, "grad_norm": 0.6339063416959785, "learning_rate": 1.8866885423731965e-05, "loss": 0.4726, "step": 3121 }, { "epoch": 0.6155362776025236, "grad_norm": 0.6250487989240934, "learning_rate": 1.8866168636044123e-05, "loss": 0.4438, "step": 3122 }, { "epoch": 0.6157334384858044, "grad_norm": 0.5912498590576298, "learning_rate": 1.8865451635339437e-05, "loss": 0.4249, "step": 3123 }, { "epoch": 0.6159305993690851, "grad_norm": 0.9964033422451408, "learning_rate": 1.8864734421635138e-05, "loss": 0.4533, "step": 3124 }, { "epoch": 0.6161277602523659, "grad_norm": 0.6783125045850023, "learning_rate": 1.8864016994948456e-05, "loss": 0.4399, "step": 3125 }, { "epoch": 0.6163249211356467, "grad_norm": 0.5640318304653738, "learning_rate": 1.8863299355296626e-05, "loss": 0.417, "step": 3126 }, { "epoch": 0.6165220820189274, "grad_norm": 0.7270371763719108, "learning_rate": 1.8862581502696893e-05, "loss": 0.4581, "step": 3127 }, { "epoch": 0.6167192429022083, "grad_norm": 0.5953459293919341, "learning_rate": 1.8861863437166503e-05, "loss": 0.4629, "step": 3128 }, { "epoch": 0.616916403785489, "grad_norm": 0.7333601695695047, "learning_rate": 1.8861145158722703e-05, "loss": 0.4449, "step": 3129 }, { "epoch": 0.6171135646687698, "grad_norm": 0.6093823968435338, "learning_rate": 1.886042666738276e-05, "loss": 0.4538, "step": 3130 }, { "epoch": 0.6173107255520505, "grad_norm": 0.7379382803953564, "learning_rate": 1.885970796316393e-05, "loss": 0.422, "step": 3131 }, { "epoch": 0.6175078864353313, "grad_norm": 0.5910536949790164, "learning_rate": 1.885898904608348e-05, "loss": 0.4416, "step": 3132 }, { "epoch": 0.617705047318612, "grad_norm": 0.7181285290226457, "learning_rate": 1.8858269916158683e-05, "loss": 0.4505, "step": 3133 }, { "epoch": 0.6179022082018928, "grad_norm": 0.6153838686175938, "learning_rate": 1.885755057340682e-05, "loss": 0.462, "step": 3134 }, { "epoch": 0.6180993690851735, "grad_norm": 0.6682026146168444, "learning_rate": 1.8856831017845172e-05, "loss": 0.4394, "step": 3135 }, { "epoch": 0.6182965299684543, "grad_norm": 0.6870178773294343, "learning_rate": 1.885611124949102e-05, "loss": 0.4573, "step": 3136 }, { "epoch": 0.618493690851735, "grad_norm": 0.734375693308938, "learning_rate": 1.8855391268361672e-05, "loss": 0.4893, "step": 3137 }, { "epoch": 0.6186908517350158, "grad_norm": 0.6503117986853993, "learning_rate": 1.8854671074474415e-05, "loss": 0.4491, "step": 3138 }, { "epoch": 0.6188880126182965, "grad_norm": 0.6902398712205959, "learning_rate": 1.8853950667846552e-05, "loss": 0.4619, "step": 3139 }, { "epoch": 0.6190851735015773, "grad_norm": 0.5584656484231819, "learning_rate": 1.8853230048495397e-05, "loss": 0.4073, "step": 3140 }, { "epoch": 0.619282334384858, "grad_norm": 0.6705513309518663, "learning_rate": 1.885250921643826e-05, "loss": 0.4545, "step": 3141 }, { "epoch": 0.6194794952681388, "grad_norm": 0.5384726667184218, "learning_rate": 1.885178817169246e-05, "loss": 0.3684, "step": 3142 }, { "epoch": 0.6196766561514195, "grad_norm": 0.706927829405689, "learning_rate": 1.885106691427532e-05, "loss": 0.4447, "step": 3143 }, { "epoch": 0.6198738170347003, "grad_norm": 0.553276508635115, "learning_rate": 1.885034544420417e-05, "loss": 0.4144, "step": 3144 }, { "epoch": 0.620070977917981, "grad_norm": 0.6314672976429081, "learning_rate": 1.8849623761496344e-05, "loss": 0.4518, "step": 3145 }, { "epoch": 0.6202681388012619, "grad_norm": 0.5953908729469908, "learning_rate": 1.884890186616918e-05, "loss": 0.4872, "step": 3146 }, { "epoch": 0.6204652996845426, "grad_norm": 0.6073305974604772, "learning_rate": 1.884817975824002e-05, "loss": 0.441, "step": 3147 }, { "epoch": 0.6206624605678234, "grad_norm": 0.6526389975490265, "learning_rate": 1.884745743772622e-05, "loss": 0.435, "step": 3148 }, { "epoch": 0.6208596214511041, "grad_norm": 0.6677687240737739, "learning_rate": 1.884673490464513e-05, "loss": 0.4322, "step": 3149 }, { "epoch": 0.6210567823343849, "grad_norm": 0.5838277331167949, "learning_rate": 1.884601215901411e-05, "loss": 0.4541, "step": 3150 }, { "epoch": 0.6212539432176656, "grad_norm": 0.5916911486373814, "learning_rate": 1.8845289200850523e-05, "loss": 0.4284, "step": 3151 }, { "epoch": 0.6214511041009464, "grad_norm": 0.5790112173160624, "learning_rate": 1.8844566030171737e-05, "loss": 0.4481, "step": 3152 }, { "epoch": 0.6216482649842271, "grad_norm": 1.1931766299083246, "learning_rate": 1.8843842646995135e-05, "loss": 0.4262, "step": 3153 }, { "epoch": 0.6218454258675079, "grad_norm": 0.6719849832897626, "learning_rate": 1.884311905133809e-05, "loss": 0.4364, "step": 3154 }, { "epoch": 0.6220425867507886, "grad_norm": 0.6366996603140915, "learning_rate": 1.8842395243217986e-05, "loss": 0.452, "step": 3155 }, { "epoch": 0.6222397476340694, "grad_norm": 0.6564134246631125, "learning_rate": 1.884167122265222e-05, "loss": 0.4725, "step": 3156 }, { "epoch": 0.6224369085173501, "grad_norm": 0.6718054415711968, "learning_rate": 1.8840946989658175e-05, "loss": 0.4179, "step": 3157 }, { "epoch": 0.6226340694006309, "grad_norm": 0.6323002874688755, "learning_rate": 1.8840222544253265e-05, "loss": 0.4829, "step": 3158 }, { "epoch": 0.6228312302839116, "grad_norm": 0.5594006708403959, "learning_rate": 1.883949788645489e-05, "loss": 0.4349, "step": 3159 }, { "epoch": 0.6230283911671924, "grad_norm": 0.6793405116890823, "learning_rate": 1.8838773016280457e-05, "loss": 0.4814, "step": 3160 }, { "epoch": 0.6232255520504731, "grad_norm": 0.5889819240809804, "learning_rate": 1.8838047933747386e-05, "loss": 0.477, "step": 3161 }, { "epoch": 0.623422712933754, "grad_norm": 0.6015700310464448, "learning_rate": 1.8837322638873093e-05, "loss": 0.4701, "step": 3162 }, { "epoch": 0.6236198738170347, "grad_norm": 0.537156319132712, "learning_rate": 1.883659713167501e-05, "loss": 0.3994, "step": 3163 }, { "epoch": 0.6238170347003155, "grad_norm": 0.6309490765989089, "learning_rate": 1.8835871412170563e-05, "loss": 0.4657, "step": 3164 }, { "epoch": 0.6240141955835962, "grad_norm": 0.5932465191692712, "learning_rate": 1.8835145480377194e-05, "loss": 0.4406, "step": 3165 }, { "epoch": 0.624211356466877, "grad_norm": 0.5843771388751763, "learning_rate": 1.8834419336312334e-05, "loss": 0.4644, "step": 3166 }, { "epoch": 0.6244085173501577, "grad_norm": 0.5350744617276989, "learning_rate": 1.8833692979993437e-05, "loss": 0.4334, "step": 3167 }, { "epoch": 0.6246056782334385, "grad_norm": 0.6032969293393299, "learning_rate": 1.8832966411437958e-05, "loss": 0.4516, "step": 3168 }, { "epoch": 0.6248028391167192, "grad_norm": 0.5402329623043484, "learning_rate": 1.883223963066334e-05, "loss": 0.4669, "step": 3169 }, { "epoch": 0.625, "grad_norm": 0.6420727536456771, "learning_rate": 1.8831512637687054e-05, "loss": 0.4933, "step": 3170 }, { "epoch": 0.6251971608832808, "grad_norm": 0.6837145058399116, "learning_rate": 1.8830785432526568e-05, "loss": 0.4316, "step": 3171 }, { "epoch": 0.6253943217665615, "grad_norm": 0.5669487608780952, "learning_rate": 1.883005801519935e-05, "loss": 0.4435, "step": 3172 }, { "epoch": 0.6255914826498423, "grad_norm": 0.5500128986494645, "learning_rate": 1.8829330385722875e-05, "loss": 0.4352, "step": 3173 }, { "epoch": 0.625788643533123, "grad_norm": 0.659708848221679, "learning_rate": 1.882860254411463e-05, "loss": 0.4302, "step": 3174 }, { "epoch": 0.6259858044164038, "grad_norm": 0.5634015211579477, "learning_rate": 1.8827874490392095e-05, "loss": 0.4344, "step": 3175 }, { "epoch": 0.6261829652996845, "grad_norm": 0.5511225422280275, "learning_rate": 1.882714622457277e-05, "loss": 0.4493, "step": 3176 }, { "epoch": 0.6263801261829653, "grad_norm": 0.5535762556867952, "learning_rate": 1.882641774667415e-05, "loss": 0.4786, "step": 3177 }, { "epoch": 0.626577287066246, "grad_norm": 0.7296756084321033, "learning_rate": 1.8825689056713733e-05, "loss": 0.4107, "step": 3178 }, { "epoch": 0.6267744479495269, "grad_norm": 0.5691759534943049, "learning_rate": 1.8824960154709027e-05, "loss": 0.4937, "step": 3179 }, { "epoch": 0.6269716088328076, "grad_norm": 0.5643398502645974, "learning_rate": 1.882423104067755e-05, "loss": 0.4371, "step": 3180 }, { "epoch": 0.6271687697160884, "grad_norm": 0.5414110934753018, "learning_rate": 1.8823501714636815e-05, "loss": 0.4372, "step": 3181 }, { "epoch": 0.6273659305993691, "grad_norm": 0.5716798244654584, "learning_rate": 1.8822772176604346e-05, "loss": 0.4386, "step": 3182 }, { "epoch": 0.6275630914826499, "grad_norm": 0.5594732072778538, "learning_rate": 1.882204242659767e-05, "loss": 0.4415, "step": 3183 }, { "epoch": 0.6277602523659306, "grad_norm": 0.5968240035493283, "learning_rate": 1.8821312464634318e-05, "loss": 0.4651, "step": 3184 }, { "epoch": 0.6279574132492114, "grad_norm": 0.5050689025421782, "learning_rate": 1.8820582290731836e-05, "loss": 0.3813, "step": 3185 }, { "epoch": 0.6281545741324921, "grad_norm": 0.571720125704504, "learning_rate": 1.8819851904907756e-05, "loss": 0.4117, "step": 3186 }, { "epoch": 0.6283517350157729, "grad_norm": 0.6099795491237106, "learning_rate": 1.8819121307179634e-05, "loss": 0.4739, "step": 3187 }, { "epoch": 0.6285488958990536, "grad_norm": 0.5516740130300875, "learning_rate": 1.881839049756502e-05, "loss": 0.4132, "step": 3188 }, { "epoch": 0.6287460567823344, "grad_norm": 0.6078983072011943, "learning_rate": 1.8817659476081474e-05, "loss": 0.4612, "step": 3189 }, { "epoch": 0.6289432176656151, "grad_norm": 0.5561987130798062, "learning_rate": 1.8816928242746554e-05, "loss": 0.4411, "step": 3190 }, { "epoch": 0.6291403785488959, "grad_norm": 0.5499939289696452, "learning_rate": 1.8816196797577838e-05, "loss": 0.4462, "step": 3191 }, { "epoch": 0.6293375394321766, "grad_norm": 0.565448729109912, "learning_rate": 1.881546514059289e-05, "loss": 0.4426, "step": 3192 }, { "epoch": 0.6295347003154574, "grad_norm": 0.5466556054488702, "learning_rate": 1.8814733271809296e-05, "loss": 0.4633, "step": 3193 }, { "epoch": 0.6297318611987381, "grad_norm": 0.6054836321256308, "learning_rate": 1.8814001191244636e-05, "loss": 0.4592, "step": 3194 }, { "epoch": 0.629929022082019, "grad_norm": 0.6252405670685508, "learning_rate": 1.8813268898916498e-05, "loss": 0.4588, "step": 3195 }, { "epoch": 0.6301261829652997, "grad_norm": 0.5210693946541807, "learning_rate": 1.881253639484248e-05, "loss": 0.4461, "step": 3196 }, { "epoch": 0.6303233438485805, "grad_norm": 0.5953514887490388, "learning_rate": 1.8811803679040178e-05, "loss": 0.4862, "step": 3197 }, { "epoch": 0.6305205047318612, "grad_norm": 0.6102071586071421, "learning_rate": 1.8811070751527196e-05, "loss": 0.4214, "step": 3198 }, { "epoch": 0.630717665615142, "grad_norm": 0.6100495554396542, "learning_rate": 1.8810337612321144e-05, "loss": 0.4648, "step": 3199 }, { "epoch": 0.6309148264984227, "grad_norm": 0.5197551849910016, "learning_rate": 1.8809604261439634e-05, "loss": 0.4382, "step": 3200 }, { "epoch": 0.6311119873817035, "grad_norm": 0.5490144964409054, "learning_rate": 1.8808870698900288e-05, "loss": 0.4477, "step": 3201 }, { "epoch": 0.6313091482649842, "grad_norm": 0.5628554485158465, "learning_rate": 1.880813692472073e-05, "loss": 0.4605, "step": 3202 }, { "epoch": 0.631506309148265, "grad_norm": 0.8650405899982267, "learning_rate": 1.8807402938918588e-05, "loss": 0.4288, "step": 3203 }, { "epoch": 0.6317034700315457, "grad_norm": 0.5546723666140645, "learning_rate": 1.88066687415115e-05, "loss": 0.4412, "step": 3204 }, { "epoch": 0.6319006309148265, "grad_norm": 0.5808634529336613, "learning_rate": 1.8805934332517104e-05, "loss": 0.4733, "step": 3205 }, { "epoch": 0.6320977917981072, "grad_norm": 0.6305165897108991, "learning_rate": 1.880519971195304e-05, "loss": 0.4469, "step": 3206 }, { "epoch": 0.632294952681388, "grad_norm": 0.5672855153253277, "learning_rate": 1.880446487983697e-05, "loss": 0.4699, "step": 3207 }, { "epoch": 0.6324921135646687, "grad_norm": 0.5959371940317938, "learning_rate": 1.880372983618653e-05, "loss": 0.481, "step": 3208 }, { "epoch": 0.6326892744479495, "grad_norm": 0.5264066379704362, "learning_rate": 1.88029945810194e-05, "loss": 0.4093, "step": 3209 }, { "epoch": 0.6328864353312302, "grad_norm": 0.5705739208816702, "learning_rate": 1.880225911435323e-05, "loss": 0.4517, "step": 3210 }, { "epoch": 0.633083596214511, "grad_norm": 0.6114799226361008, "learning_rate": 1.88015234362057e-05, "loss": 0.4527, "step": 3211 }, { "epoch": 0.6332807570977917, "grad_norm": 0.5837271501297863, "learning_rate": 1.8800787546594487e-05, "loss": 0.4596, "step": 3212 }, { "epoch": 0.6334779179810726, "grad_norm": 0.5577201101577096, "learning_rate": 1.8800051445537256e-05, "loss": 0.4577, "step": 3213 }, { "epoch": 0.6336750788643533, "grad_norm": 0.5736984526638703, "learning_rate": 1.8799315133051707e-05, "loss": 0.4451, "step": 3214 }, { "epoch": 0.6338722397476341, "grad_norm": 0.7258407481830798, "learning_rate": 1.8798578609155528e-05, "loss": 0.4459, "step": 3215 }, { "epoch": 0.6340694006309149, "grad_norm": 0.5742874979476474, "learning_rate": 1.8797841873866406e-05, "loss": 0.4688, "step": 3216 }, { "epoch": 0.6342665615141956, "grad_norm": 0.5334223008485237, "learning_rate": 1.8797104927202055e-05, "loss": 0.4496, "step": 3217 }, { "epoch": 0.6344637223974764, "grad_norm": 0.5706148443360702, "learning_rate": 1.879636776918017e-05, "loss": 0.4617, "step": 3218 }, { "epoch": 0.6346608832807571, "grad_norm": 0.8529042316450798, "learning_rate": 1.8795630399818466e-05, "loss": 0.4704, "step": 3219 }, { "epoch": 0.6348580441640379, "grad_norm": 0.5624250580860771, "learning_rate": 1.8794892819134657e-05, "loss": 0.4725, "step": 3220 }, { "epoch": 0.6350552050473186, "grad_norm": 0.5838011246869812, "learning_rate": 1.8794155027146468e-05, "loss": 0.4641, "step": 3221 }, { "epoch": 0.6352523659305994, "grad_norm": 0.564379408051252, "learning_rate": 1.879341702387162e-05, "loss": 0.4536, "step": 3222 }, { "epoch": 0.6354495268138801, "grad_norm": 0.5730447220643163, "learning_rate": 1.8792678809327852e-05, "loss": 0.4228, "step": 3223 }, { "epoch": 0.6356466876971609, "grad_norm": 0.9816181582279039, "learning_rate": 1.879194038353289e-05, "loss": 0.4869, "step": 3224 }, { "epoch": 0.6358438485804416, "grad_norm": 0.5470063459656487, "learning_rate": 1.8791201746504485e-05, "loss": 0.4252, "step": 3225 }, { "epoch": 0.6360410094637224, "grad_norm": 0.5839122977250125, "learning_rate": 1.8790462898260373e-05, "loss": 0.3985, "step": 3226 }, { "epoch": 0.6362381703470031, "grad_norm": 0.601019249469374, "learning_rate": 1.8789723838818314e-05, "loss": 0.4639, "step": 3227 }, { "epoch": 0.636435331230284, "grad_norm": 0.6426956793821633, "learning_rate": 1.878898456819606e-05, "loss": 0.4722, "step": 3228 }, { "epoch": 0.6366324921135647, "grad_norm": 0.5652057471720214, "learning_rate": 1.878824508641137e-05, "loss": 0.4385, "step": 3229 }, { "epoch": 0.6368296529968455, "grad_norm": 0.634391516253938, "learning_rate": 1.8787505393482023e-05, "loss": 0.4861, "step": 3230 }, { "epoch": 0.6370268138801262, "grad_norm": 0.5276142848529989, "learning_rate": 1.8786765489425776e-05, "loss": 0.4167, "step": 3231 }, { "epoch": 0.637223974763407, "grad_norm": 0.642092919755005, "learning_rate": 1.8786025374260418e-05, "loss": 0.4447, "step": 3232 }, { "epoch": 0.6374211356466877, "grad_norm": 0.6622984389500397, "learning_rate": 1.8785285048003722e-05, "loss": 0.4509, "step": 3233 }, { "epoch": 0.6376182965299685, "grad_norm": 0.6059786233435729, "learning_rate": 1.8784544510673477e-05, "loss": 0.4328, "step": 3234 }, { "epoch": 0.6378154574132492, "grad_norm": 0.5608277860083667, "learning_rate": 1.8783803762287477e-05, "loss": 0.4403, "step": 3235 }, { "epoch": 0.63801261829653, "grad_norm": 0.5705812031919998, "learning_rate": 1.8783062802863516e-05, "loss": 0.447, "step": 3236 }, { "epoch": 0.6382097791798107, "grad_norm": 0.6099064166899052, "learning_rate": 1.8782321632419402e-05, "loss": 0.4635, "step": 3237 }, { "epoch": 0.6384069400630915, "grad_norm": 0.5789456791219298, "learning_rate": 1.8781580250972933e-05, "loss": 0.453, "step": 3238 }, { "epoch": 0.6386041009463722, "grad_norm": 0.5435966426313446, "learning_rate": 1.8780838658541932e-05, "loss": 0.4088, "step": 3239 }, { "epoch": 0.638801261829653, "grad_norm": 0.5637574474073005, "learning_rate": 1.878009685514421e-05, "loss": 0.4516, "step": 3240 }, { "epoch": 0.6389984227129337, "grad_norm": 0.6583384784739884, "learning_rate": 1.8779354840797588e-05, "loss": 0.454, "step": 3241 }, { "epoch": 0.6391955835962145, "grad_norm": 0.5554247237363399, "learning_rate": 1.87786126155199e-05, "loss": 0.4567, "step": 3242 }, { "epoch": 0.6393927444794952, "grad_norm": 0.5657140987582041, "learning_rate": 1.877787017932897e-05, "loss": 0.4716, "step": 3243 }, { "epoch": 0.639589905362776, "grad_norm": 0.5498999624708573, "learning_rate": 1.8777127532242643e-05, "loss": 0.4527, "step": 3244 }, { "epoch": 0.6397870662460567, "grad_norm": 0.597593319418814, "learning_rate": 1.8776384674278756e-05, "loss": 0.4409, "step": 3245 }, { "epoch": 0.6399842271293376, "grad_norm": 0.5970410292124881, "learning_rate": 1.8775641605455162e-05, "loss": 0.4716, "step": 3246 }, { "epoch": 0.6401813880126183, "grad_norm": 0.5712017944124336, "learning_rate": 1.877489832578971e-05, "loss": 0.4408, "step": 3247 }, { "epoch": 0.6403785488958991, "grad_norm": 0.5264381336103525, "learning_rate": 1.877415483530026e-05, "loss": 0.4419, "step": 3248 }, { "epoch": 0.6405757097791798, "grad_norm": 0.6131589523064833, "learning_rate": 1.8773411134004677e-05, "loss": 0.4544, "step": 3249 }, { "epoch": 0.6407728706624606, "grad_norm": 0.5718419218962923, "learning_rate": 1.8772667221920823e-05, "loss": 0.4296, "step": 3250 }, { "epoch": 0.6409700315457413, "grad_norm": 0.578624341262867, "learning_rate": 1.8771923099066573e-05, "loss": 0.4605, "step": 3251 }, { "epoch": 0.6411671924290221, "grad_norm": 0.5787190935133748, "learning_rate": 1.877117876545981e-05, "loss": 0.4434, "step": 3252 }, { "epoch": 0.6413643533123028, "grad_norm": 0.5634817233410009, "learning_rate": 1.877043422111841e-05, "loss": 0.4614, "step": 3253 }, { "epoch": 0.6415615141955836, "grad_norm": 0.5510386384849804, "learning_rate": 1.876968946606027e-05, "loss": 0.4477, "step": 3254 }, { "epoch": 0.6417586750788643, "grad_norm": 0.6031650127924183, "learning_rate": 1.8768944500303276e-05, "loss": 0.4528, "step": 3255 }, { "epoch": 0.6419558359621451, "grad_norm": 0.563947528442472, "learning_rate": 1.876819932386533e-05, "loss": 0.4464, "step": 3256 }, { "epoch": 0.6421529968454258, "grad_norm": 0.562104581809266, "learning_rate": 1.8767453936764332e-05, "loss": 0.4448, "step": 3257 }, { "epoch": 0.6423501577287066, "grad_norm": 0.5711382767124764, "learning_rate": 1.876670833901819e-05, "loss": 0.432, "step": 3258 }, { "epoch": 0.6425473186119873, "grad_norm": 0.5697395824924175, "learning_rate": 1.8765962530644826e-05, "loss": 0.446, "step": 3259 }, { "epoch": 0.6427444794952681, "grad_norm": 0.5940770780237805, "learning_rate": 1.8765216511662153e-05, "loss": 0.4551, "step": 3260 }, { "epoch": 0.642941640378549, "grad_norm": 0.9903145042557886, "learning_rate": 1.876447028208809e-05, "loss": 0.4633, "step": 3261 }, { "epoch": 0.6431388012618297, "grad_norm": 0.5566812407384747, "learning_rate": 1.8763723841940576e-05, "loss": 0.4058, "step": 3262 }, { "epoch": 0.6433359621451105, "grad_norm": 0.6250449584024177, "learning_rate": 1.8762977191237536e-05, "loss": 0.4706, "step": 3263 }, { "epoch": 0.6435331230283912, "grad_norm": 2.210243090857098, "learning_rate": 1.876223032999691e-05, "loss": 0.4406, "step": 3264 }, { "epoch": 0.643730283911672, "grad_norm": 0.6481804636618524, "learning_rate": 1.876148325823665e-05, "loss": 0.432, "step": 3265 }, { "epoch": 0.6439274447949527, "grad_norm": 0.6303860129263393, "learning_rate": 1.8760735975974693e-05, "loss": 0.4137, "step": 3266 }, { "epoch": 0.6441246056782335, "grad_norm": 0.6383795817729836, "learning_rate": 1.8759988483229e-05, "loss": 0.4554, "step": 3267 }, { "epoch": 0.6443217665615142, "grad_norm": 0.6875260817255804, "learning_rate": 1.8759240780017534e-05, "loss": 0.443, "step": 3268 }, { "epoch": 0.644518927444795, "grad_norm": 0.6662104585984299, "learning_rate": 1.875849286635825e-05, "loss": 0.4445, "step": 3269 }, { "epoch": 0.6447160883280757, "grad_norm": 0.6171930410732172, "learning_rate": 1.8757744742269123e-05, "loss": 0.4374, "step": 3270 }, { "epoch": 0.6449132492113565, "grad_norm": 0.7933975678175971, "learning_rate": 1.8756996407768128e-05, "loss": 0.4766, "step": 3271 }, { "epoch": 0.6451104100946372, "grad_norm": 1.0409394957016507, "learning_rate": 1.875624786287324e-05, "loss": 0.4418, "step": 3272 }, { "epoch": 0.645307570977918, "grad_norm": 0.5947622881166746, "learning_rate": 1.875549910760245e-05, "loss": 0.4575, "step": 3273 }, { "epoch": 0.6455047318611987, "grad_norm": 0.5670316521357006, "learning_rate": 1.875475014197374e-05, "loss": 0.4488, "step": 3274 }, { "epoch": 0.6457018927444795, "grad_norm": 0.6158749237920658, "learning_rate": 1.8754000966005105e-05, "loss": 0.4173, "step": 3275 }, { "epoch": 0.6458990536277602, "grad_norm": 0.6765411423930585, "learning_rate": 1.8753251579714548e-05, "loss": 0.4131, "step": 3276 }, { "epoch": 0.646096214511041, "grad_norm": 0.5338915194924335, "learning_rate": 1.8752501983120076e-05, "loss": 0.4262, "step": 3277 }, { "epoch": 0.6462933753943217, "grad_norm": 0.566003677186254, "learning_rate": 1.8751752176239693e-05, "loss": 0.4208, "step": 3278 }, { "epoch": 0.6464905362776026, "grad_norm": 0.6018668670309205, "learning_rate": 1.8751002159091415e-05, "loss": 0.4591, "step": 3279 }, { "epoch": 0.6466876971608833, "grad_norm": 0.6534805827884597, "learning_rate": 1.8750251931693265e-05, "loss": 0.4538, "step": 3280 }, { "epoch": 0.6468848580441641, "grad_norm": 0.5692551760672482, "learning_rate": 1.8749501494063266e-05, "loss": 0.4171, "step": 3281 }, { "epoch": 0.6470820189274448, "grad_norm": 0.6447690381178922, "learning_rate": 1.874875084621945e-05, "loss": 0.4436, "step": 3282 }, { "epoch": 0.6472791798107256, "grad_norm": 0.531474927660677, "learning_rate": 1.8747999988179846e-05, "loss": 0.4287, "step": 3283 }, { "epoch": 0.6474763406940063, "grad_norm": 0.5470438621167268, "learning_rate": 1.8747248919962498e-05, "loss": 0.4417, "step": 3284 }, { "epoch": 0.6476735015772871, "grad_norm": 0.5741916924212119, "learning_rate": 1.874649764158545e-05, "loss": 0.425, "step": 3285 }, { "epoch": 0.6478706624605678, "grad_norm": 0.5348263622053581, "learning_rate": 1.8745746153066756e-05, "loss": 0.4169, "step": 3286 }, { "epoch": 0.6480678233438486, "grad_norm": 0.5607951226714928, "learning_rate": 1.8744994454424463e-05, "loss": 0.4375, "step": 3287 }, { "epoch": 0.6482649842271293, "grad_norm": 0.6004093976459436, "learning_rate": 1.874424254567664e-05, "loss": 0.4482, "step": 3288 }, { "epoch": 0.6484621451104101, "grad_norm": 0.5668829924890922, "learning_rate": 1.8743490426841346e-05, "loss": 0.4462, "step": 3289 }, { "epoch": 0.6486593059936908, "grad_norm": 0.7152077392581754, "learning_rate": 1.8742738097936653e-05, "loss": 0.4899, "step": 3290 }, { "epoch": 0.6488564668769716, "grad_norm": 0.5722777962307597, "learning_rate": 1.874198555898064e-05, "loss": 0.4618, "step": 3291 }, { "epoch": 0.6490536277602523, "grad_norm": 0.6284802948397332, "learning_rate": 1.874123280999138e-05, "loss": 0.415, "step": 3292 }, { "epoch": 0.6492507886435331, "grad_norm": 0.6162363187667032, "learning_rate": 1.8740479850986962e-05, "loss": 0.4557, "step": 3293 }, { "epoch": 0.6494479495268138, "grad_norm": 0.616624529665098, "learning_rate": 1.8739726681985478e-05, "loss": 0.4604, "step": 3294 }, { "epoch": 0.6496451104100947, "grad_norm": 0.5855562263616807, "learning_rate": 1.8738973303005024e-05, "loss": 0.4637, "step": 3295 }, { "epoch": 0.6498422712933754, "grad_norm": 0.6041841740545615, "learning_rate": 1.87382197140637e-05, "loss": 0.4623, "step": 3296 }, { "epoch": 0.6500394321766562, "grad_norm": 4.533496170121018, "learning_rate": 1.873746591517961e-05, "loss": 0.4471, "step": 3297 }, { "epoch": 0.6502365930599369, "grad_norm": 0.6624890927859284, "learning_rate": 1.873671190637086e-05, "loss": 0.4267, "step": 3298 }, { "epoch": 0.6504337539432177, "grad_norm": 0.5905974277110145, "learning_rate": 1.8735957687655577e-05, "loss": 0.4125, "step": 3299 }, { "epoch": 0.6506309148264984, "grad_norm": 0.6041600898310406, "learning_rate": 1.8735203259051872e-05, "loss": 0.454, "step": 3300 }, { "epoch": 0.6508280757097792, "grad_norm": 0.6390231671362254, "learning_rate": 1.8734448620577875e-05, "loss": 0.4887, "step": 3301 }, { "epoch": 0.6510252365930599, "grad_norm": 0.6117066782843804, "learning_rate": 1.8733693772251716e-05, "loss": 0.4556, "step": 3302 }, { "epoch": 0.6512223974763407, "grad_norm": 0.5711747386901173, "learning_rate": 1.873293871409153e-05, "loss": 0.4204, "step": 3303 }, { "epoch": 0.6514195583596214, "grad_norm": 0.6143464512727876, "learning_rate": 1.8732183446115462e-05, "loss": 0.4434, "step": 3304 }, { "epoch": 0.6516167192429022, "grad_norm": 0.5417403420673808, "learning_rate": 1.8731427968341654e-05, "loss": 0.4246, "step": 3305 }, { "epoch": 0.651813880126183, "grad_norm": 0.6890669641904964, "learning_rate": 1.8730672280788254e-05, "loss": 0.4989, "step": 3306 }, { "epoch": 0.6520110410094637, "grad_norm": 0.5499879128399455, "learning_rate": 1.8729916383473427e-05, "loss": 0.4529, "step": 3307 }, { "epoch": 0.6522082018927445, "grad_norm": 0.5830578121646517, "learning_rate": 1.8729160276415325e-05, "loss": 0.4023, "step": 3308 }, { "epoch": 0.6524053627760252, "grad_norm": 0.5667181804512541, "learning_rate": 1.872840395963212e-05, "loss": 0.4491, "step": 3309 }, { "epoch": 0.652602523659306, "grad_norm": 0.5766356519567429, "learning_rate": 1.872764743314198e-05, "loss": 0.4666, "step": 3310 }, { "epoch": 0.6527996845425867, "grad_norm": 0.5432561051881504, "learning_rate": 1.872689069696308e-05, "loss": 0.4311, "step": 3311 }, { "epoch": 0.6529968454258676, "grad_norm": 0.5811877061331849, "learning_rate": 1.8726133751113605e-05, "loss": 0.4351, "step": 3312 }, { "epoch": 0.6531940063091483, "grad_norm": 0.5756974097066245, "learning_rate": 1.872537659561174e-05, "loss": 0.4561, "step": 3313 }, { "epoch": 0.6533911671924291, "grad_norm": 0.5313182062056155, "learning_rate": 1.8724619230475675e-05, "loss": 0.3902, "step": 3314 }, { "epoch": 0.6535883280757098, "grad_norm": 0.5497309033214044, "learning_rate": 1.872386165572361e-05, "loss": 0.4263, "step": 3315 }, { "epoch": 0.6537854889589906, "grad_norm": 0.5466469389230997, "learning_rate": 1.872310387137374e-05, "loss": 0.4224, "step": 3316 }, { "epoch": 0.6539826498422713, "grad_norm": 2.202702375918148, "learning_rate": 1.872234587744427e-05, "loss": 0.4472, "step": 3317 }, { "epoch": 0.6541798107255521, "grad_norm": 0.5869148492580374, "learning_rate": 1.8721587673953425e-05, "loss": 0.4671, "step": 3318 }, { "epoch": 0.6543769716088328, "grad_norm": 0.5427996690221996, "learning_rate": 1.8720829260919407e-05, "loss": 0.4289, "step": 3319 }, { "epoch": 0.6545741324921136, "grad_norm": 0.5656017555414672, "learning_rate": 1.8720070638360447e-05, "loss": 0.4294, "step": 3320 }, { "epoch": 0.6547712933753943, "grad_norm": 4.358651338020947, "learning_rate": 1.8719311806294768e-05, "loss": 0.5382, "step": 3321 }, { "epoch": 0.6549684542586751, "grad_norm": 0.6865845679707644, "learning_rate": 1.87185527647406e-05, "loss": 0.4885, "step": 3322 }, { "epoch": 0.6551656151419558, "grad_norm": 1.9321821824405176, "learning_rate": 1.871779351371618e-05, "loss": 0.4279, "step": 3323 }, { "epoch": 0.6553627760252366, "grad_norm": 0.6207436028024012, "learning_rate": 1.8717034053239748e-05, "loss": 0.4415, "step": 3324 }, { "epoch": 0.6555599369085173, "grad_norm": 0.7726441299810708, "learning_rate": 1.8716274383329556e-05, "loss": 0.4166, "step": 3325 }, { "epoch": 0.6557570977917981, "grad_norm": 1.3763107640662533, "learning_rate": 1.8715514504003854e-05, "loss": 0.4449, "step": 3326 }, { "epoch": 0.6559542586750788, "grad_norm": 0.6496821405637063, "learning_rate": 1.8714754415280894e-05, "loss": 0.451, "step": 3327 }, { "epoch": 0.6561514195583596, "grad_norm": 0.6130036203951224, "learning_rate": 1.8713994117178945e-05, "loss": 0.4593, "step": 3328 }, { "epoch": 0.6563485804416404, "grad_norm": 0.5725117299905743, "learning_rate": 1.8713233609716266e-05, "loss": 0.4278, "step": 3329 }, { "epoch": 0.6565457413249212, "grad_norm": 0.5445827811729219, "learning_rate": 1.8712472892911132e-05, "loss": 0.4101, "step": 3330 }, { "epoch": 0.6567429022082019, "grad_norm": 0.7500084982788475, "learning_rate": 1.8711711966781826e-05, "loss": 0.4248, "step": 3331 }, { "epoch": 0.6569400630914827, "grad_norm": 0.6621959429007946, "learning_rate": 1.8710950831346623e-05, "loss": 0.4908, "step": 3332 }, { "epoch": 0.6571372239747634, "grad_norm": 2.2676905068146054, "learning_rate": 1.871018948662381e-05, "loss": 0.498, "step": 3333 }, { "epoch": 0.6573343848580442, "grad_norm": 0.7294312287786445, "learning_rate": 1.870942793263168e-05, "loss": 0.4693, "step": 3334 }, { "epoch": 0.6575315457413249, "grad_norm": 0.5923749861108738, "learning_rate": 1.870866616938853e-05, "loss": 0.4634, "step": 3335 }, { "epoch": 0.6577287066246057, "grad_norm": 0.6623239066356625, "learning_rate": 1.870790419691266e-05, "loss": 0.4434, "step": 3336 }, { "epoch": 0.6579258675078864, "grad_norm": 0.6507535977225328, "learning_rate": 1.8707142015222386e-05, "loss": 0.4619, "step": 3337 }, { "epoch": 0.6581230283911672, "grad_norm": 3.387004003722433, "learning_rate": 1.870637962433601e-05, "loss": 0.5185, "step": 3338 }, { "epoch": 0.6583201892744479, "grad_norm": 0.7986315924274782, "learning_rate": 1.870561702427185e-05, "loss": 0.441, "step": 3339 }, { "epoch": 0.6585173501577287, "grad_norm": 0.5803415335213525, "learning_rate": 1.870485421504823e-05, "loss": 0.4293, "step": 3340 }, { "epoch": 0.6587145110410094, "grad_norm": 0.7703993364474427, "learning_rate": 1.870409119668348e-05, "loss": 0.4929, "step": 3341 }, { "epoch": 0.6589116719242902, "grad_norm": 0.6513075944835972, "learning_rate": 1.870332796919593e-05, "loss": 0.4607, "step": 3342 }, { "epoch": 0.6591088328075709, "grad_norm": 0.6750849955698458, "learning_rate": 1.8702564532603917e-05, "loss": 0.439, "step": 3343 }, { "epoch": 0.6593059936908517, "grad_norm": 0.6375890019239625, "learning_rate": 1.8701800886925784e-05, "loss": 0.4726, "step": 3344 }, { "epoch": 0.6595031545741324, "grad_norm": 0.65991248253205, "learning_rate": 1.8701037032179873e-05, "loss": 0.4631, "step": 3345 }, { "epoch": 0.6597003154574133, "grad_norm": 0.8742592115103047, "learning_rate": 1.870027296838454e-05, "loss": 0.4502, "step": 3346 }, { "epoch": 0.659897476340694, "grad_norm": 0.6581694810065215, "learning_rate": 1.8699508695558145e-05, "loss": 0.4371, "step": 3347 }, { "epoch": 0.6600946372239748, "grad_norm": 0.7348305014231767, "learning_rate": 1.869874421371905e-05, "loss": 0.4287, "step": 3348 }, { "epoch": 0.6602917981072555, "grad_norm": 0.6216191228474292, "learning_rate": 1.8697979522885617e-05, "loss": 0.441, "step": 3349 }, { "epoch": 0.6604889589905363, "grad_norm": 0.7451526119076238, "learning_rate": 1.8697214623076222e-05, "loss": 0.4419, "step": 3350 }, { "epoch": 0.660686119873817, "grad_norm": 0.5518756188697328, "learning_rate": 1.8696449514309244e-05, "loss": 0.4176, "step": 3351 }, { "epoch": 0.6608832807570978, "grad_norm": 0.6734594739824642, "learning_rate": 1.869568419660306e-05, "loss": 0.4669, "step": 3352 }, { "epoch": 0.6610804416403786, "grad_norm": 0.679456190018415, "learning_rate": 1.8694918669976063e-05, "loss": 0.456, "step": 3353 }, { "epoch": 0.6612776025236593, "grad_norm": 0.5841633102935858, "learning_rate": 1.8694152934446642e-05, "loss": 0.4252, "step": 3354 }, { "epoch": 0.6614747634069401, "grad_norm": 0.5737135644711173, "learning_rate": 1.8693386990033194e-05, "loss": 0.4152, "step": 3355 }, { "epoch": 0.6616719242902208, "grad_norm": 0.6231408225081773, "learning_rate": 1.8692620836754124e-05, "loss": 0.4393, "step": 3356 }, { "epoch": 0.6618690851735016, "grad_norm": 0.7428753795745229, "learning_rate": 1.8691854474627838e-05, "loss": 0.4596, "step": 3357 }, { "epoch": 0.6620662460567823, "grad_norm": 0.6536356718789794, "learning_rate": 1.8691087903672752e-05, "loss": 0.4548, "step": 3358 }, { "epoch": 0.6622634069400631, "grad_norm": 0.6121171634606317, "learning_rate": 1.8690321123907277e-05, "loss": 0.4633, "step": 3359 }, { "epoch": 0.6624605678233438, "grad_norm": 0.5626255419013269, "learning_rate": 1.868955413534984e-05, "loss": 0.443, "step": 3360 }, { "epoch": 0.6626577287066246, "grad_norm": 0.5728350774998016, "learning_rate": 1.8688786938018866e-05, "loss": 0.4273, "step": 3361 }, { "epoch": 0.6628548895899053, "grad_norm": 0.5765941930596808, "learning_rate": 1.8688019531932788e-05, "loss": 0.4511, "step": 3362 }, { "epoch": 0.6630520504731862, "grad_norm": 0.5622738400789903, "learning_rate": 1.8687251917110045e-05, "loss": 0.4643, "step": 3363 }, { "epoch": 0.6632492113564669, "grad_norm": 0.5415771750284268, "learning_rate": 1.8686484093569078e-05, "loss": 0.4228, "step": 3364 }, { "epoch": 0.6634463722397477, "grad_norm": 0.5508756543268856, "learning_rate": 1.868571606132834e-05, "loss": 0.452, "step": 3365 }, { "epoch": 0.6636435331230284, "grad_norm": 0.6135272218599022, "learning_rate": 1.8684947820406273e-05, "loss": 0.4667, "step": 3366 }, { "epoch": 0.6638406940063092, "grad_norm": 0.5960455949438687, "learning_rate": 1.8684179370821343e-05, "loss": 0.4686, "step": 3367 }, { "epoch": 0.6640378548895899, "grad_norm": 0.5480090387207861, "learning_rate": 1.8683410712592015e-05, "loss": 0.448, "step": 3368 }, { "epoch": 0.6642350157728707, "grad_norm": 0.6217390232273009, "learning_rate": 1.8682641845736748e-05, "loss": 0.4106, "step": 3369 }, { "epoch": 0.6644321766561514, "grad_norm": 0.5922247859816271, "learning_rate": 1.8681872770274013e-05, "loss": 0.4577, "step": 3370 }, { "epoch": 0.6646293375394322, "grad_norm": 0.556590095986522, "learning_rate": 1.86811034862223e-05, "loss": 0.4541, "step": 3371 }, { "epoch": 0.6648264984227129, "grad_norm": 0.5422277567079224, "learning_rate": 1.8680333993600084e-05, "loss": 0.4413, "step": 3372 }, { "epoch": 0.6650236593059937, "grad_norm": 0.609846737333361, "learning_rate": 1.867956429242585e-05, "loss": 0.4439, "step": 3373 }, { "epoch": 0.6652208201892744, "grad_norm": 0.5366562298747772, "learning_rate": 1.86787943827181e-05, "loss": 0.4227, "step": 3374 }, { "epoch": 0.6654179810725552, "grad_norm": 0.59012017930504, "learning_rate": 1.8678024264495323e-05, "loss": 0.4351, "step": 3375 }, { "epoch": 0.6656151419558359, "grad_norm": 0.5771194135297009, "learning_rate": 1.8677253937776024e-05, "loss": 0.4206, "step": 3376 }, { "epoch": 0.6658123028391167, "grad_norm": 0.5678453290335862, "learning_rate": 1.8676483402578714e-05, "loss": 0.458, "step": 3377 }, { "epoch": 0.6660094637223974, "grad_norm": 0.5848984447281752, "learning_rate": 1.86757126589219e-05, "loss": 0.4294, "step": 3378 }, { "epoch": 0.6662066246056783, "grad_norm": 0.5612739049068476, "learning_rate": 1.8674941706824104e-05, "loss": 0.4335, "step": 3379 }, { "epoch": 0.666403785488959, "grad_norm": 0.5554127683707469, "learning_rate": 1.8674170546303846e-05, "loss": 0.4216, "step": 3380 }, { "epoch": 0.6666009463722398, "grad_norm": 0.5356096683041639, "learning_rate": 1.8673399177379657e-05, "loss": 0.417, "step": 3381 }, { "epoch": 0.6667981072555205, "grad_norm": 0.5413485188215648, "learning_rate": 1.8672627600070068e-05, "loss": 0.4244, "step": 3382 }, { "epoch": 0.6669952681388013, "grad_norm": 0.5305208256677184, "learning_rate": 1.8671855814393617e-05, "loss": 0.4153, "step": 3383 }, { "epoch": 0.667192429022082, "grad_norm": 0.5903750497070852, "learning_rate": 1.8671083820368846e-05, "loss": 0.4633, "step": 3384 }, { "epoch": 0.6673895899053628, "grad_norm": 1.389773255933898, "learning_rate": 1.8670311618014307e-05, "loss": 0.4518, "step": 3385 }, { "epoch": 0.6675867507886435, "grad_norm": 0.6734664022072558, "learning_rate": 1.8669539207348544e-05, "loss": 0.4303, "step": 3386 }, { "epoch": 0.6677839116719243, "grad_norm": 0.5925325374467875, "learning_rate": 1.8668766588390122e-05, "loss": 0.4222, "step": 3387 }, { "epoch": 0.667981072555205, "grad_norm": 0.5505241807065062, "learning_rate": 1.8667993761157602e-05, "loss": 0.4407, "step": 3388 }, { "epoch": 0.6681782334384858, "grad_norm": 0.7188824261126605, "learning_rate": 1.866722072566955e-05, "loss": 0.4378, "step": 3389 }, { "epoch": 0.6683753943217665, "grad_norm": 1.6862636115429694, "learning_rate": 1.8666447481944542e-05, "loss": 0.4472, "step": 3390 }, { "epoch": 0.6685725552050473, "grad_norm": 0.535071580188235, "learning_rate": 1.8665674030001154e-05, "loss": 0.4324, "step": 3391 }, { "epoch": 0.668769716088328, "grad_norm": 0.5720087515109528, "learning_rate": 1.866490036985797e-05, "loss": 0.4285, "step": 3392 }, { "epoch": 0.6689668769716088, "grad_norm": 0.5753023247784349, "learning_rate": 1.8664126501533576e-05, "loss": 0.471, "step": 3393 }, { "epoch": 0.6691640378548895, "grad_norm": 0.5438723970201407, "learning_rate": 1.8663352425046564e-05, "loss": 0.4349, "step": 3394 }, { "epoch": 0.6693611987381703, "grad_norm": 0.565118624393037, "learning_rate": 1.8662578140415535e-05, "loss": 0.4534, "step": 3395 }, { "epoch": 0.669558359621451, "grad_norm": 0.6635843691593936, "learning_rate": 1.866180364765909e-05, "loss": 0.4541, "step": 3396 }, { "epoch": 0.6697555205047319, "grad_norm": 0.5553454488274417, "learning_rate": 1.8661028946795837e-05, "loss": 0.4164, "step": 3397 }, { "epoch": 0.6699526813880127, "grad_norm": 2.036398178901347, "learning_rate": 1.866025403784439e-05, "loss": 0.4708, "step": 3398 }, { "epoch": 0.6701498422712934, "grad_norm": 0.5825487984269258, "learning_rate": 1.8659478920823364e-05, "loss": 0.4196, "step": 3399 }, { "epoch": 0.6703470031545742, "grad_norm": 0.5882347623158389, "learning_rate": 1.865870359575138e-05, "loss": 0.4376, "step": 3400 }, { "epoch": 0.6705441640378549, "grad_norm": 0.5537130860684163, "learning_rate": 1.8657928062647075e-05, "loss": 0.4292, "step": 3401 }, { "epoch": 0.6707413249211357, "grad_norm": 0.6278956340355503, "learning_rate": 1.8657152321529075e-05, "loss": 0.4216, "step": 3402 }, { "epoch": 0.6709384858044164, "grad_norm": 0.5849954721222042, "learning_rate": 1.8656376372416017e-05, "loss": 0.472, "step": 3403 }, { "epoch": 0.6711356466876972, "grad_norm": 0.6501311313277848, "learning_rate": 1.8655600215326547e-05, "loss": 0.4157, "step": 3404 }, { "epoch": 0.6713328075709779, "grad_norm": 0.60533862716353, "learning_rate": 1.8654823850279312e-05, "loss": 0.4249, "step": 3405 }, { "epoch": 0.6715299684542587, "grad_norm": 0.7045177259010618, "learning_rate": 1.8654047277292962e-05, "loss": 0.5035, "step": 3406 }, { "epoch": 0.6717271293375394, "grad_norm": 0.5709556862264039, "learning_rate": 1.8653270496386163e-05, "loss": 0.4189, "step": 3407 }, { "epoch": 0.6719242902208202, "grad_norm": 0.657277026760384, "learning_rate": 1.8652493507577564e-05, "loss": 0.4936, "step": 3408 }, { "epoch": 0.6721214511041009, "grad_norm": 0.8211869369723545, "learning_rate": 1.8651716310885845e-05, "loss": 0.4697, "step": 3409 }, { "epoch": 0.6723186119873817, "grad_norm": 0.5634640318066427, "learning_rate": 1.8650938906329674e-05, "loss": 0.4266, "step": 3410 }, { "epoch": 0.6725157728706624, "grad_norm": 0.6140117782031614, "learning_rate": 1.865016129392773e-05, "loss": 0.4469, "step": 3411 }, { "epoch": 0.6727129337539433, "grad_norm": 0.6385762621009046, "learning_rate": 1.864938347369869e-05, "loss": 0.471, "step": 3412 }, { "epoch": 0.672910094637224, "grad_norm": 0.5758795206307361, "learning_rate": 1.8648605445661256e-05, "loss": 0.4321, "step": 3413 }, { "epoch": 0.6731072555205048, "grad_norm": 0.633725193056609, "learning_rate": 1.8647827209834105e-05, "loss": 0.469, "step": 3414 }, { "epoch": 0.6733044164037855, "grad_norm": 0.5649636256512869, "learning_rate": 1.864704876623594e-05, "loss": 0.3962, "step": 3415 }, { "epoch": 0.6735015772870663, "grad_norm": 0.5541617307127727, "learning_rate": 1.8646270114885467e-05, "loss": 0.4245, "step": 3416 }, { "epoch": 0.673698738170347, "grad_norm": 0.5576784865143702, "learning_rate": 1.864549125580139e-05, "loss": 0.4479, "step": 3417 }, { "epoch": 0.6738958990536278, "grad_norm": 0.6606800054432356, "learning_rate": 1.8644712189002426e-05, "loss": 0.4209, "step": 3418 }, { "epoch": 0.6740930599369085, "grad_norm": 0.5996093302287164, "learning_rate": 1.864393291450729e-05, "loss": 0.4806, "step": 3419 }, { "epoch": 0.6742902208201893, "grad_norm": 0.5796582195346354, "learning_rate": 1.8643153432334703e-05, "loss": 0.4231, "step": 3420 }, { "epoch": 0.67448738170347, "grad_norm": 0.6804211689949103, "learning_rate": 1.8642373742503395e-05, "loss": 0.469, "step": 3421 }, { "epoch": 0.6746845425867508, "grad_norm": 0.8018195345405356, "learning_rate": 1.8641593845032098e-05, "loss": 0.4612, "step": 3422 }, { "epoch": 0.6748817034700315, "grad_norm": 0.5430532048956593, "learning_rate": 1.864081373993955e-05, "loss": 0.447, "step": 3423 }, { "epoch": 0.6750788643533123, "grad_norm": 0.5081290249970105, "learning_rate": 1.8640033427244497e-05, "loss": 0.3789, "step": 3424 }, { "epoch": 0.675276025236593, "grad_norm": 0.6128416516260621, "learning_rate": 1.863925290696568e-05, "loss": 0.4795, "step": 3425 }, { "epoch": 0.6754731861198738, "grad_norm": 0.5433092528759821, "learning_rate": 1.8638472179121855e-05, "loss": 0.4399, "step": 3426 }, { "epoch": 0.6756703470031545, "grad_norm": 6.721875316952158, "learning_rate": 1.863769124373178e-05, "loss": 0.4689, "step": 3427 }, { "epoch": 0.6758675078864353, "grad_norm": 0.7065414307044805, "learning_rate": 1.8636910100814216e-05, "loss": 0.457, "step": 3428 }, { "epoch": 0.676064668769716, "grad_norm": 0.5525113473136398, "learning_rate": 1.863612875038793e-05, "loss": 0.4101, "step": 3429 }, { "epoch": 0.6762618296529969, "grad_norm": 0.6783813336542089, "learning_rate": 1.86353471924717e-05, "loss": 0.4869, "step": 3430 }, { "epoch": 0.6764589905362776, "grad_norm": 0.7824903261317586, "learning_rate": 1.8634565427084295e-05, "loss": 0.4584, "step": 3431 }, { "epoch": 0.6766561514195584, "grad_norm": 0.6900514518883072, "learning_rate": 1.8633783454244506e-05, "loss": 0.4408, "step": 3432 }, { "epoch": 0.6768533123028391, "grad_norm": 0.6615900688611679, "learning_rate": 1.8633001273971115e-05, "loss": 0.4558, "step": 3433 }, { "epoch": 0.6770504731861199, "grad_norm": 0.5567854903968781, "learning_rate": 1.863221888628292e-05, "loss": 0.4031, "step": 3434 }, { "epoch": 0.6772476340694006, "grad_norm": 0.6417523757757003, "learning_rate": 1.8631436291198707e-05, "loss": 0.4672, "step": 3435 }, { "epoch": 0.6774447949526814, "grad_norm": 0.6024057489717937, "learning_rate": 1.863065348873729e-05, "loss": 0.4381, "step": 3436 }, { "epoch": 0.6776419558359621, "grad_norm": 0.6645203432793743, "learning_rate": 1.8629870478917477e-05, "loss": 0.4424, "step": 3437 }, { "epoch": 0.6778391167192429, "grad_norm": 0.5526525607402512, "learning_rate": 1.8629087261758072e-05, "loss": 0.4154, "step": 3438 }, { "epoch": 0.6780362776025236, "grad_norm": 0.5995871293018885, "learning_rate": 1.8628303837277893e-05, "loss": 0.4304, "step": 3439 }, { "epoch": 0.6782334384858044, "grad_norm": 0.5743129816438846, "learning_rate": 1.8627520205495772e-05, "loss": 0.448, "step": 3440 }, { "epoch": 0.6784305993690851, "grad_norm": 0.5829067346816689, "learning_rate": 1.862673636643053e-05, "loss": 0.4211, "step": 3441 }, { "epoch": 0.6786277602523659, "grad_norm": 0.6220895944597309, "learning_rate": 1.8625952320100998e-05, "loss": 0.4241, "step": 3442 }, { "epoch": 0.6788249211356467, "grad_norm": 0.8700470116975232, "learning_rate": 1.8625168066526017e-05, "loss": 0.4635, "step": 3443 }, { "epoch": 0.6790220820189274, "grad_norm": 0.572249467365147, "learning_rate": 1.8624383605724422e-05, "loss": 0.4684, "step": 3444 }, { "epoch": 0.6792192429022083, "grad_norm": 0.5329766253780658, "learning_rate": 1.8623598937715072e-05, "loss": 0.4132, "step": 3445 }, { "epoch": 0.679416403785489, "grad_norm": 0.5378353899707224, "learning_rate": 1.8622814062516807e-05, "loss": 0.4514, "step": 3446 }, { "epoch": 0.6796135646687698, "grad_norm": 0.6043076943443687, "learning_rate": 1.8622028980148494e-05, "loss": 0.4195, "step": 3447 }, { "epoch": 0.6798107255520505, "grad_norm": 0.5415001575820032, "learning_rate": 1.8621243690628993e-05, "loss": 0.4388, "step": 3448 }, { "epoch": 0.6800078864353313, "grad_norm": 0.5715084310965, "learning_rate": 1.8620458193977166e-05, "loss": 0.4377, "step": 3449 }, { "epoch": 0.680205047318612, "grad_norm": 0.5477132137010473, "learning_rate": 1.861967249021189e-05, "loss": 0.4166, "step": 3450 }, { "epoch": 0.6804022082018928, "grad_norm": 0.513438305978723, "learning_rate": 1.861888657935204e-05, "loss": 0.4098, "step": 3451 }, { "epoch": 0.6805993690851735, "grad_norm": 0.7909891794930992, "learning_rate": 1.8618100461416503e-05, "loss": 0.4311, "step": 3452 }, { "epoch": 0.6807965299684543, "grad_norm": 0.5263405808562206, "learning_rate": 1.8617314136424157e-05, "loss": 0.4381, "step": 3453 }, { "epoch": 0.680993690851735, "grad_norm": 0.5858050604116924, "learning_rate": 1.8616527604393903e-05, "loss": 0.4383, "step": 3454 }, { "epoch": 0.6811908517350158, "grad_norm": 0.56256951114871, "learning_rate": 1.8615740865344632e-05, "loss": 0.4517, "step": 3455 }, { "epoch": 0.6813880126182965, "grad_norm": 1.956983000612891, "learning_rate": 1.861495391929525e-05, "loss": 0.4863, "step": 3456 }, { "epoch": 0.6815851735015773, "grad_norm": 224.166458708411, "learning_rate": 1.8614166766264662e-05, "loss": 0.8088, "step": 3457 }, { "epoch": 0.681782334384858, "grad_norm": 0.7066319163990039, "learning_rate": 1.8613379406271784e-05, "loss": 0.4568, "step": 3458 }, { "epoch": 0.6819794952681388, "grad_norm": 0.530318401808118, "learning_rate": 1.8612591839335526e-05, "loss": 0.4104, "step": 3459 }, { "epoch": 0.6821766561514195, "grad_norm": 0.6416231249123863, "learning_rate": 1.861180406547481e-05, "loss": 0.4504, "step": 3460 }, { "epoch": 0.6823738170347003, "grad_norm": 0.6016468271692968, "learning_rate": 1.8611016084708572e-05, "loss": 0.4866, "step": 3461 }, { "epoch": 0.682570977917981, "grad_norm": 0.5665263512082256, "learning_rate": 1.8610227897055736e-05, "loss": 0.4272, "step": 3462 }, { "epoch": 0.6827681388012619, "grad_norm": 0.5951978340725875, "learning_rate": 1.8609439502535244e-05, "loss": 0.47, "step": 3463 }, { "epoch": 0.6829652996845426, "grad_norm": 0.5145073667769414, "learning_rate": 1.8608650901166034e-05, "loss": 0.3968, "step": 3464 }, { "epoch": 0.6831624605678234, "grad_norm": 0.6596358236905893, "learning_rate": 1.8607862092967048e-05, "loss": 0.4825, "step": 3465 }, { "epoch": 0.6833596214511041, "grad_norm": 0.5633284833079664, "learning_rate": 1.8607073077957246e-05, "loss": 0.4295, "step": 3466 }, { "epoch": 0.6835567823343849, "grad_norm": 0.5986090013281019, "learning_rate": 1.8606283856155585e-05, "loss": 0.4758, "step": 3467 }, { "epoch": 0.6837539432176656, "grad_norm": 0.6051379255945182, "learning_rate": 1.8605494427581022e-05, "loss": 0.4411, "step": 3468 }, { "epoch": 0.6839511041009464, "grad_norm": 0.5119522262994246, "learning_rate": 1.8604704792252524e-05, "loss": 0.4063, "step": 3469 }, { "epoch": 0.6841482649842271, "grad_norm": 0.5838237384793697, "learning_rate": 1.8603914950189063e-05, "loss": 0.4083, "step": 3470 }, { "epoch": 0.6843454258675079, "grad_norm": 0.5495244130738517, "learning_rate": 1.860312490140962e-05, "loss": 0.4398, "step": 3471 }, { "epoch": 0.6845425867507886, "grad_norm": 0.8373973638550604, "learning_rate": 1.860233464593317e-05, "loss": 0.4274, "step": 3472 }, { "epoch": 0.6847397476340694, "grad_norm": 0.6095069405419896, "learning_rate": 1.8601544183778707e-05, "loss": 0.4282, "step": 3473 }, { "epoch": 0.6849369085173501, "grad_norm": 0.6313696871900962, "learning_rate": 1.8600753514965215e-05, "loss": 0.4644, "step": 3474 }, { "epoch": 0.6851340694006309, "grad_norm": 0.6331745807987341, "learning_rate": 1.8599962639511692e-05, "loss": 0.4227, "step": 3475 }, { "epoch": 0.6853312302839116, "grad_norm": 0.6271712528145608, "learning_rate": 1.8599171557437147e-05, "loss": 0.4182, "step": 3476 }, { "epoch": 0.6855283911671924, "grad_norm": 0.6822672026358032, "learning_rate": 1.8598380268760573e-05, "loss": 0.4609, "step": 3477 }, { "epoch": 0.6857255520504731, "grad_norm": 0.6394977318604866, "learning_rate": 1.8597588773500997e-05, "loss": 0.4657, "step": 3478 }, { "epoch": 0.685922712933754, "grad_norm": 0.5473583517133394, "learning_rate": 1.8596797071677422e-05, "loss": 0.4207, "step": 3479 }, { "epoch": 0.6861198738170347, "grad_norm": 0.5332831738588625, "learning_rate": 1.8596005163308874e-05, "loss": 0.393, "step": 3480 }, { "epoch": 0.6863170347003155, "grad_norm": 0.5850491469644991, "learning_rate": 1.859521304841438e-05, "loss": 0.4544, "step": 3481 }, { "epoch": 0.6865141955835962, "grad_norm": 0.5298743941736833, "learning_rate": 1.859442072701297e-05, "loss": 0.4558, "step": 3482 }, { "epoch": 0.686711356466877, "grad_norm": 0.5914438587794221, "learning_rate": 1.8593628199123684e-05, "loss": 0.4696, "step": 3483 }, { "epoch": 0.6869085173501577, "grad_norm": 0.5849436915983998, "learning_rate": 1.8592835464765557e-05, "loss": 0.468, "step": 3484 }, { "epoch": 0.6871056782334385, "grad_norm": 0.5517879662711394, "learning_rate": 1.859204252395764e-05, "loss": 0.4404, "step": 3485 }, { "epoch": 0.6873028391167192, "grad_norm": 0.5432311116041397, "learning_rate": 1.8591249376718984e-05, "loss": 0.4182, "step": 3486 }, { "epoch": 0.6875, "grad_norm": 0.5778253507921007, "learning_rate": 1.859045602306864e-05, "loss": 0.4714, "step": 3487 }, { "epoch": 0.6876971608832808, "grad_norm": 0.5626709550212079, "learning_rate": 1.8589662463025674e-05, "loss": 0.4214, "step": 3488 }, { "epoch": 0.6878943217665615, "grad_norm": 0.5828536367056488, "learning_rate": 1.858886869660915e-05, "loss": 0.4687, "step": 3489 }, { "epoch": 0.6880914826498423, "grad_norm": 0.5546642607606428, "learning_rate": 1.8588074723838136e-05, "loss": 0.4452, "step": 3490 }, { "epoch": 0.688288643533123, "grad_norm": 0.5748243298511659, "learning_rate": 1.8587280544731712e-05, "loss": 0.4597, "step": 3491 }, { "epoch": 0.6884858044164038, "grad_norm": 0.5614432197404846, "learning_rate": 1.858648615930896e-05, "loss": 0.4094, "step": 3492 }, { "epoch": 0.6886829652996845, "grad_norm": 0.5310867099166175, "learning_rate": 1.8585691567588964e-05, "loss": 0.4, "step": 3493 }, { "epoch": 0.6888801261829653, "grad_norm": 0.5487952682416996, "learning_rate": 1.858489676959081e-05, "loss": 0.4263, "step": 3494 }, { "epoch": 0.689077287066246, "grad_norm": 0.6769572158367133, "learning_rate": 1.85841017653336e-05, "loss": 0.4654, "step": 3495 }, { "epoch": 0.6892744479495269, "grad_norm": 0.6393020125226531, "learning_rate": 1.8583306554836432e-05, "loss": 0.4583, "step": 3496 }, { "epoch": 0.6894716088328076, "grad_norm": 0.559712814410196, "learning_rate": 1.8582511138118413e-05, "loss": 0.4298, "step": 3497 }, { "epoch": 0.6896687697160884, "grad_norm": 0.602123934295947, "learning_rate": 1.8581715515198652e-05, "loss": 0.4675, "step": 3498 }, { "epoch": 0.6898659305993691, "grad_norm": 0.5733778203272168, "learning_rate": 1.8580919686096263e-05, "loss": 0.4297, "step": 3499 }, { "epoch": 0.6900630914826499, "grad_norm": 0.6829776698050027, "learning_rate": 1.858012365083037e-05, "loss": 0.4478, "step": 3500 }, { "epoch": 0.6902602523659306, "grad_norm": 0.9792198683444031, "learning_rate": 1.8579327409420094e-05, "loss": 0.4772, "step": 3501 }, { "epoch": 0.6904574132492114, "grad_norm": 0.594469172261072, "learning_rate": 1.8578530961884574e-05, "loss": 0.4306, "step": 3502 }, { "epoch": 0.6906545741324921, "grad_norm": 0.6517540399351949, "learning_rate": 1.8577734308242936e-05, "loss": 0.4324, "step": 3503 }, { "epoch": 0.6908517350157729, "grad_norm": 0.5804034407531462, "learning_rate": 1.8576937448514323e-05, "loss": 0.4582, "step": 3504 }, { "epoch": 0.6910488958990536, "grad_norm": 0.5754381054150248, "learning_rate": 1.857614038271788e-05, "loss": 0.4442, "step": 3505 }, { "epoch": 0.6912460567823344, "grad_norm": 0.5394282365890705, "learning_rate": 1.857534311087276e-05, "loss": 0.3904, "step": 3506 }, { "epoch": 0.6914432176656151, "grad_norm": 0.5714584536292786, "learning_rate": 1.8574545632998116e-05, "loss": 0.438, "step": 3507 }, { "epoch": 0.6916403785488959, "grad_norm": 0.6005244403686105, "learning_rate": 1.857374794911311e-05, "loss": 0.4698, "step": 3508 }, { "epoch": 0.6918375394321766, "grad_norm": 0.5475318354156559, "learning_rate": 1.85729500592369e-05, "loss": 0.4425, "step": 3509 }, { "epoch": 0.6920347003154574, "grad_norm": 0.5867289944866695, "learning_rate": 1.857215196338866e-05, "loss": 0.4601, "step": 3510 }, { "epoch": 0.6922318611987381, "grad_norm": 0.5465094788446564, "learning_rate": 1.8571353661587573e-05, "loss": 0.4496, "step": 3511 }, { "epoch": 0.692429022082019, "grad_norm": 0.574286072227518, "learning_rate": 1.8570555153852806e-05, "loss": 0.4449, "step": 3512 }, { "epoch": 0.6926261829652997, "grad_norm": 0.6073127421651645, "learning_rate": 1.8569756440203554e-05, "loss": 0.449, "step": 3513 }, { "epoch": 0.6928233438485805, "grad_norm": 0.5771548485295239, "learning_rate": 1.8568957520659e-05, "loss": 0.4555, "step": 3514 }, { "epoch": 0.6930205047318612, "grad_norm": 0.5445589778122638, "learning_rate": 1.856815839523834e-05, "loss": 0.4319, "step": 3515 }, { "epoch": 0.693217665615142, "grad_norm": 0.5629986928595075, "learning_rate": 1.8567359063960778e-05, "loss": 0.439, "step": 3516 }, { "epoch": 0.6934148264984227, "grad_norm": 0.5596461637975891, "learning_rate": 1.8566559526845512e-05, "loss": 0.4341, "step": 3517 }, { "epoch": 0.6936119873817035, "grad_norm": 0.5375000041425667, "learning_rate": 1.8565759783911756e-05, "loss": 0.3897, "step": 3518 }, { "epoch": 0.6938091482649842, "grad_norm": 0.503604882436781, "learning_rate": 1.8564959835178725e-05, "loss": 0.3891, "step": 3519 }, { "epoch": 0.694006309148265, "grad_norm": 0.6060651930247586, "learning_rate": 1.8564159680665633e-05, "loss": 0.4572, "step": 3520 }, { "epoch": 0.6942034700315457, "grad_norm": 0.5897040744548951, "learning_rate": 1.856335932039171e-05, "loss": 0.4341, "step": 3521 }, { "epoch": 0.6944006309148265, "grad_norm": 0.5726098247448242, "learning_rate": 1.8562558754376182e-05, "loss": 0.4337, "step": 3522 }, { "epoch": 0.6945977917981072, "grad_norm": 0.5241962440051866, "learning_rate": 1.8561757982638285e-05, "loss": 0.4222, "step": 3523 }, { "epoch": 0.694794952681388, "grad_norm": 0.5767874135195625, "learning_rate": 1.856095700519726e-05, "loss": 0.4669, "step": 3524 }, { "epoch": 0.6949921135646687, "grad_norm": 0.605586730484475, "learning_rate": 1.856015582207235e-05, "loss": 0.4965, "step": 3525 }, { "epoch": 0.6951892744479495, "grad_norm": 0.5293811874847645, "learning_rate": 1.8559354433282795e-05, "loss": 0.4187, "step": 3526 }, { "epoch": 0.6953864353312302, "grad_norm": 0.557624349231964, "learning_rate": 1.8558552838847862e-05, "loss": 0.4413, "step": 3527 }, { "epoch": 0.695583596214511, "grad_norm": 0.5724753571082727, "learning_rate": 1.8557751038786807e-05, "loss": 0.4325, "step": 3528 }, { "epoch": 0.6957807570977917, "grad_norm": 0.5642589878108021, "learning_rate": 1.8556949033118886e-05, "loss": 0.4346, "step": 3529 }, { "epoch": 0.6959779179810726, "grad_norm": 0.587492971862397, "learning_rate": 1.855614682186338e-05, "loss": 0.4709, "step": 3530 }, { "epoch": 0.6961750788643533, "grad_norm": 0.5465280182977807, "learning_rate": 1.8555344405039553e-05, "loss": 0.4184, "step": 3531 }, { "epoch": 0.6963722397476341, "grad_norm": 0.6480278193209652, "learning_rate": 1.8554541782666685e-05, "loss": 0.4649, "step": 3532 }, { "epoch": 0.6965694006309149, "grad_norm": 0.5135839915061162, "learning_rate": 1.8553738954764068e-05, "loss": 0.4147, "step": 3533 }, { "epoch": 0.6967665615141956, "grad_norm": 0.5495932553673605, "learning_rate": 1.855293592135098e-05, "loss": 0.3947, "step": 3534 }, { "epoch": 0.6969637223974764, "grad_norm": 0.5759718073274976, "learning_rate": 1.8552132682446716e-05, "loss": 0.4613, "step": 3535 }, { "epoch": 0.6971608832807571, "grad_norm": 0.5693917333392535, "learning_rate": 1.8551329238070583e-05, "loss": 0.4661, "step": 3536 }, { "epoch": 0.6973580441640379, "grad_norm": 0.633091177062187, "learning_rate": 1.8550525588241878e-05, "loss": 0.4114, "step": 3537 }, { "epoch": 0.6975552050473186, "grad_norm": 0.5719956129908553, "learning_rate": 1.8549721732979904e-05, "loss": 0.4638, "step": 3538 }, { "epoch": 0.6977523659305994, "grad_norm": 0.5734766418241823, "learning_rate": 1.8548917672303987e-05, "loss": 0.4279, "step": 3539 }, { "epoch": 0.6979495268138801, "grad_norm": 0.5817240921924031, "learning_rate": 1.8548113406233436e-05, "loss": 0.4595, "step": 3540 }, { "epoch": 0.6981466876971609, "grad_norm": 0.5498165766780686, "learning_rate": 1.8547308934787576e-05, "loss": 0.4183, "step": 3541 }, { "epoch": 0.6983438485804416, "grad_norm": 0.5771901083506239, "learning_rate": 1.8546504257985738e-05, "loss": 0.4399, "step": 3542 }, { "epoch": 0.6985410094637224, "grad_norm": 0.8880467198374812, "learning_rate": 1.8545699375847247e-05, "loss": 0.4673, "step": 3543 }, { "epoch": 0.6987381703470031, "grad_norm": 0.8215695607693159, "learning_rate": 1.8544894288391452e-05, "loss": 0.3944, "step": 3544 }, { "epoch": 0.698935331230284, "grad_norm": 2.190816475198652, "learning_rate": 1.8544088995637693e-05, "loss": 0.43, "step": 3545 }, { "epoch": 0.6991324921135647, "grad_norm": 0.6619407960935211, "learning_rate": 1.854328349760531e-05, "loss": 0.474, "step": 3546 }, { "epoch": 0.6993296529968455, "grad_norm": 0.7707630061873859, "learning_rate": 1.8542477794313662e-05, "loss": 0.4369, "step": 3547 }, { "epoch": 0.6995268138801262, "grad_norm": 0.6668874413715213, "learning_rate": 1.8541671885782106e-05, "loss": 0.4226, "step": 3548 }, { "epoch": 0.699723974763407, "grad_norm": 0.5933363551194153, "learning_rate": 1.8540865772030004e-05, "loss": 0.44, "step": 3549 }, { "epoch": 0.6999211356466877, "grad_norm": 0.5411012362808466, "learning_rate": 1.8540059453076728e-05, "loss": 0.4505, "step": 3550 }, { "epoch": 0.7001182965299685, "grad_norm": 2.092070387748023, "learning_rate": 1.853925292894164e-05, "loss": 0.4816, "step": 3551 }, { "epoch": 0.7003154574132492, "grad_norm": 0.6015879767455612, "learning_rate": 1.853844619964413e-05, "loss": 0.4578, "step": 3552 }, { "epoch": 0.70051261829653, "grad_norm": 0.6227707549366479, "learning_rate": 1.853763926520357e-05, "loss": 0.4621, "step": 3553 }, { "epoch": 0.7007097791798107, "grad_norm": 0.6158286256739309, "learning_rate": 1.8536832125639353e-05, "loss": 0.4615, "step": 3554 }, { "epoch": 0.7009069400630915, "grad_norm": 4.099666789727112, "learning_rate": 1.8536024780970868e-05, "loss": 0.4471, "step": 3555 }, { "epoch": 0.7011041009463722, "grad_norm": 1.3865107372029537, "learning_rate": 1.8535217231217512e-05, "loss": 0.4835, "step": 3556 }, { "epoch": 0.701301261829653, "grad_norm": 0.6437956304730564, "learning_rate": 1.8534409476398693e-05, "loss": 0.4976, "step": 3557 }, { "epoch": 0.7014984227129337, "grad_norm": 0.809900272846465, "learning_rate": 1.853360151653381e-05, "loss": 0.4337, "step": 3558 }, { "epoch": 0.7016955835962145, "grad_norm": 0.5732397543697593, "learning_rate": 1.8532793351642283e-05, "loss": 0.447, "step": 3559 }, { "epoch": 0.7018927444794952, "grad_norm": 0.6180992287814596, "learning_rate": 1.853198498174352e-05, "loss": 0.4483, "step": 3560 }, { "epoch": 0.702089905362776, "grad_norm": 0.5836355816115922, "learning_rate": 1.853117640685695e-05, "loss": 0.4698, "step": 3561 }, { "epoch": 0.7022870662460567, "grad_norm": 0.6266470149737501, "learning_rate": 1.853036762700199e-05, "loss": 0.4526, "step": 3562 }, { "epoch": 0.7024842271293376, "grad_norm": 0.5545308522500857, "learning_rate": 1.8529558642198085e-05, "loss": 0.4275, "step": 3563 }, { "epoch": 0.7026813880126183, "grad_norm": 0.5997846025265139, "learning_rate": 1.8528749452464667e-05, "loss": 0.4495, "step": 3564 }, { "epoch": 0.7028785488958991, "grad_norm": 0.5932966298805183, "learning_rate": 1.8527940057821168e-05, "loss": 0.4404, "step": 3565 }, { "epoch": 0.7030757097791798, "grad_norm": 0.6264233832414875, "learning_rate": 1.8527130458287047e-05, "loss": 0.4668, "step": 3566 }, { "epoch": 0.7032728706624606, "grad_norm": 0.5422421411631603, "learning_rate": 1.8526320653881745e-05, "loss": 0.4352, "step": 3567 }, { "epoch": 0.7034700315457413, "grad_norm": 0.5335967174546794, "learning_rate": 1.8525510644624726e-05, "loss": 0.4091, "step": 3568 }, { "epoch": 0.7036671924290221, "grad_norm": 0.8352069980523148, "learning_rate": 1.852470043053545e-05, "loss": 0.4723, "step": 3569 }, { "epoch": 0.7038643533123028, "grad_norm": 2.2486627770555585, "learning_rate": 1.8523890011633377e-05, "loss": 0.4209, "step": 3570 }, { "epoch": 0.7040615141955836, "grad_norm": 0.5577369897883039, "learning_rate": 1.8523079387937984e-05, "loss": 0.4209, "step": 3571 }, { "epoch": 0.7042586750788643, "grad_norm": 0.7637425020315376, "learning_rate": 1.8522268559468744e-05, "loss": 0.44, "step": 3572 }, { "epoch": 0.7044558359621451, "grad_norm": 0.6449349203733895, "learning_rate": 1.8521457526245142e-05, "loss": 0.4603, "step": 3573 }, { "epoch": 0.7046529968454258, "grad_norm": 0.5791189282772926, "learning_rate": 1.852064628828666e-05, "loss": 0.4327, "step": 3574 }, { "epoch": 0.7048501577287066, "grad_norm": 0.7439087018417354, "learning_rate": 1.851983484561279e-05, "loss": 0.4688, "step": 3575 }, { "epoch": 0.7050473186119873, "grad_norm": 0.5678987372658572, "learning_rate": 1.8519023198243023e-05, "loss": 0.4532, "step": 3576 }, { "epoch": 0.7052444794952681, "grad_norm": 0.6893907579878671, "learning_rate": 1.8518211346196865e-05, "loss": 0.4482, "step": 3577 }, { "epoch": 0.705441640378549, "grad_norm": 0.6063321929451956, "learning_rate": 1.851739928949382e-05, "loss": 0.452, "step": 3578 }, { "epoch": 0.7056388012618297, "grad_norm": 0.5937895590659021, "learning_rate": 1.85165870281534e-05, "loss": 0.4067, "step": 3579 }, { "epoch": 0.7058359621451105, "grad_norm": 0.571875582830355, "learning_rate": 1.8515774562195115e-05, "loss": 0.4296, "step": 3580 }, { "epoch": 0.7060331230283912, "grad_norm": 0.6350546066367247, "learning_rate": 1.851496189163849e-05, "loss": 0.4802, "step": 3581 }, { "epoch": 0.706230283911672, "grad_norm": 0.5728556146008228, "learning_rate": 1.8514149016503048e-05, "loss": 0.396, "step": 3582 }, { "epoch": 0.7064274447949527, "grad_norm": 0.5607106332513473, "learning_rate": 1.851333593680832e-05, "loss": 0.395, "step": 3583 }, { "epoch": 0.7066246056782335, "grad_norm": 0.6303565279440619, "learning_rate": 1.851252265257384e-05, "loss": 0.4606, "step": 3584 }, { "epoch": 0.7068217665615142, "grad_norm": 0.5656202726110657, "learning_rate": 1.8511709163819146e-05, "loss": 0.4347, "step": 3585 }, { "epoch": 0.707018927444795, "grad_norm": 0.5691199982028473, "learning_rate": 1.851089547056379e-05, "loss": 0.4799, "step": 3586 }, { "epoch": 0.7072160883280757, "grad_norm": 0.5132663071610697, "learning_rate": 1.851008157282731e-05, "loss": 0.4254, "step": 3587 }, { "epoch": 0.7074132492113565, "grad_norm": 0.599385661554487, "learning_rate": 1.8509267470629275e-05, "loss": 0.4807, "step": 3588 }, { "epoch": 0.7076104100946372, "grad_norm": 0.5953814395962257, "learning_rate": 1.850845316398923e-05, "loss": 0.4638, "step": 3589 }, { "epoch": 0.707807570977918, "grad_norm": 0.5296035745748813, "learning_rate": 1.8507638652926748e-05, "loss": 0.4341, "step": 3590 }, { "epoch": 0.7080047318611987, "grad_norm": 0.5836957939032046, "learning_rate": 1.85068239374614e-05, "loss": 0.4664, "step": 3591 }, { "epoch": 0.7082018927444795, "grad_norm": 0.5383659896543327, "learning_rate": 1.8506009017612752e-05, "loss": 0.4342, "step": 3592 }, { "epoch": 0.7083990536277602, "grad_norm": 0.5802394458133452, "learning_rate": 1.850519389340039e-05, "loss": 0.4459, "step": 3593 }, { "epoch": 0.708596214511041, "grad_norm": 0.6351661320355233, "learning_rate": 1.850437856484389e-05, "loss": 0.4401, "step": 3594 }, { "epoch": 0.7087933753943217, "grad_norm": 0.5835902309373062, "learning_rate": 1.850356303196285e-05, "loss": 0.4412, "step": 3595 }, { "epoch": 0.7089905362776026, "grad_norm": 0.5574546489985274, "learning_rate": 1.850274729477686e-05, "loss": 0.4558, "step": 3596 }, { "epoch": 0.7091876971608833, "grad_norm": 0.6989042029450458, "learning_rate": 1.850193135330552e-05, "loss": 0.4467, "step": 3597 }, { "epoch": 0.7093848580441641, "grad_norm": 0.5679323561115319, "learning_rate": 1.850111520756843e-05, "loss": 0.4235, "step": 3598 }, { "epoch": 0.7095820189274448, "grad_norm": 2.385763474041582, "learning_rate": 1.8500298857585207e-05, "loss": 0.442, "step": 3599 }, { "epoch": 0.7097791798107256, "grad_norm": 0.6566291998053769, "learning_rate": 1.8499482303375454e-05, "loss": 0.4537, "step": 3600 }, { "epoch": 0.7099763406940063, "grad_norm": 0.5737937851393634, "learning_rate": 1.8498665544958793e-05, "loss": 0.4083, "step": 3601 }, { "epoch": 0.7101735015772871, "grad_norm": 0.5528769295761875, "learning_rate": 1.8497848582354852e-05, "loss": 0.4165, "step": 3602 }, { "epoch": 0.7103706624605678, "grad_norm": 0.5798728107229193, "learning_rate": 1.8497031415583252e-05, "loss": 0.4289, "step": 3603 }, { "epoch": 0.7105678233438486, "grad_norm": 0.5389476339142518, "learning_rate": 1.8496214044663633e-05, "loss": 0.4062, "step": 3604 }, { "epoch": 0.7107649842271293, "grad_norm": 0.6195250954116948, "learning_rate": 1.8495396469615627e-05, "loss": 0.4685, "step": 3605 }, { "epoch": 0.7109621451104101, "grad_norm": 0.6193545738894738, "learning_rate": 1.849457869045888e-05, "loss": 0.4704, "step": 3606 }, { "epoch": 0.7111593059936908, "grad_norm": 0.5963823991477826, "learning_rate": 1.849376070721304e-05, "loss": 0.4376, "step": 3607 }, { "epoch": 0.7113564668769716, "grad_norm": 0.5649286189482374, "learning_rate": 1.849294251989776e-05, "loss": 0.442, "step": 3608 }, { "epoch": 0.7115536277602523, "grad_norm": 0.5655868715091625, "learning_rate": 1.8492124128532697e-05, "loss": 0.4472, "step": 3609 }, { "epoch": 0.7117507886435331, "grad_norm": 0.5624574778497905, "learning_rate": 1.849130553313751e-05, "loss": 0.4198, "step": 3610 }, { "epoch": 0.7119479495268138, "grad_norm": 0.5918835988067689, "learning_rate": 1.849048673373187e-05, "loss": 0.4553, "step": 3611 }, { "epoch": 0.7121451104100947, "grad_norm": 0.5450609283163641, "learning_rate": 1.848966773033545e-05, "loss": 0.408, "step": 3612 }, { "epoch": 0.7123422712933754, "grad_norm": 0.6211662601865566, "learning_rate": 1.8488848522967926e-05, "loss": 0.4925, "step": 3613 }, { "epoch": 0.7125394321766562, "grad_norm": 0.9407135144432643, "learning_rate": 1.848802911164898e-05, "loss": 0.4563, "step": 3614 }, { "epoch": 0.7127365930599369, "grad_norm": 0.6364553089373497, "learning_rate": 1.8487209496398298e-05, "loss": 0.4556, "step": 3615 }, { "epoch": 0.7129337539432177, "grad_norm": 0.6193764318415926, "learning_rate": 1.8486389677235577e-05, "loss": 0.5091, "step": 3616 }, { "epoch": 0.7131309148264984, "grad_norm": 0.5810125836188512, "learning_rate": 1.8485569654180506e-05, "loss": 0.4331, "step": 3617 }, { "epoch": 0.7133280757097792, "grad_norm": 0.5647496664846747, "learning_rate": 1.8484749427252794e-05, "loss": 0.4669, "step": 3618 }, { "epoch": 0.7135252365930599, "grad_norm": 0.5966913216409047, "learning_rate": 1.848392899647214e-05, "loss": 0.4058, "step": 3619 }, { "epoch": 0.7137223974763407, "grad_norm": 0.5710136188088236, "learning_rate": 1.8483108361858263e-05, "loss": 0.4354, "step": 3620 }, { "epoch": 0.7139195583596214, "grad_norm": 0.5532446856241241, "learning_rate": 1.8482287523430876e-05, "loss": 0.4198, "step": 3621 }, { "epoch": 0.7141167192429022, "grad_norm": 0.6722276257279677, "learning_rate": 1.8481466481209696e-05, "loss": 0.46, "step": 3622 }, { "epoch": 0.714313880126183, "grad_norm": 0.5639143923716924, "learning_rate": 1.848064523521446e-05, "loss": 0.47, "step": 3623 }, { "epoch": 0.7145110410094637, "grad_norm": 0.651640797548847, "learning_rate": 1.847982378546489e-05, "loss": 0.4575, "step": 3624 }, { "epoch": 0.7147082018927445, "grad_norm": 0.6678536066940324, "learning_rate": 1.8479002131980726e-05, "loss": 0.4547, "step": 3625 }, { "epoch": 0.7149053627760252, "grad_norm": 0.5640299919371896, "learning_rate": 1.8478180274781707e-05, "loss": 0.4302, "step": 3626 }, { "epoch": 0.715102523659306, "grad_norm": 0.5625943114415008, "learning_rate": 1.8477358213887578e-05, "loss": 0.4295, "step": 3627 }, { "epoch": 0.7152996845425867, "grad_norm": 0.8717418579189087, "learning_rate": 1.8476535949318092e-05, "loss": 0.4288, "step": 3628 }, { "epoch": 0.7154968454258676, "grad_norm": 0.7111204636633626, "learning_rate": 1.8475713481093005e-05, "loss": 0.4779, "step": 3629 }, { "epoch": 0.7156940063091483, "grad_norm": 0.5934289467421391, "learning_rate": 1.8474890809232073e-05, "loss": 0.4132, "step": 3630 }, { "epoch": 0.7158911671924291, "grad_norm": 0.5316183763624316, "learning_rate": 1.8474067933755067e-05, "loss": 0.4343, "step": 3631 }, { "epoch": 0.7160883280757098, "grad_norm": 0.6347964274352421, "learning_rate": 1.8473244854681755e-05, "loss": 0.4702, "step": 3632 }, { "epoch": 0.7162854889589906, "grad_norm": 0.5307412228255478, "learning_rate": 1.847242157203191e-05, "loss": 0.4272, "step": 3633 }, { "epoch": 0.7164826498422713, "grad_norm": 0.6030911118878556, "learning_rate": 1.8471598085825318e-05, "loss": 0.4644, "step": 3634 }, { "epoch": 0.7166798107255521, "grad_norm": 0.5631771711931902, "learning_rate": 1.8470774396081756e-05, "loss": 0.445, "step": 3635 }, { "epoch": 0.7168769716088328, "grad_norm": 0.528315324776306, "learning_rate": 1.846995050282102e-05, "loss": 0.413, "step": 3636 }, { "epoch": 0.7170741324921136, "grad_norm": 0.542371055858913, "learning_rate": 1.84691264060629e-05, "loss": 0.4628, "step": 3637 }, { "epoch": 0.7172712933753943, "grad_norm": 0.5752445019945468, "learning_rate": 1.8468302105827195e-05, "loss": 0.4715, "step": 3638 }, { "epoch": 0.7174684542586751, "grad_norm": 0.5556213004935148, "learning_rate": 1.8467477602133716e-05, "loss": 0.451, "step": 3639 }, { "epoch": 0.7176656151419558, "grad_norm": 0.5529878854385556, "learning_rate": 1.8466652895002272e-05, "loss": 0.4484, "step": 3640 }, { "epoch": 0.7178627760252366, "grad_norm": 0.5491695139904219, "learning_rate": 1.846582798445267e-05, "loss": 0.4218, "step": 3641 }, { "epoch": 0.7180599369085173, "grad_norm": 0.5467401669297904, "learning_rate": 1.8465002870504734e-05, "loss": 0.4229, "step": 3642 }, { "epoch": 0.7182570977917981, "grad_norm": 0.557372492282451, "learning_rate": 1.8464177553178287e-05, "loss": 0.426, "step": 3643 }, { "epoch": 0.7184542586750788, "grad_norm": 0.5362074017531132, "learning_rate": 1.8463352032493162e-05, "loss": 0.41, "step": 3644 }, { "epoch": 0.7186514195583596, "grad_norm": 0.5685836955448099, "learning_rate": 1.8462526308469182e-05, "loss": 0.4272, "step": 3645 }, { "epoch": 0.7188485804416404, "grad_norm": 0.5370351970866275, "learning_rate": 1.8461700381126198e-05, "loss": 0.4278, "step": 3646 }, { "epoch": 0.7190457413249212, "grad_norm": 0.5227153310031122, "learning_rate": 1.8460874250484045e-05, "loss": 0.4478, "step": 3647 }, { "epoch": 0.7192429022082019, "grad_norm": 0.5582355226303339, "learning_rate": 1.8460047916562573e-05, "loss": 0.412, "step": 3648 }, { "epoch": 0.7194400630914827, "grad_norm": 0.9636937931489509, "learning_rate": 1.845922137938164e-05, "loss": 0.4217, "step": 3649 }, { "epoch": 0.7196372239747634, "grad_norm": 0.5507780948989435, "learning_rate": 1.8458394638961102e-05, "loss": 0.46, "step": 3650 }, { "epoch": 0.7198343848580442, "grad_norm": 0.5726098792524265, "learning_rate": 1.845756769532082e-05, "loss": 0.4523, "step": 3651 }, { "epoch": 0.7200315457413249, "grad_norm": 0.5717699113943913, "learning_rate": 1.8456740548480666e-05, "loss": 0.4571, "step": 3652 }, { "epoch": 0.7202287066246057, "grad_norm": 0.6191561110356281, "learning_rate": 1.8455913198460503e-05, "loss": 0.4511, "step": 3653 }, { "epoch": 0.7204258675078864, "grad_norm": 0.707869839857266, "learning_rate": 1.845508564528022e-05, "loss": 0.4854, "step": 3654 }, { "epoch": 0.7206230283911672, "grad_norm": 0.5271413420379327, "learning_rate": 1.8454257888959695e-05, "loss": 0.4362, "step": 3655 }, { "epoch": 0.7208201892744479, "grad_norm": 0.6354322809953766, "learning_rate": 1.845342992951882e-05, "loss": 0.4481, "step": 3656 }, { "epoch": 0.7210173501577287, "grad_norm": 0.5688074081822942, "learning_rate": 1.845260176697748e-05, "loss": 0.4423, "step": 3657 }, { "epoch": 0.7212145110410094, "grad_norm": 0.5351090443779083, "learning_rate": 1.8451773401355576e-05, "loss": 0.4057, "step": 3658 }, { "epoch": 0.7214116719242902, "grad_norm": 0.550968760823806, "learning_rate": 1.845094483267301e-05, "loss": 0.4556, "step": 3659 }, { "epoch": 0.7216088328075709, "grad_norm": 0.5487359171002028, "learning_rate": 1.845011606094969e-05, "loss": 0.4499, "step": 3660 }, { "epoch": 0.7218059936908517, "grad_norm": 0.544581400232813, "learning_rate": 1.8449287086205525e-05, "loss": 0.4242, "step": 3661 }, { "epoch": 0.7220031545741324, "grad_norm": 0.5414669158249051, "learning_rate": 1.8448457908460434e-05, "loss": 0.4179, "step": 3662 }, { "epoch": 0.7222003154574133, "grad_norm": 0.5796223289254395, "learning_rate": 1.844762852773434e-05, "loss": 0.4388, "step": 3663 }, { "epoch": 0.722397476340694, "grad_norm": 0.5501910206556003, "learning_rate": 1.8446798944047163e-05, "loss": 0.4614, "step": 3664 }, { "epoch": 0.7225946372239748, "grad_norm": 0.7117890177693054, "learning_rate": 1.8445969157418845e-05, "loss": 0.4557, "step": 3665 }, { "epoch": 0.7227917981072555, "grad_norm": 0.5801289426969807, "learning_rate": 1.844513916786931e-05, "loss": 0.4469, "step": 3666 }, { "epoch": 0.7229889589905363, "grad_norm": 0.565154672928485, "learning_rate": 1.844430897541851e-05, "loss": 0.411, "step": 3667 }, { "epoch": 0.723186119873817, "grad_norm": 0.5512031287037732, "learning_rate": 1.8443478580086388e-05, "loss": 0.4382, "step": 3668 }, { "epoch": 0.7233832807570978, "grad_norm": 0.5610735735424272, "learning_rate": 1.844264798189289e-05, "loss": 0.4713, "step": 3669 }, { "epoch": 0.7235804416403786, "grad_norm": 0.5706453195319626, "learning_rate": 1.8441817180857977e-05, "loss": 0.4797, "step": 3670 }, { "epoch": 0.7237776025236593, "grad_norm": 0.5321887393174666, "learning_rate": 1.844098617700161e-05, "loss": 0.4554, "step": 3671 }, { "epoch": 0.7239747634069401, "grad_norm": 0.54415284289807, "learning_rate": 1.8440154970343747e-05, "loss": 0.4344, "step": 3672 }, { "epoch": 0.7241719242902208, "grad_norm": 0.5250957449330941, "learning_rate": 1.843932356090437e-05, "loss": 0.4252, "step": 3673 }, { "epoch": 0.7243690851735016, "grad_norm": 0.5266852693718858, "learning_rate": 1.8438491948703445e-05, "loss": 0.4257, "step": 3674 }, { "epoch": 0.7245662460567823, "grad_norm": 0.5325304829293485, "learning_rate": 1.8437660133760955e-05, "loss": 0.4373, "step": 3675 }, { "epoch": 0.7247634069400631, "grad_norm": 0.5793560541922426, "learning_rate": 1.8436828116096886e-05, "loss": 0.4349, "step": 3676 }, { "epoch": 0.7249605678233438, "grad_norm": 0.560121961862576, "learning_rate": 1.843599589573123e-05, "loss": 0.4494, "step": 3677 }, { "epoch": 0.7251577287066246, "grad_norm": 0.5249351610445621, "learning_rate": 1.843516347268398e-05, "loss": 0.4246, "step": 3678 }, { "epoch": 0.7253548895899053, "grad_norm": 0.5412985163854938, "learning_rate": 1.8434330846975128e-05, "loss": 0.4273, "step": 3679 }, { "epoch": 0.7255520504731862, "grad_norm": 0.5315287597533882, "learning_rate": 1.843349801862469e-05, "loss": 0.4564, "step": 3680 }, { "epoch": 0.7257492113564669, "grad_norm": 0.5682525884568123, "learning_rate": 1.843266498765267e-05, "loss": 0.4446, "step": 3681 }, { "epoch": 0.7259463722397477, "grad_norm": 0.5374471401172799, "learning_rate": 1.8431831754079084e-05, "loss": 0.4586, "step": 3682 }, { "epoch": 0.7261435331230284, "grad_norm": 0.5687422399150629, "learning_rate": 1.843099831792395e-05, "loss": 0.4518, "step": 3683 }, { "epoch": 0.7263406940063092, "grad_norm": 0.5448483856895682, "learning_rate": 1.843016467920729e-05, "loss": 0.4522, "step": 3684 }, { "epoch": 0.7265378548895899, "grad_norm": 0.5303752136858124, "learning_rate": 1.8429330837949134e-05, "loss": 0.4303, "step": 3685 }, { "epoch": 0.7267350157728707, "grad_norm": 0.5441603901459253, "learning_rate": 1.842849679416952e-05, "loss": 0.4265, "step": 3686 }, { "epoch": 0.7269321766561514, "grad_norm": 0.5565608021825121, "learning_rate": 1.842766254788848e-05, "loss": 0.4704, "step": 3687 }, { "epoch": 0.7271293375394322, "grad_norm": 0.5392512013397398, "learning_rate": 1.8426828099126058e-05, "loss": 0.4282, "step": 3688 }, { "epoch": 0.7273264984227129, "grad_norm": 0.5228171340025033, "learning_rate": 1.8425993447902312e-05, "loss": 0.4262, "step": 3689 }, { "epoch": 0.7275236593059937, "grad_norm": 0.5057395283789563, "learning_rate": 1.8425158594237285e-05, "loss": 0.3781, "step": 3690 }, { "epoch": 0.7277208201892744, "grad_norm": 0.5637585116791193, "learning_rate": 1.8424323538151038e-05, "loss": 0.4578, "step": 3691 }, { "epoch": 0.7279179810725552, "grad_norm": 0.5770703957710751, "learning_rate": 1.842348827966363e-05, "loss": 0.4474, "step": 3692 }, { "epoch": 0.7281151419558359, "grad_norm": 0.5757553217171197, "learning_rate": 1.8422652818795136e-05, "loss": 0.4266, "step": 3693 }, { "epoch": 0.7283123028391167, "grad_norm": 0.5212848130468406, "learning_rate": 1.8421817155565627e-05, "loss": 0.4297, "step": 3694 }, { "epoch": 0.7285094637223974, "grad_norm": 0.6230547006563527, "learning_rate": 1.8420981289995174e-05, "loss": 0.5156, "step": 3695 }, { "epoch": 0.7287066246056783, "grad_norm": 0.5232291475507522, "learning_rate": 1.842014522210387e-05, "loss": 0.4114, "step": 3696 }, { "epoch": 0.728903785488959, "grad_norm": 0.5347786514736478, "learning_rate": 1.841930895191179e-05, "loss": 0.4235, "step": 3697 }, { "epoch": 0.7291009463722398, "grad_norm": 0.5462129983268524, "learning_rate": 1.841847247943904e-05, "loss": 0.4421, "step": 3698 }, { "epoch": 0.7292981072555205, "grad_norm": 0.49073103119405975, "learning_rate": 1.84176358047057e-05, "loss": 0.3842, "step": 3699 }, { "epoch": 0.7294952681388013, "grad_norm": 0.6150333675068365, "learning_rate": 1.8416798927731888e-05, "loss": 0.4886, "step": 3700 }, { "epoch": 0.729692429022082, "grad_norm": 0.5361700966964398, "learning_rate": 1.8415961848537702e-05, "loss": 0.4386, "step": 3701 }, { "epoch": 0.7298895899053628, "grad_norm": 0.5347576067338814, "learning_rate": 1.8415124567143258e-05, "loss": 0.4184, "step": 3702 }, { "epoch": 0.7300867507886435, "grad_norm": 0.5621215334987752, "learning_rate": 1.8414287083568666e-05, "loss": 0.4261, "step": 3703 }, { "epoch": 0.7302839116719243, "grad_norm": 0.5700646061060279, "learning_rate": 1.841344939783405e-05, "loss": 0.4179, "step": 3704 }, { "epoch": 0.730481072555205, "grad_norm": 0.518131991780773, "learning_rate": 1.841261150995954e-05, "loss": 0.4339, "step": 3705 }, { "epoch": 0.7306782334384858, "grad_norm": 0.5459761294030725, "learning_rate": 1.8411773419965263e-05, "loss": 0.397, "step": 3706 }, { "epoch": 0.7308753943217665, "grad_norm": 0.9314288519022254, "learning_rate": 1.8410935127871356e-05, "loss": 0.442, "step": 3707 }, { "epoch": 0.7310725552050473, "grad_norm": 0.5636871116155396, "learning_rate": 1.8410096633697956e-05, "loss": 0.4173, "step": 3708 }, { "epoch": 0.731269716088328, "grad_norm": 0.5193160515915649, "learning_rate": 1.8409257937465216e-05, "loss": 0.3913, "step": 3709 }, { "epoch": 0.7314668769716088, "grad_norm": 0.712727581510582, "learning_rate": 1.840841903919328e-05, "loss": 0.4502, "step": 3710 }, { "epoch": 0.7316640378548895, "grad_norm": 0.5769818802867137, "learning_rate": 1.8407579938902302e-05, "loss": 0.4527, "step": 3711 }, { "epoch": 0.7318611987381703, "grad_norm": 0.5392705119979897, "learning_rate": 1.8406740636612447e-05, "loss": 0.4538, "step": 3712 }, { "epoch": 0.732058359621451, "grad_norm": 0.6098765553285055, "learning_rate": 1.8405901132343882e-05, "loss": 0.4413, "step": 3713 }, { "epoch": 0.7322555205047319, "grad_norm": 0.5771216137328318, "learning_rate": 1.840506142611677e-05, "loss": 0.4454, "step": 3714 }, { "epoch": 0.7324526813880127, "grad_norm": 0.62152619615878, "learning_rate": 1.840422151795129e-05, "loss": 0.4713, "step": 3715 }, { "epoch": 0.7326498422712934, "grad_norm": 0.5895477541474831, "learning_rate": 1.840338140786762e-05, "loss": 0.4415, "step": 3716 }, { "epoch": 0.7328470031545742, "grad_norm": 0.7408210800785281, "learning_rate": 1.8402541095885943e-05, "loss": 0.4733, "step": 3717 }, { "epoch": 0.7330441640378549, "grad_norm": 0.56864244318536, "learning_rate": 1.8401700582026452e-05, "loss": 0.4047, "step": 3718 }, { "epoch": 0.7332413249211357, "grad_norm": 0.6695180859601412, "learning_rate": 1.8400859866309337e-05, "loss": 0.4215, "step": 3719 }, { "epoch": 0.7334384858044164, "grad_norm": 0.6556762484776179, "learning_rate": 1.84000189487548e-05, "loss": 0.5069, "step": 3720 }, { "epoch": 0.7336356466876972, "grad_norm": 0.5610750056727031, "learning_rate": 1.8399177829383043e-05, "loss": 0.4505, "step": 3721 }, { "epoch": 0.7338328075709779, "grad_norm": 0.5779111361169296, "learning_rate": 1.839833650821427e-05, "loss": 0.4109, "step": 3722 }, { "epoch": 0.7340299684542587, "grad_norm": 0.5512782925585601, "learning_rate": 1.8397494985268705e-05, "loss": 0.4559, "step": 3723 }, { "epoch": 0.7342271293375394, "grad_norm": 0.5691768441468529, "learning_rate": 1.839665326056656e-05, "loss": 0.4112, "step": 3724 }, { "epoch": 0.7344242902208202, "grad_norm": 0.5624881189319345, "learning_rate": 1.8395811334128058e-05, "loss": 0.4293, "step": 3725 }, { "epoch": 0.7346214511041009, "grad_norm": 0.7923653826550998, "learning_rate": 1.8394969205973426e-05, "loss": 0.418, "step": 3726 }, { "epoch": 0.7348186119873817, "grad_norm": 0.5815459567626416, "learning_rate": 1.8394126876122896e-05, "loss": 0.4526, "step": 3727 }, { "epoch": 0.7350157728706624, "grad_norm": 0.6090000924351008, "learning_rate": 1.8393284344596715e-05, "loss": 0.4544, "step": 3728 }, { "epoch": 0.7352129337539433, "grad_norm": 0.5232417470669563, "learning_rate": 1.8392441611415113e-05, "loss": 0.4496, "step": 3729 }, { "epoch": 0.735410094637224, "grad_norm": 0.7399663560330513, "learning_rate": 1.8391598676598344e-05, "loss": 0.4242, "step": 3730 }, { "epoch": 0.7356072555205048, "grad_norm": 0.5615281033101944, "learning_rate": 1.8390755540166663e-05, "loss": 0.4376, "step": 3731 }, { "epoch": 0.7358044164037855, "grad_norm": 0.599463559470186, "learning_rate": 1.8389912202140318e-05, "loss": 0.446, "step": 3732 }, { "epoch": 0.7360015772870663, "grad_norm": 0.5377177978774068, "learning_rate": 1.838906866253958e-05, "loss": 0.4516, "step": 3733 }, { "epoch": 0.736198738170347, "grad_norm": 0.5681460967594567, "learning_rate": 1.8388224921384707e-05, "loss": 0.4066, "step": 3734 }, { "epoch": 0.7363958990536278, "grad_norm": 0.5141694762603598, "learning_rate": 1.8387380978695977e-05, "loss": 0.4322, "step": 3735 }, { "epoch": 0.7365930599369085, "grad_norm": 0.8238886062154376, "learning_rate": 1.8386536834493667e-05, "loss": 0.4507, "step": 3736 }, { "epoch": 0.7367902208201893, "grad_norm": 0.535199442346653, "learning_rate": 1.8385692488798056e-05, "loss": 0.4553, "step": 3737 }, { "epoch": 0.73698738170347, "grad_norm": 0.6483426681293745, "learning_rate": 1.8384847941629423e-05, "loss": 0.4634, "step": 3738 }, { "epoch": 0.7371845425867508, "grad_norm": 0.5764953582261776, "learning_rate": 1.8384003193008072e-05, "loss": 0.442, "step": 3739 }, { "epoch": 0.7373817034700315, "grad_norm": 0.5572119301548107, "learning_rate": 1.8383158242954296e-05, "loss": 0.4508, "step": 3740 }, { "epoch": 0.7375788643533123, "grad_norm": 0.5681528660063264, "learning_rate": 1.8382313091488385e-05, "loss": 0.4532, "step": 3741 }, { "epoch": 0.737776025236593, "grad_norm": 0.5891890950746687, "learning_rate": 1.8381467738630656e-05, "loss": 0.4669, "step": 3742 }, { "epoch": 0.7379731861198738, "grad_norm": 0.5049888382066734, "learning_rate": 1.8380622184401416e-05, "loss": 0.3829, "step": 3743 }, { "epoch": 0.7381703470031545, "grad_norm": 0.563208762050558, "learning_rate": 1.8379776428820974e-05, "loss": 0.4047, "step": 3744 }, { "epoch": 0.7383675078864353, "grad_norm": 0.5779457902661825, "learning_rate": 1.8378930471909658e-05, "loss": 0.4206, "step": 3745 }, { "epoch": 0.738564668769716, "grad_norm": 0.7774905699447151, "learning_rate": 1.837808431368779e-05, "loss": 0.4779, "step": 3746 }, { "epoch": 0.7387618296529969, "grad_norm": 0.5629107990200406, "learning_rate": 1.83772379541757e-05, "loss": 0.3925, "step": 3747 }, { "epoch": 0.7389589905362776, "grad_norm": 0.5970620392228986, "learning_rate": 1.837639139339372e-05, "loss": 0.4488, "step": 3748 }, { "epoch": 0.7391561514195584, "grad_norm": 0.5557125511475749, "learning_rate": 1.8375544631362195e-05, "loss": 0.452, "step": 3749 }, { "epoch": 0.7393533123028391, "grad_norm": 0.5872831543728793, "learning_rate": 1.8374697668101463e-05, "loss": 0.4594, "step": 3750 }, { "epoch": 0.7395504731861199, "grad_norm": 0.5286161720159008, "learning_rate": 1.8373850503631872e-05, "loss": 0.4322, "step": 3751 }, { "epoch": 0.7397476340694006, "grad_norm": 0.560014306046355, "learning_rate": 1.8373003137973783e-05, "loss": 0.4316, "step": 3752 }, { "epoch": 0.7399447949526814, "grad_norm": 0.5456631340877649, "learning_rate": 1.8372155571147554e-05, "loss": 0.4331, "step": 3753 }, { "epoch": 0.7401419558359621, "grad_norm": 0.5985522047951393, "learning_rate": 1.837130780317354e-05, "loss": 0.4628, "step": 3754 }, { "epoch": 0.7403391167192429, "grad_norm": 0.5672181573442395, "learning_rate": 1.8370459834072118e-05, "loss": 0.4454, "step": 3755 }, { "epoch": 0.7405362776025236, "grad_norm": 0.5635085742932182, "learning_rate": 1.8369611663863656e-05, "loss": 0.4223, "step": 3756 }, { "epoch": 0.7407334384858044, "grad_norm": 0.5873022605290665, "learning_rate": 1.8368763292568532e-05, "loss": 0.4478, "step": 3757 }, { "epoch": 0.7409305993690851, "grad_norm": 0.5346924744494808, "learning_rate": 1.8367914720207137e-05, "loss": 0.3994, "step": 3758 }, { "epoch": 0.7411277602523659, "grad_norm": 0.5791612680965824, "learning_rate": 1.8367065946799845e-05, "loss": 0.417, "step": 3759 }, { "epoch": 0.7413249211356467, "grad_norm": 0.5887526324679428, "learning_rate": 1.8366216972367058e-05, "loss": 0.4324, "step": 3760 }, { "epoch": 0.7415220820189274, "grad_norm": 0.631982631731162, "learning_rate": 1.836536779692917e-05, "loss": 0.4398, "step": 3761 }, { "epoch": 0.7417192429022083, "grad_norm": 0.5498266504156066, "learning_rate": 1.836451842050659e-05, "loss": 0.444, "step": 3762 }, { "epoch": 0.741916403785489, "grad_norm": 0.5248298760030489, "learning_rate": 1.8363668843119713e-05, "loss": 0.4397, "step": 3763 }, { "epoch": 0.7421135646687698, "grad_norm": 0.5574383621046671, "learning_rate": 1.8362819064788956e-05, "loss": 0.4321, "step": 3764 }, { "epoch": 0.7423107255520505, "grad_norm": 0.5509414305533834, "learning_rate": 1.8361969085534742e-05, "loss": 0.4449, "step": 3765 }, { "epoch": 0.7425078864353313, "grad_norm": 0.6218658695377531, "learning_rate": 1.8361118905377483e-05, "loss": 0.4698, "step": 3766 }, { "epoch": 0.742705047318612, "grad_norm": 0.6025705112487485, "learning_rate": 1.8360268524337606e-05, "loss": 0.4425, "step": 3767 }, { "epoch": 0.7429022082018928, "grad_norm": 0.5354499032310642, "learning_rate": 1.835941794243555e-05, "loss": 0.4212, "step": 3768 }, { "epoch": 0.7430993690851735, "grad_norm": 0.6198349365571671, "learning_rate": 1.8358567159691745e-05, "loss": 0.4173, "step": 3769 }, { "epoch": 0.7432965299684543, "grad_norm": 0.5633007693206912, "learning_rate": 1.8357716176126633e-05, "loss": 0.4407, "step": 3770 }, { "epoch": 0.743493690851735, "grad_norm": 0.6033616935263962, "learning_rate": 1.8356864991760658e-05, "loss": 0.4832, "step": 3771 }, { "epoch": 0.7436908517350158, "grad_norm": 0.5318673355760555, "learning_rate": 1.8356013606614277e-05, "loss": 0.4512, "step": 3772 }, { "epoch": 0.7438880126182965, "grad_norm": 0.9499333131372167, "learning_rate": 1.8355162020707932e-05, "loss": 0.4394, "step": 3773 }, { "epoch": 0.7440851735015773, "grad_norm": 0.5559579298686984, "learning_rate": 1.8354310234062097e-05, "loss": 0.4233, "step": 3774 }, { "epoch": 0.744282334384858, "grad_norm": 0.6229971755454574, "learning_rate": 1.835345824669723e-05, "loss": 0.44, "step": 3775 }, { "epoch": 0.7444794952681388, "grad_norm": 0.5923143947918056, "learning_rate": 1.83526060586338e-05, "loss": 0.4606, "step": 3776 }, { "epoch": 0.7446766561514195, "grad_norm": 0.5501204352607113, "learning_rate": 1.8351753669892284e-05, "loss": 0.4291, "step": 3777 }, { "epoch": 0.7448738170347003, "grad_norm": 0.6932256286950385, "learning_rate": 1.8350901080493158e-05, "loss": 0.461, "step": 3778 }, { "epoch": 0.745070977917981, "grad_norm": 0.6073398760332815, "learning_rate": 1.8350048290456912e-05, "loss": 0.4141, "step": 3779 }, { "epoch": 0.7452681388012619, "grad_norm": 0.5251923915616548, "learning_rate": 1.834919529980403e-05, "loss": 0.4481, "step": 3780 }, { "epoch": 0.7454652996845426, "grad_norm": 0.5442185285627984, "learning_rate": 1.8348342108555007e-05, "loss": 0.4026, "step": 3781 }, { "epoch": 0.7456624605678234, "grad_norm": 0.5693867434023069, "learning_rate": 1.8347488716730343e-05, "loss": 0.4311, "step": 3782 }, { "epoch": 0.7458596214511041, "grad_norm": 0.5938661250513164, "learning_rate": 1.834663512435054e-05, "loss": 0.4613, "step": 3783 }, { "epoch": 0.7460567823343849, "grad_norm": 0.5455676983214097, "learning_rate": 1.8345781331436106e-05, "loss": 0.4195, "step": 3784 }, { "epoch": 0.7462539432176656, "grad_norm": 0.5860282571566524, "learning_rate": 1.8344927338007554e-05, "loss": 0.4211, "step": 3785 }, { "epoch": 0.7464511041009464, "grad_norm": 0.5333294020645926, "learning_rate": 1.8344073144085406e-05, "loss": 0.4371, "step": 3786 }, { "epoch": 0.7466482649842271, "grad_norm": 0.5706728165649482, "learning_rate": 1.834321874969018e-05, "loss": 0.4389, "step": 3787 }, { "epoch": 0.7468454258675079, "grad_norm": 0.7074581360169672, "learning_rate": 1.8342364154842404e-05, "loss": 0.4937, "step": 3788 }, { "epoch": 0.7470425867507886, "grad_norm": 0.5778783270484186, "learning_rate": 1.8341509359562608e-05, "loss": 0.4377, "step": 3789 }, { "epoch": 0.7472397476340694, "grad_norm": 0.5270159539912247, "learning_rate": 1.8340654363871334e-05, "loss": 0.4117, "step": 3790 }, { "epoch": 0.7474369085173501, "grad_norm": 0.5223441256989405, "learning_rate": 1.8339799167789127e-05, "loss": 0.4471, "step": 3791 }, { "epoch": 0.7476340694006309, "grad_norm": 0.5781995373444279, "learning_rate": 1.8338943771336522e-05, "loss": 0.4378, "step": 3792 }, { "epoch": 0.7478312302839116, "grad_norm": 0.5634947765385621, "learning_rate": 1.8338088174534083e-05, "loss": 0.4454, "step": 3793 }, { "epoch": 0.7480283911671924, "grad_norm": 0.5059725150862037, "learning_rate": 1.833723237740236e-05, "loss": 0.4245, "step": 3794 }, { "epoch": 0.7482255520504731, "grad_norm": 0.5326325739466916, "learning_rate": 1.833637637996191e-05, "loss": 0.4425, "step": 3795 }, { "epoch": 0.748422712933754, "grad_norm": 0.5574154238044264, "learning_rate": 1.833552018223331e-05, "loss": 0.4492, "step": 3796 }, { "epoch": 0.7486198738170347, "grad_norm": 0.5402149077761322, "learning_rate": 1.8334663784237124e-05, "loss": 0.4006, "step": 3797 }, { "epoch": 0.7488170347003155, "grad_norm": 0.5791969213862481, "learning_rate": 1.8333807185993927e-05, "loss": 0.4434, "step": 3798 }, { "epoch": 0.7490141955835962, "grad_norm": 0.6018305196834619, "learning_rate": 1.8332950387524304e-05, "loss": 0.4588, "step": 3799 }, { "epoch": 0.749211356466877, "grad_norm": 0.6572227006369701, "learning_rate": 1.8332093388848836e-05, "loss": 0.446, "step": 3800 }, { "epoch": 0.7494085173501577, "grad_norm": 0.5684378740198406, "learning_rate": 1.8331236189988115e-05, "loss": 0.4071, "step": 3801 }, { "epoch": 0.7496056782334385, "grad_norm": 0.5338624607390893, "learning_rate": 1.8330378790962734e-05, "loss": 0.4635, "step": 3802 }, { "epoch": 0.7498028391167192, "grad_norm": 0.5649660572252639, "learning_rate": 1.8329521191793293e-05, "loss": 0.4277, "step": 3803 }, { "epoch": 0.75, "grad_norm": 0.5526706058734381, "learning_rate": 1.83286633925004e-05, "loss": 0.4404, "step": 3804 }, { "epoch": 0.75, "eval_loss": 0.44013890624046326, "eval_runtime": 343.9735, "eval_samples_per_second": 23.636, "eval_steps_per_second": 1.48, "step": 3804 }, { "epoch": 0.7501971608832808, "grad_norm": 0.6372546717506914, "learning_rate": 1.8327805393104658e-05, "loss": 0.4548, "step": 3805 }, { "epoch": 0.7503943217665615, "grad_norm": 0.5913842512169649, "learning_rate": 1.832694719362669e-05, "loss": 0.4518, "step": 3806 }, { "epoch": 0.7505914826498423, "grad_norm": 0.5826300542247199, "learning_rate": 1.8326088794087108e-05, "loss": 0.4818, "step": 3807 }, { "epoch": 0.750788643533123, "grad_norm": 0.5907815365065618, "learning_rate": 1.8325230194506538e-05, "loss": 0.4365, "step": 3808 }, { "epoch": 0.7509858044164038, "grad_norm": 0.5496356683694023, "learning_rate": 1.8324371394905606e-05, "loss": 0.4574, "step": 3809 }, { "epoch": 0.7511829652996845, "grad_norm": 0.5711613351725243, "learning_rate": 1.832351239530495e-05, "loss": 0.4584, "step": 3810 }, { "epoch": 0.7513801261829653, "grad_norm": 0.9200633659180221, "learning_rate": 1.8322653195725206e-05, "loss": 0.4781, "step": 3811 }, { "epoch": 0.751577287066246, "grad_norm": 0.6033375447437546, "learning_rate": 1.832179379618702e-05, "loss": 0.4412, "step": 3812 }, { "epoch": 0.7517744479495269, "grad_norm": 0.5171494431027918, "learning_rate": 1.832093419671103e-05, "loss": 0.4153, "step": 3813 }, { "epoch": 0.7519716088328076, "grad_norm": 0.7257919932365838, "learning_rate": 1.83200743973179e-05, "loss": 0.4529, "step": 3814 }, { "epoch": 0.7521687697160884, "grad_norm": 0.6164155494736133, "learning_rate": 1.831921439802828e-05, "loss": 0.4239, "step": 3815 }, { "epoch": 0.7523659305993691, "grad_norm": 0.5742726759356669, "learning_rate": 1.8318354198862836e-05, "loss": 0.4595, "step": 3816 }, { "epoch": 0.7525630914826499, "grad_norm": 0.5364358313603149, "learning_rate": 1.831749379984223e-05, "loss": 0.3946, "step": 3817 }, { "epoch": 0.7527602523659306, "grad_norm": 0.5931144322460239, "learning_rate": 1.8316633200987143e-05, "loss": 0.4723, "step": 3818 }, { "epoch": 0.7529574132492114, "grad_norm": 0.6631211961273565, "learning_rate": 1.8315772402318243e-05, "loss": 0.5013, "step": 3819 }, { "epoch": 0.7531545741324921, "grad_norm": 0.555817939093465, "learning_rate": 1.8314911403856212e-05, "loss": 0.4234, "step": 3820 }, { "epoch": 0.7533517350157729, "grad_norm": 0.5699005102970771, "learning_rate": 1.8314050205621742e-05, "loss": 0.4629, "step": 3821 }, { "epoch": 0.7535488958990536, "grad_norm": 0.5456220237338646, "learning_rate": 1.831318880763552e-05, "loss": 0.4227, "step": 3822 }, { "epoch": 0.7537460567823344, "grad_norm": 0.564506446004603, "learning_rate": 1.8312327209918242e-05, "loss": 0.4478, "step": 3823 }, { "epoch": 0.7539432176656151, "grad_norm": 0.6007442236000919, "learning_rate": 1.831146541249061e-05, "loss": 0.4404, "step": 3824 }, { "epoch": 0.7541403785488959, "grad_norm": 0.5756483770605955, "learning_rate": 1.8310603415373328e-05, "loss": 0.436, "step": 3825 }, { "epoch": 0.7543375394321766, "grad_norm": 0.5730763258268717, "learning_rate": 1.8309741218587102e-05, "loss": 0.4509, "step": 3826 }, { "epoch": 0.7545347003154574, "grad_norm": 0.573198374198784, "learning_rate": 1.8308878822152655e-05, "loss": 0.437, "step": 3827 }, { "epoch": 0.7547318611987381, "grad_norm": 0.5616109549442615, "learning_rate": 1.8308016226090704e-05, "loss": 0.4439, "step": 3828 }, { "epoch": 0.754929022082019, "grad_norm": 0.5895791098794968, "learning_rate": 1.8307153430421972e-05, "loss": 0.4744, "step": 3829 }, { "epoch": 0.7551261829652997, "grad_norm": 0.5565201237486335, "learning_rate": 1.830629043516719e-05, "loss": 0.4657, "step": 3830 }, { "epoch": 0.7553233438485805, "grad_norm": 0.5061359477774192, "learning_rate": 1.830542724034709e-05, "loss": 0.4204, "step": 3831 }, { "epoch": 0.7555205047318612, "grad_norm": 0.5660890953859956, "learning_rate": 1.8304563845982413e-05, "loss": 0.4396, "step": 3832 }, { "epoch": 0.755717665615142, "grad_norm": 0.5503977078210044, "learning_rate": 1.83037002520939e-05, "loss": 0.4393, "step": 3833 }, { "epoch": 0.7559148264984227, "grad_norm": 0.6550111174917882, "learning_rate": 1.8302836458702302e-05, "loss": 0.4523, "step": 3834 }, { "epoch": 0.7561119873817035, "grad_norm": 2.619858501191366, "learning_rate": 1.8301972465828373e-05, "loss": 0.469, "step": 3835 }, { "epoch": 0.7563091482649842, "grad_norm": 0.6235970138061254, "learning_rate": 1.830110827349287e-05, "loss": 0.4333, "step": 3836 }, { "epoch": 0.756506309148265, "grad_norm": 0.532577589765572, "learning_rate": 1.8300243881716553e-05, "loss": 0.4646, "step": 3837 }, { "epoch": 0.7567034700315457, "grad_norm": 0.6198914137370217, "learning_rate": 1.8299379290520197e-05, "loss": 0.4148, "step": 3838 }, { "epoch": 0.7569006309148265, "grad_norm": 0.5916239893137991, "learning_rate": 1.8298514499924567e-05, "loss": 0.4566, "step": 3839 }, { "epoch": 0.7570977917981072, "grad_norm": 0.5602675170057476, "learning_rate": 1.8297649509950446e-05, "loss": 0.4242, "step": 3840 }, { "epoch": 0.757294952681388, "grad_norm": 0.493735580049777, "learning_rate": 1.829678432061861e-05, "loss": 0.3774, "step": 3841 }, { "epoch": 0.7574921135646687, "grad_norm": 0.5400394381153409, "learning_rate": 1.829591893194985e-05, "loss": 0.4206, "step": 3842 }, { "epoch": 0.7576892744479495, "grad_norm": 0.5643283428425839, "learning_rate": 1.829505334396496e-05, "loss": 0.4571, "step": 3843 }, { "epoch": 0.7578864353312302, "grad_norm": 0.5377612017847214, "learning_rate": 1.8294187556684733e-05, "loss": 0.4133, "step": 3844 }, { "epoch": 0.758083596214511, "grad_norm": 0.5497122296347786, "learning_rate": 1.829332157012997e-05, "loss": 0.4332, "step": 3845 }, { "epoch": 0.7582807570977917, "grad_norm": 0.5849200039580923, "learning_rate": 1.8292455384321476e-05, "loss": 0.4284, "step": 3846 }, { "epoch": 0.7584779179810726, "grad_norm": 0.5543841134301215, "learning_rate": 1.8291588999280065e-05, "loss": 0.4532, "step": 3847 }, { "epoch": 0.7586750788643533, "grad_norm": 0.5615983003588527, "learning_rate": 1.8290722415026548e-05, "loss": 0.4302, "step": 3848 }, { "epoch": 0.7588722397476341, "grad_norm": 0.551746761958415, "learning_rate": 1.828985563158175e-05, "loss": 0.3985, "step": 3849 }, { "epoch": 0.7590694006309149, "grad_norm": 0.6412908877953788, "learning_rate": 1.8288988648966498e-05, "loss": 0.415, "step": 3850 }, { "epoch": 0.7592665615141956, "grad_norm": 0.5308071077261949, "learning_rate": 1.8288121467201615e-05, "loss": 0.4216, "step": 3851 }, { "epoch": 0.7594637223974764, "grad_norm": 0.5623371567129707, "learning_rate": 1.8287254086307942e-05, "loss": 0.4205, "step": 3852 }, { "epoch": 0.7596608832807571, "grad_norm": 0.6609929780062805, "learning_rate": 1.8286386506306314e-05, "loss": 0.4483, "step": 3853 }, { "epoch": 0.7598580441640379, "grad_norm": 0.5571621496040055, "learning_rate": 1.8285518727217578e-05, "loss": 0.3971, "step": 3854 }, { "epoch": 0.7600552050473186, "grad_norm": 0.6151096972545813, "learning_rate": 1.8284650749062583e-05, "loss": 0.4393, "step": 3855 }, { "epoch": 0.7602523659305994, "grad_norm": 0.565063463574547, "learning_rate": 1.8283782571862182e-05, "loss": 0.4521, "step": 3856 }, { "epoch": 0.7604495268138801, "grad_norm": 0.5410714930908408, "learning_rate": 1.828291419563723e-05, "loss": 0.4147, "step": 3857 }, { "epoch": 0.7606466876971609, "grad_norm": 0.5396095207165513, "learning_rate": 1.8282045620408596e-05, "loss": 0.4591, "step": 3858 }, { "epoch": 0.7608438485804416, "grad_norm": 0.6397377487608418, "learning_rate": 1.828117684619715e-05, "loss": 0.4952, "step": 3859 }, { "epoch": 0.7610410094637224, "grad_norm": 0.5360037148692628, "learning_rate": 1.8280307873023758e-05, "loss": 0.4169, "step": 3860 }, { "epoch": 0.7612381703470031, "grad_norm": 0.5634653164334658, "learning_rate": 1.8279438700909305e-05, "loss": 0.4209, "step": 3861 }, { "epoch": 0.761435331230284, "grad_norm": 0.5642735061715564, "learning_rate": 1.8278569329874667e-05, "loss": 0.4156, "step": 3862 }, { "epoch": 0.7616324921135647, "grad_norm": 0.5346697867413164, "learning_rate": 1.8277699759940732e-05, "loss": 0.4157, "step": 3863 }, { "epoch": 0.7618296529968455, "grad_norm": 0.5458496115616259, "learning_rate": 1.8276829991128397e-05, "loss": 0.4412, "step": 3864 }, { "epoch": 0.7620268138801262, "grad_norm": 0.542780014402022, "learning_rate": 1.8275960023458554e-05, "loss": 0.4051, "step": 3865 }, { "epoch": 0.762223974763407, "grad_norm": 0.5179693655492266, "learning_rate": 1.827508985695211e-05, "loss": 0.4127, "step": 3866 }, { "epoch": 0.7624211356466877, "grad_norm": 0.5553660779995085, "learning_rate": 1.8274219491629965e-05, "loss": 0.4401, "step": 3867 }, { "epoch": 0.7626182965299685, "grad_norm": 0.5712424475051656, "learning_rate": 1.827334892751304e-05, "loss": 0.4267, "step": 3868 }, { "epoch": 0.7628154574132492, "grad_norm": 0.5521939908028256, "learning_rate": 1.8272478164622237e-05, "loss": 0.4525, "step": 3869 }, { "epoch": 0.76301261829653, "grad_norm": 0.6059666034435798, "learning_rate": 1.827160720297849e-05, "loss": 0.4352, "step": 3870 }, { "epoch": 0.7632097791798107, "grad_norm": 0.5568390943861937, "learning_rate": 1.827073604260271e-05, "loss": 0.4435, "step": 3871 }, { "epoch": 0.7634069400630915, "grad_norm": 0.6154734211328192, "learning_rate": 1.8269864683515847e-05, "loss": 0.478, "step": 3872 }, { "epoch": 0.7636041009463722, "grad_norm": 0.6336972990527457, "learning_rate": 1.8268993125738817e-05, "loss": 0.488, "step": 3873 }, { "epoch": 0.763801261829653, "grad_norm": 0.5158784852798877, "learning_rate": 1.826812136929257e-05, "loss": 0.3997, "step": 3874 }, { "epoch": 0.7639984227129337, "grad_norm": 0.5545942044525611, "learning_rate": 1.8267249414198055e-05, "loss": 0.4042, "step": 3875 }, { "epoch": 0.7641955835962145, "grad_norm": 0.7691672609377771, "learning_rate": 1.8266377260476206e-05, "loss": 0.4804, "step": 3876 }, { "epoch": 0.7643927444794952, "grad_norm": 0.607087584955189, "learning_rate": 1.826550490814799e-05, "loss": 0.4558, "step": 3877 }, { "epoch": 0.764589905362776, "grad_norm": 0.62388863749188, "learning_rate": 1.8264632357234366e-05, "loss": 0.4604, "step": 3878 }, { "epoch": 0.7647870662460567, "grad_norm": 0.5639061589375046, "learning_rate": 1.826375960775629e-05, "loss": 0.4197, "step": 3879 }, { "epoch": 0.7649842271293376, "grad_norm": 0.575114173679805, "learning_rate": 1.8262886659734738e-05, "loss": 0.466, "step": 3880 }, { "epoch": 0.7651813880126183, "grad_norm": 1.0686366669808436, "learning_rate": 1.8262013513190677e-05, "loss": 0.4063, "step": 3881 }, { "epoch": 0.7653785488958991, "grad_norm": 0.6064570679491103, "learning_rate": 1.826114016814509e-05, "loss": 0.3918, "step": 3882 }, { "epoch": 0.7655757097791798, "grad_norm": 0.5647896972824821, "learning_rate": 1.8260266624618957e-05, "loss": 0.4329, "step": 3883 }, { "epoch": 0.7657728706624606, "grad_norm": 0.6946858259139369, "learning_rate": 1.8259392882633266e-05, "loss": 0.4704, "step": 3884 }, { "epoch": 0.7659700315457413, "grad_norm": 0.5720833541883139, "learning_rate": 1.825851894220901e-05, "loss": 0.4309, "step": 3885 }, { "epoch": 0.7661671924290221, "grad_norm": 0.5967274277243448, "learning_rate": 1.8257644803367186e-05, "loss": 0.4341, "step": 3886 }, { "epoch": 0.7663643533123028, "grad_norm": 0.5985333908745908, "learning_rate": 1.8256770466128793e-05, "loss": 0.4523, "step": 3887 }, { "epoch": 0.7665615141955836, "grad_norm": 0.5560392931956263, "learning_rate": 1.8255895930514843e-05, "loss": 0.4228, "step": 3888 }, { "epoch": 0.7667586750788643, "grad_norm": 0.5832994995378911, "learning_rate": 1.8255021196546346e-05, "loss": 0.4299, "step": 3889 }, { "epoch": 0.7669558359621451, "grad_norm": 1.4002961334330222, "learning_rate": 1.8254146264244316e-05, "loss": 0.4941, "step": 3890 }, { "epoch": 0.7671529968454258, "grad_norm": 0.5534367003995347, "learning_rate": 1.8253271133629775e-05, "loss": 0.4185, "step": 3891 }, { "epoch": 0.7673501577287066, "grad_norm": 0.5935042775545306, "learning_rate": 1.8252395804723744e-05, "loss": 0.4807, "step": 3892 }, { "epoch": 0.7675473186119873, "grad_norm": 0.8877026134448658, "learning_rate": 1.8251520277547267e-05, "loss": 0.4268, "step": 3893 }, { "epoch": 0.7677444794952681, "grad_norm": 1.2517353246924268, "learning_rate": 1.8250644552121362e-05, "loss": 0.4519, "step": 3894 }, { "epoch": 0.767941640378549, "grad_norm": 0.5929121870930241, "learning_rate": 1.8249768628467085e-05, "loss": 0.4413, "step": 3895 }, { "epoch": 0.7681388012618297, "grad_norm": 0.5782475207334447, "learning_rate": 1.8248892506605468e-05, "loss": 0.4157, "step": 3896 }, { "epoch": 0.7683359621451105, "grad_norm": 0.6117468432248259, "learning_rate": 1.8248016186557566e-05, "loss": 0.4333, "step": 3897 }, { "epoch": 0.7685331230283912, "grad_norm": 0.5927409035058885, "learning_rate": 1.8247139668344432e-05, "loss": 0.4869, "step": 3898 }, { "epoch": 0.768730283911672, "grad_norm": 1.158455927574009, "learning_rate": 1.824626295198713e-05, "loss": 0.4575, "step": 3899 }, { "epoch": 0.7689274447949527, "grad_norm": 0.8392945391726799, "learning_rate": 1.824538603750672e-05, "loss": 0.4607, "step": 3900 }, { "epoch": 0.7691246056782335, "grad_norm": 0.5998059892020197, "learning_rate": 1.824450892492427e-05, "loss": 0.4594, "step": 3901 }, { "epoch": 0.7693217665615142, "grad_norm": 0.5695316389310182, "learning_rate": 1.824363161426085e-05, "loss": 0.4175, "step": 3902 }, { "epoch": 0.769518927444795, "grad_norm": 0.6240626330754476, "learning_rate": 1.8242754105537542e-05, "loss": 0.454, "step": 3903 }, { "epoch": 0.7697160883280757, "grad_norm": 0.5912514018969862, "learning_rate": 1.8241876398775434e-05, "loss": 0.4311, "step": 3904 }, { "epoch": 0.7699132492113565, "grad_norm": 0.6624090571377835, "learning_rate": 1.8240998493995607e-05, "loss": 0.4719, "step": 3905 }, { "epoch": 0.7701104100946372, "grad_norm": 0.9203541875889368, "learning_rate": 1.8240120391219148e-05, "loss": 0.4803, "step": 3906 }, { "epoch": 0.770307570977918, "grad_norm": 0.5914744368657348, "learning_rate": 1.823924209046717e-05, "loss": 0.418, "step": 3907 }, { "epoch": 0.7705047318611987, "grad_norm": 0.6070617757379179, "learning_rate": 1.8238363591760758e-05, "loss": 0.4429, "step": 3908 }, { "epoch": 0.7707018927444795, "grad_norm": 0.6325247392458869, "learning_rate": 1.8237484895121033e-05, "loss": 0.4688, "step": 3909 }, { "epoch": 0.7708990536277602, "grad_norm": 0.573852165719456, "learning_rate": 1.8236606000569095e-05, "loss": 0.4126, "step": 3910 }, { "epoch": 0.771096214511041, "grad_norm": 0.5567521119492211, "learning_rate": 1.823572690812607e-05, "loss": 0.4415, "step": 3911 }, { "epoch": 0.7712933753943217, "grad_norm": 0.6224892204399546, "learning_rate": 1.8234847617813067e-05, "loss": 0.4473, "step": 3912 }, { "epoch": 0.7714905362776026, "grad_norm": 0.6241807022753024, "learning_rate": 1.823396812965122e-05, "loss": 0.4632, "step": 3913 }, { "epoch": 0.7716876971608833, "grad_norm": 0.564677599383395, "learning_rate": 1.8233088443661665e-05, "loss": 0.4242, "step": 3914 }, { "epoch": 0.7718848580441641, "grad_norm": 0.5882925220136831, "learning_rate": 1.8232208559865522e-05, "loss": 0.431, "step": 3915 }, { "epoch": 0.7720820189274448, "grad_norm": 0.574227496801443, "learning_rate": 1.823132847828394e-05, "loss": 0.451, "step": 3916 }, { "epoch": 0.7722791798107256, "grad_norm": 0.5222564423269557, "learning_rate": 1.8230448198938067e-05, "loss": 0.4296, "step": 3917 }, { "epoch": 0.7724763406940063, "grad_norm": 0.57083859662196, "learning_rate": 1.8229567721849046e-05, "loss": 0.4257, "step": 3918 }, { "epoch": 0.7726735015772871, "grad_norm": 1.6592668807628246, "learning_rate": 1.822868704703803e-05, "loss": 0.4921, "step": 3919 }, { "epoch": 0.7728706624605678, "grad_norm": 0.6157770095064646, "learning_rate": 1.8227806174526187e-05, "loss": 0.4314, "step": 3920 }, { "epoch": 0.7730678233438486, "grad_norm": 0.5852063956961934, "learning_rate": 1.822692510433467e-05, "loss": 0.4654, "step": 3921 }, { "epoch": 0.7732649842271293, "grad_norm": 0.5866034997283565, "learning_rate": 1.8226043836484655e-05, "loss": 0.4346, "step": 3922 }, { "epoch": 0.7734621451104101, "grad_norm": 0.5510812356437924, "learning_rate": 1.8225162370997313e-05, "loss": 0.4801, "step": 3923 }, { "epoch": 0.7736593059936908, "grad_norm": 0.5472377435499002, "learning_rate": 1.822428070789382e-05, "loss": 0.4502, "step": 3924 }, { "epoch": 0.7738564668769716, "grad_norm": 0.5620895652186938, "learning_rate": 1.8223398847195358e-05, "loss": 0.4428, "step": 3925 }, { "epoch": 0.7740536277602523, "grad_norm": 0.5573421453852319, "learning_rate": 1.822251678892312e-05, "loss": 0.4397, "step": 3926 }, { "epoch": 0.7742507886435331, "grad_norm": 0.5756487306350889, "learning_rate": 1.822163453309829e-05, "loss": 0.4255, "step": 3927 }, { "epoch": 0.7744479495268138, "grad_norm": 0.5374818660181303, "learning_rate": 1.8220752079742072e-05, "loss": 0.4208, "step": 3928 }, { "epoch": 0.7746451104100947, "grad_norm": 0.5294345547402046, "learning_rate": 1.8219869428875668e-05, "loss": 0.4217, "step": 3929 }, { "epoch": 0.7748422712933754, "grad_norm": 0.56730405884325, "learning_rate": 1.8218986580520276e-05, "loss": 0.4768, "step": 3930 }, { "epoch": 0.7750394321766562, "grad_norm": 0.5604501496661701, "learning_rate": 1.8218103534697116e-05, "loss": 0.4596, "step": 3931 }, { "epoch": 0.7752365930599369, "grad_norm": 0.5579910747422517, "learning_rate": 1.8217220291427398e-05, "loss": 0.4527, "step": 3932 }, { "epoch": 0.7754337539432177, "grad_norm": 0.5270550522697447, "learning_rate": 1.821633685073235e-05, "loss": 0.4559, "step": 3933 }, { "epoch": 0.7756309148264984, "grad_norm": 0.5923687114112185, "learning_rate": 1.8215453212633188e-05, "loss": 0.4548, "step": 3934 }, { "epoch": 0.7758280757097792, "grad_norm": 0.6303703716242776, "learning_rate": 1.821456937715115e-05, "loss": 0.5182, "step": 3935 }, { "epoch": 0.7760252365930599, "grad_norm": 0.5753784721230842, "learning_rate": 1.8213685344307465e-05, "loss": 0.4475, "step": 3936 }, { "epoch": 0.7762223974763407, "grad_norm": 0.7373579959797398, "learning_rate": 1.8212801114123377e-05, "loss": 0.4316, "step": 3937 }, { "epoch": 0.7764195583596214, "grad_norm": 0.5762319703064297, "learning_rate": 1.8211916686620128e-05, "loss": 0.4546, "step": 3938 }, { "epoch": 0.7766167192429022, "grad_norm": 0.5920585040131835, "learning_rate": 1.8211032061818968e-05, "loss": 0.4471, "step": 3939 }, { "epoch": 0.776813880126183, "grad_norm": 0.6803659907474547, "learning_rate": 1.8210147239741148e-05, "loss": 0.4755, "step": 3940 }, { "epoch": 0.7770110410094637, "grad_norm": 0.5509653813630704, "learning_rate": 1.8209262220407932e-05, "loss": 0.4586, "step": 3941 }, { "epoch": 0.7772082018927445, "grad_norm": 0.5581868920790423, "learning_rate": 1.820837700384058e-05, "loss": 0.4219, "step": 3942 }, { "epoch": 0.7774053627760252, "grad_norm": 0.5592867285417933, "learning_rate": 1.8207491590060356e-05, "loss": 0.4193, "step": 3943 }, { "epoch": 0.777602523659306, "grad_norm": 0.6501883656371863, "learning_rate": 1.8206605979088545e-05, "loss": 0.4511, "step": 3944 }, { "epoch": 0.7777996845425867, "grad_norm": 0.5572979380305522, "learning_rate": 1.820572017094641e-05, "loss": 0.4403, "step": 3945 }, { "epoch": 0.7779968454258676, "grad_norm": 0.6146644203648315, "learning_rate": 1.8204834165655242e-05, "loss": 0.4852, "step": 3946 }, { "epoch": 0.7781940063091483, "grad_norm": 0.5413773415764146, "learning_rate": 1.8203947963236322e-05, "loss": 0.4437, "step": 3947 }, { "epoch": 0.7783911671924291, "grad_norm": 0.5605314804048697, "learning_rate": 1.8203061563710952e-05, "loss": 0.4294, "step": 3948 }, { "epoch": 0.7785883280757098, "grad_norm": 0.5316072340317646, "learning_rate": 1.820217496710042e-05, "loss": 0.4243, "step": 3949 }, { "epoch": 0.7787854889589906, "grad_norm": 0.6492198435447817, "learning_rate": 1.8201288173426027e-05, "loss": 0.4659, "step": 3950 }, { "epoch": 0.7789826498422713, "grad_norm": 0.5713530844391045, "learning_rate": 1.820040118270908e-05, "loss": 0.4579, "step": 3951 }, { "epoch": 0.7791798107255521, "grad_norm": 0.5363420111437506, "learning_rate": 1.8199513994970893e-05, "loss": 0.425, "step": 3952 }, { "epoch": 0.7793769716088328, "grad_norm": 0.5542539770275866, "learning_rate": 1.819862661023278e-05, "loss": 0.4234, "step": 3953 }, { "epoch": 0.7795741324921136, "grad_norm": 0.5285872082909737, "learning_rate": 1.8197739028516062e-05, "loss": 0.4238, "step": 3954 }, { "epoch": 0.7797712933753943, "grad_norm": 0.5456326663863709, "learning_rate": 1.819685124984206e-05, "loss": 0.4301, "step": 3955 }, { "epoch": 0.7799684542586751, "grad_norm": 0.5664354597499723, "learning_rate": 1.8195963274232106e-05, "loss": 0.4555, "step": 3956 }, { "epoch": 0.7801656151419558, "grad_norm": 0.5315491155568633, "learning_rate": 1.8195075101707535e-05, "loss": 0.4272, "step": 3957 }, { "epoch": 0.7803627760252366, "grad_norm": 0.8663503651895043, "learning_rate": 1.819418673228968e-05, "loss": 0.4816, "step": 3958 }, { "epoch": 0.7805599369085173, "grad_norm": 0.5526623269158064, "learning_rate": 1.8193298165999896e-05, "loss": 0.462, "step": 3959 }, { "epoch": 0.7807570977917981, "grad_norm": 0.5973908735760364, "learning_rate": 1.8192409402859526e-05, "loss": 0.4452, "step": 3960 }, { "epoch": 0.7809542586750788, "grad_norm": 0.5521161874463199, "learning_rate": 1.819152044288992e-05, "loss": 0.4203, "step": 3961 }, { "epoch": 0.7811514195583596, "grad_norm": 0.6040478645836748, "learning_rate": 1.819063128611244e-05, "loss": 0.4598, "step": 3962 }, { "epoch": 0.7813485804416404, "grad_norm": 0.5454361035093315, "learning_rate": 1.8189741932548447e-05, "loss": 0.426, "step": 3963 }, { "epoch": 0.7815457413249212, "grad_norm": 0.6321003848230068, "learning_rate": 1.8188852382219308e-05, "loss": 0.4685, "step": 3964 }, { "epoch": 0.7817429022082019, "grad_norm": 0.6137933316061781, "learning_rate": 1.8187962635146397e-05, "loss": 0.4376, "step": 3965 }, { "epoch": 0.7819400630914827, "grad_norm": 0.5396185891653529, "learning_rate": 1.8187072691351088e-05, "loss": 0.421, "step": 3966 }, { "epoch": 0.7821372239747634, "grad_norm": 0.6005283575463105, "learning_rate": 1.8186182550854768e-05, "loss": 0.4782, "step": 3967 }, { "epoch": 0.7823343848580442, "grad_norm": 0.516827483855773, "learning_rate": 1.8185292213678818e-05, "loss": 0.4034, "step": 3968 }, { "epoch": 0.7825315457413249, "grad_norm": 0.5753697155306382, "learning_rate": 1.818440167984463e-05, "loss": 0.4585, "step": 3969 }, { "epoch": 0.7827287066246057, "grad_norm": 0.5542615113432319, "learning_rate": 1.8183510949373603e-05, "loss": 0.4504, "step": 3970 }, { "epoch": 0.7829258675078864, "grad_norm": 0.5333879318693766, "learning_rate": 1.8182620022287133e-05, "loss": 0.4213, "step": 3971 }, { "epoch": 0.7831230283911672, "grad_norm": 0.6072385415952638, "learning_rate": 1.8181728898606628e-05, "loss": 0.4395, "step": 3972 }, { "epoch": 0.7833201892744479, "grad_norm": 0.543662857256321, "learning_rate": 1.81808375783535e-05, "loss": 0.4385, "step": 3973 }, { "epoch": 0.7835173501577287, "grad_norm": 5.542609848424303, "learning_rate": 1.817994606154916e-05, "loss": 0.5795, "step": 3974 }, { "epoch": 0.7837145110410094, "grad_norm": 0.5948140810991429, "learning_rate": 1.8179054348215025e-05, "loss": 0.4474, "step": 3975 }, { "epoch": 0.7839116719242902, "grad_norm": 0.519423366632737, "learning_rate": 1.8178162438372528e-05, "loss": 0.4145, "step": 3976 }, { "epoch": 0.7841088328075709, "grad_norm": 0.5902188567847949, "learning_rate": 1.817727033204309e-05, "loss": 0.4315, "step": 3977 }, { "epoch": 0.7843059936908517, "grad_norm": 0.5948977039074232, "learning_rate": 1.8176378029248147e-05, "loss": 0.458, "step": 3978 }, { "epoch": 0.7845031545741324, "grad_norm": 0.5611639183104089, "learning_rate": 1.8175485530009137e-05, "loss": 0.4531, "step": 3979 }, { "epoch": 0.7847003154574133, "grad_norm": 1.0876807024291277, "learning_rate": 1.8174592834347503e-05, "loss": 0.4904, "step": 3980 }, { "epoch": 0.784897476340694, "grad_norm": 0.5358410389890251, "learning_rate": 1.8173699942284695e-05, "loss": 0.4228, "step": 3981 }, { "epoch": 0.7850946372239748, "grad_norm": 0.5943472824942584, "learning_rate": 1.8172806853842163e-05, "loss": 0.4777, "step": 3982 }, { "epoch": 0.7852917981072555, "grad_norm": 0.5829558673091376, "learning_rate": 1.8171913569041362e-05, "loss": 0.4223, "step": 3983 }, { "epoch": 0.7854889589905363, "grad_norm": 0.5306668965862121, "learning_rate": 1.8171020087903762e-05, "loss": 0.4089, "step": 3984 }, { "epoch": 0.785686119873817, "grad_norm": 0.5451241305750418, "learning_rate": 1.8170126410450823e-05, "loss": 0.4361, "step": 3985 }, { "epoch": 0.7858832807570978, "grad_norm": 0.5469353788262228, "learning_rate": 1.8169232536704012e-05, "loss": 0.4297, "step": 3986 }, { "epoch": 0.7860804416403786, "grad_norm": 0.7105261028282002, "learning_rate": 1.8168338466684817e-05, "loss": 0.4311, "step": 3987 }, { "epoch": 0.7862776025236593, "grad_norm": 0.5989580178979008, "learning_rate": 1.816744420041471e-05, "loss": 0.468, "step": 3988 }, { "epoch": 0.7864747634069401, "grad_norm": 0.545707298003204, "learning_rate": 1.816654973791518e-05, "loss": 0.4527, "step": 3989 }, { "epoch": 0.7866719242902208, "grad_norm": 0.6273199078420786, "learning_rate": 1.8165655079207716e-05, "loss": 0.449, "step": 3990 }, { "epoch": 0.7868690851735016, "grad_norm": 0.5611837255166214, "learning_rate": 1.816476022431381e-05, "loss": 0.4656, "step": 3991 }, { "epoch": 0.7870662460567823, "grad_norm": 0.5518444727001759, "learning_rate": 1.816386517325497e-05, "loss": 0.4284, "step": 3992 }, { "epoch": 0.7872634069400631, "grad_norm": 0.5892275816171331, "learning_rate": 1.816296992605269e-05, "loss": 0.4539, "step": 3993 }, { "epoch": 0.7874605678233438, "grad_norm": 0.5254195560267715, "learning_rate": 1.8162074482728487e-05, "loss": 0.4565, "step": 3994 }, { "epoch": 0.7876577287066246, "grad_norm": 0.5448949680828074, "learning_rate": 1.816117884330387e-05, "loss": 0.4178, "step": 3995 }, { "epoch": 0.7878548895899053, "grad_norm": 0.5746576844950827, "learning_rate": 1.816028300780036e-05, "loss": 0.4735, "step": 3996 }, { "epoch": 0.7880520504731862, "grad_norm": 0.537017218341343, "learning_rate": 1.8159386976239478e-05, "loss": 0.4047, "step": 3997 }, { "epoch": 0.7882492113564669, "grad_norm": 0.6354220762186988, "learning_rate": 1.815849074864275e-05, "loss": 0.4155, "step": 3998 }, { "epoch": 0.7884463722397477, "grad_norm": 0.578467197061079, "learning_rate": 1.8157594325031716e-05, "loss": 0.4183, "step": 3999 }, { "epoch": 0.7886435331230284, "grad_norm": 0.6475745820310453, "learning_rate": 1.8156697705427907e-05, "loss": 0.4209, "step": 4000 }, { "epoch": 0.7888406940063092, "grad_norm": 0.6157428470516835, "learning_rate": 1.815580088985287e-05, "loss": 0.4531, "step": 4001 }, { "epoch": 0.7890378548895899, "grad_norm": 0.5998799236699729, "learning_rate": 1.815490387832815e-05, "loss": 0.4751, "step": 4002 }, { "epoch": 0.7892350157728707, "grad_norm": 0.5551797172113444, "learning_rate": 1.8154006670875294e-05, "loss": 0.444, "step": 4003 }, { "epoch": 0.7894321766561514, "grad_norm": 0.7557929675891345, "learning_rate": 1.815310926751586e-05, "loss": 0.433, "step": 4004 }, { "epoch": 0.7896293375394322, "grad_norm": 3.8174090731821937, "learning_rate": 1.8152211668271413e-05, "loss": 0.428, "step": 4005 }, { "epoch": 0.7898264984227129, "grad_norm": 0.7034780817377414, "learning_rate": 1.8151313873163513e-05, "loss": 0.417, "step": 4006 }, { "epoch": 0.7900236593059937, "grad_norm": 0.5926550848858589, "learning_rate": 1.8150415882213735e-05, "loss": 0.4756, "step": 4007 }, { "epoch": 0.7902208201892744, "grad_norm": 0.6888172885215963, "learning_rate": 1.814951769544365e-05, "loss": 0.4438, "step": 4008 }, { "epoch": 0.7904179810725552, "grad_norm": 0.5290234268356881, "learning_rate": 1.8148619312874844e-05, "loss": 0.4186, "step": 4009 }, { "epoch": 0.7906151419558359, "grad_norm": 0.7662700497693185, "learning_rate": 1.8147720734528893e-05, "loss": 0.4871, "step": 4010 }, { "epoch": 0.7908123028391167, "grad_norm": 0.5957613132464488, "learning_rate": 1.814682196042739e-05, "loss": 0.4572, "step": 4011 }, { "epoch": 0.7910094637223974, "grad_norm": 0.6866807817706564, "learning_rate": 1.8145922990591932e-05, "loss": 0.4296, "step": 4012 }, { "epoch": 0.7912066246056783, "grad_norm": 0.5397278268058946, "learning_rate": 1.8145023825044114e-05, "loss": 0.4247, "step": 4013 }, { "epoch": 0.791403785488959, "grad_norm": 0.7343980490854036, "learning_rate": 1.8144124463805535e-05, "loss": 0.4587, "step": 4014 }, { "epoch": 0.7916009463722398, "grad_norm": 0.5376919265100979, "learning_rate": 1.8143224906897812e-05, "loss": 0.4111, "step": 4015 }, { "epoch": 0.7917981072555205, "grad_norm": 0.633488416178425, "learning_rate": 1.814232515434255e-05, "loss": 0.4592, "step": 4016 }, { "epoch": 0.7919952681388013, "grad_norm": 0.5283179674486278, "learning_rate": 1.814142520616137e-05, "loss": 0.4288, "step": 4017 }, { "epoch": 0.792192429022082, "grad_norm": 0.5780943767126454, "learning_rate": 1.8140525062375894e-05, "loss": 0.4239, "step": 4018 }, { "epoch": 0.7923895899053628, "grad_norm": 0.5489901267012213, "learning_rate": 1.8139624723007748e-05, "loss": 0.423, "step": 4019 }, { "epoch": 0.7925867507886435, "grad_norm": 0.5878569214560349, "learning_rate": 1.813872418807856e-05, "loss": 0.4265, "step": 4020 }, { "epoch": 0.7927839116719243, "grad_norm": 0.5737545069149199, "learning_rate": 1.813782345760997e-05, "loss": 0.4491, "step": 4021 }, { "epoch": 0.792981072555205, "grad_norm": 0.823990029369412, "learning_rate": 1.813692253162362e-05, "loss": 0.4082, "step": 4022 }, { "epoch": 0.7931782334384858, "grad_norm": 0.5299434682153719, "learning_rate": 1.8136021410141154e-05, "loss": 0.3884, "step": 4023 }, { "epoch": 0.7933753943217665, "grad_norm": 0.551899978074941, "learning_rate": 1.813512009318422e-05, "loss": 0.4369, "step": 4024 }, { "epoch": 0.7935725552050473, "grad_norm": 8.176889365950668, "learning_rate": 1.8134218580774475e-05, "loss": 0.4485, "step": 4025 }, { "epoch": 0.793769716088328, "grad_norm": 0.6125163347743271, "learning_rate": 1.813331687293358e-05, "loss": 0.4474, "step": 4026 }, { "epoch": 0.7939668769716088, "grad_norm": 0.5484917124775043, "learning_rate": 1.8132414969683197e-05, "loss": 0.4205, "step": 4027 }, { "epoch": 0.7941640378548895, "grad_norm": 0.557875142995521, "learning_rate": 1.8131512871044993e-05, "loss": 0.4225, "step": 4028 }, { "epoch": 0.7943611987381703, "grad_norm": 0.637672714031013, "learning_rate": 1.8130610577040646e-05, "loss": 0.483, "step": 4029 }, { "epoch": 0.794558359621451, "grad_norm": 0.5142583048499065, "learning_rate": 1.812970808769183e-05, "loss": 0.42, "step": 4030 }, { "epoch": 0.7947555205047319, "grad_norm": 0.5565242219925195, "learning_rate": 1.812880540302023e-05, "loss": 0.4521, "step": 4031 }, { "epoch": 0.7949526813880127, "grad_norm": 0.5468093785090244, "learning_rate": 1.812790252304754e-05, "loss": 0.4285, "step": 4032 }, { "epoch": 0.7951498422712934, "grad_norm": 0.5792091705278589, "learning_rate": 1.8126999447795438e-05, "loss": 0.4666, "step": 4033 }, { "epoch": 0.7953470031545742, "grad_norm": 4.072216890062226, "learning_rate": 1.8126096177285637e-05, "loss": 0.5029, "step": 4034 }, { "epoch": 0.7955441640378549, "grad_norm": 0.6296907138779285, "learning_rate": 1.8125192711539828e-05, "loss": 0.4361, "step": 4035 }, { "epoch": 0.7957413249211357, "grad_norm": 0.5788781690916837, "learning_rate": 1.812428905057972e-05, "loss": 0.4399, "step": 4036 }, { "epoch": 0.7959384858044164, "grad_norm": 0.5884757591246823, "learning_rate": 1.8123385194427027e-05, "loss": 0.4888, "step": 4037 }, { "epoch": 0.7961356466876972, "grad_norm": 0.6261353284400933, "learning_rate": 1.8122481143103465e-05, "loss": 0.4474, "step": 4038 }, { "epoch": 0.7963328075709779, "grad_norm": 0.5682139648963489, "learning_rate": 1.812157689663075e-05, "loss": 0.4466, "step": 4039 }, { "epoch": 0.7965299684542587, "grad_norm": 0.6006306057576911, "learning_rate": 1.8120672455030606e-05, "loss": 0.4287, "step": 4040 }, { "epoch": 0.7967271293375394, "grad_norm": 0.6096153492041305, "learning_rate": 1.8119767818324773e-05, "loss": 0.4854, "step": 4041 }, { "epoch": 0.7969242902208202, "grad_norm": 0.5816536621919691, "learning_rate": 1.8118862986534974e-05, "loss": 0.4314, "step": 4042 }, { "epoch": 0.7971214511041009, "grad_norm": 0.5915477679203885, "learning_rate": 1.811795795968296e-05, "loss": 0.4392, "step": 4043 }, { "epoch": 0.7973186119873817, "grad_norm": 0.5550175414784126, "learning_rate": 1.8117052737790463e-05, "loss": 0.3972, "step": 4044 }, { "epoch": 0.7975157728706624, "grad_norm": 0.5778936646533668, "learning_rate": 1.8116147320879238e-05, "loss": 0.4382, "step": 4045 }, { "epoch": 0.7977129337539433, "grad_norm": 0.6157010473714177, "learning_rate": 1.811524170897104e-05, "loss": 0.4333, "step": 4046 }, { "epoch": 0.797910094637224, "grad_norm": 0.5637162571951514, "learning_rate": 1.8114335902087625e-05, "loss": 0.4542, "step": 4047 }, { "epoch": 0.7981072555205048, "grad_norm": 0.584581727444574, "learning_rate": 1.811342990025075e-05, "loss": 0.454, "step": 4048 }, { "epoch": 0.7983044164037855, "grad_norm": 0.5543640727335405, "learning_rate": 1.8112523703482194e-05, "loss": 0.4145, "step": 4049 }, { "epoch": 0.7985015772870663, "grad_norm": 0.6099912064912423, "learning_rate": 1.8111617311803722e-05, "loss": 0.4478, "step": 4050 }, { "epoch": 0.798698738170347, "grad_norm": 0.5700094595349666, "learning_rate": 1.8110710725237114e-05, "loss": 0.4459, "step": 4051 }, { "epoch": 0.7988958990536278, "grad_norm": 0.5495028696341402, "learning_rate": 1.8109803943804146e-05, "loss": 0.424, "step": 4052 }, { "epoch": 0.7990930599369085, "grad_norm": 0.5306200168382031, "learning_rate": 1.8108896967526607e-05, "loss": 0.4018, "step": 4053 }, { "epoch": 0.7992902208201893, "grad_norm": 1.038998012553428, "learning_rate": 1.810798979642629e-05, "loss": 0.4464, "step": 4054 }, { "epoch": 0.79948738170347, "grad_norm": 1.0234857955785743, "learning_rate": 1.8107082430524986e-05, "loss": 0.4581, "step": 4055 }, { "epoch": 0.7996845425867508, "grad_norm": 0.5159790767288523, "learning_rate": 1.81061748698445e-05, "loss": 0.433, "step": 4056 }, { "epoch": 0.7998817034700315, "grad_norm": 1.0841772174572797, "learning_rate": 1.8105267114406633e-05, "loss": 0.4212, "step": 4057 }, { "epoch": 0.8000788643533123, "grad_norm": 0.5786065173723118, "learning_rate": 1.81043591642332e-05, "loss": 0.4607, "step": 4058 }, { "epoch": 0.800276025236593, "grad_norm": 0.68262028393148, "learning_rate": 1.810345101934601e-05, "loss": 0.41, "step": 4059 }, { "epoch": 0.8004731861198738, "grad_norm": 1.292583207003667, "learning_rate": 1.8102542679766884e-05, "loss": 0.4715, "step": 4060 }, { "epoch": 0.8006703470031545, "grad_norm": 0.5803586064757899, "learning_rate": 1.8101634145517644e-05, "loss": 0.4231, "step": 4061 }, { "epoch": 0.8008675078864353, "grad_norm": 0.6051334610135585, "learning_rate": 1.810072541662012e-05, "loss": 0.5071, "step": 4062 }, { "epoch": 0.801064668769716, "grad_norm": 0.6153429571062644, "learning_rate": 1.8099816493096144e-05, "loss": 0.4541, "step": 4063 }, { "epoch": 0.8012618296529969, "grad_norm": 0.6245215575502755, "learning_rate": 1.8098907374967557e-05, "loss": 0.4805, "step": 4064 }, { "epoch": 0.8014589905362776, "grad_norm": 1.371389249743891, "learning_rate": 1.8097998062256193e-05, "loss": 0.4497, "step": 4065 }, { "epoch": 0.8016561514195584, "grad_norm": 0.5974203617721795, "learning_rate": 1.8097088554983906e-05, "loss": 0.4546, "step": 4066 }, { "epoch": 0.8018533123028391, "grad_norm": 0.8325017489031189, "learning_rate": 1.8096178853172548e-05, "loss": 0.4492, "step": 4067 }, { "epoch": 0.8020504731861199, "grad_norm": 0.5910351760004731, "learning_rate": 1.809526895684397e-05, "loss": 0.4848, "step": 4068 }, { "epoch": 0.8022476340694006, "grad_norm": 0.5900803892149564, "learning_rate": 1.809435886602004e-05, "loss": 0.4601, "step": 4069 }, { "epoch": 0.8024447949526814, "grad_norm": 0.8379969783401379, "learning_rate": 1.8093448580722617e-05, "loss": 0.4094, "step": 4070 }, { "epoch": 0.8026419558359621, "grad_norm": 0.6961897937500606, "learning_rate": 1.809253810097358e-05, "loss": 0.4621, "step": 4071 }, { "epoch": 0.8028391167192429, "grad_norm": 0.5613024287580417, "learning_rate": 1.809162742679479e-05, "loss": 0.4727, "step": 4072 }, { "epoch": 0.8030362776025236, "grad_norm": 0.5597933707348918, "learning_rate": 1.8090716558208136e-05, "loss": 0.4608, "step": 4073 }, { "epoch": 0.8032334384858044, "grad_norm": 0.5506039857706837, "learning_rate": 1.8089805495235507e-05, "loss": 0.4177, "step": 4074 }, { "epoch": 0.8034305993690851, "grad_norm": 0.6412143999415043, "learning_rate": 1.808889423789878e-05, "loss": 0.45, "step": 4075 }, { "epoch": 0.8036277602523659, "grad_norm": 0.5355910563904579, "learning_rate": 1.808798278621986e-05, "loss": 0.4033, "step": 4076 }, { "epoch": 0.8038249211356467, "grad_norm": 1.393193920661567, "learning_rate": 1.808707114022064e-05, "loss": 0.4729, "step": 4077 }, { "epoch": 0.8040220820189274, "grad_norm": 0.5347566428548295, "learning_rate": 1.808615929992302e-05, "loss": 0.4392, "step": 4078 }, { "epoch": 0.8042192429022083, "grad_norm": 0.5701756563883921, "learning_rate": 1.8085247265348913e-05, "loss": 0.4447, "step": 4079 }, { "epoch": 0.804416403785489, "grad_norm": 0.5336017684518319, "learning_rate": 1.808433503652023e-05, "loss": 0.4357, "step": 4080 }, { "epoch": 0.8046135646687698, "grad_norm": 0.5128271293806619, "learning_rate": 1.8083422613458886e-05, "loss": 0.4149, "step": 4081 }, { "epoch": 0.8048107255520505, "grad_norm": 0.5813112643112214, "learning_rate": 1.8082509996186802e-05, "loss": 0.475, "step": 4082 }, { "epoch": 0.8050078864353313, "grad_norm": 0.5645562044886906, "learning_rate": 1.808159718472591e-05, "loss": 0.449, "step": 4083 }, { "epoch": 0.805205047318612, "grad_norm": 0.5209108203099843, "learning_rate": 1.8080684179098135e-05, "loss": 0.4282, "step": 4084 }, { "epoch": 0.8054022082018928, "grad_norm": 0.5442950978703488, "learning_rate": 1.807977097932542e-05, "loss": 0.4519, "step": 4085 }, { "epoch": 0.8055993690851735, "grad_norm": 0.5397055857931383, "learning_rate": 1.8078857585429698e-05, "loss": 0.4182, "step": 4086 }, { "epoch": 0.8057965299684543, "grad_norm": 0.5457157266372396, "learning_rate": 1.8077943997432913e-05, "loss": 0.4589, "step": 4087 }, { "epoch": 0.805993690851735, "grad_norm": 0.5415377375158944, "learning_rate": 1.8077030215357024e-05, "loss": 0.4621, "step": 4088 }, { "epoch": 0.8061908517350158, "grad_norm": 0.5301693008219718, "learning_rate": 1.8076116239223976e-05, "loss": 0.4085, "step": 4089 }, { "epoch": 0.8063880126182965, "grad_norm": 0.5719849084421536, "learning_rate": 1.807520206905573e-05, "loss": 0.4478, "step": 4090 }, { "epoch": 0.8065851735015773, "grad_norm": 0.5068601560398842, "learning_rate": 1.8074287704874258e-05, "loss": 0.4119, "step": 4091 }, { "epoch": 0.806782334384858, "grad_norm": 0.5667924640136568, "learning_rate": 1.8073373146701517e-05, "loss": 0.4064, "step": 4092 }, { "epoch": 0.8069794952681388, "grad_norm": 0.5134527641968423, "learning_rate": 1.8072458394559485e-05, "loss": 0.3986, "step": 4093 }, { "epoch": 0.8071766561514195, "grad_norm": 0.6144715703002955, "learning_rate": 1.807154344847014e-05, "loss": 0.4866, "step": 4094 }, { "epoch": 0.8073738170347003, "grad_norm": 0.6012333197786712, "learning_rate": 1.8070628308455463e-05, "loss": 0.4539, "step": 4095 }, { "epoch": 0.807570977917981, "grad_norm": 0.5267747372001423, "learning_rate": 1.8069712974537444e-05, "loss": 0.4468, "step": 4096 }, { "epoch": 0.8077681388012619, "grad_norm": 0.5566674138070858, "learning_rate": 1.8068797446738072e-05, "loss": 0.4231, "step": 4097 }, { "epoch": 0.8079652996845426, "grad_norm": 0.5254269426936413, "learning_rate": 1.806788172507934e-05, "loss": 0.4015, "step": 4098 }, { "epoch": 0.8081624605678234, "grad_norm": 0.5478509815630053, "learning_rate": 1.8066965809583255e-05, "loss": 0.4472, "step": 4099 }, { "epoch": 0.8083596214511041, "grad_norm": 0.5715957253192107, "learning_rate": 1.8066049700271818e-05, "loss": 0.4465, "step": 4100 }, { "epoch": 0.8085567823343849, "grad_norm": 0.5584176748983026, "learning_rate": 1.8065133397167045e-05, "loss": 0.4425, "step": 4101 }, { "epoch": 0.8087539432176656, "grad_norm": 0.5738027949569142, "learning_rate": 1.8064216900290943e-05, "loss": 0.4376, "step": 4102 }, { "epoch": 0.8089511041009464, "grad_norm": 0.5582915978440195, "learning_rate": 1.806330020966554e-05, "loss": 0.4309, "step": 4103 }, { "epoch": 0.8091482649842271, "grad_norm": 0.6063188844006441, "learning_rate": 1.8062383325312855e-05, "loss": 0.4636, "step": 4104 }, { "epoch": 0.8093454258675079, "grad_norm": 0.5657855330946964, "learning_rate": 1.8061466247254914e-05, "loss": 0.4632, "step": 4105 }, { "epoch": 0.8095425867507886, "grad_norm": 0.5670328332375311, "learning_rate": 1.806054897551376e-05, "loss": 0.439, "step": 4106 }, { "epoch": 0.8097397476340694, "grad_norm": 0.6265658420538915, "learning_rate": 1.8059631510111424e-05, "loss": 0.4424, "step": 4107 }, { "epoch": 0.8099369085173501, "grad_norm": 0.7579410291352082, "learning_rate": 1.805871385106995e-05, "loss": 0.4335, "step": 4108 }, { "epoch": 0.8101340694006309, "grad_norm": 0.5063248042221858, "learning_rate": 1.8057795998411384e-05, "loss": 0.4214, "step": 4109 }, { "epoch": 0.8103312302839116, "grad_norm": 0.5236497593759417, "learning_rate": 1.8056877952157786e-05, "loss": 0.43, "step": 4110 }, { "epoch": 0.8105283911671924, "grad_norm": 0.6034930272092489, "learning_rate": 1.80559597123312e-05, "loss": 0.4211, "step": 4111 }, { "epoch": 0.8107255520504731, "grad_norm": 0.5882438633902384, "learning_rate": 1.80550412789537e-05, "loss": 0.443, "step": 4112 }, { "epoch": 0.810922712933754, "grad_norm": 0.5366584995675621, "learning_rate": 1.8054122652047342e-05, "loss": 0.4112, "step": 4113 }, { "epoch": 0.8111198738170347, "grad_norm": 0.6250197267195037, "learning_rate": 1.8053203831634207e-05, "loss": 0.4361, "step": 4114 }, { "epoch": 0.8113170347003155, "grad_norm": 0.5654169621581985, "learning_rate": 1.805228481773636e-05, "loss": 0.4699, "step": 4115 }, { "epoch": 0.8115141955835962, "grad_norm": 0.5753974888094099, "learning_rate": 1.8051365610375884e-05, "loss": 0.4418, "step": 4116 }, { "epoch": 0.811711356466877, "grad_norm": 0.5264404059073737, "learning_rate": 1.8050446209574872e-05, "loss": 0.4286, "step": 4117 }, { "epoch": 0.8119085173501577, "grad_norm": 0.6190117803945342, "learning_rate": 1.8049526615355404e-05, "loss": 0.4704, "step": 4118 }, { "epoch": 0.8121056782334385, "grad_norm": 0.5747549365566348, "learning_rate": 1.8048606827739578e-05, "loss": 0.4404, "step": 4119 }, { "epoch": 0.8123028391167192, "grad_norm": 0.6355096601857466, "learning_rate": 1.8047686846749488e-05, "loss": 0.453, "step": 4120 }, { "epoch": 0.8125, "grad_norm": 0.5199655678601448, "learning_rate": 1.8046766672407244e-05, "loss": 0.4341, "step": 4121 }, { "epoch": 0.8126971608832808, "grad_norm": 0.6314812726234823, "learning_rate": 1.8045846304734948e-05, "loss": 0.4633, "step": 4122 }, { "epoch": 0.8128943217665615, "grad_norm": 0.5432564307359536, "learning_rate": 1.8044925743754717e-05, "loss": 0.4788, "step": 4123 }, { "epoch": 0.8130914826498423, "grad_norm": 0.7570361059774975, "learning_rate": 1.8044004989488662e-05, "loss": 0.4695, "step": 4124 }, { "epoch": 0.813288643533123, "grad_norm": 0.5383797237758886, "learning_rate": 1.8043084041958915e-05, "loss": 0.4453, "step": 4125 }, { "epoch": 0.8134858044164038, "grad_norm": 0.647994374550417, "learning_rate": 1.8042162901187596e-05, "loss": 0.4632, "step": 4126 }, { "epoch": 0.8136829652996845, "grad_norm": 0.5849265654509385, "learning_rate": 1.8041241567196834e-05, "loss": 0.4533, "step": 4127 }, { "epoch": 0.8138801261829653, "grad_norm": 0.5450500959334965, "learning_rate": 1.804032004000877e-05, "loss": 0.4477, "step": 4128 }, { "epoch": 0.814077287066246, "grad_norm": 0.5608733688063781, "learning_rate": 1.803939831964554e-05, "loss": 0.4586, "step": 4129 }, { "epoch": 0.8142744479495269, "grad_norm": 0.4822206632784062, "learning_rate": 1.8038476406129294e-05, "loss": 0.4067, "step": 4130 }, { "epoch": 0.8144716088328076, "grad_norm": 0.5171796277326578, "learning_rate": 1.803755429948218e-05, "loss": 0.4101, "step": 4131 }, { "epoch": 0.8146687697160884, "grad_norm": 0.5373199928293783, "learning_rate": 1.8036631999726348e-05, "loss": 0.4328, "step": 4132 }, { "epoch": 0.8148659305993691, "grad_norm": 0.5649352218109933, "learning_rate": 1.8035709506883962e-05, "loss": 0.4896, "step": 4133 }, { "epoch": 0.8150630914826499, "grad_norm": 0.5095327224371953, "learning_rate": 1.8034786820977184e-05, "loss": 0.4626, "step": 4134 }, { "epoch": 0.8152602523659306, "grad_norm": 0.5296243774919772, "learning_rate": 1.8033863942028183e-05, "loss": 0.4478, "step": 4135 }, { "epoch": 0.8154574132492114, "grad_norm": 0.5532856082609808, "learning_rate": 1.803294087005913e-05, "loss": 0.4353, "step": 4136 }, { "epoch": 0.8156545741324921, "grad_norm": 0.5241033299839589, "learning_rate": 1.8032017605092202e-05, "loss": 0.4305, "step": 4137 }, { "epoch": 0.8158517350157729, "grad_norm": 0.47170306660651057, "learning_rate": 1.8031094147149587e-05, "loss": 0.3626, "step": 4138 }, { "epoch": 0.8160488958990536, "grad_norm": 0.5677681072099129, "learning_rate": 1.8030170496253463e-05, "loss": 0.4935, "step": 4139 }, { "epoch": 0.8162460567823344, "grad_norm": 0.5340874190338516, "learning_rate": 1.802924665242603e-05, "loss": 0.42, "step": 4140 }, { "epoch": 0.8164432176656151, "grad_norm": 0.5247377226187014, "learning_rate": 1.8028322615689477e-05, "loss": 0.4289, "step": 4141 }, { "epoch": 0.8166403785488959, "grad_norm": 0.5542330994880117, "learning_rate": 1.802739838606601e-05, "loss": 0.4464, "step": 4142 }, { "epoch": 0.8168375394321766, "grad_norm": 19.89904924544576, "learning_rate": 1.8026473963577834e-05, "loss": 0.5853, "step": 4143 }, { "epoch": 0.8170347003154574, "grad_norm": 0.6171631795186983, "learning_rate": 1.8025549348247154e-05, "loss": 0.4306, "step": 4144 }, { "epoch": 0.8172318611987381, "grad_norm": 0.5504405635665522, "learning_rate": 1.802462454009619e-05, "loss": 0.4298, "step": 4145 }, { "epoch": 0.817429022082019, "grad_norm": 0.5886602077518848, "learning_rate": 1.802369953914716e-05, "loss": 0.4274, "step": 4146 }, { "epoch": 0.8176261829652997, "grad_norm": 0.5150160069331389, "learning_rate": 1.8022774345422284e-05, "loss": 0.4456, "step": 4147 }, { "epoch": 0.8178233438485805, "grad_norm": 0.5736174914873833, "learning_rate": 1.8021848958943796e-05, "loss": 0.4318, "step": 4148 }, { "epoch": 0.8180205047318612, "grad_norm": 0.553943761555841, "learning_rate": 1.8020923379733925e-05, "loss": 0.4331, "step": 4149 }, { "epoch": 0.818217665615142, "grad_norm": 0.5424039413522739, "learning_rate": 1.801999760781491e-05, "loss": 0.4062, "step": 4150 }, { "epoch": 0.8184148264984227, "grad_norm": 0.6049951163196499, "learning_rate": 1.8019071643208996e-05, "loss": 0.4153, "step": 4151 }, { "epoch": 0.8186119873817035, "grad_norm": 0.5680607234809361, "learning_rate": 1.8018145485938427e-05, "loss": 0.4338, "step": 4152 }, { "epoch": 0.8188091482649842, "grad_norm": 0.5291584704930397, "learning_rate": 1.8017219136025458e-05, "loss": 0.4267, "step": 4153 }, { "epoch": 0.819006309148265, "grad_norm": 0.5806259018554695, "learning_rate": 1.801629259349234e-05, "loss": 0.4584, "step": 4154 }, { "epoch": 0.8192034700315457, "grad_norm": 0.5109979743220913, "learning_rate": 1.801536585836134e-05, "loss": 0.4448, "step": 4155 }, { "epoch": 0.8194006309148265, "grad_norm": 1.0089832585060787, "learning_rate": 1.801443893065472e-05, "loss": 0.4481, "step": 4156 }, { "epoch": 0.8195977917981072, "grad_norm": 0.5400204179891461, "learning_rate": 1.8013511810394747e-05, "loss": 0.4355, "step": 4157 }, { "epoch": 0.819794952681388, "grad_norm": 0.5585252526726473, "learning_rate": 1.80125844976037e-05, "loss": 0.4572, "step": 4158 }, { "epoch": 0.8199921135646687, "grad_norm": 0.5965272393146293, "learning_rate": 1.8011656992303863e-05, "loss": 0.4494, "step": 4159 }, { "epoch": 0.8201892744479495, "grad_norm": 0.5918746014179956, "learning_rate": 1.801072929451751e-05, "loss": 0.4272, "step": 4160 }, { "epoch": 0.8203864353312302, "grad_norm": 0.5389833452459519, "learning_rate": 1.8009801404266936e-05, "loss": 0.4041, "step": 4161 }, { "epoch": 0.820583596214511, "grad_norm": 2.6854862231259338, "learning_rate": 1.8008873321574435e-05, "loss": 0.4462, "step": 4162 }, { "epoch": 0.8207807570977917, "grad_norm": 12.845011498981133, "learning_rate": 1.8007945046462302e-05, "loss": 0.4828, "step": 4163 }, { "epoch": 0.8209779179810726, "grad_norm": 0.6285393491640829, "learning_rate": 1.800701657895284e-05, "loss": 0.4869, "step": 4164 }, { "epoch": 0.8211750788643533, "grad_norm": 0.5463921951908919, "learning_rate": 1.8006087919068354e-05, "loss": 0.4517, "step": 4165 }, { "epoch": 0.8213722397476341, "grad_norm": 3.3042430055230434, "learning_rate": 1.800515906683116e-05, "loss": 0.4767, "step": 4166 }, { "epoch": 0.8215694006309149, "grad_norm": 0.608965250290735, "learning_rate": 1.8004230022263575e-05, "loss": 0.4708, "step": 4167 }, { "epoch": 0.8217665615141956, "grad_norm": 0.5644431847609628, "learning_rate": 1.800330078538792e-05, "loss": 0.4087, "step": 4168 }, { "epoch": 0.8219637223974764, "grad_norm": 0.6120359597106482, "learning_rate": 1.8002371356226512e-05, "loss": 0.4715, "step": 4169 }, { "epoch": 0.8221608832807571, "grad_norm": 0.5666338187192854, "learning_rate": 1.800144173480169e-05, "loss": 0.463, "step": 4170 }, { "epoch": 0.8223580441640379, "grad_norm": 0.5992673957167014, "learning_rate": 1.800051192113579e-05, "loss": 0.4724, "step": 4171 }, { "epoch": 0.8225552050473186, "grad_norm": 0.5713699259593359, "learning_rate": 1.799958191525115e-05, "loss": 0.4568, "step": 4172 }, { "epoch": 0.8227523659305994, "grad_norm": 0.5796266634572839, "learning_rate": 1.7998651717170105e-05, "loss": 0.4501, "step": 4173 }, { "epoch": 0.8229495268138801, "grad_norm": 0.5794145486533863, "learning_rate": 1.7997721326915015e-05, "loss": 0.4894, "step": 4174 }, { "epoch": 0.8231466876971609, "grad_norm": 0.548917204300831, "learning_rate": 1.799679074450823e-05, "loss": 0.4151, "step": 4175 }, { "epoch": 0.8233438485804416, "grad_norm": 0.6880509796679153, "learning_rate": 1.7995859969972108e-05, "loss": 0.4362, "step": 4176 }, { "epoch": 0.8235410094637224, "grad_norm": 0.6333808028462822, "learning_rate": 1.7994929003329008e-05, "loss": 0.4802, "step": 4177 }, { "epoch": 0.8237381703470031, "grad_norm": 0.5717804452138856, "learning_rate": 1.7993997844601305e-05, "loss": 0.4161, "step": 4178 }, { "epoch": 0.823935331230284, "grad_norm": 0.5904214094197301, "learning_rate": 1.799306649381136e-05, "loss": 0.4527, "step": 4179 }, { "epoch": 0.8241324921135647, "grad_norm": 0.5224269790433933, "learning_rate": 1.7992134950981562e-05, "loss": 0.3997, "step": 4180 }, { "epoch": 0.8243296529968455, "grad_norm": 0.5926917719981647, "learning_rate": 1.7991203216134283e-05, "loss": 0.436, "step": 4181 }, { "epoch": 0.8245268138801262, "grad_norm": 0.5735078186499827, "learning_rate": 1.7990271289291913e-05, "loss": 0.4367, "step": 4182 }, { "epoch": 0.824723974763407, "grad_norm": 0.6677743637655965, "learning_rate": 1.798933917047684e-05, "loss": 0.4423, "step": 4183 }, { "epoch": 0.8249211356466877, "grad_norm": 0.582150975318319, "learning_rate": 1.7988406859711457e-05, "loss": 0.4656, "step": 4184 }, { "epoch": 0.8251182965299685, "grad_norm": 0.4907564880534393, "learning_rate": 1.7987474357018172e-05, "loss": 0.3808, "step": 4185 }, { "epoch": 0.8253154574132492, "grad_norm": 0.6380319217129097, "learning_rate": 1.7986541662419376e-05, "loss": 0.422, "step": 4186 }, { "epoch": 0.82551261829653, "grad_norm": 0.5678615187977981, "learning_rate": 1.7985608775937492e-05, "loss": 0.4317, "step": 4187 }, { "epoch": 0.8257097791798107, "grad_norm": 0.5265688118305606, "learning_rate": 1.798467569759492e-05, "loss": 0.4309, "step": 4188 }, { "epoch": 0.8259069400630915, "grad_norm": 0.5925439253755009, "learning_rate": 1.798374242741409e-05, "loss": 0.4434, "step": 4189 }, { "epoch": 0.8261041009463722, "grad_norm": 0.6569515898788928, "learning_rate": 1.7982808965417415e-05, "loss": 0.4008, "step": 4190 }, { "epoch": 0.826301261829653, "grad_norm": 0.5541366851451021, "learning_rate": 1.7981875311627327e-05, "loss": 0.4168, "step": 4191 }, { "epoch": 0.8264984227129337, "grad_norm": 0.5419308496525422, "learning_rate": 1.7980941466066254e-05, "loss": 0.4263, "step": 4192 }, { "epoch": 0.8266955835962145, "grad_norm": 0.5627041984651608, "learning_rate": 1.798000742875664e-05, "loss": 0.4471, "step": 4193 }, { "epoch": 0.8268927444794952, "grad_norm": 0.5277704275115969, "learning_rate": 1.797907319972092e-05, "loss": 0.4245, "step": 4194 }, { "epoch": 0.827089905362776, "grad_norm": 0.5673323456563254, "learning_rate": 1.797813877898154e-05, "loss": 0.437, "step": 4195 }, { "epoch": 0.8272870662460567, "grad_norm": 0.580531434044333, "learning_rate": 1.7977204166560954e-05, "loss": 0.4844, "step": 4196 }, { "epoch": 0.8274842271293376, "grad_norm": 0.5301677248107065, "learning_rate": 1.797626936248161e-05, "loss": 0.4146, "step": 4197 }, { "epoch": 0.8276813880126183, "grad_norm": 0.6170836844395501, "learning_rate": 1.7975334366765974e-05, "loss": 0.4555, "step": 4198 }, { "epoch": 0.8278785488958991, "grad_norm": 0.5481999082788359, "learning_rate": 1.7974399179436502e-05, "loss": 0.467, "step": 4199 }, { "epoch": 0.8280757097791798, "grad_norm": 0.5408148308068799, "learning_rate": 1.7973463800515675e-05, "loss": 0.4672, "step": 4200 }, { "epoch": 0.8282728706624606, "grad_norm": 0.5652366078680668, "learning_rate": 1.7972528230025954e-05, "loss": 0.4412, "step": 4201 }, { "epoch": 0.8284700315457413, "grad_norm": 0.538008456834178, "learning_rate": 1.7971592467989824e-05, "loss": 0.4455, "step": 4202 }, { "epoch": 0.8286671924290221, "grad_norm": 0.5497144473137757, "learning_rate": 1.7970656514429767e-05, "loss": 0.4495, "step": 4203 }, { "epoch": 0.8288643533123028, "grad_norm": 0.5400822082606279, "learning_rate": 1.7969720369368266e-05, "loss": 0.4171, "step": 4204 }, { "epoch": 0.8290615141955836, "grad_norm": 0.5032870883447115, "learning_rate": 1.796878403282782e-05, "loss": 0.4039, "step": 4205 }, { "epoch": 0.8292586750788643, "grad_norm": 0.5077806097030966, "learning_rate": 1.7967847504830914e-05, "loss": 0.4433, "step": 4206 }, { "epoch": 0.8294558359621451, "grad_norm": 0.5809809540553955, "learning_rate": 1.7966910785400058e-05, "loss": 0.4481, "step": 4207 }, { "epoch": 0.8296529968454258, "grad_norm": 0.5469499961190881, "learning_rate": 1.7965973874557754e-05, "loss": 0.4356, "step": 4208 }, { "epoch": 0.8298501577287066, "grad_norm": 0.5124543710594913, "learning_rate": 1.7965036772326515e-05, "loss": 0.4394, "step": 4209 }, { "epoch": 0.8300473186119873, "grad_norm": 0.5569661416418024, "learning_rate": 1.796409947872885e-05, "loss": 0.4131, "step": 4210 }, { "epoch": 0.8302444794952681, "grad_norm": 0.566607051094716, "learning_rate": 1.7963161993787285e-05, "loss": 0.4291, "step": 4211 }, { "epoch": 0.830441640378549, "grad_norm": 0.6317395544141171, "learning_rate": 1.796222431752434e-05, "loss": 0.4528, "step": 4212 }, { "epoch": 0.8306388012618297, "grad_norm": 0.5260509022156253, "learning_rate": 1.796128644996254e-05, "loss": 0.4194, "step": 4213 }, { "epoch": 0.8308359621451105, "grad_norm": 0.8039173856781239, "learning_rate": 1.7960348391124422e-05, "loss": 0.4602, "step": 4214 }, { "epoch": 0.8310331230283912, "grad_norm": 0.5673951125473756, "learning_rate": 1.7959410141032524e-05, "loss": 0.4874, "step": 4215 }, { "epoch": 0.831230283911672, "grad_norm": 0.5472093603546104, "learning_rate": 1.795847169970939e-05, "loss": 0.4226, "step": 4216 }, { "epoch": 0.8314274447949527, "grad_norm": 0.6600760076559387, "learning_rate": 1.7957533067177565e-05, "loss": 0.4507, "step": 4217 }, { "epoch": 0.8316246056782335, "grad_norm": 0.5420048851350387, "learning_rate": 1.7956594243459597e-05, "loss": 0.4597, "step": 4218 }, { "epoch": 0.8318217665615142, "grad_norm": 0.5846319849001685, "learning_rate": 1.7955655228578046e-05, "loss": 0.4637, "step": 4219 }, { "epoch": 0.832018927444795, "grad_norm": 0.5925117015754054, "learning_rate": 1.7954716022555474e-05, "loss": 0.475, "step": 4220 }, { "epoch": 0.8322160883280757, "grad_norm": 0.5371836503383141, "learning_rate": 1.795377662541444e-05, "loss": 0.4217, "step": 4221 }, { "epoch": 0.8324132492113565, "grad_norm": 0.7483433485104296, "learning_rate": 1.795283703717752e-05, "loss": 0.4527, "step": 4222 }, { "epoch": 0.8326104100946372, "grad_norm": 0.5384653534370835, "learning_rate": 1.7951897257867284e-05, "loss": 0.4125, "step": 4223 }, { "epoch": 0.832807570977918, "grad_norm": 0.5538901981181154, "learning_rate": 1.7950957287506313e-05, "loss": 0.437, "step": 4224 }, { "epoch": 0.8330047318611987, "grad_norm": 0.5448553268692006, "learning_rate": 1.795001712611719e-05, "loss": 0.432, "step": 4225 }, { "epoch": 0.8332018927444795, "grad_norm": 0.5608394906395617, "learning_rate": 1.7949076773722505e-05, "loss": 0.4812, "step": 4226 }, { "epoch": 0.8333990536277602, "grad_norm": 0.48592649216287015, "learning_rate": 1.7948136230344847e-05, "loss": 0.4042, "step": 4227 }, { "epoch": 0.833596214511041, "grad_norm": 0.5149456873877976, "learning_rate": 1.7947195496006817e-05, "loss": 0.4183, "step": 4228 }, { "epoch": 0.8337933753943217, "grad_norm": 0.5308679728526855, "learning_rate": 1.7946254570731015e-05, "loss": 0.4564, "step": 4229 }, { "epoch": 0.8339905362776026, "grad_norm": 0.5705369452847299, "learning_rate": 1.7945313454540046e-05, "loss": 0.4626, "step": 4230 }, { "epoch": 0.8341876971608833, "grad_norm": 0.5379448499030216, "learning_rate": 1.7944372147456527e-05, "loss": 0.449, "step": 4231 }, { "epoch": 0.8343848580441641, "grad_norm": 0.536663991436941, "learning_rate": 1.7943430649503065e-05, "loss": 0.4614, "step": 4232 }, { "epoch": 0.8345820189274448, "grad_norm": 0.5322133371923756, "learning_rate": 1.794248896070229e-05, "loss": 0.4251, "step": 4233 }, { "epoch": 0.8347791798107256, "grad_norm": 0.5399982863538748, "learning_rate": 1.7941547081076818e-05, "loss": 0.4575, "step": 4234 }, { "epoch": 0.8349763406940063, "grad_norm": 0.5375534373449318, "learning_rate": 1.7940605010649284e-05, "loss": 0.4295, "step": 4235 }, { "epoch": 0.8351735015772871, "grad_norm": 0.5478250406848311, "learning_rate": 1.7939662749442317e-05, "loss": 0.4352, "step": 4236 }, { "epoch": 0.8353706624605678, "grad_norm": 0.5493522873611169, "learning_rate": 1.7938720297478564e-05, "loss": 0.4487, "step": 4237 }, { "epoch": 0.8355678233438486, "grad_norm": 0.5461493071646538, "learning_rate": 1.7937777654780656e-05, "loss": 0.4621, "step": 4238 }, { "epoch": 0.8357649842271293, "grad_norm": 0.5405378451575877, "learning_rate": 1.793683482137125e-05, "loss": 0.4167, "step": 4239 }, { "epoch": 0.8359621451104101, "grad_norm": 0.5514595660368122, "learning_rate": 1.7935891797272998e-05, "loss": 0.4397, "step": 4240 }, { "epoch": 0.8361593059936908, "grad_norm": 0.5700483701942513, "learning_rate": 1.7934948582508554e-05, "loss": 0.4287, "step": 4241 }, { "epoch": 0.8363564668769716, "grad_norm": 34.56133146719008, "learning_rate": 1.793400517710058e-05, "loss": 0.4441, "step": 4242 }, { "epoch": 0.8365536277602523, "grad_norm": 24.176632613926778, "learning_rate": 1.7933061581071743e-05, "loss": 0.4912, "step": 4243 }, { "epoch": 0.8367507886435331, "grad_norm": 0.6360329434335535, "learning_rate": 1.793211779444471e-05, "loss": 0.4639, "step": 4244 }, { "epoch": 0.8369479495268138, "grad_norm": 0.6094525873812249, "learning_rate": 1.7931173817242163e-05, "loss": 0.4648, "step": 4245 }, { "epoch": 0.8371451104100947, "grad_norm": 0.6835903177747272, "learning_rate": 1.7930229649486777e-05, "loss": 0.4328, "step": 4246 }, { "epoch": 0.8373422712933754, "grad_norm": 0.5554084747146647, "learning_rate": 1.7929285291201237e-05, "loss": 0.4385, "step": 4247 }, { "epoch": 0.8375394321766562, "grad_norm": 0.5699771546905784, "learning_rate": 1.7928340742408236e-05, "loss": 0.4457, "step": 4248 }, { "epoch": 0.8377365930599369, "grad_norm": 0.6851558987403872, "learning_rate": 1.792739600313046e-05, "loss": 0.4285, "step": 4249 }, { "epoch": 0.8379337539432177, "grad_norm": 0.5710755557207288, "learning_rate": 1.7926451073390612e-05, "loss": 0.4773, "step": 4250 }, { "epoch": 0.8381309148264984, "grad_norm": 0.6090925217214507, "learning_rate": 1.7925505953211394e-05, "loss": 0.4597, "step": 4251 }, { "epoch": 0.8383280757097792, "grad_norm": 0.5459422698014952, "learning_rate": 1.792456064261551e-05, "loss": 0.4478, "step": 4252 }, { "epoch": 0.8385252365930599, "grad_norm": 0.5617534088004623, "learning_rate": 1.7923615141625677e-05, "loss": 0.4396, "step": 4253 }, { "epoch": 0.8387223974763407, "grad_norm": 0.6076401507399524, "learning_rate": 1.792266945026461e-05, "loss": 0.4723, "step": 4254 }, { "epoch": 0.8389195583596214, "grad_norm": 0.5960645754305288, "learning_rate": 1.792172356855503e-05, "loss": 0.4686, "step": 4255 }, { "epoch": 0.8391167192429022, "grad_norm": 0.5490789054602314, "learning_rate": 1.7920777496519665e-05, "loss": 0.4362, "step": 4256 }, { "epoch": 0.839313880126183, "grad_norm": 0.5627710935612827, "learning_rate": 1.7919831234181234e-05, "loss": 0.4299, "step": 4257 }, { "epoch": 0.8395110410094637, "grad_norm": 0.5861974351027728, "learning_rate": 1.7918884781562486e-05, "loss": 0.444, "step": 4258 }, { "epoch": 0.8397082018927445, "grad_norm": 0.5865858099699106, "learning_rate": 1.7917938138686152e-05, "loss": 0.4527, "step": 4259 }, { "epoch": 0.8399053627760252, "grad_norm": 0.6002524896560638, "learning_rate": 1.791699130557498e-05, "loss": 0.4515, "step": 4260 }, { "epoch": 0.840102523659306, "grad_norm": 1.3685110134791254, "learning_rate": 1.7916044282251713e-05, "loss": 0.3769, "step": 4261 }, { "epoch": 0.8402996845425867, "grad_norm": 0.5968798080121339, "learning_rate": 1.7915097068739108e-05, "loss": 0.443, "step": 4262 }, { "epoch": 0.8404968454258676, "grad_norm": 0.916862721842038, "learning_rate": 1.7914149665059922e-05, "loss": 0.4274, "step": 4263 }, { "epoch": 0.8406940063091483, "grad_norm": 0.5597685865087868, "learning_rate": 1.791320207123692e-05, "loss": 0.4155, "step": 4264 }, { "epoch": 0.8408911671924291, "grad_norm": 0.7515342823351008, "learning_rate": 1.7912254287292863e-05, "loss": 0.4606, "step": 4265 }, { "epoch": 0.8410883280757098, "grad_norm": 0.5225927715886619, "learning_rate": 1.7911306313250523e-05, "loss": 0.4025, "step": 4266 }, { "epoch": 0.8412854889589906, "grad_norm": 0.5557086212596624, "learning_rate": 1.7910358149132682e-05, "loss": 0.4528, "step": 4267 }, { "epoch": 0.8414826498422713, "grad_norm": 0.5735148560669096, "learning_rate": 1.7909409794962115e-05, "loss": 0.4348, "step": 4268 }, { "epoch": 0.8416798107255521, "grad_norm": 0.5535257331063189, "learning_rate": 1.790846125076161e-05, "loss": 0.4406, "step": 4269 }, { "epoch": 0.8418769716088328, "grad_norm": 0.6394946265678507, "learning_rate": 1.790751251655395e-05, "loss": 0.4316, "step": 4270 }, { "epoch": 0.8420741324921136, "grad_norm": 0.5519239974498845, "learning_rate": 1.7906563592361935e-05, "loss": 0.4457, "step": 4271 }, { "epoch": 0.8422712933753943, "grad_norm": 0.5760757282643882, "learning_rate": 1.7905614478208363e-05, "loss": 0.4369, "step": 4272 }, { "epoch": 0.8424684542586751, "grad_norm": 0.5792308751257407, "learning_rate": 1.7904665174116038e-05, "loss": 0.4549, "step": 4273 }, { "epoch": 0.8426656151419558, "grad_norm": 0.5683056364517756, "learning_rate": 1.790371568010777e-05, "loss": 0.4554, "step": 4274 }, { "epoch": 0.8428627760252366, "grad_norm": 0.5615230001406172, "learning_rate": 1.7902765996206364e-05, "loss": 0.4459, "step": 4275 }, { "epoch": 0.8430599369085173, "grad_norm": 0.5564542215994093, "learning_rate": 1.790181612243464e-05, "loss": 0.45, "step": 4276 }, { "epoch": 0.8432570977917981, "grad_norm": 0.5401445915473098, "learning_rate": 1.7900866058815424e-05, "loss": 0.4439, "step": 4277 }, { "epoch": 0.8434542586750788, "grad_norm": 0.5386599600784427, "learning_rate": 1.7899915805371536e-05, "loss": 0.4422, "step": 4278 }, { "epoch": 0.8436514195583596, "grad_norm": 0.5120099294044634, "learning_rate": 1.789896536212581e-05, "loss": 0.4206, "step": 4279 }, { "epoch": 0.8438485804416404, "grad_norm": 0.5832867234998125, "learning_rate": 1.7898014729101077e-05, "loss": 0.4727, "step": 4280 }, { "epoch": 0.8440457413249212, "grad_norm": 0.5192015921845785, "learning_rate": 1.7897063906320182e-05, "loss": 0.4239, "step": 4281 }, { "epoch": 0.8442429022082019, "grad_norm": 0.5700985443501763, "learning_rate": 1.7896112893805967e-05, "loss": 0.4813, "step": 4282 }, { "epoch": 0.8444400630914827, "grad_norm": 0.5274314100347487, "learning_rate": 1.789516169158128e-05, "loss": 0.4251, "step": 4283 }, { "epoch": 0.8446372239747634, "grad_norm": 0.5003375182641053, "learning_rate": 1.7894210299668977e-05, "loss": 0.3926, "step": 4284 }, { "epoch": 0.8448343848580442, "grad_norm": 0.5327663026050636, "learning_rate": 1.7893258718091916e-05, "loss": 0.4379, "step": 4285 }, { "epoch": 0.8450315457413249, "grad_norm": 0.5526141806841286, "learning_rate": 1.7892306946872952e-05, "loss": 0.4765, "step": 4286 }, { "epoch": 0.8452287066246057, "grad_norm": 0.5253955764429079, "learning_rate": 1.7891354986034964e-05, "loss": 0.4083, "step": 4287 }, { "epoch": 0.8454258675078864, "grad_norm": 0.9128365999475107, "learning_rate": 1.7890402835600814e-05, "loss": 0.4293, "step": 4288 }, { "epoch": 0.8456230283911672, "grad_norm": 0.50508451474393, "learning_rate": 1.7889450495593386e-05, "loss": 0.4281, "step": 4289 }, { "epoch": 0.8458201892744479, "grad_norm": 0.5667695632508356, "learning_rate": 1.7888497966035552e-05, "loss": 0.4551, "step": 4290 }, { "epoch": 0.8460173501577287, "grad_norm": 0.5140164942376172, "learning_rate": 1.7887545246950204e-05, "loss": 0.4283, "step": 4291 }, { "epoch": 0.8462145110410094, "grad_norm": 0.5547368089832118, "learning_rate": 1.7886592338360227e-05, "loss": 0.4204, "step": 4292 }, { "epoch": 0.8464116719242902, "grad_norm": 0.5107722071951123, "learning_rate": 1.7885639240288523e-05, "loss": 0.4269, "step": 4293 }, { "epoch": 0.8466088328075709, "grad_norm": 0.5099322891614341, "learning_rate": 1.788468595275798e-05, "loss": 0.3775, "step": 4294 }, { "epoch": 0.8468059936908517, "grad_norm": 0.5424856816385994, "learning_rate": 1.7883732475791512e-05, "loss": 0.4256, "step": 4295 }, { "epoch": 0.8470031545741324, "grad_norm": 0.5241853108328052, "learning_rate": 1.7882778809412024e-05, "loss": 0.4292, "step": 4296 }, { "epoch": 0.8472003154574133, "grad_norm": 0.542588881145842, "learning_rate": 1.7881824953642423e-05, "loss": 0.4074, "step": 4297 }, { "epoch": 0.847397476340694, "grad_norm": 0.7626469717720238, "learning_rate": 1.788087090850563e-05, "loss": 0.4365, "step": 4298 }, { "epoch": 0.8475946372239748, "grad_norm": 0.6333662401521095, "learning_rate": 1.787991667402457e-05, "loss": 0.4631, "step": 4299 }, { "epoch": 0.8477917981072555, "grad_norm": 0.5258355942220182, "learning_rate": 1.787896225022216e-05, "loss": 0.4423, "step": 4300 }, { "epoch": 0.8479889589905363, "grad_norm": 0.572162435469079, "learning_rate": 1.7878007637121344e-05, "loss": 0.4635, "step": 4301 }, { "epoch": 0.848186119873817, "grad_norm": 0.6051771551039156, "learning_rate": 1.7877052834745048e-05, "loss": 0.5096, "step": 4302 }, { "epoch": 0.8483832807570978, "grad_norm": 0.5953955107671394, "learning_rate": 1.7876097843116214e-05, "loss": 0.4335, "step": 4303 }, { "epoch": 0.8485804416403786, "grad_norm": 0.5157443177704121, "learning_rate": 1.7875142662257788e-05, "loss": 0.4226, "step": 4304 }, { "epoch": 0.8487776025236593, "grad_norm": 0.5783953306524755, "learning_rate": 1.7874187292192716e-05, "loss": 0.4105, "step": 4305 }, { "epoch": 0.8489747634069401, "grad_norm": 3.512906038519687, "learning_rate": 1.7873231732943954e-05, "loss": 0.4794, "step": 4306 }, { "epoch": 0.8491719242902208, "grad_norm": 0.6607264573754198, "learning_rate": 1.787227598453446e-05, "loss": 0.4226, "step": 4307 }, { "epoch": 0.8493690851735016, "grad_norm": 0.5184733373485347, "learning_rate": 1.7871320046987195e-05, "loss": 0.4468, "step": 4308 }, { "epoch": 0.8495662460567823, "grad_norm": 0.6207995208356605, "learning_rate": 1.7870363920325126e-05, "loss": 0.4391, "step": 4309 }, { "epoch": 0.8497634069400631, "grad_norm": 0.9804052510102346, "learning_rate": 1.7869407604571228e-05, "loss": 0.4196, "step": 4310 }, { "epoch": 0.8499605678233438, "grad_norm": 0.5996292455245761, "learning_rate": 1.7868451099748473e-05, "loss": 0.4297, "step": 4311 }, { "epoch": 0.8501577287066246, "grad_norm": 0.5472017368916122, "learning_rate": 1.7867494405879847e-05, "loss": 0.4032, "step": 4312 }, { "epoch": 0.8503548895899053, "grad_norm": 0.7587762660324456, "learning_rate": 1.786653752298833e-05, "loss": 0.4218, "step": 4313 }, { "epoch": 0.8505520504731862, "grad_norm": 0.5433647751228663, "learning_rate": 1.7865580451096912e-05, "loss": 0.4439, "step": 4314 }, { "epoch": 0.8507492113564669, "grad_norm": 0.5467356597687231, "learning_rate": 1.7864623190228592e-05, "loss": 0.4679, "step": 4315 }, { "epoch": 0.8509463722397477, "grad_norm": 0.5811637139524398, "learning_rate": 1.7863665740406367e-05, "loss": 0.4204, "step": 4316 }, { "epoch": 0.8511435331230284, "grad_norm": 0.5301517872977239, "learning_rate": 1.786270810165324e-05, "loss": 0.4555, "step": 4317 }, { "epoch": 0.8513406940063092, "grad_norm": 0.5854692352465581, "learning_rate": 1.7861750273992216e-05, "loss": 0.4597, "step": 4318 }, { "epoch": 0.8515378548895899, "grad_norm": 0.5663117363358948, "learning_rate": 1.7860792257446315e-05, "loss": 0.4437, "step": 4319 }, { "epoch": 0.8517350157728707, "grad_norm": 0.5681785500111551, "learning_rate": 1.785983405203855e-05, "loss": 0.4747, "step": 4320 }, { "epoch": 0.8519321766561514, "grad_norm": 0.6863444614061993, "learning_rate": 1.7858875657791937e-05, "loss": 0.4423, "step": 4321 }, { "epoch": 0.8521293375394322, "grad_norm": 0.8893916870868374, "learning_rate": 1.7857917074729513e-05, "loss": 0.4413, "step": 4322 }, { "epoch": 0.8523264984227129, "grad_norm": 0.5878530748957967, "learning_rate": 1.78569583028743e-05, "loss": 0.4737, "step": 4323 }, { "epoch": 0.8525236593059937, "grad_norm": 0.5738260255184614, "learning_rate": 1.7855999342249338e-05, "loss": 0.4325, "step": 4324 }, { "epoch": 0.8527208201892744, "grad_norm": 0.5660103845510339, "learning_rate": 1.7855040192877666e-05, "loss": 0.4242, "step": 4325 }, { "epoch": 0.8529179810725552, "grad_norm": 0.5644476271832222, "learning_rate": 1.7854080854782324e-05, "loss": 0.4465, "step": 4326 }, { "epoch": 0.8531151419558359, "grad_norm": 0.5507139670335401, "learning_rate": 1.7853121327986368e-05, "loss": 0.452, "step": 4327 }, { "epoch": 0.8533123028391167, "grad_norm": 0.5645022256444481, "learning_rate": 1.785216161251285e-05, "loss": 0.4225, "step": 4328 }, { "epoch": 0.8535094637223974, "grad_norm": 2.5107638932499734, "learning_rate": 1.7851201708384823e-05, "loss": 0.466, "step": 4329 }, { "epoch": 0.8537066246056783, "grad_norm": 0.6115304333104712, "learning_rate": 1.785024161562535e-05, "loss": 0.4656, "step": 4330 }, { "epoch": 0.853903785488959, "grad_norm": 0.5583226024777117, "learning_rate": 1.7849281334257504e-05, "loss": 0.4285, "step": 4331 }, { "epoch": 0.8541009463722398, "grad_norm": 0.5123230160651777, "learning_rate": 1.784832086430435e-05, "loss": 0.3872, "step": 4332 }, { "epoch": 0.8542981072555205, "grad_norm": 0.6949006788224281, "learning_rate": 1.784736020578897e-05, "loss": 0.4535, "step": 4333 }, { "epoch": 0.8544952681388013, "grad_norm": 1.427386384185727, "learning_rate": 1.784639935873444e-05, "loss": 0.4704, "step": 4334 }, { "epoch": 0.854692429022082, "grad_norm": 1.106108233363933, "learning_rate": 1.784543832316385e-05, "loss": 0.4372, "step": 4335 }, { "epoch": 0.8548895899053628, "grad_norm": 0.5901770882082108, "learning_rate": 1.7844477099100282e-05, "loss": 0.4699, "step": 4336 }, { "epoch": 0.8550867507886435, "grad_norm": 0.544525043882823, "learning_rate": 1.784351568656684e-05, "loss": 0.4027, "step": 4337 }, { "epoch": 0.8552839116719243, "grad_norm": 0.5934533266780927, "learning_rate": 1.7842554085586613e-05, "loss": 0.4838, "step": 4338 }, { "epoch": 0.855481072555205, "grad_norm": 0.5596803431341593, "learning_rate": 1.7841592296182705e-05, "loss": 0.4621, "step": 4339 }, { "epoch": 0.8556782334384858, "grad_norm": 0.5394622987888912, "learning_rate": 1.7840630318378233e-05, "loss": 0.397, "step": 4340 }, { "epoch": 0.8558753943217665, "grad_norm": 0.5455394824500505, "learning_rate": 1.78396681521963e-05, "loss": 0.4519, "step": 4341 }, { "epoch": 0.8560725552050473, "grad_norm": 0.5203051216706605, "learning_rate": 1.7838705797660033e-05, "loss": 0.4413, "step": 4342 }, { "epoch": 0.856269716088328, "grad_norm": 0.5539835315564265, "learning_rate": 1.783774325479254e-05, "loss": 0.4285, "step": 4343 }, { "epoch": 0.8564668769716088, "grad_norm": 0.546518036363135, "learning_rate": 1.7836780523616957e-05, "loss": 0.4444, "step": 4344 }, { "epoch": 0.8566640378548895, "grad_norm": 0.5381032645999315, "learning_rate": 1.7835817604156407e-05, "loss": 0.4187, "step": 4345 }, { "epoch": 0.8568611987381703, "grad_norm": 0.6118133777534611, "learning_rate": 1.7834854496434032e-05, "loss": 0.4176, "step": 4346 }, { "epoch": 0.857058359621451, "grad_norm": 0.6427676574290028, "learning_rate": 1.7833891200472967e-05, "loss": 0.5017, "step": 4347 }, { "epoch": 0.8572555205047319, "grad_norm": 0.5031944020117598, "learning_rate": 1.7832927716296357e-05, "loss": 0.3831, "step": 4348 }, { "epoch": 0.8574526813880127, "grad_norm": 0.5370545377340153, "learning_rate": 1.7831964043927355e-05, "loss": 0.4165, "step": 4349 }, { "epoch": 0.8576498422712934, "grad_norm": 0.5878309901877112, "learning_rate": 1.7831000183389107e-05, "loss": 0.4848, "step": 4350 }, { "epoch": 0.8578470031545742, "grad_norm": 0.5608856336311973, "learning_rate": 1.783003613470477e-05, "loss": 0.4418, "step": 4351 }, { "epoch": 0.8580441640378549, "grad_norm": 0.5690843999597316, "learning_rate": 1.7829071897897515e-05, "loss": 0.4331, "step": 4352 }, { "epoch": 0.8582413249211357, "grad_norm": 0.5542069316816697, "learning_rate": 1.7828107472990498e-05, "loss": 0.4454, "step": 4353 }, { "epoch": 0.8584384858044164, "grad_norm": 0.5267592858399887, "learning_rate": 1.78271428600069e-05, "loss": 0.4271, "step": 4354 }, { "epoch": 0.8586356466876972, "grad_norm": 0.5686441971248088, "learning_rate": 1.7826178058969884e-05, "loss": 0.4757, "step": 4355 }, { "epoch": 0.8588328075709779, "grad_norm": 0.5227257642875571, "learning_rate": 1.7825213069902646e-05, "loss": 0.4574, "step": 4356 }, { "epoch": 0.8590299684542587, "grad_norm": 0.5300979103639493, "learning_rate": 1.782424789282836e-05, "loss": 0.4289, "step": 4357 }, { "epoch": 0.8592271293375394, "grad_norm": 0.5849541511404455, "learning_rate": 1.7823282527770214e-05, "loss": 0.4209, "step": 4358 }, { "epoch": 0.8594242902208202, "grad_norm": 0.48637468314860755, "learning_rate": 1.782231697475141e-05, "loss": 0.4429, "step": 4359 }, { "epoch": 0.8596214511041009, "grad_norm": 0.5419865357900749, "learning_rate": 1.7821351233795135e-05, "loss": 0.4511, "step": 4360 }, { "epoch": 0.8598186119873817, "grad_norm": 1.4858320044760873, "learning_rate": 1.7820385304924602e-05, "loss": 0.4831, "step": 4361 }, { "epoch": 0.8600157728706624, "grad_norm": 0.5226021841678236, "learning_rate": 1.7819419188163015e-05, "loss": 0.4268, "step": 4362 }, { "epoch": 0.8602129337539433, "grad_norm": 0.5958597819761025, "learning_rate": 1.7818452883533587e-05, "loss": 0.4404, "step": 4363 }, { "epoch": 0.860410094637224, "grad_norm": 0.5282803263222494, "learning_rate": 1.781748639105953e-05, "loss": 0.4351, "step": 4364 }, { "epoch": 0.8606072555205048, "grad_norm": 0.5691278859949257, "learning_rate": 1.7816519710764065e-05, "loss": 0.4288, "step": 4365 }, { "epoch": 0.8608044164037855, "grad_norm": 0.8711980455369287, "learning_rate": 1.7815552842670424e-05, "loss": 0.4175, "step": 4366 }, { "epoch": 0.8610015772870663, "grad_norm": 0.48580191160854, "learning_rate": 1.7814585786801826e-05, "loss": 0.4037, "step": 4367 }, { "epoch": 0.861198738170347, "grad_norm": 0.5079270568376504, "learning_rate": 1.7813618543181515e-05, "loss": 0.4153, "step": 4368 }, { "epoch": 0.8613958990536278, "grad_norm": 0.5060650682920903, "learning_rate": 1.781265111183273e-05, "loss": 0.4223, "step": 4369 }, { "epoch": 0.8615930599369085, "grad_norm": 0.5229358894402564, "learning_rate": 1.7811683492778704e-05, "loss": 0.4267, "step": 4370 }, { "epoch": 0.8617902208201893, "grad_norm": 0.5485854868559356, "learning_rate": 1.7810715686042694e-05, "loss": 0.4611, "step": 4371 }, { "epoch": 0.86198738170347, "grad_norm": 0.7504801541100838, "learning_rate": 1.7809747691647947e-05, "loss": 0.4411, "step": 4372 }, { "epoch": 0.8621845425867508, "grad_norm": 0.5652888768887478, "learning_rate": 1.7808779509617726e-05, "loss": 0.4544, "step": 4373 }, { "epoch": 0.8623817034700315, "grad_norm": 0.5229601149145549, "learning_rate": 1.7807811139975287e-05, "loss": 0.432, "step": 4374 }, { "epoch": 0.8625788643533123, "grad_norm": 0.5693809303118241, "learning_rate": 1.78068425827439e-05, "loss": 0.4531, "step": 4375 }, { "epoch": 0.862776025236593, "grad_norm": 0.5324905030720758, "learning_rate": 1.7805873837946833e-05, "loss": 0.4503, "step": 4376 }, { "epoch": 0.8629731861198738, "grad_norm": 0.5981637783741762, "learning_rate": 1.780490490560736e-05, "loss": 0.4517, "step": 4377 }, { "epoch": 0.8631703470031545, "grad_norm": 0.5398817339769384, "learning_rate": 1.7803935785748758e-05, "loss": 0.4098, "step": 4378 }, { "epoch": 0.8633675078864353, "grad_norm": 0.6324773282334193, "learning_rate": 1.7802966478394318e-05, "loss": 0.4814, "step": 4379 }, { "epoch": 0.863564668769716, "grad_norm": 0.5574386589246891, "learning_rate": 1.7801996983567325e-05, "loss": 0.446, "step": 4380 }, { "epoch": 0.8637618296529969, "grad_norm": 0.5038265095672974, "learning_rate": 1.780102730129107e-05, "loss": 0.4219, "step": 4381 }, { "epoch": 0.8639589905362776, "grad_norm": 0.5471006623991824, "learning_rate": 1.7800057431588852e-05, "loss": 0.4301, "step": 4382 }, { "epoch": 0.8641561514195584, "grad_norm": 0.562249614894374, "learning_rate": 1.7799087374483974e-05, "loss": 0.4629, "step": 4383 }, { "epoch": 0.8643533123028391, "grad_norm": 0.5815465050068923, "learning_rate": 1.7798117129999738e-05, "loss": 0.4361, "step": 4384 }, { "epoch": 0.8645504731861199, "grad_norm": 0.5771439496581714, "learning_rate": 1.779714669815946e-05, "loss": 0.4527, "step": 4385 }, { "epoch": 0.8647476340694006, "grad_norm": 0.5527264398916031, "learning_rate": 1.7796176078986458e-05, "loss": 0.4552, "step": 4386 }, { "epoch": 0.8649447949526814, "grad_norm": 0.564880759780719, "learning_rate": 1.7795205272504044e-05, "loss": 0.4528, "step": 4387 }, { "epoch": 0.8651419558359621, "grad_norm": 0.5384822591936045, "learning_rate": 1.7794234278735544e-05, "loss": 0.4476, "step": 4388 }, { "epoch": 0.8653391167192429, "grad_norm": 1.03147273350115, "learning_rate": 1.779326309770429e-05, "loss": 0.427, "step": 4389 }, { "epoch": 0.8655362776025236, "grad_norm": 0.5033337533381047, "learning_rate": 1.7792291729433615e-05, "loss": 0.422, "step": 4390 }, { "epoch": 0.8657334384858044, "grad_norm": 0.8142231011342511, "learning_rate": 1.7791320173946857e-05, "loss": 0.4298, "step": 4391 }, { "epoch": 0.8659305993690851, "grad_norm": 0.5307387314489819, "learning_rate": 1.7790348431267353e-05, "loss": 0.4549, "step": 4392 }, { "epoch": 0.8661277602523659, "grad_norm": 0.5806932313493961, "learning_rate": 1.7789376501418457e-05, "loss": 0.4667, "step": 4393 }, { "epoch": 0.8663249211356467, "grad_norm": 0.516262259612544, "learning_rate": 1.778840438442352e-05, "loss": 0.4056, "step": 4394 }, { "epoch": 0.8665220820189274, "grad_norm": 0.5715000920752781, "learning_rate": 1.7787432080305895e-05, "loss": 0.4327, "step": 4395 }, { "epoch": 0.8667192429022083, "grad_norm": 0.549118836810879, "learning_rate": 1.7786459589088942e-05, "loss": 0.4196, "step": 4396 }, { "epoch": 0.866916403785489, "grad_norm": 0.5185987495530604, "learning_rate": 1.778548691079603e-05, "loss": 0.3998, "step": 4397 }, { "epoch": 0.8671135646687698, "grad_norm": 0.594091330595555, "learning_rate": 1.7784514045450518e-05, "loss": 0.4366, "step": 4398 }, { "epoch": 0.8673107255520505, "grad_norm": 0.5512970340203216, "learning_rate": 1.7783540993075793e-05, "loss": 0.4249, "step": 4399 }, { "epoch": 0.8675078864353313, "grad_norm": 0.6495070525009865, "learning_rate": 1.7782567753695227e-05, "loss": 0.4518, "step": 4400 }, { "epoch": 0.867705047318612, "grad_norm": 0.5475014771103609, "learning_rate": 1.7781594327332203e-05, "loss": 0.4358, "step": 4401 }, { "epoch": 0.8679022082018928, "grad_norm": 0.5446824793376154, "learning_rate": 1.7780620714010108e-05, "loss": 0.4317, "step": 4402 }, { "epoch": 0.8680993690851735, "grad_norm": 0.5672459127571157, "learning_rate": 1.7779646913752334e-05, "loss": 0.4306, "step": 4403 }, { "epoch": 0.8682965299684543, "grad_norm": 0.5159309255044556, "learning_rate": 1.7778672926582277e-05, "loss": 0.4356, "step": 4404 }, { "epoch": 0.868493690851735, "grad_norm": 0.5706253535740993, "learning_rate": 1.777769875252334e-05, "loss": 0.4624, "step": 4405 }, { "epoch": 0.8686908517350158, "grad_norm": 0.5468735157960866, "learning_rate": 1.7776724391598928e-05, "loss": 0.4204, "step": 4406 }, { "epoch": 0.8688880126182965, "grad_norm": 0.6041214464176105, "learning_rate": 1.7775749843832454e-05, "loss": 0.4493, "step": 4407 }, { "epoch": 0.8690851735015773, "grad_norm": 0.5209820736307841, "learning_rate": 1.777477510924732e-05, "loss": 0.4042, "step": 4408 }, { "epoch": 0.869282334384858, "grad_norm": 0.5604674535426473, "learning_rate": 1.777380018786696e-05, "loss": 0.4276, "step": 4409 }, { "epoch": 0.8694794952681388, "grad_norm": 0.5079309439621922, "learning_rate": 1.7772825079714788e-05, "loss": 0.4476, "step": 4410 }, { "epoch": 0.8696766561514195, "grad_norm": 0.6476704699217136, "learning_rate": 1.7771849784814232e-05, "loss": 0.4578, "step": 4411 }, { "epoch": 0.8698738170347003, "grad_norm": 0.5924379738519122, "learning_rate": 1.7770874303188727e-05, "loss": 0.4311, "step": 4412 }, { "epoch": 0.870070977917981, "grad_norm": 0.524148406557284, "learning_rate": 1.776989863486171e-05, "loss": 0.389, "step": 4413 }, { "epoch": 0.8702681388012619, "grad_norm": 0.5327802595222653, "learning_rate": 1.776892277985662e-05, "loss": 0.4413, "step": 4414 }, { "epoch": 0.8704652996845426, "grad_norm": 0.62581900914729, "learning_rate": 1.7767946738196903e-05, "loss": 0.4319, "step": 4415 }, { "epoch": 0.8706624605678234, "grad_norm": 0.55431472515896, "learning_rate": 1.7766970509906014e-05, "loss": 0.4486, "step": 4416 }, { "epoch": 0.8708596214511041, "grad_norm": 0.5876652626007826, "learning_rate": 1.77659940950074e-05, "loss": 0.4344, "step": 4417 }, { "epoch": 0.8710567823343849, "grad_norm": 0.5551588366167218, "learning_rate": 1.7765017493524526e-05, "loss": 0.4568, "step": 4418 }, { "epoch": 0.8712539432176656, "grad_norm": 0.5251199442258818, "learning_rate": 1.776404070548085e-05, "loss": 0.444, "step": 4419 }, { "epoch": 0.8714511041009464, "grad_norm": 0.5098007039746633, "learning_rate": 1.7763063730899846e-05, "loss": 0.4078, "step": 4420 }, { "epoch": 0.8716482649842271, "grad_norm": 0.5270323086561973, "learning_rate": 1.776208656980499e-05, "loss": 0.4585, "step": 4421 }, { "epoch": 0.8718454258675079, "grad_norm": 0.575464987069899, "learning_rate": 1.7761109222219747e-05, "loss": 0.4532, "step": 4422 }, { "epoch": 0.8720425867507886, "grad_norm": 0.5312918974599236, "learning_rate": 1.7760131688167606e-05, "loss": 0.4247, "step": 4423 }, { "epoch": 0.8722397476340694, "grad_norm": 0.5487949802158028, "learning_rate": 1.775915396767205e-05, "loss": 0.4636, "step": 4424 }, { "epoch": 0.8724369085173501, "grad_norm": 0.6044316638894782, "learning_rate": 1.7758176060756572e-05, "loss": 0.4606, "step": 4425 }, { "epoch": 0.8726340694006309, "grad_norm": 0.5816796983719151, "learning_rate": 1.775719796744467e-05, "loss": 0.4585, "step": 4426 }, { "epoch": 0.8728312302839116, "grad_norm": 0.5731233568405086, "learning_rate": 1.775621968775984e-05, "loss": 0.4263, "step": 4427 }, { "epoch": 0.8730283911671924, "grad_norm": 0.596194849375308, "learning_rate": 1.7755241221725583e-05, "loss": 0.4631, "step": 4428 }, { "epoch": 0.8732255520504731, "grad_norm": 0.5635206736546373, "learning_rate": 1.7754262569365413e-05, "loss": 0.4385, "step": 4429 }, { "epoch": 0.873422712933754, "grad_norm": 0.5479175042231295, "learning_rate": 1.7753283730702837e-05, "loss": 0.431, "step": 4430 }, { "epoch": 0.8736198738170347, "grad_norm": 0.5279639808442017, "learning_rate": 1.7752304705761377e-05, "loss": 0.4326, "step": 4431 }, { "epoch": 0.8738170347003155, "grad_norm": 0.5194599602636989, "learning_rate": 1.7751325494564556e-05, "loss": 0.4215, "step": 4432 }, { "epoch": 0.8740141955835962, "grad_norm": 0.5615746974328736, "learning_rate": 1.7750346097135896e-05, "loss": 0.4653, "step": 4433 }, { "epoch": 0.874211356466877, "grad_norm": 0.5428620740818375, "learning_rate": 1.774936651349893e-05, "loss": 0.4014, "step": 4434 }, { "epoch": 0.8744085173501577, "grad_norm": 0.4968476884769345, "learning_rate": 1.774838674367719e-05, "loss": 0.4188, "step": 4435 }, { "epoch": 0.8746056782334385, "grad_norm": 0.5130271919066233, "learning_rate": 1.7747406787694222e-05, "loss": 0.4324, "step": 4436 }, { "epoch": 0.8748028391167192, "grad_norm": 0.7695088300581262, "learning_rate": 1.774642664557357e-05, "loss": 0.4302, "step": 4437 }, { "epoch": 0.875, "grad_norm": 0.5482718548743002, "learning_rate": 1.7745446317338773e-05, "loss": 0.4285, "step": 4438 }, { "epoch": 0.8751971608832808, "grad_norm": 0.5602436646595499, "learning_rate": 1.7744465803013394e-05, "loss": 0.4519, "step": 4439 }, { "epoch": 0.8753943217665615, "grad_norm": 0.6144891088977394, "learning_rate": 1.774348510262099e-05, "loss": 0.4275, "step": 4440 }, { "epoch": 0.8755914826498423, "grad_norm": 0.56029801618344, "learning_rate": 1.774250421618511e-05, "loss": 0.3993, "step": 4441 }, { "epoch": 0.875788643533123, "grad_norm": 0.5547908674306443, "learning_rate": 1.7741523143729344e-05, "loss": 0.4449, "step": 4442 }, { "epoch": 0.8759858044164038, "grad_norm": 0.5349957311996212, "learning_rate": 1.7740541885277243e-05, "loss": 0.4173, "step": 4443 }, { "epoch": 0.8761829652996845, "grad_norm": 0.5813115159456966, "learning_rate": 1.773956044085239e-05, "loss": 0.4598, "step": 4444 }, { "epoch": 0.8763801261829653, "grad_norm": 0.5607295785609846, "learning_rate": 1.773857881047837e-05, "loss": 0.4563, "step": 4445 }, { "epoch": 0.876577287066246, "grad_norm": 0.535106190739021, "learning_rate": 1.773759699417876e-05, "loss": 0.4743, "step": 4446 }, { "epoch": 0.8767744479495269, "grad_norm": 0.5125347386594389, "learning_rate": 1.773661499197715e-05, "loss": 0.4162, "step": 4447 }, { "epoch": 0.8769716088328076, "grad_norm": 0.49637878407034497, "learning_rate": 1.7735632803897135e-05, "loss": 0.4293, "step": 4448 }, { "epoch": 0.8771687697160884, "grad_norm": 0.5116773726384485, "learning_rate": 1.773465042996231e-05, "loss": 0.449, "step": 4449 }, { "epoch": 0.8773659305993691, "grad_norm": 0.5442212905587093, "learning_rate": 1.7733667870196282e-05, "loss": 0.433, "step": 4450 }, { "epoch": 0.8775630914826499, "grad_norm": 0.5755384800113009, "learning_rate": 1.7732685124622656e-05, "loss": 0.4539, "step": 4451 }, { "epoch": 0.8777602523659306, "grad_norm": 0.5372496581928871, "learning_rate": 1.773170219326504e-05, "loss": 0.436, "step": 4452 }, { "epoch": 0.8779574132492114, "grad_norm": 0.5189362257725963, "learning_rate": 1.7730719076147057e-05, "loss": 0.4433, "step": 4453 }, { "epoch": 0.8781545741324921, "grad_norm": 0.548560113980181, "learning_rate": 1.7729735773292322e-05, "loss": 0.4596, "step": 4454 }, { "epoch": 0.8783517350157729, "grad_norm": 0.5494525538977173, "learning_rate": 1.7728752284724454e-05, "loss": 0.4452, "step": 4455 }, { "epoch": 0.8785488958990536, "grad_norm": 0.5269179148603672, "learning_rate": 1.7727768610467097e-05, "loss": 0.4617, "step": 4456 }, { "epoch": 0.8787460567823344, "grad_norm": 0.6517832670327971, "learning_rate": 1.7726784750543867e-05, "loss": 0.4377, "step": 4457 }, { "epoch": 0.8789432176656151, "grad_norm": 0.5812600325676386, "learning_rate": 1.7725800704978416e-05, "loss": 0.4803, "step": 4458 }, { "epoch": 0.8791403785488959, "grad_norm": 0.5376422098313386, "learning_rate": 1.772481647379438e-05, "loss": 0.4467, "step": 4459 }, { "epoch": 0.8793375394321766, "grad_norm": 0.5404291511379831, "learning_rate": 1.7723832057015413e-05, "loss": 0.4662, "step": 4460 }, { "epoch": 0.8795347003154574, "grad_norm": 0.6308719185145436, "learning_rate": 1.7722847454665156e-05, "loss": 0.456, "step": 4461 }, { "epoch": 0.8797318611987381, "grad_norm": 0.5257173961466224, "learning_rate": 1.772186266676727e-05, "loss": 0.4427, "step": 4462 }, { "epoch": 0.879929022082019, "grad_norm": 0.6053907583720101, "learning_rate": 1.7720877693345414e-05, "loss": 0.4045, "step": 4463 }, { "epoch": 0.8801261829652997, "grad_norm": 0.6644310590594147, "learning_rate": 1.7719892534423255e-05, "loss": 0.4652, "step": 4464 }, { "epoch": 0.8803233438485805, "grad_norm": 0.5704624224624693, "learning_rate": 1.7718907190024462e-05, "loss": 0.4775, "step": 4465 }, { "epoch": 0.8805205047318612, "grad_norm": 0.5410836564451232, "learning_rate": 1.7717921660172708e-05, "loss": 0.4499, "step": 4466 }, { "epoch": 0.880717665615142, "grad_norm": 0.5228027698546309, "learning_rate": 1.771693594489167e-05, "loss": 0.4313, "step": 4467 }, { "epoch": 0.8809148264984227, "grad_norm": 0.5265433161743185, "learning_rate": 1.771595004420503e-05, "loss": 0.404, "step": 4468 }, { "epoch": 0.8811119873817035, "grad_norm": 0.5812483523416486, "learning_rate": 1.771496395813648e-05, "loss": 0.4359, "step": 4469 }, { "epoch": 0.8813091482649842, "grad_norm": 0.5446967314373959, "learning_rate": 1.7713977686709706e-05, "loss": 0.4476, "step": 4470 }, { "epoch": 0.881506309148265, "grad_norm": 0.5031631759781324, "learning_rate": 1.7712991229948405e-05, "loss": 0.3942, "step": 4471 }, { "epoch": 0.8817034700315457, "grad_norm": 0.5115052245818232, "learning_rate": 1.7712004587876278e-05, "loss": 0.4236, "step": 4472 }, { "epoch": 0.8819006309148265, "grad_norm": 0.5212693828848795, "learning_rate": 1.7711017760517033e-05, "loss": 0.4075, "step": 4473 }, { "epoch": 0.8820977917981072, "grad_norm": 0.5593095083849272, "learning_rate": 1.7710030747894375e-05, "loss": 0.4372, "step": 4474 }, { "epoch": 0.882294952681388, "grad_norm": 0.5103075891986539, "learning_rate": 1.770904355003202e-05, "loss": 0.424, "step": 4475 }, { "epoch": 0.8824921135646687, "grad_norm": 0.5391202190499896, "learning_rate": 1.7708056166953684e-05, "loss": 0.4861, "step": 4476 }, { "epoch": 0.8826892744479495, "grad_norm": 0.5138735176424362, "learning_rate": 1.7707068598683095e-05, "loss": 0.4461, "step": 4477 }, { "epoch": 0.8828864353312302, "grad_norm": 0.5261969184630167, "learning_rate": 1.7706080845243975e-05, "loss": 0.4413, "step": 4478 }, { "epoch": 0.883083596214511, "grad_norm": 0.5319173341858092, "learning_rate": 1.7705092906660054e-05, "loss": 0.4574, "step": 4479 }, { "epoch": 0.8832807570977917, "grad_norm": 0.5410201601688506, "learning_rate": 1.7704104782955074e-05, "loss": 0.4396, "step": 4480 }, { "epoch": 0.8834779179810726, "grad_norm": 0.5346889247807762, "learning_rate": 1.770311647415277e-05, "loss": 0.4465, "step": 4481 }, { "epoch": 0.8836750788643533, "grad_norm": 0.5216713453775305, "learning_rate": 1.7702127980276893e-05, "loss": 0.439, "step": 4482 }, { "epoch": 0.8838722397476341, "grad_norm": 0.5794595199940287, "learning_rate": 1.7701139301351187e-05, "loss": 0.4277, "step": 4483 }, { "epoch": 0.8840694006309149, "grad_norm": 0.5173700020686466, "learning_rate": 1.7700150437399405e-05, "loss": 0.4404, "step": 4484 }, { "epoch": 0.8842665615141956, "grad_norm": 0.7338285688890706, "learning_rate": 1.7699161388445313e-05, "loss": 0.4936, "step": 4485 }, { "epoch": 0.8844637223974764, "grad_norm": 0.49830492751234945, "learning_rate": 1.7698172154512666e-05, "loss": 0.4316, "step": 4486 }, { "epoch": 0.8846608832807571, "grad_norm": 0.5574600419347723, "learning_rate": 1.7697182735625233e-05, "loss": 0.448, "step": 4487 }, { "epoch": 0.8848580441640379, "grad_norm": 0.5158020909340794, "learning_rate": 1.7696193131806786e-05, "loss": 0.4157, "step": 4488 }, { "epoch": 0.8850552050473186, "grad_norm": 0.5156523644377932, "learning_rate": 1.76952033430811e-05, "loss": 0.4277, "step": 4489 }, { "epoch": 0.8852523659305994, "grad_norm": 0.4951200681741904, "learning_rate": 1.769421336947196e-05, "loss": 0.41, "step": 4490 }, { "epoch": 0.8854495268138801, "grad_norm": 0.5210924234785088, "learning_rate": 1.769322321100315e-05, "loss": 0.4321, "step": 4491 }, { "epoch": 0.8856466876971609, "grad_norm": 0.5546404645348809, "learning_rate": 1.769223286769845e-05, "loss": 0.46, "step": 4492 }, { "epoch": 0.8858438485804416, "grad_norm": 0.5116058758291511, "learning_rate": 1.7691242339581664e-05, "loss": 0.4255, "step": 4493 }, { "epoch": 0.8860410094637224, "grad_norm": 0.5259991490113677, "learning_rate": 1.769025162667659e-05, "loss": 0.4601, "step": 4494 }, { "epoch": 0.8862381703470031, "grad_norm": 0.6096603798487029, "learning_rate": 1.7689260729007025e-05, "loss": 0.4446, "step": 4495 }, { "epoch": 0.886435331230284, "grad_norm": 0.5502457929004166, "learning_rate": 1.768826964659678e-05, "loss": 0.4437, "step": 4496 }, { "epoch": 0.8866324921135647, "grad_norm": 0.4955351951661073, "learning_rate": 1.7687278379469665e-05, "loss": 0.4367, "step": 4497 }, { "epoch": 0.8868296529968455, "grad_norm": 0.48794238779718313, "learning_rate": 1.7686286927649493e-05, "loss": 0.3898, "step": 4498 }, { "epoch": 0.8870268138801262, "grad_norm": 0.5403525717354225, "learning_rate": 1.768529529116009e-05, "loss": 0.4431, "step": 4499 }, { "epoch": 0.887223974763407, "grad_norm": 0.5376973399491151, "learning_rate": 1.768430347002528e-05, "loss": 0.444, "step": 4500 }, { "epoch": 0.8874211356466877, "grad_norm": 0.5272976538103022, "learning_rate": 1.768331146426889e-05, "loss": 0.4236, "step": 4501 }, { "epoch": 0.8876182965299685, "grad_norm": 0.5344980125641151, "learning_rate": 1.7682319273914755e-05, "loss": 0.4426, "step": 4502 }, { "epoch": 0.8878154574132492, "grad_norm": 0.49953537905007334, "learning_rate": 1.7681326898986713e-05, "loss": 0.4209, "step": 4503 }, { "epoch": 0.88801261829653, "grad_norm": 0.5371135502177788, "learning_rate": 1.7680334339508604e-05, "loss": 0.4334, "step": 4504 }, { "epoch": 0.8882097791798107, "grad_norm": 0.5315742083699853, "learning_rate": 1.767934159550428e-05, "loss": 0.4259, "step": 4505 }, { "epoch": 0.8884069400630915, "grad_norm": 0.5965034470720473, "learning_rate": 1.767834866699759e-05, "loss": 0.4426, "step": 4506 }, { "epoch": 0.8886041009463722, "grad_norm": 0.5836671704413614, "learning_rate": 1.767735555401239e-05, "loss": 0.4287, "step": 4507 }, { "epoch": 0.888801261829653, "grad_norm": 0.5625765431157921, "learning_rate": 1.767636225657254e-05, "loss": 0.4459, "step": 4508 }, { "epoch": 0.8889984227129337, "grad_norm": 0.7370740412563955, "learning_rate": 1.7675368774701906e-05, "loss": 0.426, "step": 4509 }, { "epoch": 0.8891955835962145, "grad_norm": 0.5477180104542236, "learning_rate": 1.7674375108424354e-05, "loss": 0.4451, "step": 4510 }, { "epoch": 0.8893927444794952, "grad_norm": 0.5330930261842161, "learning_rate": 1.767338125776376e-05, "loss": 0.3926, "step": 4511 }, { "epoch": 0.889589905362776, "grad_norm": 0.5310418495745474, "learning_rate": 1.7672387222744e-05, "loss": 0.4434, "step": 4512 }, { "epoch": 0.8897870662460567, "grad_norm": 0.5341421770335281, "learning_rate": 1.7671393003388964e-05, "loss": 0.4476, "step": 4513 }, { "epoch": 0.8899842271293376, "grad_norm": 0.50982098147757, "learning_rate": 1.7670398599722533e-05, "loss": 0.4392, "step": 4514 }, { "epoch": 0.8901813880126183, "grad_norm": 0.6050964967523266, "learning_rate": 1.7669404011768596e-05, "loss": 0.466, "step": 4515 }, { "epoch": 0.8903785488958991, "grad_norm": 0.5082170155704829, "learning_rate": 1.766840923955105e-05, "loss": 0.3896, "step": 4516 }, { "epoch": 0.8905757097791798, "grad_norm": 0.542583849270664, "learning_rate": 1.76674142830938e-05, "loss": 0.4499, "step": 4517 }, { "epoch": 0.8907728706624606, "grad_norm": 0.549649515755227, "learning_rate": 1.7666419142420746e-05, "loss": 0.4205, "step": 4518 }, { "epoch": 0.8909700315457413, "grad_norm": 0.5575951769960029, "learning_rate": 1.76654238175558e-05, "loss": 0.4465, "step": 4519 }, { "epoch": 0.8911671924290221, "grad_norm": 0.5453782064529624, "learning_rate": 1.766442830852287e-05, "loss": 0.4769, "step": 4520 }, { "epoch": 0.8913643533123028, "grad_norm": 0.5101131437540427, "learning_rate": 1.766343261534588e-05, "loss": 0.4017, "step": 4521 }, { "epoch": 0.8915615141955836, "grad_norm": 0.6136158485523702, "learning_rate": 1.766243673804875e-05, "loss": 0.4098, "step": 4522 }, { "epoch": 0.8917586750788643, "grad_norm": 0.5292957632786092, "learning_rate": 1.7661440676655407e-05, "loss": 0.4223, "step": 4523 }, { "epoch": 0.8919558359621451, "grad_norm": 0.5540759577823543, "learning_rate": 1.766044443118978e-05, "loss": 0.4593, "step": 4524 }, { "epoch": 0.8921529968454258, "grad_norm": 0.5392743345482487, "learning_rate": 1.765944800167581e-05, "loss": 0.4235, "step": 4525 }, { "epoch": 0.8923501577287066, "grad_norm": 0.6346400393371832, "learning_rate": 1.7658451388137432e-05, "loss": 0.4231, "step": 4526 }, { "epoch": 0.8925473186119873, "grad_norm": 0.5201806359082028, "learning_rate": 1.7657454590598594e-05, "loss": 0.4375, "step": 4527 }, { "epoch": 0.8927444794952681, "grad_norm": 0.5175630311254862, "learning_rate": 1.765645760908324e-05, "loss": 0.4408, "step": 4528 }, { "epoch": 0.892941640378549, "grad_norm": 0.5592078904199146, "learning_rate": 1.7655460443615327e-05, "loss": 0.4714, "step": 4529 }, { "epoch": 0.8931388012618297, "grad_norm": 0.595148154874751, "learning_rate": 1.7654463094218813e-05, "loss": 0.4265, "step": 4530 }, { "epoch": 0.8933359621451105, "grad_norm": 0.5367589779182992, "learning_rate": 1.7653465560917656e-05, "loss": 0.4664, "step": 4531 }, { "epoch": 0.8935331230283912, "grad_norm": 0.5404275654069878, "learning_rate": 1.7652467843735828e-05, "loss": 0.4304, "step": 4532 }, { "epoch": 0.893730283911672, "grad_norm": 0.5426667005563901, "learning_rate": 1.7651469942697296e-05, "loss": 0.4187, "step": 4533 }, { "epoch": 0.8939274447949527, "grad_norm": 0.5355183086191119, "learning_rate": 1.7650471857826038e-05, "loss": 0.42, "step": 4534 }, { "epoch": 0.8941246056782335, "grad_norm": 0.5434987852881384, "learning_rate": 1.7649473589146032e-05, "loss": 0.4129, "step": 4535 }, { "epoch": 0.8943217665615142, "grad_norm": 0.49571415053957635, "learning_rate": 1.7648475136681265e-05, "loss": 0.4332, "step": 4536 }, { "epoch": 0.894518927444795, "grad_norm": 0.545085011797932, "learning_rate": 1.7647476500455723e-05, "loss": 0.4722, "step": 4537 }, { "epoch": 0.8947160883280757, "grad_norm": 0.5458717696446517, "learning_rate": 1.76464776804934e-05, "loss": 0.4437, "step": 4538 }, { "epoch": 0.8949132492113565, "grad_norm": 0.5198965233821335, "learning_rate": 1.764547867681829e-05, "loss": 0.4335, "step": 4539 }, { "epoch": 0.8951104100946372, "grad_norm": 0.5098927604754595, "learning_rate": 1.76444794894544e-05, "loss": 0.4281, "step": 4540 }, { "epoch": 0.895307570977918, "grad_norm": 0.5476572140939739, "learning_rate": 1.7643480118425733e-05, "loss": 0.4575, "step": 4541 }, { "epoch": 0.8955047318611987, "grad_norm": 0.5506996953424919, "learning_rate": 1.7642480563756305e-05, "loss": 0.4308, "step": 4542 }, { "epoch": 0.8957018927444795, "grad_norm": 0.5418834197677749, "learning_rate": 1.7641480825470123e-05, "loss": 0.3992, "step": 4543 }, { "epoch": 0.8958990536277602, "grad_norm": 0.5330741568842563, "learning_rate": 1.764048090359121e-05, "loss": 0.4314, "step": 4544 }, { "epoch": 0.896096214511041, "grad_norm": 0.5364576977743398, "learning_rate": 1.7639480798143593e-05, "loss": 0.4311, "step": 4545 }, { "epoch": 0.8962933753943217, "grad_norm": 0.5244339795995854, "learning_rate": 1.7638480509151297e-05, "loss": 0.4576, "step": 4546 }, { "epoch": 0.8964905362776026, "grad_norm": 0.563203046041122, "learning_rate": 1.7637480036638356e-05, "loss": 0.4828, "step": 4547 }, { "epoch": 0.8966876971608833, "grad_norm": 0.5070783658444626, "learning_rate": 1.7636479380628806e-05, "loss": 0.4226, "step": 4548 }, { "epoch": 0.8968848580441641, "grad_norm": 0.5595950137316498, "learning_rate": 1.7635478541146687e-05, "loss": 0.459, "step": 4549 }, { "epoch": 0.8970820189274448, "grad_norm": 0.5564160601048573, "learning_rate": 1.763447751821605e-05, "loss": 0.4069, "step": 4550 }, { "epoch": 0.8972791798107256, "grad_norm": 0.5397157852142757, "learning_rate": 1.763347631186094e-05, "loss": 0.4336, "step": 4551 }, { "epoch": 0.8974763406940063, "grad_norm": 0.5028947071543393, "learning_rate": 1.7632474922105416e-05, "loss": 0.4111, "step": 4552 }, { "epoch": 0.8976735015772871, "grad_norm": 0.6653704389356556, "learning_rate": 1.7631473348973537e-05, "loss": 0.4817, "step": 4553 }, { "epoch": 0.8978706624605678, "grad_norm": 0.6954841719181198, "learning_rate": 1.7630471592489366e-05, "loss": 0.4422, "step": 4554 }, { "epoch": 0.8980678233438486, "grad_norm": 0.5904197333916205, "learning_rate": 1.7629469652676965e-05, "loss": 0.4538, "step": 4555 }, { "epoch": 0.8982649842271293, "grad_norm": 1.260867911052856, "learning_rate": 1.7628467529560417e-05, "loss": 0.4514, "step": 4556 }, { "epoch": 0.8984621451104101, "grad_norm": 1.0289033380566197, "learning_rate": 1.762746522316379e-05, "loss": 0.4128, "step": 4557 }, { "epoch": 0.8986593059936908, "grad_norm": 0.5244592891802363, "learning_rate": 1.762646273351117e-05, "loss": 0.4338, "step": 4558 }, { "epoch": 0.8988564668769716, "grad_norm": 0.5044945072717523, "learning_rate": 1.7625460060626644e-05, "loss": 0.4117, "step": 4559 }, { "epoch": 0.8990536277602523, "grad_norm": 0.7264810983583718, "learning_rate": 1.7624457204534292e-05, "loss": 0.4583, "step": 4560 }, { "epoch": 0.8992507886435331, "grad_norm": 0.5588409715862622, "learning_rate": 1.762345416525822e-05, "loss": 0.4361, "step": 4561 }, { "epoch": 0.8994479495268138, "grad_norm": 0.4900029305965899, "learning_rate": 1.7622450942822524e-05, "loss": 0.4037, "step": 4562 }, { "epoch": 0.8996451104100947, "grad_norm": 0.6753821625040557, "learning_rate": 1.7621447537251307e-05, "loss": 0.4286, "step": 4563 }, { "epoch": 0.8998422712933754, "grad_norm": 0.5928729012070617, "learning_rate": 1.762044394856867e-05, "loss": 0.4391, "step": 4564 }, { "epoch": 0.9000394321766562, "grad_norm": 0.5403084220268555, "learning_rate": 1.7619440176798733e-05, "loss": 0.4305, "step": 4565 }, { "epoch": 0.9002365930599369, "grad_norm": 0.5023737487603344, "learning_rate": 1.761843622196561e-05, "loss": 0.432, "step": 4566 }, { "epoch": 0.9004337539432177, "grad_norm": 0.8214257640360723, "learning_rate": 1.7617432084093424e-05, "loss": 0.4732, "step": 4567 }, { "epoch": 0.9006309148264984, "grad_norm": 0.6157040111093859, "learning_rate": 1.7616427763206294e-05, "loss": 0.4106, "step": 4568 }, { "epoch": 0.9008280757097792, "grad_norm": 0.6386520150246892, "learning_rate": 1.7615423259328356e-05, "loss": 0.4317, "step": 4569 }, { "epoch": 0.9010252365930599, "grad_norm": 0.5233550893573012, "learning_rate": 1.761441857248374e-05, "loss": 0.428, "step": 4570 }, { "epoch": 0.9012223974763407, "grad_norm": 0.5816027891775581, "learning_rate": 1.7613413702696584e-05, "loss": 0.4624, "step": 4571 }, { "epoch": 0.9014195583596214, "grad_norm": 0.5226310374487503, "learning_rate": 1.7612408649991037e-05, "loss": 0.4252, "step": 4572 }, { "epoch": 0.9016167192429022, "grad_norm": 0.7039706825623913, "learning_rate": 1.761140341439124e-05, "loss": 0.4465, "step": 4573 }, { "epoch": 0.901813880126183, "grad_norm": 0.6335577419820779, "learning_rate": 1.7610397995921348e-05, "loss": 0.4193, "step": 4574 }, { "epoch": 0.9020110410094637, "grad_norm": 0.5647597040461998, "learning_rate": 1.760939239460551e-05, "loss": 0.4354, "step": 4575 }, { "epoch": 0.9022082018927445, "grad_norm": 0.5628152685174825, "learning_rate": 1.7608386610467898e-05, "loss": 0.4757, "step": 4576 }, { "epoch": 0.9024053627760252, "grad_norm": 3.2929032363930917, "learning_rate": 1.7607380643532667e-05, "loss": 0.4282, "step": 4577 }, { "epoch": 0.902602523659306, "grad_norm": 0.7504633569369734, "learning_rate": 1.7606374493823993e-05, "loss": 0.4495, "step": 4578 }, { "epoch": 0.9027996845425867, "grad_norm": 0.5656614412611999, "learning_rate": 1.7605368161366043e-05, "loss": 0.4242, "step": 4579 }, { "epoch": 0.9029968454258676, "grad_norm": 0.5437572557240907, "learning_rate": 1.7604361646183004e-05, "loss": 0.4365, "step": 4580 }, { "epoch": 0.9031940063091483, "grad_norm": 0.5704886873684303, "learning_rate": 1.760335494829905e-05, "loss": 0.4378, "step": 4581 }, { "epoch": 0.9033911671924291, "grad_norm": 0.5670944223065693, "learning_rate": 1.7602348067738367e-05, "loss": 0.4222, "step": 4582 }, { "epoch": 0.9035883280757098, "grad_norm": 0.571465040387628, "learning_rate": 1.760134100452515e-05, "loss": 0.4459, "step": 4583 }, { "epoch": 0.9037854889589906, "grad_norm": 0.5411774398722019, "learning_rate": 1.7600333758683598e-05, "loss": 0.4373, "step": 4584 }, { "epoch": 0.9039826498422713, "grad_norm": 0.5164990496084579, "learning_rate": 1.7599326330237906e-05, "loss": 0.3954, "step": 4585 }, { "epoch": 0.9041798107255521, "grad_norm": 0.5954434300630094, "learning_rate": 1.7598318719212274e-05, "loss": 0.4541, "step": 4586 }, { "epoch": 0.9043769716088328, "grad_norm": 0.5629588725994905, "learning_rate": 1.7597310925630922e-05, "loss": 0.4696, "step": 4587 }, { "epoch": 0.9045741324921136, "grad_norm": 1.0533510364008871, "learning_rate": 1.7596302949518054e-05, "loss": 0.4087, "step": 4588 }, { "epoch": 0.9047712933753943, "grad_norm": 0.5848783608010563, "learning_rate": 1.759529479089789e-05, "loss": 0.4562, "step": 4589 }, { "epoch": 0.9049684542586751, "grad_norm": 0.5359425519217931, "learning_rate": 1.7594286449794655e-05, "loss": 0.4139, "step": 4590 }, { "epoch": 0.9051656151419558, "grad_norm": 0.5766734785111527, "learning_rate": 1.759327792623257e-05, "loss": 0.4232, "step": 4591 }, { "epoch": 0.9053627760252366, "grad_norm": 0.5120340477249785, "learning_rate": 1.759226922023587e-05, "loss": 0.4266, "step": 4592 }, { "epoch": 0.9055599369085173, "grad_norm": 0.5398891426274068, "learning_rate": 1.7591260331828785e-05, "loss": 0.4179, "step": 4593 }, { "epoch": 0.9057570977917981, "grad_norm": 0.5184539964235813, "learning_rate": 1.759025126103556e-05, "loss": 0.4125, "step": 4594 }, { "epoch": 0.9059542586750788, "grad_norm": 0.6029467871180186, "learning_rate": 1.7589242007880435e-05, "loss": 0.4991, "step": 4595 }, { "epoch": 0.9061514195583596, "grad_norm": 0.5757959829050494, "learning_rate": 1.7588232572387657e-05, "loss": 0.4294, "step": 4596 }, { "epoch": 0.9063485804416404, "grad_norm": 0.5612525679026711, "learning_rate": 1.7587222954581483e-05, "loss": 0.4779, "step": 4597 }, { "epoch": 0.9065457413249212, "grad_norm": 0.5047317331482842, "learning_rate": 1.758621315448617e-05, "loss": 0.3938, "step": 4598 }, { "epoch": 0.9067429022082019, "grad_norm": 0.5667514613061159, "learning_rate": 1.7585203172125972e-05, "loss": 0.4599, "step": 4599 }, { "epoch": 0.9069400630914827, "grad_norm": 0.5293002056349737, "learning_rate": 1.758419300752516e-05, "loss": 0.4219, "step": 4600 }, { "epoch": 0.9071372239747634, "grad_norm": 0.5080066799539775, "learning_rate": 1.758318266070801e-05, "loss": 0.3933, "step": 4601 }, { "epoch": 0.9073343848580442, "grad_norm": 0.5576817270172586, "learning_rate": 1.758217213169878e-05, "loss": 0.4389, "step": 4602 }, { "epoch": 0.9075315457413249, "grad_norm": 0.5264486271047948, "learning_rate": 1.7581161420521765e-05, "loss": 0.4522, "step": 4603 }, { "epoch": 0.9077287066246057, "grad_norm": 0.5894471822557419, "learning_rate": 1.758015052720124e-05, "loss": 0.4453, "step": 4604 }, { "epoch": 0.9079258675078864, "grad_norm": 0.5534225411257014, "learning_rate": 1.7579139451761495e-05, "loss": 0.4532, "step": 4605 }, { "epoch": 0.9081230283911672, "grad_norm": 0.5369235116741078, "learning_rate": 1.7578128194226823e-05, "loss": 0.4251, "step": 4606 }, { "epoch": 0.9083201892744479, "grad_norm": 0.4830071277784729, "learning_rate": 1.7577116754621512e-05, "loss": 0.4189, "step": 4607 }, { "epoch": 0.9085173501577287, "grad_norm": 0.7383285750265494, "learning_rate": 1.7576105132969874e-05, "loss": 0.4759, "step": 4608 }, { "epoch": 0.9087145110410094, "grad_norm": 0.5395821153111766, "learning_rate": 1.757509332929621e-05, "loss": 0.4595, "step": 4609 }, { "epoch": 0.9089116719242902, "grad_norm": 0.5971105354676437, "learning_rate": 1.7574081343624827e-05, "loss": 0.4501, "step": 4610 }, { "epoch": 0.9091088328075709, "grad_norm": 0.5322422724423118, "learning_rate": 1.757306917598004e-05, "loss": 0.4432, "step": 4611 }, { "epoch": 0.9093059936908517, "grad_norm": 0.6382374433003123, "learning_rate": 1.7572056826386167e-05, "loss": 0.4918, "step": 4612 }, { "epoch": 0.9095031545741324, "grad_norm": 1.0874753951271274, "learning_rate": 1.7571044294867533e-05, "loss": 0.4519, "step": 4613 }, { "epoch": 0.9097003154574133, "grad_norm": 1.0844704783301853, "learning_rate": 1.757003158144846e-05, "loss": 0.4352, "step": 4614 }, { "epoch": 0.909897476340694, "grad_norm": 0.522978838373464, "learning_rate": 1.7569018686153286e-05, "loss": 0.4233, "step": 4615 }, { "epoch": 0.9100946372239748, "grad_norm": 0.5933248972486103, "learning_rate": 1.756800560900634e-05, "loss": 0.4419, "step": 4616 }, { "epoch": 0.9102917981072555, "grad_norm": 1.216489202161031, "learning_rate": 1.7566992350031965e-05, "loss": 0.4741, "step": 4617 }, { "epoch": 0.9104889589905363, "grad_norm": 0.5819684520560912, "learning_rate": 1.7565978909254508e-05, "loss": 0.4278, "step": 4618 }, { "epoch": 0.910686119873817, "grad_norm": 0.580755399529423, "learning_rate": 1.756496528669831e-05, "loss": 0.4708, "step": 4619 }, { "epoch": 0.9108832807570978, "grad_norm": 0.5363608983821153, "learning_rate": 1.7563951482387733e-05, "loss": 0.4008, "step": 4620 }, { "epoch": 0.9110804416403786, "grad_norm": 0.5949632674163584, "learning_rate": 1.7562937496347126e-05, "loss": 0.4663, "step": 4621 }, { "epoch": 0.9112776025236593, "grad_norm": 0.7340546915226606, "learning_rate": 1.756192332860086e-05, "loss": 0.4453, "step": 4622 }, { "epoch": 0.9114747634069401, "grad_norm": 0.5539458382888915, "learning_rate": 1.7560908979173294e-05, "loss": 0.4457, "step": 4623 }, { "epoch": 0.9116719242902208, "grad_norm": 0.5614114939363166, "learning_rate": 1.7559894448088802e-05, "loss": 0.4023, "step": 4624 }, { "epoch": 0.9118690851735016, "grad_norm": 0.5710583292679061, "learning_rate": 1.7558879735371753e-05, "loss": 0.4688, "step": 4625 }, { "epoch": 0.9120662460567823, "grad_norm": 0.5550240437903982, "learning_rate": 1.755786484104654e-05, "loss": 0.3906, "step": 4626 }, { "epoch": 0.9122634069400631, "grad_norm": 0.4837275585777602, "learning_rate": 1.755684976513753e-05, "loss": 0.3919, "step": 4627 }, { "epoch": 0.9124605678233438, "grad_norm": 0.5844614371573156, "learning_rate": 1.7555834507669124e-05, "loss": 0.4086, "step": 4628 }, { "epoch": 0.9126577287066246, "grad_norm": 0.5180930399677969, "learning_rate": 1.7554819068665707e-05, "loss": 0.4039, "step": 4629 }, { "epoch": 0.9128548895899053, "grad_norm": 0.5819268556226904, "learning_rate": 1.7553803448151678e-05, "loss": 0.4102, "step": 4630 }, { "epoch": 0.9130520504731862, "grad_norm": 0.5158313726358502, "learning_rate": 1.755278764615144e-05, "loss": 0.4266, "step": 4631 }, { "epoch": 0.9132492113564669, "grad_norm": 0.5930149183798764, "learning_rate": 1.7551771662689393e-05, "loss": 0.4468, "step": 4632 }, { "epoch": 0.9134463722397477, "grad_norm": 0.5517958419973356, "learning_rate": 1.7550755497789955e-05, "loss": 0.4381, "step": 4633 }, { "epoch": 0.9136435331230284, "grad_norm": 0.6790482699026756, "learning_rate": 1.754973915147753e-05, "loss": 0.4301, "step": 4634 }, { "epoch": 0.9138406940063092, "grad_norm": 0.5451328047356113, "learning_rate": 1.7548722623776547e-05, "loss": 0.455, "step": 4635 }, { "epoch": 0.9140378548895899, "grad_norm": 0.5153803270688035, "learning_rate": 1.754770591471142e-05, "loss": 0.4178, "step": 4636 }, { "epoch": 0.9142350157728707, "grad_norm": 0.5355618034950106, "learning_rate": 1.7546689024306585e-05, "loss": 0.4328, "step": 4637 }, { "epoch": 0.9144321766561514, "grad_norm": 0.5793723146542517, "learning_rate": 1.7545671952586464e-05, "loss": 0.4498, "step": 4638 }, { "epoch": 0.9146293375394322, "grad_norm": 0.49690302528171043, "learning_rate": 1.75446546995755e-05, "loss": 0.4184, "step": 4639 }, { "epoch": 0.9148264984227129, "grad_norm": 0.5093059262044622, "learning_rate": 1.7543637265298136e-05, "loss": 0.3955, "step": 4640 }, { "epoch": 0.9150236593059937, "grad_norm": 0.5592098323156065, "learning_rate": 1.7542619649778804e-05, "loss": 0.4376, "step": 4641 }, { "epoch": 0.9152208201892744, "grad_norm": 0.5525412638322774, "learning_rate": 1.7541601853041963e-05, "loss": 0.4554, "step": 4642 }, { "epoch": 0.9154179810725552, "grad_norm": 0.5262471397287505, "learning_rate": 1.7540583875112065e-05, "loss": 0.4337, "step": 4643 }, { "epoch": 0.9156151419558359, "grad_norm": 0.5235757919665939, "learning_rate": 1.753956571601357e-05, "loss": 0.4335, "step": 4644 }, { "epoch": 0.9158123028391167, "grad_norm": 0.5494919507343168, "learning_rate": 1.7538547375770934e-05, "loss": 0.4198, "step": 4645 }, { "epoch": 0.9160094637223974, "grad_norm": 0.5155811541332505, "learning_rate": 1.7537528854408625e-05, "loss": 0.4006, "step": 4646 }, { "epoch": 0.9162066246056783, "grad_norm": 0.619279355812408, "learning_rate": 1.753651015195112e-05, "loss": 0.4524, "step": 4647 }, { "epoch": 0.916403785488959, "grad_norm": 0.5234709511842155, "learning_rate": 1.7535491268422885e-05, "loss": 0.4365, "step": 4648 }, { "epoch": 0.9166009463722398, "grad_norm": 0.5760967298774721, "learning_rate": 1.7534472203848402e-05, "loss": 0.421, "step": 4649 }, { "epoch": 0.9167981072555205, "grad_norm": 0.5744984868953806, "learning_rate": 1.7533452958252164e-05, "loss": 0.4393, "step": 4650 }, { "epoch": 0.9169952681388013, "grad_norm": 0.5325399090464799, "learning_rate": 1.7532433531658646e-05, "loss": 0.4407, "step": 4651 }, { "epoch": 0.917192429022082, "grad_norm": 0.5340699591886052, "learning_rate": 1.7531413924092347e-05, "loss": 0.4295, "step": 4652 }, { "epoch": 0.9173895899053628, "grad_norm": 0.5494041102557055, "learning_rate": 1.7530394135577768e-05, "loss": 0.4489, "step": 4653 }, { "epoch": 0.9175867507886435, "grad_norm": 0.5182867692860177, "learning_rate": 1.75293741661394e-05, "loss": 0.4204, "step": 4654 }, { "epoch": 0.9177839116719243, "grad_norm": 0.5728288091111668, "learning_rate": 1.752835401580176e-05, "loss": 0.4238, "step": 4655 }, { "epoch": 0.917981072555205, "grad_norm": 0.49589426702980033, "learning_rate": 1.752733368458935e-05, "loss": 0.4539, "step": 4656 }, { "epoch": 0.9181782334384858, "grad_norm": 0.48458956489019156, "learning_rate": 1.7526313172526687e-05, "loss": 0.3936, "step": 4657 }, { "epoch": 0.9183753943217665, "grad_norm": 0.5274678434361841, "learning_rate": 1.7525292479638286e-05, "loss": 0.4484, "step": 4658 }, { "epoch": 0.9185725552050473, "grad_norm": 0.5211344047356368, "learning_rate": 1.7524271605948676e-05, "loss": 0.4461, "step": 4659 }, { "epoch": 0.918769716088328, "grad_norm": 0.511296727944466, "learning_rate": 1.752325055148238e-05, "loss": 0.4185, "step": 4660 }, { "epoch": 0.9189668769716088, "grad_norm": 0.5647095561867232, "learning_rate": 1.752222931626393e-05, "loss": 0.4473, "step": 4661 }, { "epoch": 0.9191640378548895, "grad_norm": 0.513631356257729, "learning_rate": 1.7521207900317866e-05, "loss": 0.4403, "step": 4662 }, { "epoch": 0.9193611987381703, "grad_norm": 0.5155278543678777, "learning_rate": 1.7520186303668722e-05, "loss": 0.4393, "step": 4663 }, { "epoch": 0.919558359621451, "grad_norm": 0.5019934238237479, "learning_rate": 1.751916452634105e-05, "loss": 0.4006, "step": 4664 }, { "epoch": 0.9197555205047319, "grad_norm": 0.5133536326390901, "learning_rate": 1.7518142568359395e-05, "loss": 0.4422, "step": 4665 }, { "epoch": 0.9199526813880127, "grad_norm": 1.1425170395813518, "learning_rate": 1.7517120429748305e-05, "loss": 0.4254, "step": 4666 }, { "epoch": 0.9201498422712934, "grad_norm": 0.5062808556522904, "learning_rate": 1.751609811053235e-05, "loss": 0.4093, "step": 4667 }, { "epoch": 0.9203470031545742, "grad_norm": 0.5329727124070021, "learning_rate": 1.7515075610736077e-05, "loss": 0.445, "step": 4668 }, { "epoch": 0.9205441640378549, "grad_norm": 0.5773802180358756, "learning_rate": 1.751405293038407e-05, "loss": 0.4348, "step": 4669 }, { "epoch": 0.9207413249211357, "grad_norm": 0.5692775257801391, "learning_rate": 1.7513030069500885e-05, "loss": 0.4427, "step": 4670 }, { "epoch": 0.9209384858044164, "grad_norm": 0.5036100073808859, "learning_rate": 1.7512007028111103e-05, "loss": 0.4376, "step": 4671 }, { "epoch": 0.9211356466876972, "grad_norm": 0.5271798471163804, "learning_rate": 1.75109838062393e-05, "loss": 0.4504, "step": 4672 }, { "epoch": 0.9213328075709779, "grad_norm": 0.5357814969525453, "learning_rate": 1.750996040391007e-05, "loss": 0.4528, "step": 4673 }, { "epoch": 0.9215299684542587, "grad_norm": 0.5702199988441526, "learning_rate": 1.7508936821147986e-05, "loss": 0.453, "step": 4674 }, { "epoch": 0.9217271293375394, "grad_norm": 0.5404783434059391, "learning_rate": 1.750791305797765e-05, "loss": 0.4315, "step": 4675 }, { "epoch": 0.9219242902208202, "grad_norm": 0.5084713263493303, "learning_rate": 1.7506889114423658e-05, "loss": 0.4159, "step": 4676 }, { "epoch": 0.9221214511041009, "grad_norm": 0.5201149594816759, "learning_rate": 1.750586499051061e-05, "loss": 0.4029, "step": 4677 }, { "epoch": 0.9223186119873817, "grad_norm": 0.5436352106831215, "learning_rate": 1.750484068626311e-05, "loss": 0.4239, "step": 4678 }, { "epoch": 0.9225157728706624, "grad_norm": 0.5231124883960365, "learning_rate": 1.7503816201705772e-05, "loss": 0.4557, "step": 4679 }, { "epoch": 0.9227129337539433, "grad_norm": 0.5377568165510164, "learning_rate": 1.75027915368632e-05, "loss": 0.4332, "step": 4680 }, { "epoch": 0.922910094637224, "grad_norm": 0.49556159503049685, "learning_rate": 1.7501766691760027e-05, "loss": 0.3759, "step": 4681 }, { "epoch": 0.9231072555205048, "grad_norm": 0.5366737464738414, "learning_rate": 1.7500741666420863e-05, "loss": 0.4272, "step": 4682 }, { "epoch": 0.9233044164037855, "grad_norm": 0.5333368284829383, "learning_rate": 1.749971646087034e-05, "loss": 0.4893, "step": 4683 }, { "epoch": 0.9235015772870663, "grad_norm": 0.5191084944970333, "learning_rate": 1.7498691075133094e-05, "loss": 0.4537, "step": 4684 }, { "epoch": 0.923698738170347, "grad_norm": 0.4874538317773819, "learning_rate": 1.7497665509233753e-05, "loss": 0.4073, "step": 4685 }, { "epoch": 0.9238958990536278, "grad_norm": 0.5841148468262191, "learning_rate": 1.7496639763196965e-05, "loss": 0.4568, "step": 4686 }, { "epoch": 0.9240930599369085, "grad_norm": 0.46497152647935325, "learning_rate": 1.7495613837047362e-05, "loss": 0.3768, "step": 4687 }, { "epoch": 0.9242902208201893, "grad_norm": 0.6951617732456002, "learning_rate": 1.7494587730809603e-05, "loss": 0.4369, "step": 4688 }, { "epoch": 0.92448738170347, "grad_norm": 0.5439225603081767, "learning_rate": 1.749356144450834e-05, "loss": 0.4204, "step": 4689 }, { "epoch": 0.9246845425867508, "grad_norm": 0.5545139274444996, "learning_rate": 1.749253497816823e-05, "loss": 0.4533, "step": 4690 }, { "epoch": 0.9248817034700315, "grad_norm": 0.5645877878868146, "learning_rate": 1.7491508331813928e-05, "loss": 0.4588, "step": 4691 }, { "epoch": 0.9250788643533123, "grad_norm": 0.5001014226230943, "learning_rate": 1.7490481505470112e-05, "loss": 0.413, "step": 4692 }, { "epoch": 0.925276025236593, "grad_norm": 0.5340731127970042, "learning_rate": 1.748945449916144e-05, "loss": 0.4355, "step": 4693 }, { "epoch": 0.9254731861198738, "grad_norm": 0.4812452465843482, "learning_rate": 1.7488427312912596e-05, "loss": 0.4345, "step": 4694 }, { "epoch": 0.9256703470031545, "grad_norm": 1.0890928667586521, "learning_rate": 1.7487399946748253e-05, "loss": 0.4734, "step": 4695 }, { "epoch": 0.9258675078864353, "grad_norm": 0.5505478980835827, "learning_rate": 1.74863724006931e-05, "loss": 0.4808, "step": 4696 }, { "epoch": 0.926064668769716, "grad_norm": 0.5008122123586376, "learning_rate": 1.7485344674771817e-05, "loss": 0.4225, "step": 4697 }, { "epoch": 0.9262618296529969, "grad_norm": 0.6702157966102137, "learning_rate": 1.7484316769009105e-05, "loss": 0.4698, "step": 4698 }, { "epoch": 0.9264589905362776, "grad_norm": 0.5272899289334974, "learning_rate": 1.7483288683429655e-05, "loss": 0.414, "step": 4699 }, { "epoch": 0.9266561514195584, "grad_norm": 0.5858391308330164, "learning_rate": 1.7482260418058167e-05, "loss": 0.4288, "step": 4700 }, { "epoch": 0.9268533123028391, "grad_norm": 0.5833034743596007, "learning_rate": 1.7481231972919346e-05, "loss": 0.4879, "step": 4701 }, { "epoch": 0.9270504731861199, "grad_norm": 0.5320561242621449, "learning_rate": 1.74802033480379e-05, "loss": 0.4144, "step": 4702 }, { "epoch": 0.9272476340694006, "grad_norm": 0.5387587973540497, "learning_rate": 1.7479174543438547e-05, "loss": 0.4487, "step": 4703 }, { "epoch": 0.9274447949526814, "grad_norm": 0.5997202359893333, "learning_rate": 1.7478145559146002e-05, "loss": 0.4796, "step": 4704 }, { "epoch": 0.9276419558359621, "grad_norm": 0.5857148110448572, "learning_rate": 1.747711639518499e-05, "loss": 0.4235, "step": 4705 }, { "epoch": 0.9278391167192429, "grad_norm": 0.5798656081956509, "learning_rate": 1.7476087051580235e-05, "loss": 0.4518, "step": 4706 }, { "epoch": 0.9280362776025236, "grad_norm": 0.5831794755793467, "learning_rate": 1.7475057528356466e-05, "loss": 0.4613, "step": 4707 }, { "epoch": 0.9282334384858044, "grad_norm": 0.5068716865871977, "learning_rate": 1.7474027825538422e-05, "loss": 0.4025, "step": 4708 }, { "epoch": 0.9284305993690851, "grad_norm": 0.5949150495584267, "learning_rate": 1.747299794315084e-05, "loss": 0.4116, "step": 4709 }, { "epoch": 0.9286277602523659, "grad_norm": 0.5125219184067067, "learning_rate": 1.7471967881218466e-05, "loss": 0.4338, "step": 4710 }, { "epoch": 0.9288249211356467, "grad_norm": 0.5270990266912229, "learning_rate": 1.7470937639766042e-05, "loss": 0.4209, "step": 4711 }, { "epoch": 0.9290220820189274, "grad_norm": 0.5495969435777367, "learning_rate": 1.746990721881833e-05, "loss": 0.4161, "step": 4712 }, { "epoch": 0.9292192429022083, "grad_norm": 0.519716074091138, "learning_rate": 1.746887661840008e-05, "loss": 0.4226, "step": 4713 }, { "epoch": 0.929416403785489, "grad_norm": 0.5108308327628213, "learning_rate": 1.7467845838536054e-05, "loss": 0.4335, "step": 4714 }, { "epoch": 0.9296135646687698, "grad_norm": 4.264832132353777, "learning_rate": 1.746681487925102e-05, "loss": 0.5218, "step": 4715 }, { "epoch": 0.9298107255520505, "grad_norm": 0.6179886893181556, "learning_rate": 1.746578374056974e-05, "loss": 0.4544, "step": 4716 }, { "epoch": 0.9300078864353313, "grad_norm": 0.5304906705707955, "learning_rate": 1.7464752422516996e-05, "loss": 0.4232, "step": 4717 }, { "epoch": 0.930205047318612, "grad_norm": 0.5331325354477906, "learning_rate": 1.7463720925117565e-05, "loss": 0.3829, "step": 4718 }, { "epoch": 0.9304022082018928, "grad_norm": 0.5765730607350507, "learning_rate": 1.7462689248396228e-05, "loss": 0.4317, "step": 4719 }, { "epoch": 0.9305993690851735, "grad_norm": 0.5313090247910929, "learning_rate": 1.7461657392377772e-05, "loss": 0.4628, "step": 4720 }, { "epoch": 0.9307965299684543, "grad_norm": 0.5604970128075755, "learning_rate": 1.746062535708699e-05, "loss": 0.4345, "step": 4721 }, { "epoch": 0.930993690851735, "grad_norm": 0.6273618716282534, "learning_rate": 1.7459593142548674e-05, "loss": 0.4068, "step": 4722 }, { "epoch": 0.9311908517350158, "grad_norm": 0.5829766636894154, "learning_rate": 1.7458560748787625e-05, "loss": 0.4577, "step": 4723 }, { "epoch": 0.9313880126182965, "grad_norm": 0.6025168414566998, "learning_rate": 1.7457528175828648e-05, "loss": 0.4054, "step": 4724 }, { "epoch": 0.9315851735015773, "grad_norm": 1.0019218230150604, "learning_rate": 1.745649542369655e-05, "loss": 0.4536, "step": 4725 }, { "epoch": 0.931782334384858, "grad_norm": 0.5862918544220463, "learning_rate": 1.7455462492416148e-05, "loss": 0.4495, "step": 4726 }, { "epoch": 0.9319794952681388, "grad_norm": 0.5461382110764923, "learning_rate": 1.7454429382012255e-05, "loss": 0.4391, "step": 4727 }, { "epoch": 0.9321766561514195, "grad_norm": 0.5936092061621204, "learning_rate": 1.745339609250969e-05, "loss": 0.4482, "step": 4728 }, { "epoch": 0.9323738170347003, "grad_norm": 0.5346698342600564, "learning_rate": 1.7452362623933283e-05, "loss": 0.4358, "step": 4729 }, { "epoch": 0.932570977917981, "grad_norm": 0.5604545243460433, "learning_rate": 1.7451328976307864e-05, "loss": 0.4072, "step": 4730 }, { "epoch": 0.9327681388012619, "grad_norm": 0.5624845340732466, "learning_rate": 1.7450295149658265e-05, "loss": 0.4933, "step": 4731 }, { "epoch": 0.9329652996845426, "grad_norm": 0.5321758225204737, "learning_rate": 1.7449261144009325e-05, "loss": 0.4828, "step": 4732 }, { "epoch": 0.9331624605678234, "grad_norm": 0.5442798225790718, "learning_rate": 1.744822695938589e-05, "loss": 0.4275, "step": 4733 }, { "epoch": 0.9333596214511041, "grad_norm": 2.0994490161932124, "learning_rate": 1.74471925958128e-05, "loss": 0.4325, "step": 4734 }, { "epoch": 0.9335567823343849, "grad_norm": 0.5447286623554133, "learning_rate": 1.744615805331491e-05, "loss": 0.4577, "step": 4735 }, { "epoch": 0.9337539432176656, "grad_norm": 1.0389460335213794, "learning_rate": 1.744512333191708e-05, "loss": 0.4268, "step": 4736 }, { "epoch": 0.9339511041009464, "grad_norm": 0.5585776839463066, "learning_rate": 1.7444088431644166e-05, "loss": 0.4436, "step": 4737 }, { "epoch": 0.9341482649842271, "grad_norm": 0.5415927198264985, "learning_rate": 1.7443053352521032e-05, "loss": 0.4297, "step": 4738 }, { "epoch": 0.9343454258675079, "grad_norm": 0.7020791964781086, "learning_rate": 1.7442018094572546e-05, "loss": 0.3968, "step": 4739 }, { "epoch": 0.9345425867507886, "grad_norm": 0.5293642076390404, "learning_rate": 1.7440982657823583e-05, "loss": 0.4414, "step": 4740 }, { "epoch": 0.9347397476340694, "grad_norm": 0.5699485347614668, "learning_rate": 1.743994704229902e-05, "loss": 0.4452, "step": 4741 }, { "epoch": 0.9349369085173501, "grad_norm": 0.9714674676297455, "learning_rate": 1.743891124802374e-05, "loss": 0.4315, "step": 4742 }, { "epoch": 0.9351340694006309, "grad_norm": 0.5220962127597865, "learning_rate": 1.7437875275022622e-05, "loss": 0.4145, "step": 4743 }, { "epoch": 0.9353312302839116, "grad_norm": 0.538501745627749, "learning_rate": 1.7436839123320566e-05, "loss": 0.46, "step": 4744 }, { "epoch": 0.9355283911671924, "grad_norm": 0.49639504847696175, "learning_rate": 1.743580279294246e-05, "loss": 0.4082, "step": 4745 }, { "epoch": 0.9357255520504731, "grad_norm": 0.531129670237936, "learning_rate": 1.74347662839132e-05, "loss": 0.4302, "step": 4746 }, { "epoch": 0.935922712933754, "grad_norm": 0.6286351478232094, "learning_rate": 1.7433729596257694e-05, "loss": 0.4533, "step": 4747 }, { "epoch": 0.9361198738170347, "grad_norm": 0.5416558622895243, "learning_rate": 1.743269273000085e-05, "loss": 0.4097, "step": 4748 }, { "epoch": 0.9363170347003155, "grad_norm": 0.5339455229568808, "learning_rate": 1.7431655685167578e-05, "loss": 0.439, "step": 4749 }, { "epoch": 0.9365141955835962, "grad_norm": 0.5760351851798424, "learning_rate": 1.743061846178279e-05, "loss": 0.4818, "step": 4750 }, { "epoch": 0.936711356466877, "grad_norm": 0.5917480235250198, "learning_rate": 1.742958105987141e-05, "loss": 0.4568, "step": 4751 }, { "epoch": 0.9369085173501577, "grad_norm": 0.8339420837901182, "learning_rate": 1.7428543479458367e-05, "loss": 0.4443, "step": 4752 }, { "epoch": 0.9371056782334385, "grad_norm": 0.5697309446607353, "learning_rate": 1.7427505720568583e-05, "loss": 0.4565, "step": 4753 }, { "epoch": 0.9373028391167192, "grad_norm": 0.5415366469073645, "learning_rate": 1.7426467783226992e-05, "loss": 0.4764, "step": 4754 }, { "epoch": 0.9375, "grad_norm": 0.6019352545618148, "learning_rate": 1.742542966745853e-05, "loss": 0.4492, "step": 4755 }, { "epoch": 0.9376971608832808, "grad_norm": 0.6685857679022774, "learning_rate": 1.7424391373288142e-05, "loss": 0.4581, "step": 4756 }, { "epoch": 0.9378943217665615, "grad_norm": 0.5894795119442221, "learning_rate": 1.742335290074077e-05, "loss": 0.4677, "step": 4757 }, { "epoch": 0.9380914826498423, "grad_norm": 0.5528329269741018, "learning_rate": 1.7422314249841373e-05, "loss": 0.4342, "step": 4758 }, { "epoch": 0.938288643533123, "grad_norm": 0.5633875242021811, "learning_rate": 1.7421275420614895e-05, "loss": 0.4297, "step": 4759 }, { "epoch": 0.9384858044164038, "grad_norm": 0.6057160126806879, "learning_rate": 1.7420236413086298e-05, "loss": 0.4469, "step": 4760 }, { "epoch": 0.9386829652996845, "grad_norm": 0.5062209536970022, "learning_rate": 1.7419197227280545e-05, "loss": 0.413, "step": 4761 }, { "epoch": 0.9388801261829653, "grad_norm": 0.5947801340129587, "learning_rate": 1.7418157863222608e-05, "loss": 0.4505, "step": 4762 }, { "epoch": 0.939077287066246, "grad_norm": 0.5473326180648399, "learning_rate": 1.7417118320937452e-05, "loss": 0.4298, "step": 4763 }, { "epoch": 0.9392744479495269, "grad_norm": 0.5842555110402521, "learning_rate": 1.7416078600450053e-05, "loss": 0.4366, "step": 4764 }, { "epoch": 0.9394716088328076, "grad_norm": 6.596709214102324, "learning_rate": 1.7415038701785397e-05, "loss": 0.4933, "step": 4765 }, { "epoch": 0.9396687697160884, "grad_norm": 0.5983220979388622, "learning_rate": 1.741399862496846e-05, "loss": 0.4303, "step": 4766 }, { "epoch": 0.9398659305993691, "grad_norm": 0.5605426934566005, "learning_rate": 1.741295837002424e-05, "loss": 0.4376, "step": 4767 }, { "epoch": 0.9400630914826499, "grad_norm": 0.6623850194449121, "learning_rate": 1.7411917936977728e-05, "loss": 0.4891, "step": 4768 }, { "epoch": 0.9402602523659306, "grad_norm": 0.5643318735416755, "learning_rate": 1.7410877325853914e-05, "loss": 0.4342, "step": 4769 }, { "epoch": 0.9404574132492114, "grad_norm": 0.510419807668752, "learning_rate": 1.7409836536677804e-05, "loss": 0.4094, "step": 4770 }, { "epoch": 0.9406545741324921, "grad_norm": 0.5957515521253349, "learning_rate": 1.7408795569474407e-05, "loss": 0.4534, "step": 4771 }, { "epoch": 0.9408517350157729, "grad_norm": 0.5938740829653067, "learning_rate": 1.7407754424268727e-05, "loss": 0.4209, "step": 4772 }, { "epoch": 0.9410488958990536, "grad_norm": 0.5430175081242069, "learning_rate": 1.7406713101085782e-05, "loss": 0.4428, "step": 4773 }, { "epoch": 0.9412460567823344, "grad_norm": 0.6092122034861576, "learning_rate": 1.7405671599950593e-05, "loss": 0.4552, "step": 4774 }, { "epoch": 0.9414432176656151, "grad_norm": 0.5301372981781962, "learning_rate": 1.7404629920888178e-05, "loss": 0.3949, "step": 4775 }, { "epoch": 0.9416403785488959, "grad_norm": 0.874324067325636, "learning_rate": 1.7403588063923565e-05, "loss": 0.4171, "step": 4776 }, { "epoch": 0.9418375394321766, "grad_norm": 0.562779555203244, "learning_rate": 1.7402546029081793e-05, "loss": 0.4508, "step": 4777 }, { "epoch": 0.9420347003154574, "grad_norm": 0.5751034428164898, "learning_rate": 1.7401503816387886e-05, "loss": 0.4648, "step": 4778 }, { "epoch": 0.9422318611987381, "grad_norm": 0.4928153903001422, "learning_rate": 1.740046142586689e-05, "loss": 0.4094, "step": 4779 }, { "epoch": 0.942429022082019, "grad_norm": 0.5593941847892724, "learning_rate": 1.7399418857543848e-05, "loss": 0.4448, "step": 4780 }, { "epoch": 0.9426261829652997, "grad_norm": 0.5120731327364189, "learning_rate": 1.739837611144381e-05, "loss": 0.4433, "step": 4781 }, { "epoch": 0.9428233438485805, "grad_norm": 0.5547010617938658, "learning_rate": 1.739733318759183e-05, "loss": 0.4517, "step": 4782 }, { "epoch": 0.9430205047318612, "grad_norm": 0.5279709568184927, "learning_rate": 1.739629008601296e-05, "loss": 0.4414, "step": 4783 }, { "epoch": 0.943217665615142, "grad_norm": 0.6546356596831223, "learning_rate": 1.7395246806732266e-05, "loss": 0.502, "step": 4784 }, { "epoch": 0.9434148264984227, "grad_norm": 0.5551495693288206, "learning_rate": 1.739420334977481e-05, "loss": 0.4212, "step": 4785 }, { "epoch": 0.9436119873817035, "grad_norm": 0.5092496805523852, "learning_rate": 1.7393159715165668e-05, "loss": 0.405, "step": 4786 }, { "epoch": 0.9438091482649842, "grad_norm": 0.5118148863889752, "learning_rate": 1.739211590292991e-05, "loss": 0.4185, "step": 4787 }, { "epoch": 0.944006309148265, "grad_norm": 0.5543188761300978, "learning_rate": 1.739107191309261e-05, "loss": 0.4229, "step": 4788 }, { "epoch": 0.9442034700315457, "grad_norm": 0.4947381983906698, "learning_rate": 1.7390027745678857e-05, "loss": 0.4347, "step": 4789 }, { "epoch": 0.9444006309148265, "grad_norm": 0.5333604588136764, "learning_rate": 1.7388983400713736e-05, "loss": 0.4273, "step": 4790 }, { "epoch": 0.9445977917981072, "grad_norm": 0.48436693257142593, "learning_rate": 1.7387938878222337e-05, "loss": 0.3842, "step": 4791 }, { "epoch": 0.944794952681388, "grad_norm": 0.48098662178856766, "learning_rate": 1.7386894178229764e-05, "loss": 0.4078, "step": 4792 }, { "epoch": 0.9449921135646687, "grad_norm": 0.5927563970559201, "learning_rate": 1.7385849300761104e-05, "loss": 0.4479, "step": 4793 }, { "epoch": 0.9451892744479495, "grad_norm": 0.5292188971068847, "learning_rate": 1.7384804245841468e-05, "loss": 0.4466, "step": 4794 }, { "epoch": 0.9453864353312302, "grad_norm": 0.5495901269166898, "learning_rate": 1.7383759013495965e-05, "loss": 0.423, "step": 4795 }, { "epoch": 0.945583596214511, "grad_norm": 0.5124360410567629, "learning_rate": 1.73827136037497e-05, "loss": 0.4669, "step": 4796 }, { "epoch": 0.9457807570977917, "grad_norm": 0.5728281769441116, "learning_rate": 1.7381668016627798e-05, "loss": 0.4712, "step": 4797 }, { "epoch": 0.9459779179810726, "grad_norm": 0.48078417126183814, "learning_rate": 1.738062225215538e-05, "loss": 0.4534, "step": 4798 }, { "epoch": 0.9461750788643533, "grad_norm": 0.4972157768011806, "learning_rate": 1.7379576310357568e-05, "loss": 0.4335, "step": 4799 }, { "epoch": 0.9463722397476341, "grad_norm": 0.6537133826858609, "learning_rate": 1.7378530191259492e-05, "loss": 0.4604, "step": 4800 }, { "epoch": 0.9465694006309149, "grad_norm": 0.5206764483089402, "learning_rate": 1.7377483894886285e-05, "loss": 0.4377, "step": 4801 }, { "epoch": 0.9467665615141956, "grad_norm": 0.5571810897809417, "learning_rate": 1.7376437421263088e-05, "loss": 0.4732, "step": 4802 }, { "epoch": 0.9469637223974764, "grad_norm": 0.5154485693291944, "learning_rate": 1.737539077041504e-05, "loss": 0.4422, "step": 4803 }, { "epoch": 0.9471608832807571, "grad_norm": 2.538605874424299, "learning_rate": 1.737434394236729e-05, "loss": 0.4926, "step": 4804 }, { "epoch": 0.9473580441640379, "grad_norm": 0.5854347379433069, "learning_rate": 1.737329693714499e-05, "loss": 0.4397, "step": 4805 }, { "epoch": 0.9475552050473186, "grad_norm": 0.5692750771949014, "learning_rate": 1.7372249754773292e-05, "loss": 0.4892, "step": 4806 }, { "epoch": 0.9477523659305994, "grad_norm": 0.5453924056164228, "learning_rate": 1.7371202395277357e-05, "loss": 0.4488, "step": 4807 }, { "epoch": 0.9479495268138801, "grad_norm": 0.5329921263116298, "learning_rate": 1.7370154858682347e-05, "loss": 0.4305, "step": 4808 }, { "epoch": 0.9481466876971609, "grad_norm": 0.5070927581481749, "learning_rate": 1.736910714501343e-05, "loss": 0.3879, "step": 4809 }, { "epoch": 0.9483438485804416, "grad_norm": 0.49761316318511184, "learning_rate": 1.7368059254295783e-05, "loss": 0.4439, "step": 4810 }, { "epoch": 0.9485410094637224, "grad_norm": 0.6116580466663731, "learning_rate": 1.736701118655458e-05, "loss": 0.4213, "step": 4811 }, { "epoch": 0.9487381703470031, "grad_norm": 0.6059773769906507, "learning_rate": 1.7365962941814998e-05, "loss": 0.4447, "step": 4812 }, { "epoch": 0.948935331230284, "grad_norm": 0.5302599878215019, "learning_rate": 1.7364914520102223e-05, "loss": 0.4219, "step": 4813 }, { "epoch": 0.9491324921135647, "grad_norm": 0.5346407557209393, "learning_rate": 1.736386592144145e-05, "loss": 0.4452, "step": 4814 }, { "epoch": 0.9493296529968455, "grad_norm": 2.5973339435009675, "learning_rate": 1.7362817145857866e-05, "loss": 0.5188, "step": 4815 }, { "epoch": 0.9495268138801262, "grad_norm": 0.6369521233467632, "learning_rate": 1.736176819337667e-05, "loss": 0.4557, "step": 4816 }, { "epoch": 0.949723974763407, "grad_norm": 0.7229696457122112, "learning_rate": 1.7360719064023067e-05, "loss": 0.4351, "step": 4817 }, { "epoch": 0.9499211356466877, "grad_norm": 0.5548249000280323, "learning_rate": 1.7359669757822256e-05, "loss": 0.4495, "step": 4818 }, { "epoch": 0.9501182965299685, "grad_norm": 0.5371920142295181, "learning_rate": 1.7358620274799455e-05, "loss": 0.4468, "step": 4819 }, { "epoch": 0.9503154574132492, "grad_norm": 0.5768920071321693, "learning_rate": 1.7357570614979878e-05, "loss": 0.4429, "step": 4820 }, { "epoch": 0.95051261829653, "grad_norm": 0.5491663645524869, "learning_rate": 1.735652077838874e-05, "loss": 0.4645, "step": 4821 }, { "epoch": 0.9507097791798107, "grad_norm": 0.5993991809124819, "learning_rate": 1.735547076505127e-05, "loss": 0.4071, "step": 4822 }, { "epoch": 0.9509069400630915, "grad_norm": 0.4880554858272554, "learning_rate": 1.7354420574992686e-05, "loss": 0.4059, "step": 4823 }, { "epoch": 0.9511041009463722, "grad_norm": 0.903588480396732, "learning_rate": 1.7353370208238226e-05, "loss": 0.4422, "step": 4824 }, { "epoch": 0.951301261829653, "grad_norm": 0.5459492044215943, "learning_rate": 1.7352319664813126e-05, "loss": 0.4284, "step": 4825 }, { "epoch": 0.9514984227129337, "grad_norm": 0.531596665432785, "learning_rate": 1.7351268944742626e-05, "loss": 0.4111, "step": 4826 }, { "epoch": 0.9516955835962145, "grad_norm": 0.5420852235661492, "learning_rate": 1.735021804805197e-05, "loss": 0.4238, "step": 4827 }, { "epoch": 0.9518927444794952, "grad_norm": 0.5118772611762697, "learning_rate": 1.7349166974766407e-05, "loss": 0.4063, "step": 4828 }, { "epoch": 0.952089905362776, "grad_norm": 0.6928236531905183, "learning_rate": 1.7348115724911188e-05, "loss": 0.4473, "step": 4829 }, { "epoch": 0.9522870662460567, "grad_norm": 0.5459811584499307, "learning_rate": 1.734706429851157e-05, "loss": 0.4339, "step": 4830 }, { "epoch": 0.9524842271293376, "grad_norm": 0.5306998499179901, "learning_rate": 1.7346012695592817e-05, "loss": 0.4496, "step": 4831 }, { "epoch": 0.9526813880126183, "grad_norm": 0.5488004261374878, "learning_rate": 1.7344960916180192e-05, "loss": 0.4504, "step": 4832 }, { "epoch": 0.9528785488958991, "grad_norm": 0.5667953843020231, "learning_rate": 1.734390896029897e-05, "loss": 0.4086, "step": 4833 }, { "epoch": 0.9530757097791798, "grad_norm": 0.5323191653010501, "learning_rate": 1.7342856827974417e-05, "loss": 0.4353, "step": 4834 }, { "epoch": 0.9532728706624606, "grad_norm": 0.5423134586875568, "learning_rate": 1.7341804519231815e-05, "loss": 0.4411, "step": 4835 }, { "epoch": 0.9534700315457413, "grad_norm": 0.548006233665663, "learning_rate": 1.734075203409645e-05, "loss": 0.4433, "step": 4836 }, { "epoch": 0.9536671924290221, "grad_norm": 0.5273485527910695, "learning_rate": 1.7339699372593605e-05, "loss": 0.4673, "step": 4837 }, { "epoch": 0.9538643533123028, "grad_norm": 0.5406256570631455, "learning_rate": 1.7338646534748572e-05, "loss": 0.4424, "step": 4838 }, { "epoch": 0.9540615141955836, "grad_norm": 0.5228289639533955, "learning_rate": 1.7337593520586645e-05, "loss": 0.4113, "step": 4839 }, { "epoch": 0.9542586750788643, "grad_norm": 0.5199695216281296, "learning_rate": 1.7336540330133126e-05, "loss": 0.4334, "step": 4840 }, { "epoch": 0.9544558359621451, "grad_norm": 0.5157055642134136, "learning_rate": 1.7335486963413318e-05, "loss": 0.4114, "step": 4841 }, { "epoch": 0.9546529968454258, "grad_norm": 0.5627963986729889, "learning_rate": 1.7334433420452527e-05, "loss": 0.454, "step": 4842 }, { "epoch": 0.9548501577287066, "grad_norm": 0.5354466461718106, "learning_rate": 1.7333379701276068e-05, "loss": 0.4418, "step": 4843 }, { "epoch": 0.9550473186119873, "grad_norm": 0.9673410881580022, "learning_rate": 1.7332325805909256e-05, "loss": 0.4306, "step": 4844 }, { "epoch": 0.9552444794952681, "grad_norm": 5.372837853533415, "learning_rate": 1.733127173437741e-05, "loss": 0.4094, "step": 4845 }, { "epoch": 0.955441640378549, "grad_norm": 0.6273908422804465, "learning_rate": 1.7330217486705862e-05, "loss": 0.4191, "step": 4846 }, { "epoch": 0.9556388012618297, "grad_norm": 0.5769927479813597, "learning_rate": 1.732916306291993e-05, "loss": 0.4635, "step": 4847 }, { "epoch": 0.9558359621451105, "grad_norm": 0.5345159528878386, "learning_rate": 1.7328108463044953e-05, "loss": 0.4228, "step": 4848 }, { "epoch": 0.9560331230283912, "grad_norm": 0.5196926420397846, "learning_rate": 1.7327053687106273e-05, "loss": 0.3951, "step": 4849 }, { "epoch": 0.956230283911672, "grad_norm": 0.4976003610694764, "learning_rate": 1.7325998735129227e-05, "loss": 0.3933, "step": 4850 }, { "epoch": 0.9564274447949527, "grad_norm": 0.5629160034402856, "learning_rate": 1.7324943607139158e-05, "loss": 0.4207, "step": 4851 }, { "epoch": 0.9566246056782335, "grad_norm": 0.541499287915572, "learning_rate": 1.7323888303161422e-05, "loss": 0.4565, "step": 4852 }, { "epoch": 0.9568217665615142, "grad_norm": 0.5494402412976808, "learning_rate": 1.732283282322137e-05, "loss": 0.4487, "step": 4853 }, { "epoch": 0.957018927444795, "grad_norm": 0.5145315890387855, "learning_rate": 1.7321777167344367e-05, "loss": 0.4067, "step": 4854 }, { "epoch": 0.9572160883280757, "grad_norm": 0.5356564633940517, "learning_rate": 1.732072133555577e-05, "loss": 0.4432, "step": 4855 }, { "epoch": 0.9574132492113565, "grad_norm": 0.498975459571729, "learning_rate": 1.7319665327880945e-05, "loss": 0.3735, "step": 4856 }, { "epoch": 0.9576104100946372, "grad_norm": 0.618141773791518, "learning_rate": 1.7318609144345265e-05, "loss": 0.4099, "step": 4857 }, { "epoch": 0.957807570977918, "grad_norm": 0.5160325448737755, "learning_rate": 1.7317552784974113e-05, "loss": 0.3971, "step": 4858 }, { "epoch": 0.9580047318611987, "grad_norm": 0.5514233004043473, "learning_rate": 1.7316496249792857e-05, "loss": 0.46, "step": 4859 }, { "epoch": 0.9582018927444795, "grad_norm": 0.54586722386132, "learning_rate": 1.7315439538826887e-05, "loss": 0.4271, "step": 4860 }, { "epoch": 0.9583990536277602, "grad_norm": 0.50615085528196, "learning_rate": 1.7314382652101595e-05, "loss": 0.4251, "step": 4861 }, { "epoch": 0.958596214511041, "grad_norm": 0.5311049525152373, "learning_rate": 1.7313325589642363e-05, "loss": 0.4354, "step": 4862 }, { "epoch": 0.9587933753943217, "grad_norm": 0.5243846048285586, "learning_rate": 1.7312268351474603e-05, "loss": 0.423, "step": 4863 }, { "epoch": 0.9589905362776026, "grad_norm": 0.5359735257750494, "learning_rate": 1.73112109376237e-05, "loss": 0.4092, "step": 4864 }, { "epoch": 0.9591876971608833, "grad_norm": 0.5599828574009664, "learning_rate": 1.7310153348115068e-05, "loss": 0.4465, "step": 4865 }, { "epoch": 0.9593848580441641, "grad_norm": 0.53367931056808, "learning_rate": 1.7309095582974115e-05, "loss": 0.4485, "step": 4866 }, { "epoch": 0.9595820189274448, "grad_norm": 0.5485569037489243, "learning_rate": 1.7308037642226258e-05, "loss": 0.4181, "step": 4867 }, { "epoch": 0.9597791798107256, "grad_norm": 0.5325054492925666, "learning_rate": 1.7306979525896907e-05, "loss": 0.4338, "step": 4868 }, { "epoch": 0.9599763406940063, "grad_norm": 1.2447432969875094, "learning_rate": 1.730592123401149e-05, "loss": 0.447, "step": 4869 }, { "epoch": 0.9601735015772871, "grad_norm": 0.5405855805944394, "learning_rate": 1.7304862766595433e-05, "loss": 0.4556, "step": 4870 }, { "epoch": 0.9603706624605678, "grad_norm": 0.5014468926425727, "learning_rate": 1.7303804123674165e-05, "loss": 0.4127, "step": 4871 }, { "epoch": 0.9605678233438486, "grad_norm": 0.5242180756855537, "learning_rate": 1.730274530527312e-05, "loss": 0.4488, "step": 4872 }, { "epoch": 0.9607649842271293, "grad_norm": 0.5590887485992455, "learning_rate": 1.730168631141774e-05, "loss": 0.4408, "step": 4873 }, { "epoch": 0.9609621451104101, "grad_norm": 0.5590641148729568, "learning_rate": 1.7300627142133466e-05, "loss": 0.4421, "step": 4874 }, { "epoch": 0.9611593059936908, "grad_norm": 0.5381918633394875, "learning_rate": 1.7299567797445744e-05, "loss": 0.4152, "step": 4875 }, { "epoch": 0.9613564668769716, "grad_norm": 0.5116503025098527, "learning_rate": 1.729850827738003e-05, "loss": 0.4137, "step": 4876 }, { "epoch": 0.9615536277602523, "grad_norm": 0.601103697573897, "learning_rate": 1.7297448581961775e-05, "loss": 0.4367, "step": 4877 }, { "epoch": 0.9617507886435331, "grad_norm": 0.5234000253181185, "learning_rate": 1.7296388711216442e-05, "loss": 0.4264, "step": 4878 }, { "epoch": 0.9619479495268138, "grad_norm": 0.5489406687628315, "learning_rate": 1.7295328665169495e-05, "loss": 0.4522, "step": 4879 }, { "epoch": 0.9621451104100947, "grad_norm": 0.5304149080662556, "learning_rate": 1.7294268443846403e-05, "loss": 0.4531, "step": 4880 }, { "epoch": 0.9623422712933754, "grad_norm": 0.5277630403000341, "learning_rate": 1.7293208047272635e-05, "loss": 0.4233, "step": 4881 }, { "epoch": 0.9625394321766562, "grad_norm": 0.5216948283800477, "learning_rate": 1.729214747547367e-05, "loss": 0.4634, "step": 4882 }, { "epoch": 0.9627365930599369, "grad_norm": 0.5128948306308777, "learning_rate": 1.7291086728474992e-05, "loss": 0.3948, "step": 4883 }, { "epoch": 0.9629337539432177, "grad_norm": 0.4864061144860882, "learning_rate": 1.729002580630208e-05, "loss": 0.4011, "step": 4884 }, { "epoch": 0.9631309148264984, "grad_norm": 1.3934757331709215, "learning_rate": 1.7288964708980432e-05, "loss": 0.4406, "step": 4885 }, { "epoch": 0.9633280757097792, "grad_norm": 0.5314428535537528, "learning_rate": 1.7287903436535535e-05, "loss": 0.4282, "step": 4886 }, { "epoch": 0.9635252365930599, "grad_norm": 0.5113959395669175, "learning_rate": 1.728684198899289e-05, "loss": 0.4459, "step": 4887 }, { "epoch": 0.9637223974763407, "grad_norm": 0.7377416915879886, "learning_rate": 1.7285780366377998e-05, "loss": 0.4226, "step": 4888 }, { "epoch": 0.9639195583596214, "grad_norm": 0.625103270758122, "learning_rate": 1.7284718568716362e-05, "loss": 0.4857, "step": 4889 }, { "epoch": 0.9641167192429022, "grad_norm": 0.49384862341141844, "learning_rate": 1.7283656596033502e-05, "loss": 0.4377, "step": 4890 }, { "epoch": 0.964313880126183, "grad_norm": 0.8265103239871188, "learning_rate": 1.7282594448354922e-05, "loss": 0.4826, "step": 4891 }, { "epoch": 0.9645110410094637, "grad_norm": 0.5660829434512461, "learning_rate": 1.728153212570615e-05, "loss": 0.436, "step": 4892 }, { "epoch": 0.9647082018927445, "grad_norm": 0.49876988837276626, "learning_rate": 1.7280469628112698e-05, "loss": 0.4192, "step": 4893 }, { "epoch": 0.9649053627760252, "grad_norm": 0.6150615512750844, "learning_rate": 1.7279406955600107e-05, "loss": 0.4016, "step": 4894 }, { "epoch": 0.965102523659306, "grad_norm": 0.536942351647561, "learning_rate": 1.7278344108193897e-05, "loss": 0.4207, "step": 4895 }, { "epoch": 0.9652996845425867, "grad_norm": 0.9470023911925746, "learning_rate": 1.7277281085919613e-05, "loss": 0.4274, "step": 4896 }, { "epoch": 0.9654968454258676, "grad_norm": 0.5235369204502012, "learning_rate": 1.727621788880279e-05, "loss": 0.4244, "step": 4897 }, { "epoch": 0.9656940063091483, "grad_norm": 0.6071731227037889, "learning_rate": 1.727515451686897e-05, "loss": 0.3893, "step": 4898 }, { "epoch": 0.9658911671924291, "grad_norm": 0.5903477067942368, "learning_rate": 1.7274090970143705e-05, "loss": 0.4356, "step": 4899 }, { "epoch": 0.9660883280757098, "grad_norm": 0.5376060698400171, "learning_rate": 1.7273027248652545e-05, "loss": 0.4561, "step": 4900 }, { "epoch": 0.9662854889589906, "grad_norm": 0.5838617039015437, "learning_rate": 1.727196335242105e-05, "loss": 0.4383, "step": 4901 }, { "epoch": 0.9664826498422713, "grad_norm": 0.5167788808435415, "learning_rate": 1.727089928147478e-05, "loss": 0.4024, "step": 4902 }, { "epoch": 0.9666798107255521, "grad_norm": 0.5624366894471787, "learning_rate": 1.72698350358393e-05, "loss": 0.4309, "step": 4903 }, { "epoch": 0.9668769716088328, "grad_norm": 0.753504301288875, "learning_rate": 1.726877061554018e-05, "loss": 0.3958, "step": 4904 }, { "epoch": 0.9670741324921136, "grad_norm": 0.5684825023726245, "learning_rate": 1.726770602060299e-05, "loss": 0.4328, "step": 4905 }, { "epoch": 0.9672712933753943, "grad_norm": 0.6897233391043007, "learning_rate": 1.726664125105331e-05, "loss": 0.4256, "step": 4906 }, { "epoch": 0.9674684542586751, "grad_norm": 0.5640572708097991, "learning_rate": 1.726557630691672e-05, "loss": 0.4343, "step": 4907 }, { "epoch": 0.9676656151419558, "grad_norm": 0.5552212931426607, "learning_rate": 1.7264511188218812e-05, "loss": 0.4653, "step": 4908 }, { "epoch": 0.9678627760252366, "grad_norm": 1.0633900724505037, "learning_rate": 1.726344589498517e-05, "loss": 0.4826, "step": 4909 }, { "epoch": 0.9680599369085173, "grad_norm": 0.7383588772446893, "learning_rate": 1.7262380427241394e-05, "loss": 0.4684, "step": 4910 }, { "epoch": 0.9682570977917981, "grad_norm": 0.6826171176254585, "learning_rate": 1.7261314785013078e-05, "loss": 0.4237, "step": 4911 }, { "epoch": 0.9684542586750788, "grad_norm": 0.6364722722252665, "learning_rate": 1.7260248968325828e-05, "loss": 0.4489, "step": 4912 }, { "epoch": 0.9686514195583596, "grad_norm": 0.5663790368688596, "learning_rate": 1.7259182977205248e-05, "loss": 0.4189, "step": 4913 }, { "epoch": 0.9688485804416404, "grad_norm": 0.5420023619747468, "learning_rate": 1.7258116811676956e-05, "loss": 0.4295, "step": 4914 }, { "epoch": 0.9690457413249212, "grad_norm": 0.5377721001533368, "learning_rate": 1.7257050471766558e-05, "loss": 0.4638, "step": 4915 }, { "epoch": 0.9692429022082019, "grad_norm": 0.6046459705426991, "learning_rate": 1.7255983957499676e-05, "loss": 0.4886, "step": 4916 }, { "epoch": 0.9694400630914827, "grad_norm": 0.5236164202200155, "learning_rate": 1.7254917268901942e-05, "loss": 0.4251, "step": 4917 }, { "epoch": 0.9696372239747634, "grad_norm": 0.5378880369504965, "learning_rate": 1.7253850405998976e-05, "loss": 0.4662, "step": 4918 }, { "epoch": 0.9698343848580442, "grad_norm": 0.5749445692065384, "learning_rate": 1.7252783368816413e-05, "loss": 0.4855, "step": 4919 }, { "epoch": 0.9700315457413249, "grad_norm": 0.49857545471871095, "learning_rate": 1.7251716157379887e-05, "loss": 0.4235, "step": 4920 }, { "epoch": 0.9702287066246057, "grad_norm": 0.5825438945665429, "learning_rate": 1.725064877171504e-05, "loss": 0.4292, "step": 4921 }, { "epoch": 0.9704258675078864, "grad_norm": 0.7116226202261247, "learning_rate": 1.724958121184752e-05, "loss": 0.4942, "step": 4922 }, { "epoch": 0.9706230283911672, "grad_norm": 0.5595515787710044, "learning_rate": 1.7248513477802973e-05, "loss": 0.4251, "step": 4923 }, { "epoch": 0.9708201892744479, "grad_norm": 0.5609266287961348, "learning_rate": 1.724744556960705e-05, "loss": 0.4362, "step": 4924 }, { "epoch": 0.9710173501577287, "grad_norm": 0.5332114412325697, "learning_rate": 1.7246377487285415e-05, "loss": 0.4222, "step": 4925 }, { "epoch": 0.9712145110410094, "grad_norm": 0.5579951518622186, "learning_rate": 1.7245309230863723e-05, "loss": 0.4351, "step": 4926 }, { "epoch": 0.9714116719242902, "grad_norm": 0.5504796489083765, "learning_rate": 1.7244240800367642e-05, "loss": 0.466, "step": 4927 }, { "epoch": 0.9716088328075709, "grad_norm": 0.5364866603592925, "learning_rate": 1.724317219582284e-05, "loss": 0.4356, "step": 4928 }, { "epoch": 0.9718059936908517, "grad_norm": 0.5307686124133709, "learning_rate": 1.7242103417255e-05, "loss": 0.3943, "step": 4929 }, { "epoch": 0.9720031545741324, "grad_norm": 0.535779232578516, "learning_rate": 1.724103446468979e-05, "loss": 0.4479, "step": 4930 }, { "epoch": 0.9722003154574133, "grad_norm": 0.8527430234546016, "learning_rate": 1.723996533815289e-05, "loss": 0.4271, "step": 4931 }, { "epoch": 0.972397476340694, "grad_norm": 0.4893815242972822, "learning_rate": 1.723889603767e-05, "loss": 0.4294, "step": 4932 }, { "epoch": 0.9725946372239748, "grad_norm": 0.5142240975494096, "learning_rate": 1.7237826563266797e-05, "loss": 0.4111, "step": 4933 }, { "epoch": 0.9727917981072555, "grad_norm": 0.5056050398485274, "learning_rate": 1.7236756914968985e-05, "loss": 0.4459, "step": 4934 }, { "epoch": 0.9729889589905363, "grad_norm": 0.6875911603497251, "learning_rate": 1.723568709280226e-05, "loss": 0.4393, "step": 4935 }, { "epoch": 0.973186119873817, "grad_norm": 0.5410138293600987, "learning_rate": 1.7234617096792328e-05, "loss": 0.4337, "step": 4936 }, { "epoch": 0.9733832807570978, "grad_norm": 0.5406394954943095, "learning_rate": 1.723354692696489e-05, "loss": 0.4787, "step": 4937 }, { "epoch": 0.9735804416403786, "grad_norm": 0.5039617193695565, "learning_rate": 1.7232476583345667e-05, "loss": 0.4269, "step": 4938 }, { "epoch": 0.9737776025236593, "grad_norm": 0.5667126358374103, "learning_rate": 1.7231406065960365e-05, "loss": 0.4314, "step": 4939 }, { "epoch": 0.9739747634069401, "grad_norm": 0.5223138203259416, "learning_rate": 1.723033537483471e-05, "loss": 0.4238, "step": 4940 }, { "epoch": 0.9741719242902208, "grad_norm": 0.5134992034885483, "learning_rate": 1.722926450999443e-05, "loss": 0.435, "step": 4941 }, { "epoch": 0.9743690851735016, "grad_norm": 0.5335314712456779, "learning_rate": 1.7228193471465243e-05, "loss": 0.4448, "step": 4942 }, { "epoch": 0.9745662460567823, "grad_norm": 0.5187220320984581, "learning_rate": 1.722712225927289e-05, "loss": 0.4305, "step": 4943 }, { "epoch": 0.9747634069400631, "grad_norm": 0.5236993505503822, "learning_rate": 1.7226050873443103e-05, "loss": 0.4324, "step": 4944 }, { "epoch": 0.9749605678233438, "grad_norm": 0.5501948139908517, "learning_rate": 1.7224979314001623e-05, "loss": 0.4445, "step": 4945 }, { "epoch": 0.9751577287066246, "grad_norm": 0.5025205140860234, "learning_rate": 1.72239075809742e-05, "loss": 0.4067, "step": 4946 }, { "epoch": 0.9753548895899053, "grad_norm": 0.509021011008656, "learning_rate": 1.722283567438658e-05, "loss": 0.4127, "step": 4947 }, { "epoch": 0.9755520504731862, "grad_norm": 0.5169322695492775, "learning_rate": 1.7221763594264513e-05, "loss": 0.4183, "step": 4948 }, { "epoch": 0.9757492113564669, "grad_norm": 0.5893693443229753, "learning_rate": 1.7220691340633762e-05, "loss": 0.413, "step": 4949 }, { "epoch": 0.9759463722397477, "grad_norm": 0.5032678433324692, "learning_rate": 1.7219618913520086e-05, "loss": 0.3999, "step": 4950 }, { "epoch": 0.9761435331230284, "grad_norm": 0.5611832448812477, "learning_rate": 1.7218546312949255e-05, "loss": 0.4208, "step": 4951 }, { "epoch": 0.9763406940063092, "grad_norm": 0.5402039539918267, "learning_rate": 1.7217473538947032e-05, "loss": 0.4592, "step": 4952 }, { "epoch": 0.9765378548895899, "grad_norm": 0.5312684422263533, "learning_rate": 1.7216400591539194e-05, "loss": 0.3958, "step": 4953 }, { "epoch": 0.9767350157728707, "grad_norm": 0.4838095794471189, "learning_rate": 1.7215327470751525e-05, "loss": 0.4012, "step": 4954 }, { "epoch": 0.9769321766561514, "grad_norm": 0.5459043063215803, "learning_rate": 1.72142541766098e-05, "loss": 0.4889, "step": 4955 }, { "epoch": 0.9771293375394322, "grad_norm": 0.4762352325018086, "learning_rate": 1.721318070913981e-05, "loss": 0.4194, "step": 4956 }, { "epoch": 0.9773264984227129, "grad_norm": 1.1041727399927024, "learning_rate": 1.7212107068367343e-05, "loss": 0.4595, "step": 4957 }, { "epoch": 0.9775236593059937, "grad_norm": 0.5072331377511347, "learning_rate": 1.7211033254318195e-05, "loss": 0.4192, "step": 4958 }, { "epoch": 0.9777208201892744, "grad_norm": 0.5051945491324831, "learning_rate": 1.720995926701817e-05, "loss": 0.4018, "step": 4959 }, { "epoch": 0.9779179810725552, "grad_norm": 0.5317945948669217, "learning_rate": 1.7208885106493068e-05, "loss": 0.4245, "step": 4960 }, { "epoch": 0.9781151419558359, "grad_norm": 0.5003276873104907, "learning_rate": 1.7207810772768692e-05, "loss": 0.4175, "step": 4961 }, { "epoch": 0.9783123028391167, "grad_norm": 0.5129068918599115, "learning_rate": 1.720673626587086e-05, "loss": 0.4263, "step": 4962 }, { "epoch": 0.9785094637223974, "grad_norm": 0.49567393969405865, "learning_rate": 1.7205661585825385e-05, "loss": 0.4224, "step": 4963 }, { "epoch": 0.9787066246056783, "grad_norm": 0.5210701609022403, "learning_rate": 1.7204586732658088e-05, "loss": 0.4054, "step": 4964 }, { "epoch": 0.978903785488959, "grad_norm": 0.5237810329795967, "learning_rate": 1.720351170639479e-05, "loss": 0.4373, "step": 4965 }, { "epoch": 0.9791009463722398, "grad_norm": 0.49172980877968686, "learning_rate": 1.7202436507061327e-05, "loss": 0.3981, "step": 4966 }, { "epoch": 0.9792981072555205, "grad_norm": 0.5033156694120647, "learning_rate": 1.7201361134683522e-05, "loss": 0.405, "step": 4967 }, { "epoch": 0.9794952681388013, "grad_norm": 0.532232285017258, "learning_rate": 1.720028558928722e-05, "loss": 0.4189, "step": 4968 }, { "epoch": 0.979692429022082, "grad_norm": 0.5084865188261308, "learning_rate": 1.7199209870898257e-05, "loss": 0.426, "step": 4969 }, { "epoch": 0.9798895899053628, "grad_norm": 0.5195794855402261, "learning_rate": 1.719813397954248e-05, "loss": 0.4279, "step": 4970 }, { "epoch": 0.9800867507886435, "grad_norm": 0.5120454170989818, "learning_rate": 1.7197057915245738e-05, "loss": 0.4465, "step": 4971 }, { "epoch": 0.9802839116719243, "grad_norm": 0.5104835439357177, "learning_rate": 1.7195981678033883e-05, "loss": 0.4551, "step": 4972 }, { "epoch": 0.980481072555205, "grad_norm": 0.545753139194094, "learning_rate": 1.7194905267932775e-05, "loss": 0.4515, "step": 4973 }, { "epoch": 0.9806782334384858, "grad_norm": 0.48063719552920575, "learning_rate": 1.719382868496827e-05, "loss": 0.3959, "step": 4974 }, { "epoch": 0.9808753943217665, "grad_norm": 0.5081444056915589, "learning_rate": 1.7192751929166237e-05, "loss": 0.4267, "step": 4975 }, { "epoch": 0.9810725552050473, "grad_norm": 0.7889320206894174, "learning_rate": 1.7191675000552552e-05, "loss": 0.4256, "step": 4976 }, { "epoch": 0.981269716088328, "grad_norm": 0.5161493421048737, "learning_rate": 1.719059789915308e-05, "loss": 0.3944, "step": 4977 }, { "epoch": 0.9814668769716088, "grad_norm": 0.5526599869469584, "learning_rate": 1.7189520624993706e-05, "loss": 0.4733, "step": 4978 }, { "epoch": 0.9816640378548895, "grad_norm": 0.5921214088474026, "learning_rate": 1.7188443178100306e-05, "loss": 0.4402, "step": 4979 }, { "epoch": 0.9818611987381703, "grad_norm": 0.4899387375326017, "learning_rate": 1.7187365558498772e-05, "loss": 0.403, "step": 4980 }, { "epoch": 0.982058359621451, "grad_norm": 0.6284691568651574, "learning_rate": 1.7186287766214992e-05, "loss": 0.4247, "step": 4981 }, { "epoch": 0.9822555205047319, "grad_norm": 0.5219769363772543, "learning_rate": 1.7185209801274863e-05, "loss": 0.4378, "step": 4982 }, { "epoch": 0.9824526813880127, "grad_norm": 0.561058268499637, "learning_rate": 1.718413166370428e-05, "loss": 0.4402, "step": 4983 }, { "epoch": 0.9826498422712934, "grad_norm": 0.5228751281665158, "learning_rate": 1.7183053353529146e-05, "loss": 0.4342, "step": 4984 }, { "epoch": 0.9828470031545742, "grad_norm": 0.6359423300026139, "learning_rate": 1.7181974870775374e-05, "loss": 0.4791, "step": 4985 }, { "epoch": 0.9830441640378549, "grad_norm": 0.5452791966413713, "learning_rate": 1.718089621546887e-05, "loss": 0.4311, "step": 4986 }, { "epoch": 0.9832413249211357, "grad_norm": 0.5585163320278087, "learning_rate": 1.7179817387635552e-05, "loss": 0.3905, "step": 4987 }, { "epoch": 0.9834384858044164, "grad_norm": 1.1312602954702684, "learning_rate": 1.7178738387301342e-05, "loss": 0.5146, "step": 4988 }, { "epoch": 0.9836356466876972, "grad_norm": 0.544619439438417, "learning_rate": 1.7177659214492162e-05, "loss": 0.4722, "step": 4989 }, { "epoch": 0.9838328075709779, "grad_norm": 0.48850131893298554, "learning_rate": 1.7176579869233935e-05, "loss": 0.387, "step": 4990 }, { "epoch": 0.9840299684542587, "grad_norm": 0.646046327237621, "learning_rate": 1.71755003515526e-05, "loss": 0.4457, "step": 4991 }, { "epoch": 0.9842271293375394, "grad_norm": 0.5156158640893732, "learning_rate": 1.717442066147409e-05, "loss": 0.4206, "step": 4992 }, { "epoch": 0.9844242902208202, "grad_norm": 0.5424189331510925, "learning_rate": 1.7173340799024346e-05, "loss": 0.397, "step": 4993 }, { "epoch": 0.9846214511041009, "grad_norm": 0.5235819964470497, "learning_rate": 1.7172260764229312e-05, "loss": 0.4467, "step": 4994 }, { "epoch": 0.9848186119873817, "grad_norm": 0.5909753498690635, "learning_rate": 1.717118055711494e-05, "loss": 0.4398, "step": 4995 }, { "epoch": 0.9850157728706624, "grad_norm": 0.4987198779431939, "learning_rate": 1.7170100177707177e-05, "loss": 0.4332, "step": 4996 }, { "epoch": 0.9852129337539433, "grad_norm": 0.5563239550956313, "learning_rate": 1.7169019626031985e-05, "loss": 0.4193, "step": 4997 }, { "epoch": 0.985410094637224, "grad_norm": 0.542094709088602, "learning_rate": 1.7167938902115323e-05, "loss": 0.4354, "step": 4998 }, { "epoch": 0.9856072555205048, "grad_norm": 0.6222848829402873, "learning_rate": 1.7166858005983154e-05, "loss": 0.4455, "step": 4999 }, { "epoch": 0.9858044164037855, "grad_norm": 0.5278021757853874, "learning_rate": 1.7165776937661453e-05, "loss": 0.4299, "step": 5000 }, { "epoch": 0.9860015772870663, "grad_norm": 0.6100419080088035, "learning_rate": 1.716469569717619e-05, "loss": 0.3916, "step": 5001 }, { "epoch": 0.986198738170347, "grad_norm": 0.5391831742527743, "learning_rate": 1.716361428455334e-05, "loss": 0.4103, "step": 5002 }, { "epoch": 0.9863958990536278, "grad_norm": 0.5826789977832045, "learning_rate": 1.7162532699818893e-05, "loss": 0.4617, "step": 5003 }, { "epoch": 0.9865930599369085, "grad_norm": 0.4977680784543337, "learning_rate": 1.7161450942998827e-05, "loss": 0.4102, "step": 5004 }, { "epoch": 0.9867902208201893, "grad_norm": 0.5793799937360323, "learning_rate": 1.7160369014119136e-05, "loss": 0.424, "step": 5005 }, { "epoch": 0.98698738170347, "grad_norm": 0.5113962294204524, "learning_rate": 1.7159286913205813e-05, "loss": 0.4317, "step": 5006 }, { "epoch": 0.9871845425867508, "grad_norm": 0.5398003533758813, "learning_rate": 1.7158204640284855e-05, "loss": 0.445, "step": 5007 }, { "epoch": 0.9873817034700315, "grad_norm": 0.5979699074869115, "learning_rate": 1.7157122195382267e-05, "loss": 0.452, "step": 5008 }, { "epoch": 0.9875788643533123, "grad_norm": 0.5285476936820424, "learning_rate": 1.7156039578524055e-05, "loss": 0.4193, "step": 5009 }, { "epoch": 0.987776025236593, "grad_norm": 0.5910776224079434, "learning_rate": 1.715495678973623e-05, "loss": 0.4859, "step": 5010 }, { "epoch": 0.9879731861198738, "grad_norm": 0.5036681780427391, "learning_rate": 1.7153873829044805e-05, "loss": 0.4086, "step": 5011 }, { "epoch": 0.9881703470031545, "grad_norm": 0.5363578632899204, "learning_rate": 1.7152790696475804e-05, "loss": 0.4109, "step": 5012 }, { "epoch": 0.9883675078864353, "grad_norm": 0.5058850256040565, "learning_rate": 1.715170739205524e-05, "loss": 0.417, "step": 5013 }, { "epoch": 0.988564668769716, "grad_norm": 0.5343518165708425, "learning_rate": 1.7150623915809154e-05, "loss": 0.4317, "step": 5014 }, { "epoch": 0.9887618296529969, "grad_norm": 0.4732197281950188, "learning_rate": 1.7149540267763566e-05, "loss": 0.419, "step": 5015 }, { "epoch": 0.9889589905362776, "grad_norm": 0.5460018301214419, "learning_rate": 1.7148456447944514e-05, "loss": 0.4371, "step": 5016 }, { "epoch": 0.9891561514195584, "grad_norm": 0.5228662093888589, "learning_rate": 1.714737245637804e-05, "loss": 0.4348, "step": 5017 }, { "epoch": 0.9893533123028391, "grad_norm": 0.5136606982937066, "learning_rate": 1.7146288293090187e-05, "loss": 0.4207, "step": 5018 }, { "epoch": 0.9895504731861199, "grad_norm": 0.49381060992162196, "learning_rate": 1.7145203958107005e-05, "loss": 0.4386, "step": 5019 }, { "epoch": 0.9897476340694006, "grad_norm": 0.5097224069523834, "learning_rate": 1.714411945145454e-05, "loss": 0.449, "step": 5020 }, { "epoch": 0.9899447949526814, "grad_norm": 0.48982369331094644, "learning_rate": 1.714303477315886e-05, "loss": 0.4135, "step": 5021 }, { "epoch": 0.9901419558359621, "grad_norm": 0.5348342417259278, "learning_rate": 1.7141949923246007e-05, "loss": 0.4645, "step": 5022 }, { "epoch": 0.9903391167192429, "grad_norm": 0.4935854194155088, "learning_rate": 1.7140864901742062e-05, "loss": 0.4156, "step": 5023 }, { "epoch": 0.9905362776025236, "grad_norm": 0.538994627268955, "learning_rate": 1.7139779708673084e-05, "loss": 0.434, "step": 5024 }, { "epoch": 0.9907334384858044, "grad_norm": 0.5072487466974747, "learning_rate": 1.7138694344065152e-05, "loss": 0.4265, "step": 5025 }, { "epoch": 0.9909305993690851, "grad_norm": 14.35107865712939, "learning_rate": 1.7137608807944337e-05, "loss": 0.4277, "step": 5026 }, { "epoch": 0.9911277602523659, "grad_norm": 0.5905049052128314, "learning_rate": 1.7136523100336725e-05, "loss": 0.4204, "step": 5027 }, { "epoch": 0.9913249211356467, "grad_norm": 0.5954561128927663, "learning_rate": 1.7135437221268397e-05, "loss": 0.4066, "step": 5028 }, { "epoch": 0.9915220820189274, "grad_norm": 0.5481450959305048, "learning_rate": 1.7134351170765443e-05, "loss": 0.434, "step": 5029 }, { "epoch": 0.9917192429022083, "grad_norm": 0.5757944206123269, "learning_rate": 1.7133264948853957e-05, "loss": 0.4157, "step": 5030 }, { "epoch": 0.991916403785489, "grad_norm": 0.5811091665563136, "learning_rate": 1.7132178555560038e-05, "loss": 0.463, "step": 5031 }, { "epoch": 0.9921135646687698, "grad_norm": 0.5095692078366557, "learning_rate": 1.7131091990909786e-05, "loss": 0.4403, "step": 5032 }, { "epoch": 0.9923107255520505, "grad_norm": 0.5285063457801826, "learning_rate": 1.71300052549293e-05, "loss": 0.4018, "step": 5033 }, { "epoch": 0.9925078864353313, "grad_norm": 0.5743875490503324, "learning_rate": 1.7128918347644704e-05, "loss": 0.463, "step": 5034 }, { "epoch": 0.992705047318612, "grad_norm": 0.5553066045950534, "learning_rate": 1.7127831269082103e-05, "loss": 0.4358, "step": 5035 }, { "epoch": 0.9929022082018928, "grad_norm": 0.5201145328150542, "learning_rate": 1.712674401926761e-05, "loss": 0.3973, "step": 5036 }, { "epoch": 0.9930993690851735, "grad_norm": 0.5308940515290245, "learning_rate": 1.7125656598227357e-05, "loss": 0.4505, "step": 5037 }, { "epoch": 0.9932965299684543, "grad_norm": 0.5307597719587108, "learning_rate": 1.7124569005987466e-05, "loss": 0.4296, "step": 5038 }, { "epoch": 0.993493690851735, "grad_norm": 0.5368886391034855, "learning_rate": 1.7123481242574066e-05, "loss": 0.4627, "step": 5039 }, { "epoch": 0.9936908517350158, "grad_norm": 0.5304804270019259, "learning_rate": 1.7122393308013294e-05, "loss": 0.4513, "step": 5040 }, { "epoch": 0.9938880126182965, "grad_norm": 0.4837269924708032, "learning_rate": 1.7121305202331284e-05, "loss": 0.4263, "step": 5041 }, { "epoch": 0.9940851735015773, "grad_norm": 0.538409666606145, "learning_rate": 1.7120216925554185e-05, "loss": 0.4424, "step": 5042 }, { "epoch": 0.994282334384858, "grad_norm": 0.5107171099135106, "learning_rate": 1.7119128477708137e-05, "loss": 0.4143, "step": 5043 }, { "epoch": 0.9944794952681388, "grad_norm": 0.5109327464115445, "learning_rate": 1.7118039858819297e-05, "loss": 0.4406, "step": 5044 }, { "epoch": 0.9946766561514195, "grad_norm": 0.528632568268682, "learning_rate": 1.711695106891382e-05, "loss": 0.4322, "step": 5045 }, { "epoch": 0.9948738170347003, "grad_norm": 0.5281250909959354, "learning_rate": 1.711586210801786e-05, "loss": 0.4486, "step": 5046 }, { "epoch": 0.995070977917981, "grad_norm": 0.5636600965669576, "learning_rate": 1.7114772976157578e-05, "loss": 0.4576, "step": 5047 }, { "epoch": 0.9952681388012619, "grad_norm": 0.5721847302432999, "learning_rate": 1.711368367335915e-05, "loss": 0.4841, "step": 5048 }, { "epoch": 0.9954652996845426, "grad_norm": 0.4984992031526205, "learning_rate": 1.7112594199648742e-05, "loss": 0.4029, "step": 5049 }, { "epoch": 0.9956624605678234, "grad_norm": 0.4985204521738928, "learning_rate": 1.7111504555052533e-05, "loss": 0.4062, "step": 5050 }, { "epoch": 0.9958596214511041, "grad_norm": 0.510291189780098, "learning_rate": 1.7110414739596697e-05, "loss": 0.4291, "step": 5051 }, { "epoch": 0.9960567823343849, "grad_norm": 0.5643661497979349, "learning_rate": 1.710932475330742e-05, "loss": 0.429, "step": 5052 }, { "epoch": 0.9962539432176656, "grad_norm": 0.5118534697302934, "learning_rate": 1.7108234596210892e-05, "loss": 0.4399, "step": 5053 }, { "epoch": 0.9964511041009464, "grad_norm": 0.49932647918668954, "learning_rate": 1.7107144268333307e-05, "loss": 0.383, "step": 5054 }, { "epoch": 0.9966482649842271, "grad_norm": 0.5207920247222175, "learning_rate": 1.7106053769700855e-05, "loss": 0.4259, "step": 5055 }, { "epoch": 0.9968454258675079, "grad_norm": 0.5040712672787065, "learning_rate": 1.7104963100339738e-05, "loss": 0.4396, "step": 5056 }, { "epoch": 0.9970425867507886, "grad_norm": 0.49364985751506657, "learning_rate": 1.7103872260276163e-05, "loss": 0.3975, "step": 5057 }, { "epoch": 0.9972397476340694, "grad_norm": 0.5562215293332458, "learning_rate": 1.7102781249536333e-05, "loss": 0.4626, "step": 5058 }, { "epoch": 0.9974369085173501, "grad_norm": 0.5298513969515571, "learning_rate": 1.7101690068146466e-05, "loss": 0.4142, "step": 5059 }, { "epoch": 0.9976340694006309, "grad_norm": 0.54683487357073, "learning_rate": 1.7100598716132775e-05, "loss": 0.4327, "step": 5060 }, { "epoch": 0.9978312302839116, "grad_norm": 0.5440040494015383, "learning_rate": 1.7099507193521482e-05, "loss": 0.462, "step": 5061 }, { "epoch": 0.9980283911671924, "grad_norm": 0.49509844160455657, "learning_rate": 1.709841550033881e-05, "loss": 0.4183, "step": 5062 }, { "epoch": 0.9982255520504731, "grad_norm": 1.056087374364193, "learning_rate": 1.7097323636610992e-05, "loss": 0.4725, "step": 5063 }, { "epoch": 0.998422712933754, "grad_norm": 0.5237038370525577, "learning_rate": 1.7096231602364257e-05, "loss": 0.414, "step": 5064 }, { "epoch": 0.9986198738170347, "grad_norm": 0.5948843667990235, "learning_rate": 1.7095139397624843e-05, "loss": 0.4527, "step": 5065 }, { "epoch": 0.9988170347003155, "grad_norm": 0.5150733173683878, "learning_rate": 1.7094047022418995e-05, "loss": 0.4904, "step": 5066 }, { "epoch": 0.9990141955835962, "grad_norm": 0.7227117490491202, "learning_rate": 1.709295447677295e-05, "loss": 0.4206, "step": 5067 }, { "epoch": 0.999211356466877, "grad_norm": 0.5268898131828004, "learning_rate": 1.7091861760712963e-05, "loss": 0.4402, "step": 5068 }, { "epoch": 0.9994085173501577, "grad_norm": 0.4895461883001276, "learning_rate": 1.7090768874265285e-05, "loss": 0.43, "step": 5069 }, { "epoch": 0.9996056782334385, "grad_norm": 0.5224645141727187, "learning_rate": 1.7089675817456175e-05, "loss": 0.4484, "step": 5070 }, { "epoch": 0.9998028391167192, "grad_norm": 0.5380192178833433, "learning_rate": 1.7088582590311896e-05, "loss": 0.4619, "step": 5071 }, { "epoch": 1.0, "grad_norm": 0.48949137873370546, "learning_rate": 1.708748919285871e-05, "loss": 0.4243, "step": 5072 }, { "epoch": 1.0, "eval_loss": 0.4332016706466675, "eval_runtime": 344.5029, "eval_samples_per_second": 23.599, "eval_steps_per_second": 1.477, "step": 5072 }, { "epoch": 1.0001971608832807, "grad_norm": 0.5212823866085835, "learning_rate": 1.7086395625122888e-05, "loss": 0.4414, "step": 5073 }, { "epoch": 1.0001971608832807, "grad_norm": 0.5373287163559608, "learning_rate": 1.7085301887130708e-05, "loss": 0.3227, "step": 5074 }, { "epoch": 1.0003943217665616, "grad_norm": 0.6038566158503993, "learning_rate": 1.708420797890844e-05, "loss": 0.3653, "step": 5075 }, { "epoch": 1.0005914826498423, "grad_norm": 0.6164839814322967, "learning_rate": 1.7083113900482374e-05, "loss": 0.3345, "step": 5076 }, { "epoch": 1.000788643533123, "grad_norm": 0.5898548067122023, "learning_rate": 1.708201965187879e-05, "loss": 0.2978, "step": 5077 }, { "epoch": 1.0009858044164037, "grad_norm": 0.6086636984908, "learning_rate": 1.708092523312398e-05, "loss": 0.3338, "step": 5078 }, { "epoch": 1.0011829652996846, "grad_norm": 0.85830944121542, "learning_rate": 1.707983064424424e-05, "loss": 0.3628, "step": 5079 }, { "epoch": 1.0013801261829653, "grad_norm": 0.7446958830040732, "learning_rate": 1.7078735885265872e-05, "loss": 0.3397, "step": 5080 }, { "epoch": 1.001577287066246, "grad_norm": 0.5753953072853087, "learning_rate": 1.707764095621517e-05, "loss": 0.3338, "step": 5081 }, { "epoch": 1.0017744479495267, "grad_norm": 0.5313937087435588, "learning_rate": 1.707654585711844e-05, "loss": 0.3294, "step": 5082 }, { "epoch": 1.0019716088328077, "grad_norm": 0.5433192534151587, "learning_rate": 1.7075450588002004e-05, "loss": 0.3005, "step": 5083 }, { "epoch": 1.0021687697160884, "grad_norm": 0.5974671475978584, "learning_rate": 1.7074355148892167e-05, "loss": 0.3355, "step": 5084 }, { "epoch": 1.002365930599369, "grad_norm": 0.5292036690619668, "learning_rate": 1.707325953981525e-05, "loss": 0.3133, "step": 5085 }, { "epoch": 1.0025630914826498, "grad_norm": 0.5791264643946715, "learning_rate": 1.707216376079758e-05, "loss": 0.3628, "step": 5086 }, { "epoch": 1.0027602523659307, "grad_norm": 0.5367457017037465, "learning_rate": 1.7071067811865477e-05, "loss": 0.323, "step": 5087 }, { "epoch": 1.0029574132492114, "grad_norm": 0.5463141350198112, "learning_rate": 1.7069971693045276e-05, "loss": 0.3191, "step": 5088 }, { "epoch": 1.003154574132492, "grad_norm": 0.5683958910749297, "learning_rate": 1.706887540436331e-05, "loss": 0.3338, "step": 5089 }, { "epoch": 1.0033517350157728, "grad_norm": 2.555600213008143, "learning_rate": 1.7067778945845923e-05, "loss": 0.3227, "step": 5090 }, { "epoch": 1.0035488958990537, "grad_norm": 0.602035566632611, "learning_rate": 1.7066682317519453e-05, "loss": 0.3194, "step": 5091 }, { "epoch": 1.0037460567823344, "grad_norm": 0.5294253122717554, "learning_rate": 1.7065585519410253e-05, "loss": 0.3259, "step": 5092 }, { "epoch": 1.0039432176656151, "grad_norm": 0.534836832974196, "learning_rate": 1.706448855154467e-05, "loss": 0.3003, "step": 5093 }, { "epoch": 1.0041403785488958, "grad_norm": 0.5351347670561281, "learning_rate": 1.7063391413949056e-05, "loss": 0.3055, "step": 5094 }, { "epoch": 1.0043375394321767, "grad_norm": 0.5619292444300296, "learning_rate": 1.7062294106649777e-05, "loss": 0.3276, "step": 5095 }, { "epoch": 1.0045347003154574, "grad_norm": 0.5124532518746542, "learning_rate": 1.7061196629673198e-05, "loss": 0.3216, "step": 5096 }, { "epoch": 1.0047318611987381, "grad_norm": 0.5657829767971062, "learning_rate": 1.706009898304568e-05, "loss": 0.3475, "step": 5097 }, { "epoch": 1.0049290220820188, "grad_norm": 0.5118747834196178, "learning_rate": 1.7059001166793604e-05, "loss": 0.3161, "step": 5098 }, { "epoch": 1.0051261829652998, "grad_norm": 0.5506568794847809, "learning_rate": 1.7057903180943334e-05, "loss": 0.3096, "step": 5099 }, { "epoch": 1.0053233438485805, "grad_norm": 0.831367439521069, "learning_rate": 1.7056805025521258e-05, "loss": 0.3331, "step": 5100 }, { "epoch": 1.0055205047318612, "grad_norm": 1.070362368316711, "learning_rate": 1.705570670055376e-05, "loss": 0.316, "step": 5101 }, { "epoch": 1.0057176656151419, "grad_norm": 0.5858155470361337, "learning_rate": 1.7054608206067225e-05, "loss": 0.3482, "step": 5102 }, { "epoch": 1.0059148264984228, "grad_norm": 1.0138659359886917, "learning_rate": 1.705350954208805e-05, "loss": 0.3311, "step": 5103 }, { "epoch": 1.0061119873817035, "grad_norm": 0.5577957395835643, "learning_rate": 1.705241070864262e-05, "loss": 0.3392, "step": 5104 }, { "epoch": 1.0063091482649842, "grad_norm": 0.5829503084317307, "learning_rate": 1.7051311705757353e-05, "loss": 0.3484, "step": 5105 }, { "epoch": 1.0065063091482649, "grad_norm": 0.5629335133052799, "learning_rate": 1.7050212533458637e-05, "loss": 0.3403, "step": 5106 }, { "epoch": 1.0067034700315458, "grad_norm": 0.545530969036501, "learning_rate": 1.7049113191772892e-05, "loss": 0.3129, "step": 5107 }, { "epoch": 1.0069006309148265, "grad_norm": 0.5766133922035589, "learning_rate": 1.7048013680726524e-05, "loss": 0.3228, "step": 5108 }, { "epoch": 1.0070977917981072, "grad_norm": 0.49331230147259286, "learning_rate": 1.7046914000345955e-05, "loss": 0.2823, "step": 5109 }, { "epoch": 1.007294952681388, "grad_norm": 10.932286242518197, "learning_rate": 1.7045814150657597e-05, "loss": 0.3303, "step": 5110 }, { "epoch": 1.0074921135646688, "grad_norm": 0.6743010499249699, "learning_rate": 1.704471413168788e-05, "loss": 0.3184, "step": 5111 }, { "epoch": 1.0076892744479495, "grad_norm": 0.5417814723560809, "learning_rate": 1.7043613943463236e-05, "loss": 0.3313, "step": 5112 }, { "epoch": 1.0078864353312302, "grad_norm": 0.6255338295712346, "learning_rate": 1.7042513586010096e-05, "loss": 0.3199, "step": 5113 }, { "epoch": 1.008083596214511, "grad_norm": 0.5321460876030243, "learning_rate": 1.7041413059354893e-05, "loss": 0.3015, "step": 5114 }, { "epoch": 1.0082807570977919, "grad_norm": 0.5526276788887993, "learning_rate": 1.704031236352407e-05, "loss": 0.3266, "step": 5115 }, { "epoch": 1.0084779179810726, "grad_norm": 0.6188868104429317, "learning_rate": 1.7039211498544075e-05, "loss": 0.347, "step": 5116 }, { "epoch": 1.0086750788643533, "grad_norm": 0.5552679635253073, "learning_rate": 1.7038110464441354e-05, "loss": 0.3225, "step": 5117 }, { "epoch": 1.008872239747634, "grad_norm": 0.525568513553716, "learning_rate": 1.703700926124236e-05, "loss": 0.3194, "step": 5118 }, { "epoch": 1.0090694006309149, "grad_norm": 0.6134298315197882, "learning_rate": 1.7035907888973556e-05, "loss": 0.3509, "step": 5119 }, { "epoch": 1.0092665615141956, "grad_norm": 0.5390374770864321, "learning_rate": 1.7034806347661398e-05, "loss": 0.3143, "step": 5120 }, { "epoch": 1.0094637223974763, "grad_norm": 0.5293103732186882, "learning_rate": 1.703370463733235e-05, "loss": 0.3249, "step": 5121 }, { "epoch": 1.0096608832807572, "grad_norm": 0.7725038009082341, "learning_rate": 1.7032602758012884e-05, "loss": 0.3411, "step": 5122 }, { "epoch": 1.009858044164038, "grad_norm": 0.5621329825507172, "learning_rate": 1.703150070972947e-05, "loss": 0.3549, "step": 5123 }, { "epoch": 1.0100552050473186, "grad_norm": 0.5337193860173397, "learning_rate": 1.7030398492508595e-05, "loss": 0.3117, "step": 5124 }, { "epoch": 1.0102523659305993, "grad_norm": 0.5648883470891011, "learning_rate": 1.7029296106376732e-05, "loss": 0.3125, "step": 5125 }, { "epoch": 1.0104495268138802, "grad_norm": 0.5193119110839971, "learning_rate": 1.702819355136037e-05, "loss": 0.3176, "step": 5126 }, { "epoch": 1.010646687697161, "grad_norm": 1.1538498444708232, "learning_rate": 1.7027090827486e-05, "loss": 0.3035, "step": 5127 }, { "epoch": 1.0108438485804416, "grad_norm": 0.5536713733939184, "learning_rate": 1.702598793478011e-05, "loss": 0.3113, "step": 5128 }, { "epoch": 1.0110410094637223, "grad_norm": 0.5121398955428231, "learning_rate": 1.7024884873269206e-05, "loss": 0.2913, "step": 5129 }, { "epoch": 1.0112381703470033, "grad_norm": 0.7433927419882325, "learning_rate": 1.7023781642979786e-05, "loss": 0.3399, "step": 5130 }, { "epoch": 1.011435331230284, "grad_norm": 0.5206836608515906, "learning_rate": 1.7022678243938352e-05, "loss": 0.2917, "step": 5131 }, { "epoch": 1.0116324921135647, "grad_norm": 0.652680038981418, "learning_rate": 1.7021574676171418e-05, "loss": 0.3144, "step": 5132 }, { "epoch": 1.0118296529968454, "grad_norm": 0.5271509086644123, "learning_rate": 1.70204709397055e-05, "loss": 0.3219, "step": 5133 }, { "epoch": 1.0120268138801263, "grad_norm": 0.6052127133577494, "learning_rate": 1.7019367034567115e-05, "loss": 0.3163, "step": 5134 }, { "epoch": 1.012223974763407, "grad_norm": 0.4968211645367436, "learning_rate": 1.7018262960782783e-05, "loss": 0.312, "step": 5135 }, { "epoch": 1.0124211356466877, "grad_norm": 0.5506646508851154, "learning_rate": 1.701715871837903e-05, "loss": 0.3277, "step": 5136 }, { "epoch": 1.0126182965299684, "grad_norm": 0.5474780371203971, "learning_rate": 1.7016054307382387e-05, "loss": 0.3104, "step": 5137 }, { "epoch": 1.0128154574132493, "grad_norm": 0.5652503845654994, "learning_rate": 1.7014949727819395e-05, "loss": 0.3375, "step": 5138 }, { "epoch": 1.01301261829653, "grad_norm": 0.5762231960940658, "learning_rate": 1.701384497971658e-05, "loss": 0.3515, "step": 5139 }, { "epoch": 1.0132097791798107, "grad_norm": 0.5555119994103955, "learning_rate": 1.7012740063100495e-05, "loss": 0.304, "step": 5140 }, { "epoch": 1.0134069400630914, "grad_norm": 0.5638374760070589, "learning_rate": 1.7011634977997683e-05, "loss": 0.3496, "step": 5141 }, { "epoch": 1.0136041009463723, "grad_norm": 0.5643948965503662, "learning_rate": 1.701052972443469e-05, "loss": 0.3397, "step": 5142 }, { "epoch": 1.013801261829653, "grad_norm": 0.5135163466842045, "learning_rate": 1.700942430243808e-05, "loss": 0.301, "step": 5143 }, { "epoch": 1.0139984227129337, "grad_norm": 0.5539748285463232, "learning_rate": 1.7008318712034405e-05, "loss": 0.3244, "step": 5144 }, { "epoch": 1.0141955835962144, "grad_norm": 0.5740027085216346, "learning_rate": 1.700721295325023e-05, "loss": 0.314, "step": 5145 }, { "epoch": 1.0143927444794953, "grad_norm": 0.4898966401251555, "learning_rate": 1.7006107026112117e-05, "loss": 0.3106, "step": 5146 }, { "epoch": 1.014589905362776, "grad_norm": 0.5414399089916463, "learning_rate": 1.7005000930646643e-05, "loss": 0.2973, "step": 5147 }, { "epoch": 1.0147870662460567, "grad_norm": 0.5389719439899145, "learning_rate": 1.700389466688038e-05, "loss": 0.3285, "step": 5148 }, { "epoch": 1.0149842271293374, "grad_norm": 0.5180514047808813, "learning_rate": 1.7002788234839908e-05, "loss": 0.3141, "step": 5149 }, { "epoch": 1.0151813880126184, "grad_norm": 0.5437706206122538, "learning_rate": 1.7001681634551813e-05, "loss": 0.3277, "step": 5150 }, { "epoch": 1.015378548895899, "grad_norm": 1.396200116042807, "learning_rate": 1.700057486604267e-05, "loss": 0.3558, "step": 5151 }, { "epoch": 1.0155757097791798, "grad_norm": 0.563327018363544, "learning_rate": 1.6999467929339086e-05, "loss": 0.3421, "step": 5152 }, { "epoch": 1.0157728706624605, "grad_norm": 0.5184359383889053, "learning_rate": 1.6998360824467644e-05, "loss": 0.3295, "step": 5153 }, { "epoch": 1.0159700315457414, "grad_norm": 0.5208557657594138, "learning_rate": 1.6997253551454948e-05, "loss": 0.2896, "step": 5154 }, { "epoch": 1.016167192429022, "grad_norm": 0.5291211490149857, "learning_rate": 1.6996146110327604e-05, "loss": 0.3234, "step": 5155 }, { "epoch": 1.0163643533123028, "grad_norm": 0.7659645289802868, "learning_rate": 1.699503850111221e-05, "loss": 0.3266, "step": 5156 }, { "epoch": 1.0165615141955835, "grad_norm": 0.5341642375091761, "learning_rate": 1.699393072383539e-05, "loss": 0.3227, "step": 5157 }, { "epoch": 1.0167586750788644, "grad_norm": 0.5237990761161321, "learning_rate": 1.6992822778523745e-05, "loss": 0.3156, "step": 5158 }, { "epoch": 1.0169558359621451, "grad_norm": 0.5556636270933202, "learning_rate": 1.6991714665203905e-05, "loss": 0.3382, "step": 5159 }, { "epoch": 1.0171529968454258, "grad_norm": 0.5169961857756208, "learning_rate": 1.699060638390249e-05, "loss": 0.3083, "step": 5160 }, { "epoch": 1.0173501577287065, "grad_norm": 0.577712432873463, "learning_rate": 1.6989497934646128e-05, "loss": 0.3235, "step": 5161 }, { "epoch": 1.0175473186119874, "grad_norm": 0.48166753712469484, "learning_rate": 1.6988389317461448e-05, "loss": 0.3296, "step": 5162 }, { "epoch": 1.0177444794952681, "grad_norm": 0.5588468932069051, "learning_rate": 1.6987280532375082e-05, "loss": 0.3205, "step": 5163 }, { "epoch": 1.0179416403785488, "grad_norm": 0.5022075140943705, "learning_rate": 1.698617157941368e-05, "loss": 0.3183, "step": 5164 }, { "epoch": 1.0181388012618298, "grad_norm": 0.5824010042694542, "learning_rate": 1.698506245860388e-05, "loss": 0.3339, "step": 5165 }, { "epoch": 1.0183359621451105, "grad_norm": 0.48671865197581843, "learning_rate": 1.6983953169972333e-05, "loss": 0.3063, "step": 5166 }, { "epoch": 1.0185331230283912, "grad_norm": 0.5664051734703313, "learning_rate": 1.6982843713545678e-05, "loss": 0.318, "step": 5167 }, { "epoch": 1.0187302839116719, "grad_norm": 0.5307701667465677, "learning_rate": 1.6981734089350585e-05, "loss": 0.3375, "step": 5168 }, { "epoch": 1.0189274447949528, "grad_norm": 0.5241759654651865, "learning_rate": 1.698062429741371e-05, "loss": 0.3219, "step": 5169 }, { "epoch": 1.0191246056782335, "grad_norm": 0.5450182402988996, "learning_rate": 1.697951433776171e-05, "loss": 0.3373, "step": 5170 }, { "epoch": 1.0193217665615142, "grad_norm": 0.5065193637889227, "learning_rate": 1.6978404210421257e-05, "loss": 0.3224, "step": 5171 }, { "epoch": 1.0195189274447949, "grad_norm": 0.5440437100960368, "learning_rate": 1.6977293915419025e-05, "loss": 0.346, "step": 5172 }, { "epoch": 1.0197160883280758, "grad_norm": 0.5167797629051604, "learning_rate": 1.697618345278169e-05, "loss": 0.3114, "step": 5173 }, { "epoch": 1.0199132492113565, "grad_norm": 0.5242186802344402, "learning_rate": 1.6975072822535924e-05, "loss": 0.3092, "step": 5174 }, { "epoch": 1.0201104100946372, "grad_norm": 0.5201227294258078, "learning_rate": 1.6973962024708425e-05, "loss": 0.331, "step": 5175 }, { "epoch": 1.020307570977918, "grad_norm": 0.5072548524687776, "learning_rate": 1.6972851059325866e-05, "loss": 0.3226, "step": 5176 }, { "epoch": 1.0205047318611988, "grad_norm": 0.5003313380328426, "learning_rate": 1.6971739926414946e-05, "loss": 0.3188, "step": 5177 }, { "epoch": 1.0207018927444795, "grad_norm": 0.5360098965820386, "learning_rate": 1.6970628626002362e-05, "loss": 0.3379, "step": 5178 }, { "epoch": 1.0208990536277602, "grad_norm": 0.5006169156718928, "learning_rate": 1.6969517158114807e-05, "loss": 0.33, "step": 5179 }, { "epoch": 1.021096214511041, "grad_norm": 0.4797157904337746, "learning_rate": 1.6968405522778996e-05, "loss": 0.3035, "step": 5180 }, { "epoch": 1.0212933753943219, "grad_norm": 0.5189338955937625, "learning_rate": 1.6967293720021628e-05, "loss": 0.3063, "step": 5181 }, { "epoch": 1.0214905362776026, "grad_norm": 0.5284709213206259, "learning_rate": 1.6966181749869417e-05, "loss": 0.3307, "step": 5182 }, { "epoch": 1.0216876971608833, "grad_norm": 0.5586108885453471, "learning_rate": 1.6965069612349082e-05, "loss": 0.3354, "step": 5183 }, { "epoch": 1.021884858044164, "grad_norm": 0.4825892059596201, "learning_rate": 1.6963957307487337e-05, "loss": 0.3116, "step": 5184 }, { "epoch": 1.0220820189274449, "grad_norm": 0.5626137147948211, "learning_rate": 1.6962844835310912e-05, "loss": 0.3449, "step": 5185 }, { "epoch": 1.0222791798107256, "grad_norm": 0.4957252119548732, "learning_rate": 1.696173219584653e-05, "loss": 0.3201, "step": 5186 }, { "epoch": 1.0224763406940063, "grad_norm": 0.5997034402929716, "learning_rate": 1.696061938912093e-05, "loss": 0.3329, "step": 5187 }, { "epoch": 1.022673501577287, "grad_norm": 0.5074547201736874, "learning_rate": 1.6959506415160838e-05, "loss": 0.3124, "step": 5188 }, { "epoch": 1.022870662460568, "grad_norm": 0.525820644885496, "learning_rate": 1.6958393273993004e-05, "loss": 0.3149, "step": 5189 }, { "epoch": 1.0230678233438486, "grad_norm": 0.49149463015492956, "learning_rate": 1.695727996564417e-05, "loss": 0.3033, "step": 5190 }, { "epoch": 1.0232649842271293, "grad_norm": 0.5812486165845497, "learning_rate": 1.6956166490141076e-05, "loss": 0.3592, "step": 5191 }, { "epoch": 1.02346214511041, "grad_norm": 0.47448602856705274, "learning_rate": 1.6955052847510486e-05, "loss": 0.298, "step": 5192 }, { "epoch": 1.023659305993691, "grad_norm": 0.4933808645190031, "learning_rate": 1.6953939037779147e-05, "loss": 0.3174, "step": 5193 }, { "epoch": 1.0238564668769716, "grad_norm": 0.49672191971801205, "learning_rate": 1.6952825060973826e-05, "loss": 0.3035, "step": 5194 }, { "epoch": 1.0240536277602523, "grad_norm": 0.5303109710476851, "learning_rate": 1.695171091712128e-05, "loss": 0.3174, "step": 5195 }, { "epoch": 1.024250788643533, "grad_norm": 0.4983343105685653, "learning_rate": 1.6950596606248283e-05, "loss": 0.325, "step": 5196 }, { "epoch": 1.024447949526814, "grad_norm": 0.5149354624056497, "learning_rate": 1.6949482128381607e-05, "loss": 0.3172, "step": 5197 }, { "epoch": 1.0246451104100947, "grad_norm": 0.505149696080138, "learning_rate": 1.694836748354802e-05, "loss": 0.3276, "step": 5198 }, { "epoch": 1.0248422712933754, "grad_norm": 0.5800181685156677, "learning_rate": 1.6947252671774317e-05, "loss": 0.3669, "step": 5199 }, { "epoch": 1.025039432176656, "grad_norm": 0.530041440210273, "learning_rate": 1.694613769308727e-05, "loss": 0.3162, "step": 5200 }, { "epoch": 1.025236593059937, "grad_norm": 0.5328709074168889, "learning_rate": 1.6945022547513672e-05, "loss": 0.3364, "step": 5201 }, { "epoch": 1.0254337539432177, "grad_norm": 0.5659050716130124, "learning_rate": 1.694390723508031e-05, "loss": 0.352, "step": 5202 }, { "epoch": 1.0256309148264984, "grad_norm": 0.47684840324515954, "learning_rate": 1.694279175581399e-05, "loss": 0.2927, "step": 5203 }, { "epoch": 1.025828075709779, "grad_norm": 0.5261599552118007, "learning_rate": 1.6941676109741506e-05, "loss": 0.3474, "step": 5204 }, { "epoch": 1.02602523659306, "grad_norm": 0.5295012959661953, "learning_rate": 1.694056029688966e-05, "loss": 0.3451, "step": 5205 }, { "epoch": 1.0262223974763407, "grad_norm": 0.520003663405905, "learning_rate": 1.693944431728527e-05, "loss": 0.3223, "step": 5206 }, { "epoch": 1.0264195583596214, "grad_norm": 0.4972533544450635, "learning_rate": 1.693832817095514e-05, "loss": 0.329, "step": 5207 }, { "epoch": 1.0266167192429023, "grad_norm": 0.49743818639590465, "learning_rate": 1.6937211857926087e-05, "loss": 0.3246, "step": 5208 }, { "epoch": 1.026813880126183, "grad_norm": 0.5108957204240979, "learning_rate": 1.693609537822493e-05, "loss": 0.3604, "step": 5209 }, { "epoch": 1.0270110410094637, "grad_norm": 0.5027559180142284, "learning_rate": 1.69349787318785e-05, "loss": 0.3127, "step": 5210 }, { "epoch": 1.0272082018927444, "grad_norm": 0.5060763508870827, "learning_rate": 1.6933861918913617e-05, "loss": 0.3157, "step": 5211 }, { "epoch": 1.0274053627760253, "grad_norm": 0.5587497322838068, "learning_rate": 1.693274493935712e-05, "loss": 0.3335, "step": 5212 }, { "epoch": 1.027602523659306, "grad_norm": 0.5328215936145846, "learning_rate": 1.6931627793235845e-05, "loss": 0.3401, "step": 5213 }, { "epoch": 1.0277996845425867, "grad_norm": 0.4746075756485485, "learning_rate": 1.693051048057663e-05, "loss": 0.2946, "step": 5214 }, { "epoch": 1.0279968454258674, "grad_norm": 0.5138896173325503, "learning_rate": 1.6929393001406317e-05, "loss": 0.3183, "step": 5215 }, { "epoch": 1.0281940063091484, "grad_norm": 0.6224304648800318, "learning_rate": 1.6928275355751758e-05, "loss": 0.3385, "step": 5216 }, { "epoch": 1.028391167192429, "grad_norm": 0.4778598347974904, "learning_rate": 1.69271575436398e-05, "loss": 0.2942, "step": 5217 }, { "epoch": 1.0285883280757098, "grad_norm": 0.5206657983017704, "learning_rate": 1.6926039565097313e-05, "loss": 0.3232, "step": 5218 }, { "epoch": 1.0287854889589905, "grad_norm": 0.517735918520925, "learning_rate": 1.6924921420151143e-05, "loss": 0.3415, "step": 5219 }, { "epoch": 1.0289826498422714, "grad_norm": 0.49436293737722586, "learning_rate": 1.6923803108828155e-05, "loss": 0.2997, "step": 5220 }, { "epoch": 1.029179810725552, "grad_norm": 0.48751091503344113, "learning_rate": 1.6922684631155226e-05, "loss": 0.3047, "step": 5221 }, { "epoch": 1.0293769716088328, "grad_norm": 0.520881620751667, "learning_rate": 1.6921565987159226e-05, "loss": 0.335, "step": 5222 }, { "epoch": 1.0295741324921135, "grad_norm": 0.6347905848505744, "learning_rate": 1.6920447176867022e-05, "loss": 0.2997, "step": 5223 }, { "epoch": 1.0297712933753944, "grad_norm": 0.5493020000879799, "learning_rate": 1.6919328200305507e-05, "loss": 0.3646, "step": 5224 }, { "epoch": 1.0299684542586751, "grad_norm": 0.5057286317589824, "learning_rate": 1.691820905750156e-05, "loss": 0.3125, "step": 5225 }, { "epoch": 1.0301656151419558, "grad_norm": 0.5040769564600079, "learning_rate": 1.691708974848207e-05, "loss": 0.3069, "step": 5226 }, { "epoch": 1.0303627760252365, "grad_norm": 0.5449234431968593, "learning_rate": 1.6915970273273927e-05, "loss": 0.3322, "step": 5227 }, { "epoch": 1.0305599369085174, "grad_norm": 0.5763354754529781, "learning_rate": 1.6914850631904027e-05, "loss": 0.3389, "step": 5228 }, { "epoch": 1.0307570977917981, "grad_norm": 0.51550599843538, "learning_rate": 1.6913730824399274e-05, "loss": 0.3618, "step": 5229 }, { "epoch": 1.0309542586750788, "grad_norm": 0.5422495461192716, "learning_rate": 1.691261085078657e-05, "loss": 0.328, "step": 5230 }, { "epoch": 1.0311514195583595, "grad_norm": 0.49409053708036643, "learning_rate": 1.6911490711092824e-05, "loss": 0.3175, "step": 5231 }, { "epoch": 1.0313485804416405, "grad_norm": 0.5082339134425249, "learning_rate": 1.6910370405344948e-05, "loss": 0.3372, "step": 5232 }, { "epoch": 1.0315457413249212, "grad_norm": 0.5304668788200992, "learning_rate": 1.6909249933569856e-05, "loss": 0.3237, "step": 5233 }, { "epoch": 1.0317429022082019, "grad_norm": 0.5133955618334345, "learning_rate": 1.690812929579447e-05, "loss": 0.3391, "step": 5234 }, { "epoch": 1.0319400630914826, "grad_norm": 0.5657397770422078, "learning_rate": 1.690700849204572e-05, "loss": 0.3315, "step": 5235 }, { "epoch": 1.0321372239747635, "grad_norm": 0.5017068565390534, "learning_rate": 1.6905887522350522e-05, "loss": 0.3137, "step": 5236 }, { "epoch": 1.0323343848580442, "grad_norm": 0.49468986516238367, "learning_rate": 1.690476638673582e-05, "loss": 0.3166, "step": 5237 }, { "epoch": 1.0325315457413249, "grad_norm": 0.4900916730092504, "learning_rate": 1.690364508522854e-05, "loss": 0.3041, "step": 5238 }, { "epoch": 1.0327287066246056, "grad_norm": 0.4786911698428022, "learning_rate": 1.6902523617855633e-05, "loss": 0.3154, "step": 5239 }, { "epoch": 1.0329258675078865, "grad_norm": 0.5146263841992783, "learning_rate": 1.6901401984644034e-05, "loss": 0.2831, "step": 5240 }, { "epoch": 1.0331230283911672, "grad_norm": 0.5189552944117162, "learning_rate": 1.6900280185620697e-05, "loss": 0.31, "step": 5241 }, { "epoch": 1.033320189274448, "grad_norm": 0.5532426804255108, "learning_rate": 1.689915822081257e-05, "loss": 0.329, "step": 5242 }, { "epoch": 1.0335173501577286, "grad_norm": 0.8076083652448988, "learning_rate": 1.689803609024661e-05, "loss": 0.3378, "step": 5243 }, { "epoch": 1.0337145110410095, "grad_norm": 0.5725048421083666, "learning_rate": 1.6896913793949782e-05, "loss": 0.3247, "step": 5244 }, { "epoch": 1.0339116719242902, "grad_norm": 0.49855659279047376, "learning_rate": 1.6895791331949045e-05, "loss": 0.32, "step": 5245 }, { "epoch": 1.034108832807571, "grad_norm": 0.6416688834979003, "learning_rate": 1.6894668704271363e-05, "loss": 0.3466, "step": 5246 }, { "epoch": 1.0343059936908516, "grad_norm": 0.51271995997633, "learning_rate": 1.6893545910943718e-05, "loss": 0.3314, "step": 5247 }, { "epoch": 1.0345031545741326, "grad_norm": 0.5357629294597502, "learning_rate": 1.689242295199308e-05, "loss": 0.3238, "step": 5248 }, { "epoch": 1.0347003154574133, "grad_norm": 2.907322545828229, "learning_rate": 1.6891299827446428e-05, "loss": 0.38, "step": 5249 }, { "epoch": 1.034897476340694, "grad_norm": 0.5492499750569434, "learning_rate": 1.689017653733075e-05, "loss": 0.3136, "step": 5250 }, { "epoch": 1.0350946372239747, "grad_norm": 0.5411422818363292, "learning_rate": 1.688905308167303e-05, "loss": 0.3282, "step": 5251 }, { "epoch": 1.0352917981072556, "grad_norm": 0.5143337197907343, "learning_rate": 1.6887929460500264e-05, "loss": 0.2836, "step": 5252 }, { "epoch": 1.0354889589905363, "grad_norm": 0.5153934469793959, "learning_rate": 1.688680567383945e-05, "loss": 0.3336, "step": 5253 }, { "epoch": 1.035686119873817, "grad_norm": 0.5296091136769037, "learning_rate": 1.6885681721717575e-05, "loss": 0.3246, "step": 5254 }, { "epoch": 1.0358832807570977, "grad_norm": 0.5144127492160583, "learning_rate": 1.688455760416166e-05, "loss": 0.3193, "step": 5255 }, { "epoch": 1.0360804416403786, "grad_norm": 0.5575440600913101, "learning_rate": 1.6883433321198697e-05, "loss": 0.3244, "step": 5256 }, { "epoch": 1.0362776025236593, "grad_norm": 0.4886892524188935, "learning_rate": 1.688230887285571e-05, "loss": 0.2849, "step": 5257 }, { "epoch": 1.03647476340694, "grad_norm": 0.5024265088647863, "learning_rate": 1.6881184259159708e-05, "loss": 0.3105, "step": 5258 }, { "epoch": 1.036671924290221, "grad_norm": 0.5130152621541834, "learning_rate": 1.6880059480137715e-05, "loss": 0.3275, "step": 5259 }, { "epoch": 1.0368690851735016, "grad_norm": 0.4909264681540603, "learning_rate": 1.687893453581675e-05, "loss": 0.3102, "step": 5260 }, { "epoch": 1.0370662460567823, "grad_norm": 0.5196024238559768, "learning_rate": 1.6877809426223846e-05, "loss": 0.3231, "step": 5261 }, { "epoch": 1.037263406940063, "grad_norm": 0.5799374687724274, "learning_rate": 1.6876684151386028e-05, "loss": 0.3441, "step": 5262 }, { "epoch": 1.037460567823344, "grad_norm": 0.5397818543610164, "learning_rate": 1.687555871133034e-05, "loss": 0.3306, "step": 5263 }, { "epoch": 1.0376577287066246, "grad_norm": 8.650236468601472, "learning_rate": 1.6874433106083815e-05, "loss": 0.3565, "step": 5264 }, { "epoch": 1.0378548895899053, "grad_norm": 0.653217018292931, "learning_rate": 1.6873307335673498e-05, "loss": 0.3491, "step": 5265 }, { "epoch": 1.038052050473186, "grad_norm": 0.5706483442294323, "learning_rate": 1.6872181400126434e-05, "loss": 0.3686, "step": 5266 }, { "epoch": 1.038249211356467, "grad_norm": 1.8045203865710944, "learning_rate": 1.6871055299469686e-05, "loss": 0.3007, "step": 5267 }, { "epoch": 1.0384463722397477, "grad_norm": 0.5787568141280821, "learning_rate": 1.6869929033730294e-05, "loss": 0.339, "step": 5268 }, { "epoch": 1.0386435331230284, "grad_norm": 0.5659159080692836, "learning_rate": 1.6868802602935327e-05, "loss": 0.3399, "step": 5269 }, { "epoch": 1.038840694006309, "grad_norm": 0.6818154220606805, "learning_rate": 1.6867676007111847e-05, "loss": 0.3008, "step": 5270 }, { "epoch": 1.03903785488959, "grad_norm": 0.617492098823351, "learning_rate": 1.6866549246286918e-05, "loss": 0.3438, "step": 5271 }, { "epoch": 1.0392350157728707, "grad_norm": 0.49987762250168705, "learning_rate": 1.6865422320487617e-05, "loss": 0.295, "step": 5272 }, { "epoch": 1.0394321766561514, "grad_norm": 0.5542454344482823, "learning_rate": 1.6864295229741014e-05, "loss": 0.3195, "step": 5273 }, { "epoch": 1.039629337539432, "grad_norm": 0.5477043440442139, "learning_rate": 1.686316797407419e-05, "loss": 0.3124, "step": 5274 }, { "epoch": 1.039826498422713, "grad_norm": 0.541129200628956, "learning_rate": 1.6862040553514227e-05, "loss": 0.3037, "step": 5275 }, { "epoch": 1.0400236593059937, "grad_norm": 0.5301479545638493, "learning_rate": 1.686091296808822e-05, "loss": 0.3073, "step": 5276 }, { "epoch": 1.0402208201892744, "grad_norm": 0.5425477118739194, "learning_rate": 1.6859785217823247e-05, "loss": 0.3094, "step": 5277 }, { "epoch": 1.0404179810725551, "grad_norm": 20.30229094176611, "learning_rate": 1.6858657302746412e-05, "loss": 0.3857, "step": 5278 }, { "epoch": 1.040615141955836, "grad_norm": 0.5535859868074469, "learning_rate": 1.6857529222884813e-05, "loss": 0.3197, "step": 5279 }, { "epoch": 1.0408123028391167, "grad_norm": 0.5253753488109291, "learning_rate": 1.685640097826555e-05, "loss": 0.3229, "step": 5280 }, { "epoch": 1.0410094637223974, "grad_norm": 0.5265804552077402, "learning_rate": 1.6855272568915738e-05, "loss": 0.3349, "step": 5281 }, { "epoch": 1.0412066246056781, "grad_norm": 0.5239300594186814, "learning_rate": 1.6854143994862476e-05, "loss": 0.3265, "step": 5282 }, { "epoch": 1.041403785488959, "grad_norm": 0.503539634551038, "learning_rate": 1.685301525613289e-05, "loss": 0.3356, "step": 5283 }, { "epoch": 1.0416009463722398, "grad_norm": 0.540113881476233, "learning_rate": 1.685188635275409e-05, "loss": 0.3271, "step": 5284 }, { "epoch": 1.0417981072555205, "grad_norm": 0.5497912101764829, "learning_rate": 1.6850757284753202e-05, "loss": 0.3238, "step": 5285 }, { "epoch": 1.0419952681388012, "grad_norm": 0.5419609896862225, "learning_rate": 1.6849628052157353e-05, "loss": 0.328, "step": 5286 }, { "epoch": 1.042192429022082, "grad_norm": 0.5282911482063344, "learning_rate": 1.6848498654993676e-05, "loss": 0.3391, "step": 5287 }, { "epoch": 1.0423895899053628, "grad_norm": 0.5365488857264754, "learning_rate": 1.6847369093289304e-05, "loss": 0.3443, "step": 5288 }, { "epoch": 1.0425867507886435, "grad_norm": 0.4884175319952028, "learning_rate": 1.6846239367071372e-05, "loss": 0.33, "step": 5289 }, { "epoch": 1.0427839116719242, "grad_norm": 0.564983002840711, "learning_rate": 1.684510947636703e-05, "loss": 0.3469, "step": 5290 }, { "epoch": 1.0429810725552051, "grad_norm": 0.5449133450818839, "learning_rate": 1.684397942120342e-05, "loss": 0.3259, "step": 5291 }, { "epoch": 1.0431782334384858, "grad_norm": 0.5622289793546967, "learning_rate": 1.684284920160769e-05, "loss": 0.3485, "step": 5292 }, { "epoch": 1.0433753943217665, "grad_norm": 0.5340801322832177, "learning_rate": 1.6841718817607003e-05, "loss": 0.3358, "step": 5293 }, { "epoch": 1.0435725552050472, "grad_norm": 0.6083293183480287, "learning_rate": 1.6840588269228507e-05, "loss": 0.3317, "step": 5294 }, { "epoch": 1.0437697160883281, "grad_norm": 0.4950617147026909, "learning_rate": 1.6839457556499372e-05, "loss": 0.3276, "step": 5295 }, { "epoch": 1.0439668769716088, "grad_norm": 0.5662216265381399, "learning_rate": 1.6838326679446756e-05, "loss": 0.3515, "step": 5296 }, { "epoch": 1.0441640378548895, "grad_norm": 0.5275768249043005, "learning_rate": 1.683719563809784e-05, "loss": 0.3291, "step": 5297 }, { "epoch": 1.0443611987381702, "grad_norm": 0.5363502739274106, "learning_rate": 1.683606443247979e-05, "loss": 0.3348, "step": 5298 }, { "epoch": 1.0445583596214512, "grad_norm": 0.5434941004906498, "learning_rate": 1.6834933062619788e-05, "loss": 0.3339, "step": 5299 }, { "epoch": 1.0447555205047319, "grad_norm": 0.49586659679323425, "learning_rate": 1.6833801528545016e-05, "loss": 0.3006, "step": 5300 }, { "epoch": 1.0449526813880126, "grad_norm": 0.5211848831043154, "learning_rate": 1.6832669830282658e-05, "loss": 0.3197, "step": 5301 }, { "epoch": 1.0451498422712935, "grad_norm": 0.531819238704122, "learning_rate": 1.6831537967859904e-05, "loss": 0.3438, "step": 5302 }, { "epoch": 1.0453470031545742, "grad_norm": 0.5489480069934828, "learning_rate": 1.6830405941303948e-05, "loss": 0.3295, "step": 5303 }, { "epoch": 1.0455441640378549, "grad_norm": 0.5107386270726029, "learning_rate": 1.6829273750641995e-05, "loss": 0.3047, "step": 5304 }, { "epoch": 1.0457413249211356, "grad_norm": 0.5400652269430585, "learning_rate": 1.6828141395901236e-05, "loss": 0.3339, "step": 5305 }, { "epoch": 1.0459384858044165, "grad_norm": 0.5346562175161331, "learning_rate": 1.682700887710888e-05, "loss": 0.3171, "step": 5306 }, { "epoch": 1.0461356466876972, "grad_norm": 0.5453238920962701, "learning_rate": 1.682587619429214e-05, "loss": 0.3333, "step": 5307 }, { "epoch": 1.046332807570978, "grad_norm": 0.5404341648713517, "learning_rate": 1.6824743347478224e-05, "loss": 0.3207, "step": 5308 }, { "epoch": 1.0465299684542586, "grad_norm": 0.48716882855128474, "learning_rate": 1.682361033669436e-05, "loss": 0.3093, "step": 5309 }, { "epoch": 1.0467271293375395, "grad_norm": 0.5182039037513314, "learning_rate": 1.6822477161967757e-05, "loss": 0.3075, "step": 5310 }, { "epoch": 1.0469242902208202, "grad_norm": 0.5052936292274123, "learning_rate": 1.682134382332565e-05, "loss": 0.3087, "step": 5311 }, { "epoch": 1.047121451104101, "grad_norm": 0.5089202938636761, "learning_rate": 1.682021032079526e-05, "loss": 0.3342, "step": 5312 }, { "epoch": 1.0473186119873816, "grad_norm": 0.5487804098811082, "learning_rate": 1.681907665440383e-05, "loss": 0.326, "step": 5313 }, { "epoch": 1.0475157728706626, "grad_norm": 0.581936936175654, "learning_rate": 1.6817942824178587e-05, "loss": 0.3605, "step": 5314 }, { "epoch": 1.0477129337539433, "grad_norm": 0.5264515076671745, "learning_rate": 1.681680883014678e-05, "loss": 0.3541, "step": 5315 }, { "epoch": 1.047910094637224, "grad_norm": 0.49271519940647124, "learning_rate": 1.6815674672335652e-05, "loss": 0.3023, "step": 5316 }, { "epoch": 1.0481072555205047, "grad_norm": 0.5364492375977938, "learning_rate": 1.681454035077245e-05, "loss": 0.3016, "step": 5317 }, { "epoch": 1.0483044164037856, "grad_norm": 1.5271525514198456, "learning_rate": 1.6813405865484426e-05, "loss": 0.3644, "step": 5318 }, { "epoch": 1.0485015772870663, "grad_norm": 0.5513325850528616, "learning_rate": 1.6812271216498842e-05, "loss": 0.3354, "step": 5319 }, { "epoch": 1.048698738170347, "grad_norm": 0.5098889426381192, "learning_rate": 1.6811136403842955e-05, "loss": 0.3291, "step": 5320 }, { "epoch": 1.0488958990536277, "grad_norm": 0.5852649954559482, "learning_rate": 1.681000142754403e-05, "loss": 0.2934, "step": 5321 }, { "epoch": 1.0490930599369086, "grad_norm": 0.5187903351376137, "learning_rate": 1.680886628762934e-05, "loss": 0.29, "step": 5322 }, { "epoch": 1.0492902208201893, "grad_norm": 0.5150689333720393, "learning_rate": 1.6807730984126155e-05, "loss": 0.3159, "step": 5323 }, { "epoch": 1.04948738170347, "grad_norm": 0.5865797200646796, "learning_rate": 1.6806595517061744e-05, "loss": 0.3345, "step": 5324 }, { "epoch": 1.0496845425867507, "grad_norm": 0.5411960758909427, "learning_rate": 1.68054598864634e-05, "loss": 0.3213, "step": 5325 }, { "epoch": 1.0498817034700316, "grad_norm": 0.5402846419962667, "learning_rate": 1.6804324092358402e-05, "loss": 0.32, "step": 5326 }, { "epoch": 1.0500788643533123, "grad_norm": 0.5039112744373472, "learning_rate": 1.6803188134774037e-05, "loss": 0.3259, "step": 5327 }, { "epoch": 1.050276025236593, "grad_norm": 0.5462197388737413, "learning_rate": 1.6802052013737595e-05, "loss": 0.3309, "step": 5328 }, { "epoch": 1.0504731861198737, "grad_norm": 0.5522764205643538, "learning_rate": 1.680091572927638e-05, "loss": 0.3086, "step": 5329 }, { "epoch": 1.0506703470031546, "grad_norm": 0.5647445368693557, "learning_rate": 1.6799779281417685e-05, "loss": 0.349, "step": 5330 }, { "epoch": 1.0508675078864353, "grad_norm": 0.7627307676176236, "learning_rate": 1.679864267018882e-05, "loss": 0.3259, "step": 5331 }, { "epoch": 1.051064668769716, "grad_norm": 0.5107897902631611, "learning_rate": 1.6797505895617087e-05, "loss": 0.2968, "step": 5332 }, { "epoch": 1.0512618296529967, "grad_norm": 0.525494347575112, "learning_rate": 1.6796368957729802e-05, "loss": 0.3038, "step": 5333 }, { "epoch": 1.0514589905362777, "grad_norm": 0.5371290338339879, "learning_rate": 1.679523185655428e-05, "loss": 0.3366, "step": 5334 }, { "epoch": 1.0516561514195584, "grad_norm": 0.5533840964718997, "learning_rate": 1.6794094592117846e-05, "loss": 0.3443, "step": 5335 }, { "epoch": 1.051853312302839, "grad_norm": 0.571584471042435, "learning_rate": 1.6792957164447807e-05, "loss": 0.3503, "step": 5336 }, { "epoch": 1.0520504731861198, "grad_norm": 0.5233336933797986, "learning_rate": 1.6791819573571507e-05, "loss": 0.3158, "step": 5337 }, { "epoch": 1.0522476340694007, "grad_norm": 0.5494692589886215, "learning_rate": 1.6790681819516275e-05, "loss": 0.3101, "step": 5338 }, { "epoch": 1.0524447949526814, "grad_norm": 0.5188119339090779, "learning_rate": 1.6789543902309443e-05, "loss": 0.3182, "step": 5339 }, { "epoch": 1.052641955835962, "grad_norm": 0.4981089705542774, "learning_rate": 1.6788405821978347e-05, "loss": 0.3138, "step": 5340 }, { "epoch": 1.0528391167192428, "grad_norm": 0.5222423077867616, "learning_rate": 1.6787267578550338e-05, "loss": 0.3465, "step": 5341 }, { "epoch": 1.0530362776025237, "grad_norm": 0.6072170278417297, "learning_rate": 1.678612917205276e-05, "loss": 0.3759, "step": 5342 }, { "epoch": 1.0532334384858044, "grad_norm": 0.49978080972584255, "learning_rate": 1.6784990602512962e-05, "loss": 0.3196, "step": 5343 }, { "epoch": 1.0534305993690851, "grad_norm": 0.5037981316390495, "learning_rate": 1.67838518699583e-05, "loss": 0.3545, "step": 5344 }, { "epoch": 1.053627760252366, "grad_norm": 0.48805755679759366, "learning_rate": 1.6782712974416136e-05, "loss": 0.3178, "step": 5345 }, { "epoch": 1.0538249211356467, "grad_norm": 0.528992909703354, "learning_rate": 1.678157391591383e-05, "loss": 0.3609, "step": 5346 }, { "epoch": 1.0540220820189274, "grad_norm": 0.48711439504089044, "learning_rate": 1.6780434694478748e-05, "loss": 0.3306, "step": 5347 }, { "epoch": 1.0542192429022081, "grad_norm": 0.5482559735722008, "learning_rate": 1.6779295310138264e-05, "loss": 0.3241, "step": 5348 }, { "epoch": 1.054416403785489, "grad_norm": 0.526517838176848, "learning_rate": 1.677815576291975e-05, "loss": 0.3523, "step": 5349 }, { "epoch": 1.0546135646687698, "grad_norm": 0.560425688696832, "learning_rate": 1.6777016052850586e-05, "loss": 0.3306, "step": 5350 }, { "epoch": 1.0548107255520505, "grad_norm": 0.5204711823514945, "learning_rate": 1.6775876179958154e-05, "loss": 0.31, "step": 5351 }, { "epoch": 1.0550078864353312, "grad_norm": 0.5230775365579958, "learning_rate": 1.677473614426984e-05, "loss": 0.3317, "step": 5352 }, { "epoch": 1.055205047318612, "grad_norm": 0.503669553362592, "learning_rate": 1.6773595945813033e-05, "loss": 0.31, "step": 5353 }, { "epoch": 1.0554022082018928, "grad_norm": 0.5353586843398921, "learning_rate": 1.677245558461513e-05, "loss": 0.34, "step": 5354 }, { "epoch": 1.0555993690851735, "grad_norm": 0.5027562913026351, "learning_rate": 1.6771315060703525e-05, "loss": 0.3345, "step": 5355 }, { "epoch": 1.0557965299684542, "grad_norm": 0.5357732118957388, "learning_rate": 1.6770174374105626e-05, "loss": 0.3313, "step": 5356 }, { "epoch": 1.0559936908517351, "grad_norm": 0.5261857108517175, "learning_rate": 1.6769033524848833e-05, "loss": 0.3115, "step": 5357 }, { "epoch": 1.0561908517350158, "grad_norm": 0.5009914049381263, "learning_rate": 1.6767892512960565e-05, "loss": 0.3274, "step": 5358 }, { "epoch": 1.0563880126182965, "grad_norm": 0.5301353617492294, "learning_rate": 1.6766751338468222e-05, "loss": 0.3338, "step": 5359 }, { "epoch": 1.0565851735015772, "grad_norm": 0.4889759349749068, "learning_rate": 1.6765610001399232e-05, "loss": 0.3063, "step": 5360 }, { "epoch": 1.0567823343848581, "grad_norm": 0.5153052654541606, "learning_rate": 1.676446850178101e-05, "loss": 0.3212, "step": 5361 }, { "epoch": 1.0569794952681388, "grad_norm": 0.5432847626107267, "learning_rate": 1.6763326839640993e-05, "loss": 0.3401, "step": 5362 }, { "epoch": 1.0571766561514195, "grad_norm": 0.5095940207939735, "learning_rate": 1.6762185015006597e-05, "loss": 0.3035, "step": 5363 }, { "epoch": 1.0573738170347002, "grad_norm": 0.5082084315576471, "learning_rate": 1.676104302790526e-05, "loss": 0.3164, "step": 5364 }, { "epoch": 1.0575709779179812, "grad_norm": 0.5317861846949568, "learning_rate": 1.675990087836442e-05, "loss": 0.3234, "step": 5365 }, { "epoch": 1.0577681388012619, "grad_norm": 0.5438204191584657, "learning_rate": 1.6758758566411516e-05, "loss": 0.3276, "step": 5366 }, { "epoch": 1.0579652996845426, "grad_norm": 0.5378516453459048, "learning_rate": 1.6757616092073993e-05, "loss": 0.3213, "step": 5367 }, { "epoch": 1.0581624605678233, "grad_norm": 0.5241736593808551, "learning_rate": 1.6756473455379306e-05, "loss": 0.347, "step": 5368 }, { "epoch": 1.0583596214511042, "grad_norm": 0.5281704964843222, "learning_rate": 1.67553306563549e-05, "loss": 0.3509, "step": 5369 }, { "epoch": 1.0585567823343849, "grad_norm": 0.519218991265461, "learning_rate": 1.675418769502824e-05, "loss": 0.3438, "step": 5370 }, { "epoch": 1.0587539432176656, "grad_norm": 0.5056062844302116, "learning_rate": 1.6753044571426777e-05, "loss": 0.3209, "step": 5371 }, { "epoch": 1.0589511041009463, "grad_norm": 0.547727929795884, "learning_rate": 1.6751901285577986e-05, "loss": 0.3459, "step": 5372 }, { "epoch": 1.0591482649842272, "grad_norm": 0.5147703062425203, "learning_rate": 1.675075783750932e-05, "loss": 0.309, "step": 5373 }, { "epoch": 1.059345425867508, "grad_norm": 0.5136539353424504, "learning_rate": 1.6749614227248265e-05, "loss": 0.3356, "step": 5374 }, { "epoch": 1.0595425867507886, "grad_norm": 0.5065991374201936, "learning_rate": 1.6748470454822295e-05, "loss": 0.3117, "step": 5375 }, { "epoch": 1.0597397476340693, "grad_norm": 0.4802852456122994, "learning_rate": 1.6747326520258884e-05, "loss": 0.2974, "step": 5376 }, { "epoch": 1.0599369085173502, "grad_norm": 0.590323007451885, "learning_rate": 1.6746182423585524e-05, "loss": 0.3275, "step": 5377 }, { "epoch": 1.060134069400631, "grad_norm": 0.5031627981781183, "learning_rate": 1.6745038164829695e-05, "loss": 0.3443, "step": 5378 }, { "epoch": 1.0603312302839116, "grad_norm": 0.5335220847872871, "learning_rate": 1.6743893744018892e-05, "loss": 0.3522, "step": 5379 }, { "epoch": 1.0605283911671923, "grad_norm": 0.4886051328326671, "learning_rate": 1.6742749161180614e-05, "loss": 0.3241, "step": 5380 }, { "epoch": 1.0607255520504733, "grad_norm": 0.5426979534289905, "learning_rate": 1.6741604416342355e-05, "loss": 0.3307, "step": 5381 }, { "epoch": 1.060922712933754, "grad_norm": 0.4747697116176086, "learning_rate": 1.674045950953162e-05, "loss": 0.3072, "step": 5382 }, { "epoch": 1.0611198738170347, "grad_norm": 0.4990309113836239, "learning_rate": 1.673931444077592e-05, "loss": 0.3243, "step": 5383 }, { "epoch": 1.0613170347003154, "grad_norm": 0.5391230910740937, "learning_rate": 1.6738169210102765e-05, "loss": 0.3258, "step": 5384 }, { "epoch": 1.0615141955835963, "grad_norm": 0.5003732086840169, "learning_rate": 1.6737023817539665e-05, "loss": 0.3081, "step": 5385 }, { "epoch": 1.061711356466877, "grad_norm": 0.511666602281063, "learning_rate": 1.6735878263114146e-05, "loss": 0.3251, "step": 5386 }, { "epoch": 1.0619085173501577, "grad_norm": 0.5342493746181992, "learning_rate": 1.673473254685372e-05, "loss": 0.3242, "step": 5387 }, { "epoch": 1.0621056782334386, "grad_norm": 0.5066446089845047, "learning_rate": 1.6733586668785926e-05, "loss": 0.3415, "step": 5388 }, { "epoch": 1.0623028391167193, "grad_norm": 0.49933317033206337, "learning_rate": 1.6732440628938293e-05, "loss": 0.3069, "step": 5389 }, { "epoch": 1.0625, "grad_norm": 0.5395443938451984, "learning_rate": 1.6731294427338344e-05, "loss": 0.3343, "step": 5390 }, { "epoch": 1.0626971608832807, "grad_norm": 0.5154874242963391, "learning_rate": 1.6730148064013633e-05, "loss": 0.3353, "step": 5391 }, { "epoch": 1.0628943217665614, "grad_norm": 0.549850655755667, "learning_rate": 1.672900153899169e-05, "loss": 0.3339, "step": 5392 }, { "epoch": 1.0630914826498423, "grad_norm": 0.5437692223908803, "learning_rate": 1.6727854852300073e-05, "loss": 0.325, "step": 5393 }, { "epoch": 1.063288643533123, "grad_norm": 0.5319120551752757, "learning_rate": 1.672670800396632e-05, "loss": 0.3333, "step": 5394 }, { "epoch": 1.0634858044164037, "grad_norm": 0.5888809384452421, "learning_rate": 1.672556099401799e-05, "loss": 0.3289, "step": 5395 }, { "epoch": 1.0636829652996846, "grad_norm": 0.5174224708917468, "learning_rate": 1.672441382248264e-05, "loss": 0.325, "step": 5396 }, { "epoch": 1.0638801261829653, "grad_norm": 0.4940106410321626, "learning_rate": 1.6723266489387837e-05, "loss": 0.3277, "step": 5397 }, { "epoch": 1.064077287066246, "grad_norm": 0.5924938542192701, "learning_rate": 1.672211899476114e-05, "loss": 0.3159, "step": 5398 }, { "epoch": 1.0642744479495267, "grad_norm": 0.6030406417673295, "learning_rate": 1.672097133863012e-05, "loss": 0.352, "step": 5399 }, { "epoch": 1.0644716088328077, "grad_norm": 0.46675304029243586, "learning_rate": 1.6719823521022355e-05, "loss": 0.273, "step": 5400 }, { "epoch": 1.0646687697160884, "grad_norm": 10.313913415266276, "learning_rate": 1.6718675541965413e-05, "loss": 0.3712, "step": 5401 }, { "epoch": 1.064865930599369, "grad_norm": 0.650331375307367, "learning_rate": 1.6717527401486882e-05, "loss": 0.3173, "step": 5402 }, { "epoch": 1.0650630914826498, "grad_norm": 0.5516904225558585, "learning_rate": 1.6716379099614348e-05, "loss": 0.3332, "step": 5403 }, { "epoch": 1.0652602523659307, "grad_norm": 0.5826270309682446, "learning_rate": 1.6715230636375397e-05, "loss": 0.3162, "step": 5404 }, { "epoch": 1.0654574132492114, "grad_norm": 0.5567258991880055, "learning_rate": 1.6714082011797625e-05, "loss": 0.3091, "step": 5405 }, { "epoch": 1.065654574132492, "grad_norm": 0.5797543264480705, "learning_rate": 1.6712933225908618e-05, "loss": 0.3638, "step": 5406 }, { "epoch": 1.0658517350157728, "grad_norm": 0.500686608596155, "learning_rate": 1.6711784278735993e-05, "loss": 0.3183, "step": 5407 }, { "epoch": 1.0660488958990537, "grad_norm": 0.5482037516123067, "learning_rate": 1.6710635170307336e-05, "loss": 0.3282, "step": 5408 }, { "epoch": 1.0662460567823344, "grad_norm": 0.5003783186696478, "learning_rate": 1.6709485900650274e-05, "loss": 0.2917, "step": 5409 }, { "epoch": 1.0664432176656151, "grad_norm": 0.5886317793002753, "learning_rate": 1.6708336469792407e-05, "loss": 0.3297, "step": 5410 }, { "epoch": 1.0666403785488958, "grad_norm": 0.5466590442325215, "learning_rate": 1.670718687776135e-05, "loss": 0.3549, "step": 5411 }, { "epoch": 1.0668375394321767, "grad_norm": 0.5540316157067651, "learning_rate": 1.670603712458473e-05, "loss": 0.3215, "step": 5412 }, { "epoch": 1.0670347003154574, "grad_norm": 0.593181096332669, "learning_rate": 1.670488721029017e-05, "loss": 0.3648, "step": 5413 }, { "epoch": 1.0672318611987381, "grad_norm": 0.5482843023549779, "learning_rate": 1.6703737134905296e-05, "loss": 0.3269, "step": 5414 }, { "epoch": 1.0674290220820188, "grad_norm": 0.5958232910254453, "learning_rate": 1.6702586898457737e-05, "loss": 0.3219, "step": 5415 }, { "epoch": 1.0676261829652998, "grad_norm": 0.5248785667847379, "learning_rate": 1.6701436500975127e-05, "loss": 0.3113, "step": 5416 }, { "epoch": 1.0678233438485805, "grad_norm": 0.506343180110072, "learning_rate": 1.6700285942485113e-05, "loss": 0.2969, "step": 5417 }, { "epoch": 1.0680205047318612, "grad_norm": 0.6323082926274546, "learning_rate": 1.669913522301533e-05, "loss": 0.3197, "step": 5418 }, { "epoch": 1.0682176656151419, "grad_norm": 0.5020570452865531, "learning_rate": 1.669798434259343e-05, "loss": 0.3101, "step": 5419 }, { "epoch": 1.0684148264984228, "grad_norm": 0.7538065179489385, "learning_rate": 1.669683330124706e-05, "loss": 0.3097, "step": 5420 }, { "epoch": 1.0686119873817035, "grad_norm": 0.7336379735027907, "learning_rate": 1.669568209900388e-05, "loss": 0.2805, "step": 5421 }, { "epoch": 1.0688091482649842, "grad_norm": 0.5148666894817214, "learning_rate": 1.6694530735891548e-05, "loss": 0.3298, "step": 5422 }, { "epoch": 1.0690063091482649, "grad_norm": 0.5938791350921867, "learning_rate": 1.6693379211937717e-05, "loss": 0.3594, "step": 5423 }, { "epoch": 1.0692034700315458, "grad_norm": 0.5268352695165609, "learning_rate": 1.6692227527170067e-05, "loss": 0.3245, "step": 5424 }, { "epoch": 1.0694006309148265, "grad_norm": 0.8124267042610954, "learning_rate": 1.6691075681616257e-05, "loss": 0.3672, "step": 5425 }, { "epoch": 1.0695977917981072, "grad_norm": 0.5680106061709315, "learning_rate": 1.6689923675303967e-05, "loss": 0.3314, "step": 5426 }, { "epoch": 1.069794952681388, "grad_norm": 0.5607346328108409, "learning_rate": 1.6688771508260876e-05, "loss": 0.3461, "step": 5427 }, { "epoch": 1.0699921135646688, "grad_norm": 0.5433165552798762, "learning_rate": 1.668761918051466e-05, "loss": 0.3425, "step": 5428 }, { "epoch": 1.0701892744479495, "grad_norm": 0.5476574461852847, "learning_rate": 1.6686466692093007e-05, "loss": 0.3172, "step": 5429 }, { "epoch": 1.0703864353312302, "grad_norm": 0.6111021096749791, "learning_rate": 1.6685314043023608e-05, "loss": 0.3577, "step": 5430 }, { "epoch": 1.0705835962145112, "grad_norm": 0.5552410097887198, "learning_rate": 1.6684161233334157e-05, "loss": 0.3154, "step": 5431 }, { "epoch": 1.0707807570977919, "grad_norm": 0.5107392149662423, "learning_rate": 1.6683008263052344e-05, "loss": 0.3275, "step": 5432 }, { "epoch": 1.0709779179810726, "grad_norm": 0.6074720035217149, "learning_rate": 1.6681855132205882e-05, "loss": 0.3597, "step": 5433 }, { "epoch": 1.0711750788643533, "grad_norm": 0.515248985051321, "learning_rate": 1.6680701840822468e-05, "loss": 0.3278, "step": 5434 }, { "epoch": 1.071372239747634, "grad_norm": 0.5676731678796564, "learning_rate": 1.667954838892981e-05, "loss": 0.324, "step": 5435 }, { "epoch": 1.0715694006309149, "grad_norm": 0.5266114021796704, "learning_rate": 1.6678394776555625e-05, "loss": 0.3298, "step": 5436 }, { "epoch": 1.0717665615141956, "grad_norm": 0.515711098109902, "learning_rate": 1.667724100372763e-05, "loss": 0.3301, "step": 5437 }, { "epoch": 1.0719637223974763, "grad_norm": 0.5413115348636542, "learning_rate": 1.667608707047354e-05, "loss": 0.329, "step": 5438 }, { "epoch": 1.0721608832807572, "grad_norm": 0.5133409836464313, "learning_rate": 1.6674932976821078e-05, "loss": 0.3062, "step": 5439 }, { "epoch": 1.072358044164038, "grad_norm": 0.5514269531982732, "learning_rate": 1.667377872279798e-05, "loss": 0.3224, "step": 5440 }, { "epoch": 1.0725552050473186, "grad_norm": 0.5381267123295007, "learning_rate": 1.6672624308431977e-05, "loss": 0.3554, "step": 5441 }, { "epoch": 1.0727523659305993, "grad_norm": 0.5357973892732231, "learning_rate": 1.6671469733750795e-05, "loss": 0.3306, "step": 5442 }, { "epoch": 1.0729495268138802, "grad_norm": 0.5334845958350081, "learning_rate": 1.6670314998782183e-05, "loss": 0.3522, "step": 5443 }, { "epoch": 1.073146687697161, "grad_norm": 0.5078507960335631, "learning_rate": 1.6669160103553884e-05, "loss": 0.3192, "step": 5444 }, { "epoch": 1.0733438485804416, "grad_norm": 0.526273076314691, "learning_rate": 1.666800504809364e-05, "loss": 0.3295, "step": 5445 }, { "epoch": 1.0735410094637223, "grad_norm": 0.5501746473063156, "learning_rate": 1.6666849832429207e-05, "loss": 0.3559, "step": 5446 }, { "epoch": 1.0737381703470033, "grad_norm": 0.5417331807201895, "learning_rate": 1.6665694456588335e-05, "loss": 0.3395, "step": 5447 }, { "epoch": 1.073935331230284, "grad_norm": 0.4892217439505845, "learning_rate": 1.666453892059879e-05, "loss": 0.2989, "step": 5448 }, { "epoch": 1.0741324921135647, "grad_norm": 0.5427686650470855, "learning_rate": 1.6663383224488323e-05, "loss": 0.3249, "step": 5449 }, { "epoch": 1.0743296529968454, "grad_norm": 0.5467284567115355, "learning_rate": 1.6662227368284716e-05, "loss": 0.3353, "step": 5450 }, { "epoch": 1.0745268138801263, "grad_norm": 0.49209796772640735, "learning_rate": 1.6661071352015725e-05, "loss": 0.3205, "step": 5451 }, { "epoch": 1.074723974763407, "grad_norm": 0.5566384176423259, "learning_rate": 1.6659915175709135e-05, "loss": 0.3221, "step": 5452 }, { "epoch": 1.0749211356466877, "grad_norm": 0.526499393616949, "learning_rate": 1.665875883939272e-05, "loss": 0.3263, "step": 5453 }, { "epoch": 1.0751182965299684, "grad_norm": 2.739695761851264, "learning_rate": 1.665760234309426e-05, "loss": 0.3506, "step": 5454 }, { "epoch": 1.0753154574132493, "grad_norm": 0.6593642459442813, "learning_rate": 1.665644568684154e-05, "loss": 0.3426, "step": 5455 }, { "epoch": 1.07551261829653, "grad_norm": 0.5151614291220461, "learning_rate": 1.6655288870662354e-05, "loss": 0.328, "step": 5456 }, { "epoch": 1.0757097791798107, "grad_norm": 0.5714376826042502, "learning_rate": 1.6654131894584494e-05, "loss": 0.318, "step": 5457 }, { "epoch": 1.0759069400630914, "grad_norm": 0.5326114164538318, "learning_rate": 1.665297475863576e-05, "loss": 0.3268, "step": 5458 }, { "epoch": 1.0761041009463723, "grad_norm": 0.5318642965648936, "learning_rate": 1.6651817462843944e-05, "loss": 0.3393, "step": 5459 }, { "epoch": 1.076301261829653, "grad_norm": 0.5208615095763689, "learning_rate": 1.665066000723686e-05, "loss": 0.3048, "step": 5460 }, { "epoch": 1.0764984227129337, "grad_norm": 0.5112237680335591, "learning_rate": 1.6649502391842313e-05, "loss": 0.3306, "step": 5461 }, { "epoch": 1.0766955835962144, "grad_norm": 0.5020109988963485, "learning_rate": 1.6648344616688116e-05, "loss": 0.3079, "step": 5462 }, { "epoch": 1.0768927444794953, "grad_norm": 0.5141839034905135, "learning_rate": 1.664718668180208e-05, "loss": 0.3433, "step": 5463 }, { "epoch": 1.077089905362776, "grad_norm": 0.522715217511301, "learning_rate": 1.664602858721204e-05, "loss": 0.3273, "step": 5464 }, { "epoch": 1.0772870662460567, "grad_norm": 0.4873672081469081, "learning_rate": 1.6644870332945807e-05, "loss": 0.3131, "step": 5465 }, { "epoch": 1.0774842271293374, "grad_norm": 0.5208359970585005, "learning_rate": 1.6643711919031217e-05, "loss": 0.3326, "step": 5466 }, { "epoch": 1.0776813880126184, "grad_norm": 0.5085175763798949, "learning_rate": 1.6642553345496093e-05, "loss": 0.336, "step": 5467 }, { "epoch": 1.077878548895899, "grad_norm": 0.5395303777396451, "learning_rate": 1.664139461236828e-05, "loss": 0.3519, "step": 5468 }, { "epoch": 1.0780757097791798, "grad_norm": 0.4939691741186544, "learning_rate": 1.6640235719675607e-05, "loss": 0.3149, "step": 5469 }, { "epoch": 1.0782728706624605, "grad_norm": 0.5045822941059132, "learning_rate": 1.663907666744593e-05, "loss": 0.3257, "step": 5470 }, { "epoch": 1.0784700315457414, "grad_norm": 0.4842189935972799, "learning_rate": 1.6637917455707085e-05, "loss": 0.2941, "step": 5471 }, { "epoch": 1.078667192429022, "grad_norm": 0.488411947048396, "learning_rate": 1.663675808448693e-05, "loss": 0.3263, "step": 5472 }, { "epoch": 1.0788643533123028, "grad_norm": 0.5204936040738815, "learning_rate": 1.663559855381332e-05, "loss": 0.337, "step": 5473 }, { "epoch": 1.0790615141955835, "grad_norm": 0.4907540263785109, "learning_rate": 1.6634438863714108e-05, "loss": 0.3341, "step": 5474 }, { "epoch": 1.0792586750788644, "grad_norm": 0.5341500420019042, "learning_rate": 1.6633279014217158e-05, "loss": 0.361, "step": 5475 }, { "epoch": 1.0794558359621451, "grad_norm": 0.566795481318772, "learning_rate": 1.663211900535034e-05, "loss": 0.3565, "step": 5476 }, { "epoch": 1.0796529968454258, "grad_norm": 0.5104378663095975, "learning_rate": 1.663095883714152e-05, "loss": 0.3172, "step": 5477 }, { "epoch": 1.0798501577287065, "grad_norm": 0.7836320347832308, "learning_rate": 1.6629798509618575e-05, "loss": 0.3279, "step": 5478 }, { "epoch": 1.0800473186119874, "grad_norm": 0.4941220351531997, "learning_rate": 1.6628638022809384e-05, "loss": 0.3237, "step": 5479 }, { "epoch": 1.0802444794952681, "grad_norm": 0.538070953999582, "learning_rate": 1.6627477376741824e-05, "loss": 0.3186, "step": 5480 }, { "epoch": 1.0804416403785488, "grad_norm": 0.49651925324616386, "learning_rate": 1.6626316571443782e-05, "loss": 0.3328, "step": 5481 }, { "epoch": 1.0806388012618298, "grad_norm": 0.6699435537682001, "learning_rate": 1.662515560694315e-05, "loss": 0.3353, "step": 5482 }, { "epoch": 1.0808359621451105, "grad_norm": 0.5278612363315855, "learning_rate": 1.6623994483267823e-05, "loss": 0.3275, "step": 5483 }, { "epoch": 1.0810331230283912, "grad_norm": 0.5521883665813071, "learning_rate": 1.6622833200445688e-05, "loss": 0.3395, "step": 5484 }, { "epoch": 1.0812302839116719, "grad_norm": 0.5101814960888886, "learning_rate": 1.6621671758504656e-05, "loss": 0.3229, "step": 5485 }, { "epoch": 1.0814274447949526, "grad_norm": 0.5082133461105458, "learning_rate": 1.6620510157472626e-05, "loss": 0.3416, "step": 5486 }, { "epoch": 1.0816246056782335, "grad_norm": 0.5152393878133755, "learning_rate": 1.6619348397377508e-05, "loss": 0.2995, "step": 5487 }, { "epoch": 1.0818217665615142, "grad_norm": 0.5030381994487839, "learning_rate": 1.6618186478247214e-05, "loss": 0.3219, "step": 5488 }, { "epoch": 1.0820189274447949, "grad_norm": 0.5033251374070803, "learning_rate": 1.661702440010966e-05, "loss": 0.3278, "step": 5489 }, { "epoch": 1.0822160883280758, "grad_norm": 0.4858411621778881, "learning_rate": 1.6615862162992765e-05, "loss": 0.321, "step": 5490 }, { "epoch": 1.0824132492113565, "grad_norm": 0.5299981365610831, "learning_rate": 1.6614699766924457e-05, "loss": 0.334, "step": 5491 }, { "epoch": 1.0826104100946372, "grad_norm": 0.5398683026062827, "learning_rate": 1.661353721193266e-05, "loss": 0.3679, "step": 5492 }, { "epoch": 1.082807570977918, "grad_norm": 0.4922123143204083, "learning_rate": 1.6612374498045303e-05, "loss": 0.3252, "step": 5493 }, { "epoch": 1.0830047318611988, "grad_norm": 0.5402134649137565, "learning_rate": 1.6611211625290328e-05, "loss": 0.3488, "step": 5494 }, { "epoch": 1.0832018927444795, "grad_norm": 0.49308565622911255, "learning_rate": 1.6610048593695665e-05, "loss": 0.3274, "step": 5495 }, { "epoch": 1.0833990536277602, "grad_norm": 0.5330415075689533, "learning_rate": 1.6608885403289264e-05, "loss": 0.3416, "step": 5496 }, { "epoch": 1.083596214511041, "grad_norm": 0.5546315403447396, "learning_rate": 1.6607722054099066e-05, "loss": 0.3428, "step": 5497 }, { "epoch": 1.0837933753943219, "grad_norm": 0.548153275553841, "learning_rate": 1.6606558546153027e-05, "loss": 0.3274, "step": 5498 }, { "epoch": 1.0839905362776026, "grad_norm": 0.5741893560590743, "learning_rate": 1.6605394879479102e-05, "loss": 0.3408, "step": 5499 }, { "epoch": 1.0841876971608833, "grad_norm": 0.563655275701286, "learning_rate": 1.660423105410524e-05, "loss": 0.3535, "step": 5500 }, { "epoch": 1.084384858044164, "grad_norm": 0.5292670828512388, "learning_rate": 1.6603067070059413e-05, "loss": 0.3451, "step": 5501 }, { "epoch": 1.0845820189274449, "grad_norm": 0.4943062953455165, "learning_rate": 1.6601902927369577e-05, "loss": 0.305, "step": 5502 }, { "epoch": 1.0847791798107256, "grad_norm": 0.5684207519421706, "learning_rate": 1.660073862606371e-05, "loss": 0.3544, "step": 5503 }, { "epoch": 1.0849763406940063, "grad_norm": 0.5269370109869425, "learning_rate": 1.6599574166169783e-05, "loss": 0.3364, "step": 5504 }, { "epoch": 1.085173501577287, "grad_norm": 0.5930908127100334, "learning_rate": 1.659840954771577e-05, "loss": 0.3293, "step": 5505 }, { "epoch": 1.085370662460568, "grad_norm": 0.5064001248459025, "learning_rate": 1.6597244770729656e-05, "loss": 0.3364, "step": 5506 }, { "epoch": 1.0855678233438486, "grad_norm": 0.5583107476093074, "learning_rate": 1.659607983523942e-05, "loss": 0.344, "step": 5507 }, { "epoch": 1.0857649842271293, "grad_norm": 0.5186990686312586, "learning_rate": 1.6594914741273058e-05, "loss": 0.3203, "step": 5508 }, { "epoch": 1.08596214511041, "grad_norm": 0.5412209056965988, "learning_rate": 1.6593749488858554e-05, "loss": 0.3514, "step": 5509 }, { "epoch": 1.086159305993691, "grad_norm": 0.5847946899685581, "learning_rate": 1.6592584078023915e-05, "loss": 0.3498, "step": 5510 }, { "epoch": 1.0863564668769716, "grad_norm": 0.49519738287188864, "learning_rate": 1.659141850879713e-05, "loss": 0.3065, "step": 5511 }, { "epoch": 1.0865536277602523, "grad_norm": 0.49468808903487677, "learning_rate": 1.659025278120621e-05, "loss": 0.3129, "step": 5512 }, { "epoch": 1.086750788643533, "grad_norm": 1.0581729165868015, "learning_rate": 1.6589086895279156e-05, "loss": 0.3673, "step": 5513 }, { "epoch": 1.086947949526814, "grad_norm": 0.49002563953861666, "learning_rate": 1.6587920851043986e-05, "loss": 0.3223, "step": 5514 }, { "epoch": 1.0871451104100947, "grad_norm": 0.4905578731396364, "learning_rate": 1.6586754648528712e-05, "loss": 0.3043, "step": 5515 }, { "epoch": 1.0873422712933754, "grad_norm": 0.667527289823531, "learning_rate": 1.658558828776135e-05, "loss": 0.2788, "step": 5516 }, { "epoch": 1.087539432176656, "grad_norm": 0.8459394709559949, "learning_rate": 1.6584421768769933e-05, "loss": 0.3415, "step": 5517 }, { "epoch": 1.087736593059937, "grad_norm": 0.47906959750711475, "learning_rate": 1.6583255091582474e-05, "loss": 0.3405, "step": 5518 }, { "epoch": 1.0879337539432177, "grad_norm": 0.48428073694586943, "learning_rate": 1.658208825622701e-05, "loss": 0.3322, "step": 5519 }, { "epoch": 1.0881309148264984, "grad_norm": 0.7968161512793981, "learning_rate": 1.6580921262731582e-05, "loss": 0.3291, "step": 5520 }, { "epoch": 1.088328075709779, "grad_norm": 0.519690983389764, "learning_rate": 1.6579754111124215e-05, "loss": 0.3248, "step": 5521 }, { "epoch": 1.08852523659306, "grad_norm": 0.5091809572661937, "learning_rate": 1.6578586801432958e-05, "loss": 0.3133, "step": 5522 }, { "epoch": 1.0887223974763407, "grad_norm": 0.5549122652560055, "learning_rate": 1.6577419333685855e-05, "loss": 0.3462, "step": 5523 }, { "epoch": 1.0889195583596214, "grad_norm": 6.326360585378315, "learning_rate": 1.6576251707910955e-05, "loss": 0.3586, "step": 5524 }, { "epoch": 1.0891167192429023, "grad_norm": 0.594210310368939, "learning_rate": 1.6575083924136313e-05, "loss": 0.2988, "step": 5525 }, { "epoch": 1.089313880126183, "grad_norm": 0.5653001651310844, "learning_rate": 1.6573915982389986e-05, "loss": 0.3427, "step": 5526 }, { "epoch": 1.0895110410094637, "grad_norm": 0.6419933562592661, "learning_rate": 1.6572747882700034e-05, "loss": 0.3479, "step": 5527 }, { "epoch": 1.0897082018927444, "grad_norm": 0.549229419744898, "learning_rate": 1.657157962509452e-05, "loss": 0.3367, "step": 5528 }, { "epoch": 1.0899053627760251, "grad_norm": 0.5649927832731517, "learning_rate": 1.6570411209601515e-05, "loss": 0.3288, "step": 5529 }, { "epoch": 1.090102523659306, "grad_norm": 0.5593298618542406, "learning_rate": 1.656924263624909e-05, "loss": 0.3552, "step": 5530 }, { "epoch": 1.0902996845425867, "grad_norm": 0.5487752889109879, "learning_rate": 1.6568073905065313e-05, "loss": 0.3467, "step": 5531 }, { "epoch": 1.0904968454258674, "grad_norm": 0.517747474702925, "learning_rate": 1.656690501607828e-05, "loss": 0.3165, "step": 5532 }, { "epoch": 1.0906940063091484, "grad_norm": 0.5187685507630857, "learning_rate": 1.656573596931606e-05, "loss": 0.3284, "step": 5533 }, { "epoch": 1.090891167192429, "grad_norm": 0.5501636063381948, "learning_rate": 1.656456676480675e-05, "loss": 0.3071, "step": 5534 }, { "epoch": 1.0910883280757098, "grad_norm": 0.5456326413445273, "learning_rate": 1.6563397402578432e-05, "loss": 0.3125, "step": 5535 }, { "epoch": 1.0912854889589905, "grad_norm": 0.5648819649096508, "learning_rate": 1.6562227882659213e-05, "loss": 0.3379, "step": 5536 }, { "epoch": 1.0914826498422714, "grad_norm": 0.5019198980389612, "learning_rate": 1.656105820507718e-05, "loss": 0.32, "step": 5537 }, { "epoch": 1.091679810725552, "grad_norm": 0.5549566520131515, "learning_rate": 1.655988836986044e-05, "loss": 0.352, "step": 5538 }, { "epoch": 1.0918769716088328, "grad_norm": 0.47299201163230054, "learning_rate": 1.65587183770371e-05, "loss": 0.2692, "step": 5539 }, { "epoch": 1.0920741324921135, "grad_norm": 0.54789761189984, "learning_rate": 1.6557548226635266e-05, "loss": 0.3395, "step": 5540 }, { "epoch": 1.0922712933753944, "grad_norm": 0.5853498159297642, "learning_rate": 1.655637791868306e-05, "loss": 0.336, "step": 5541 }, { "epoch": 1.0924684542586751, "grad_norm": 0.630929601289797, "learning_rate": 1.6555207453208596e-05, "loss": 0.3531, "step": 5542 }, { "epoch": 1.0926656151419558, "grad_norm": 0.5228836259363071, "learning_rate": 1.655403683023999e-05, "loss": 0.3291, "step": 5543 }, { "epoch": 1.0928627760252365, "grad_norm": 0.5556250366890163, "learning_rate": 1.655286604980537e-05, "loss": 0.3462, "step": 5544 }, { "epoch": 1.0930599369085174, "grad_norm": 0.5731618356985932, "learning_rate": 1.655169511193287e-05, "loss": 0.3276, "step": 5545 }, { "epoch": 1.0932570977917981, "grad_norm": 0.5172208442087932, "learning_rate": 1.6550524016650617e-05, "loss": 0.3295, "step": 5546 }, { "epoch": 1.0934542586750788, "grad_norm": 0.5971863550898441, "learning_rate": 1.6549352763986747e-05, "loss": 0.3635, "step": 5547 }, { "epoch": 1.0936514195583595, "grad_norm": 0.5043253634738109, "learning_rate": 1.6548181353969407e-05, "loss": 0.3164, "step": 5548 }, { "epoch": 1.0938485804416405, "grad_norm": 0.5441825059408084, "learning_rate": 1.654700978662674e-05, "loss": 0.3326, "step": 5549 }, { "epoch": 1.0940457413249212, "grad_norm": 0.5397553446197998, "learning_rate": 1.654583806198688e-05, "loss": 0.3658, "step": 5550 }, { "epoch": 1.0942429022082019, "grad_norm": 0.5504743804813628, "learning_rate": 1.6544666180077996e-05, "loss": 0.3526, "step": 5551 }, { "epoch": 1.0944400630914826, "grad_norm": 0.5424844935779093, "learning_rate": 1.6543494140928236e-05, "loss": 0.3307, "step": 5552 }, { "epoch": 1.0946372239747635, "grad_norm": 0.526220112929215, "learning_rate": 1.654232194456576e-05, "loss": 0.3308, "step": 5553 }, { "epoch": 1.0948343848580442, "grad_norm": 0.5236749400742969, "learning_rate": 1.6541149591018727e-05, "loss": 0.3453, "step": 5554 }, { "epoch": 1.0950315457413249, "grad_norm": 0.5266507318689269, "learning_rate": 1.6539977080315313e-05, "loss": 0.3305, "step": 5555 }, { "epoch": 1.0952287066246056, "grad_norm": 0.5216109610169218, "learning_rate": 1.653880441248368e-05, "loss": 0.3316, "step": 5556 }, { "epoch": 1.0954258675078865, "grad_norm": 0.5163493651525781, "learning_rate": 1.6537631587552007e-05, "loss": 0.342, "step": 5557 }, { "epoch": 1.0956230283911672, "grad_norm": 0.5078186202086805, "learning_rate": 1.6536458605548467e-05, "loss": 0.3249, "step": 5558 }, { "epoch": 1.095820189274448, "grad_norm": 0.5276901659588409, "learning_rate": 1.6535285466501247e-05, "loss": 0.3326, "step": 5559 }, { "epoch": 1.0960173501577286, "grad_norm": 0.5355919928397173, "learning_rate": 1.653411217043853e-05, "loss": 0.352, "step": 5560 }, { "epoch": 1.0962145110410095, "grad_norm": 0.4763053400285673, "learning_rate": 1.653293871738851e-05, "loss": 0.2937, "step": 5561 }, { "epoch": 1.0964116719242902, "grad_norm": 0.5380622063499523, "learning_rate": 1.6531765107379374e-05, "loss": 0.3585, "step": 5562 }, { "epoch": 1.096608832807571, "grad_norm": 0.5111119725625164, "learning_rate": 1.653059134043932e-05, "loss": 0.3336, "step": 5563 }, { "epoch": 1.0968059936908516, "grad_norm": 0.49455292690513286, "learning_rate": 1.652941741659655e-05, "loss": 0.3136, "step": 5564 }, { "epoch": 1.0970031545741326, "grad_norm": 0.6845117374932369, "learning_rate": 1.652824333587927e-05, "loss": 0.332, "step": 5565 }, { "epoch": 1.0972003154574133, "grad_norm": 0.6523179647836078, "learning_rate": 1.652706909831569e-05, "loss": 0.3441, "step": 5566 }, { "epoch": 1.097397476340694, "grad_norm": 0.4963174475713189, "learning_rate": 1.6525894703934013e-05, "loss": 0.3077, "step": 5567 }, { "epoch": 1.0975946372239749, "grad_norm": 0.508004655738834, "learning_rate": 1.6524720152762462e-05, "loss": 0.3201, "step": 5568 }, { "epoch": 1.0977917981072556, "grad_norm": 0.5984239179107959, "learning_rate": 1.6523545444829254e-05, "loss": 0.3592, "step": 5569 }, { "epoch": 1.0979889589905363, "grad_norm": 0.5360407905331267, "learning_rate": 1.6522370580162614e-05, "loss": 0.3261, "step": 5570 }, { "epoch": 1.098186119873817, "grad_norm": 0.5556220479614998, "learning_rate": 1.652119555879077e-05, "loss": 0.3431, "step": 5571 }, { "epoch": 1.0983832807570977, "grad_norm": 0.5604439658652886, "learning_rate": 1.652002038074195e-05, "loss": 0.3145, "step": 5572 }, { "epoch": 1.0985804416403786, "grad_norm": 0.5477226089327527, "learning_rate": 1.651884504604439e-05, "loss": 0.3404, "step": 5573 }, { "epoch": 1.0987776025236593, "grad_norm": 0.5145353614083515, "learning_rate": 1.6517669554726327e-05, "loss": 0.3197, "step": 5574 }, { "epoch": 1.09897476340694, "grad_norm": 0.5111044242152223, "learning_rate": 1.6516493906816005e-05, "loss": 0.3053, "step": 5575 }, { "epoch": 1.099171924290221, "grad_norm": 0.5405424477707778, "learning_rate": 1.6515318102341672e-05, "loss": 0.3324, "step": 5576 }, { "epoch": 1.0993690851735016, "grad_norm": 0.5193363546526818, "learning_rate": 1.651414214133157e-05, "loss": 0.3423, "step": 5577 }, { "epoch": 1.0995662460567823, "grad_norm": 0.5144317929362779, "learning_rate": 1.651296602381396e-05, "loss": 0.3228, "step": 5578 }, { "epoch": 1.099763406940063, "grad_norm": 0.5119676049482131, "learning_rate": 1.6511789749817095e-05, "loss": 0.3118, "step": 5579 }, { "epoch": 1.099960567823344, "grad_norm": 0.5105599242984479, "learning_rate": 1.651061331936924e-05, "loss": 0.3323, "step": 5580 }, { "epoch": 1.1001577287066246, "grad_norm": 0.5080653593030011, "learning_rate": 1.6509436732498656e-05, "loss": 0.3318, "step": 5581 }, { "epoch": 1.1003548895899053, "grad_norm": 0.5031898165168965, "learning_rate": 1.650825998923361e-05, "loss": 0.3222, "step": 5582 }, { "epoch": 1.100552050473186, "grad_norm": 0.5226955063741691, "learning_rate": 1.650708308960238e-05, "loss": 0.355, "step": 5583 }, { "epoch": 1.100749211356467, "grad_norm": 0.5038434520036276, "learning_rate": 1.6505906033633236e-05, "loss": 0.3391, "step": 5584 }, { "epoch": 1.1009463722397477, "grad_norm": 0.5059125745144066, "learning_rate": 1.650472882135446e-05, "loss": 0.3039, "step": 5585 }, { "epoch": 1.1011435331230284, "grad_norm": 0.5040045595649187, "learning_rate": 1.650355145279434e-05, "loss": 0.3477, "step": 5586 }, { "epoch": 1.101340694006309, "grad_norm": 0.5192804609775385, "learning_rate": 1.6502373927981154e-05, "loss": 0.33, "step": 5587 }, { "epoch": 1.10153785488959, "grad_norm": 0.5922051341334427, "learning_rate": 1.6501196246943202e-05, "loss": 0.3541, "step": 5588 }, { "epoch": 1.1017350157728707, "grad_norm": 0.49875642200196624, "learning_rate": 1.650001840970877e-05, "loss": 0.2838, "step": 5589 }, { "epoch": 1.1019321766561514, "grad_norm": 0.4856484105077422, "learning_rate": 1.6498840416306168e-05, "loss": 0.3083, "step": 5590 }, { "epoch": 1.102129337539432, "grad_norm": 0.5132652807308573, "learning_rate": 1.6497662266763685e-05, "loss": 0.315, "step": 5591 }, { "epoch": 1.102326498422713, "grad_norm": 0.5193099983968429, "learning_rate": 1.6496483961109638e-05, "loss": 0.3029, "step": 5592 }, { "epoch": 1.1025236593059937, "grad_norm": 0.5331749318093294, "learning_rate": 1.649530549937233e-05, "loss": 0.3383, "step": 5593 }, { "epoch": 1.1027208201892744, "grad_norm": 0.5453618564420977, "learning_rate": 1.6494126881580077e-05, "loss": 0.3239, "step": 5594 }, { "epoch": 1.1029179810725551, "grad_norm": 0.49856539989644866, "learning_rate": 1.64929481077612e-05, "loss": 0.3423, "step": 5595 }, { "epoch": 1.103115141955836, "grad_norm": 0.5012478925347682, "learning_rate": 1.649176917794401e-05, "loss": 0.3283, "step": 5596 }, { "epoch": 1.1033123028391167, "grad_norm": 0.5109791832923377, "learning_rate": 1.649059009215684e-05, "loss": 0.3039, "step": 5597 }, { "epoch": 1.1035094637223974, "grad_norm": 0.5323841890352232, "learning_rate": 1.6489410850428017e-05, "loss": 0.3441, "step": 5598 }, { "epoch": 1.1037066246056781, "grad_norm": 0.5666261888064066, "learning_rate": 1.6488231452785867e-05, "loss": 0.3586, "step": 5599 }, { "epoch": 1.103903785488959, "grad_norm": 0.49589632413833346, "learning_rate": 1.6487051899258738e-05, "loss": 0.3359, "step": 5600 }, { "epoch": 1.1041009463722398, "grad_norm": 0.522410156012698, "learning_rate": 1.6485872189874962e-05, "loss": 0.3242, "step": 5601 }, { "epoch": 1.1042981072555205, "grad_norm": 0.5598921164556169, "learning_rate": 1.6484692324662883e-05, "loss": 0.3353, "step": 5602 }, { "epoch": 1.1044952681388012, "grad_norm": 0.565537446173838, "learning_rate": 1.6483512303650847e-05, "loss": 0.3196, "step": 5603 }, { "epoch": 1.104692429022082, "grad_norm": 0.5611042309344503, "learning_rate": 1.648233212686721e-05, "loss": 0.3531, "step": 5604 }, { "epoch": 1.1048895899053628, "grad_norm": 0.6104352248715608, "learning_rate": 1.648115179434032e-05, "loss": 0.3411, "step": 5605 }, { "epoch": 1.1050867507886435, "grad_norm": 0.5182267968343534, "learning_rate": 1.647997130609854e-05, "loss": 0.3328, "step": 5606 }, { "epoch": 1.1052839116719242, "grad_norm": 0.6629279110385724, "learning_rate": 1.647879066217023e-05, "loss": 0.3664, "step": 5607 }, { "epoch": 1.1054810725552051, "grad_norm": 0.48783718160836653, "learning_rate": 1.6477609862583758e-05, "loss": 0.3184, "step": 5608 }, { "epoch": 1.1056782334384858, "grad_norm": 0.5388589810531627, "learning_rate": 1.6476428907367497e-05, "loss": 0.322, "step": 5609 }, { "epoch": 1.1058753943217665, "grad_norm": 0.6019162312791431, "learning_rate": 1.647524779654981e-05, "loss": 0.35, "step": 5610 }, { "epoch": 1.1060725552050474, "grad_norm": 0.5227308598811116, "learning_rate": 1.6474066530159083e-05, "loss": 0.346, "step": 5611 }, { "epoch": 1.1062697160883281, "grad_norm": 0.5024291222077405, "learning_rate": 1.6472885108223694e-05, "loss": 0.3115, "step": 5612 }, { "epoch": 1.1064668769716088, "grad_norm": 0.5971378494359267, "learning_rate": 1.647170353077203e-05, "loss": 0.3421, "step": 5613 }, { "epoch": 1.1066640378548895, "grad_norm": 0.5323688610636911, "learning_rate": 1.647052179783247e-05, "loss": 0.3685, "step": 5614 }, { "epoch": 1.1068611987381702, "grad_norm": 0.5494630004599759, "learning_rate": 1.646933990943342e-05, "loss": 0.3596, "step": 5615 }, { "epoch": 1.1070583596214512, "grad_norm": 0.5315790166072238, "learning_rate": 1.6468157865603265e-05, "loss": 0.326, "step": 5616 }, { "epoch": 1.1072555205047319, "grad_norm": 1.095318117653315, "learning_rate": 1.646697566637041e-05, "loss": 0.3391, "step": 5617 }, { "epoch": 1.1074526813880126, "grad_norm": 0.4910427373938156, "learning_rate": 1.6465793311763255e-05, "loss": 0.3082, "step": 5618 }, { "epoch": 1.1076498422712935, "grad_norm": 0.5479397492318889, "learning_rate": 1.646461080181021e-05, "loss": 0.3344, "step": 5619 }, { "epoch": 1.1078470031545742, "grad_norm": 0.5103248145847307, "learning_rate": 1.6463428136539684e-05, "loss": 0.324, "step": 5620 }, { "epoch": 1.1080441640378549, "grad_norm": 0.5630855688757884, "learning_rate": 1.6462245315980094e-05, "loss": 0.3275, "step": 5621 }, { "epoch": 1.1082413249211356, "grad_norm": 0.5263509918106348, "learning_rate": 1.6461062340159853e-05, "loss": 0.3189, "step": 5622 }, { "epoch": 1.1084384858044165, "grad_norm": 0.6493282607434927, "learning_rate": 1.6459879209107394e-05, "loss": 0.3394, "step": 5623 }, { "epoch": 1.1086356466876972, "grad_norm": 0.6049538585958488, "learning_rate": 1.6458695922851126e-05, "loss": 0.3308, "step": 5624 }, { "epoch": 1.108832807570978, "grad_norm": 0.531824713225462, "learning_rate": 1.6457512481419492e-05, "loss": 0.3412, "step": 5625 }, { "epoch": 1.1090299684542586, "grad_norm": 0.5404382119881224, "learning_rate": 1.6456328884840917e-05, "loss": 0.3427, "step": 5626 }, { "epoch": 1.1092271293375395, "grad_norm": 0.5405048280095781, "learning_rate": 1.6455145133143843e-05, "loss": 0.3322, "step": 5627 }, { "epoch": 1.1094242902208202, "grad_norm": 0.5808034796183005, "learning_rate": 1.645396122635671e-05, "loss": 0.3764, "step": 5628 }, { "epoch": 1.109621451104101, "grad_norm": 0.5655517916406795, "learning_rate": 1.6452777164507957e-05, "loss": 0.3236, "step": 5629 }, { "epoch": 1.1098186119873816, "grad_norm": 0.5394522259534293, "learning_rate": 1.6451592947626043e-05, "loss": 0.3317, "step": 5630 }, { "epoch": 1.1100157728706626, "grad_norm": 0.5151963061557067, "learning_rate": 1.6450408575739407e-05, "loss": 0.3467, "step": 5631 }, { "epoch": 1.1102129337539433, "grad_norm": 0.5279890856885844, "learning_rate": 1.6449224048876512e-05, "loss": 0.3258, "step": 5632 }, { "epoch": 1.110410094637224, "grad_norm": 0.5779114047767783, "learning_rate": 1.6448039367065816e-05, "loss": 0.3662, "step": 5633 }, { "epoch": 1.1106072555205047, "grad_norm": 0.5019131557840137, "learning_rate": 1.6446854530335783e-05, "loss": 0.3201, "step": 5634 }, { "epoch": 1.1108044164037856, "grad_norm": 0.5524102955169621, "learning_rate": 1.6445669538714878e-05, "loss": 0.3678, "step": 5635 }, { "epoch": 1.1110015772870663, "grad_norm": 0.5274386217024158, "learning_rate": 1.6444484392231574e-05, "loss": 0.3452, "step": 5636 }, { "epoch": 1.111198738170347, "grad_norm": 0.5065419330922859, "learning_rate": 1.6443299090914336e-05, "loss": 0.3135, "step": 5637 }, { "epoch": 1.1113958990536277, "grad_norm": 0.5321762740305643, "learning_rate": 1.6442113634791653e-05, "loss": 0.36, "step": 5638 }, { "epoch": 1.1115930599369086, "grad_norm": 0.4997951806163294, "learning_rate": 1.6440928023892e-05, "loss": 0.3222, "step": 5639 }, { "epoch": 1.1117902208201893, "grad_norm": 0.4876569661311848, "learning_rate": 1.643974225824387e-05, "loss": 0.3212, "step": 5640 }, { "epoch": 1.11198738170347, "grad_norm": 0.5216914599856519, "learning_rate": 1.643855633787574e-05, "loss": 0.3373, "step": 5641 }, { "epoch": 1.1121845425867507, "grad_norm": 0.48155980759723177, "learning_rate": 1.643737026281611e-05, "loss": 0.3003, "step": 5642 }, { "epoch": 1.1123817034700316, "grad_norm": 0.5168478481124772, "learning_rate": 1.643618403309348e-05, "loss": 0.3435, "step": 5643 }, { "epoch": 1.1125788643533123, "grad_norm": 0.5105355384691713, "learning_rate": 1.643499764873634e-05, "loss": 0.3098, "step": 5644 }, { "epoch": 1.112776025236593, "grad_norm": 0.5395714096049974, "learning_rate": 1.6433811109773202e-05, "loss": 0.3495, "step": 5645 }, { "epoch": 1.1129731861198737, "grad_norm": 0.5042000786890821, "learning_rate": 1.643262441623257e-05, "loss": 0.3231, "step": 5646 }, { "epoch": 1.1131703470031546, "grad_norm": 0.5278311430339191, "learning_rate": 1.6431437568142956e-05, "loss": 0.3529, "step": 5647 }, { "epoch": 1.1133675078864353, "grad_norm": 0.5180324474360586, "learning_rate": 1.6430250565532878e-05, "loss": 0.3456, "step": 5648 }, { "epoch": 1.113564668769716, "grad_norm": 0.5081894704748161, "learning_rate": 1.642906340843085e-05, "loss": 0.3288, "step": 5649 }, { "epoch": 1.1137618296529967, "grad_norm": 0.6664115588691337, "learning_rate": 1.6427876096865394e-05, "loss": 0.3455, "step": 5650 }, { "epoch": 1.1139589905362777, "grad_norm": 3.5585025573079028, "learning_rate": 1.642668863086504e-05, "loss": 0.3337, "step": 5651 }, { "epoch": 1.1141561514195584, "grad_norm": 0.5420249391026032, "learning_rate": 1.642550101045832e-05, "loss": 0.3192, "step": 5652 }, { "epoch": 1.114353312302839, "grad_norm": 0.5862664575213457, "learning_rate": 1.6424313235673758e-05, "loss": 0.3308, "step": 5653 }, { "epoch": 1.1145504731861198, "grad_norm": 1.8318481156437365, "learning_rate": 1.6423125306539903e-05, "loss": 0.3252, "step": 5654 }, { "epoch": 1.1147476340694007, "grad_norm": 0.6180923856298878, "learning_rate": 1.6421937223085284e-05, "loss": 0.3386, "step": 5655 }, { "epoch": 1.1149447949526814, "grad_norm": 0.5427448619206314, "learning_rate": 1.6420748985338454e-05, "loss": 0.3212, "step": 5656 }, { "epoch": 1.115141955835962, "grad_norm": 0.5267157193078928, "learning_rate": 1.641956059332796e-05, "loss": 0.3326, "step": 5657 }, { "epoch": 1.1153391167192428, "grad_norm": 0.5512555264071693, "learning_rate": 1.6418372047082352e-05, "loss": 0.3224, "step": 5658 }, { "epoch": 1.1155362776025237, "grad_norm": 0.5720422317475261, "learning_rate": 1.6417183346630188e-05, "loss": 0.334, "step": 5659 }, { "epoch": 1.1157334384858044, "grad_norm": 0.5754561614830481, "learning_rate": 1.6415994492000026e-05, "loss": 0.3141, "step": 5660 }, { "epoch": 1.1159305993690851, "grad_norm": 0.6375296150804535, "learning_rate": 1.6414805483220426e-05, "loss": 0.3275, "step": 5661 }, { "epoch": 1.116127760252366, "grad_norm": 0.509049237537517, "learning_rate": 1.641361632031996e-05, "loss": 0.3239, "step": 5662 }, { "epoch": 1.1163249211356467, "grad_norm": 0.5128636795892693, "learning_rate": 1.64124270033272e-05, "loss": 0.3159, "step": 5663 }, { "epoch": 1.1165220820189274, "grad_norm": 0.4912807390670567, "learning_rate": 1.6411237532270718e-05, "loss": 0.3144, "step": 5664 }, { "epoch": 1.1167192429022081, "grad_norm": 0.5397224874275655, "learning_rate": 1.641004790717909e-05, "loss": 0.3313, "step": 5665 }, { "epoch": 1.1169164037854888, "grad_norm": 0.5253163708246812, "learning_rate": 1.64088581280809e-05, "loss": 0.3363, "step": 5666 }, { "epoch": 1.1171135646687698, "grad_norm": 0.6246187071629071, "learning_rate": 1.6407668195004726e-05, "loss": 0.3517, "step": 5667 }, { "epoch": 1.1173107255520505, "grad_norm": 0.510212683182266, "learning_rate": 1.640647810797917e-05, "loss": 0.3423, "step": 5668 }, { "epoch": 1.1175078864353312, "grad_norm": 0.5433994977386873, "learning_rate": 1.6405287867032816e-05, "loss": 0.3364, "step": 5669 }, { "epoch": 1.117705047318612, "grad_norm": 0.5114129587959397, "learning_rate": 1.6404097472194264e-05, "loss": 0.3212, "step": 5670 }, { "epoch": 1.1179022082018928, "grad_norm": 0.6184701890817311, "learning_rate": 1.6402906923492113e-05, "loss": 0.3586, "step": 5671 }, { "epoch": 1.1180993690851735, "grad_norm": 0.5157737817133912, "learning_rate": 1.6401716220954968e-05, "loss": 0.3466, "step": 5672 }, { "epoch": 1.1182965299684542, "grad_norm": 0.5239425894073806, "learning_rate": 1.640052536461143e-05, "loss": 0.3367, "step": 5673 }, { "epoch": 1.1184936908517351, "grad_norm": 0.6216718049365948, "learning_rate": 1.6399334354490123e-05, "loss": 0.3495, "step": 5674 }, { "epoch": 1.1186908517350158, "grad_norm": 0.5111026021999556, "learning_rate": 1.6398143190619648e-05, "loss": 0.3212, "step": 5675 }, { "epoch": 1.1188880126182965, "grad_norm": 0.49466574177659056, "learning_rate": 1.6396951873028634e-05, "loss": 0.3287, "step": 5676 }, { "epoch": 1.1190851735015772, "grad_norm": 0.48459040865141534, "learning_rate": 1.63957604017457e-05, "loss": 0.3209, "step": 5677 }, { "epoch": 1.1192823343848581, "grad_norm": 0.5002774049961711, "learning_rate": 1.6394568776799472e-05, "loss": 0.3325, "step": 5678 }, { "epoch": 1.1194794952681388, "grad_norm": 0.49713568658847396, "learning_rate": 1.6393376998218583e-05, "loss": 0.3456, "step": 5679 }, { "epoch": 1.1196766561514195, "grad_norm": 0.49493207831234715, "learning_rate": 1.6392185066031657e-05, "loss": 0.3127, "step": 5680 }, { "epoch": 1.1198738170347002, "grad_norm": 0.5362871237694371, "learning_rate": 1.639099298026734e-05, "loss": 0.3451, "step": 5681 }, { "epoch": 1.1200709779179812, "grad_norm": 0.5294651321602112, "learning_rate": 1.6389800740954268e-05, "loss": 0.362, "step": 5682 }, { "epoch": 1.1202681388012619, "grad_norm": 0.48735850316187274, "learning_rate": 1.6388608348121088e-05, "loss": 0.341, "step": 5683 }, { "epoch": 1.1204652996845426, "grad_norm": 0.5248544933902106, "learning_rate": 1.638741580179645e-05, "loss": 0.3176, "step": 5684 }, { "epoch": 1.1206624605678233, "grad_norm": 0.5439891987403144, "learning_rate": 1.6386223102009e-05, "loss": 0.359, "step": 5685 }, { "epoch": 1.1208596214511042, "grad_norm": 0.5011856573812568, "learning_rate": 1.6385030248787402e-05, "loss": 0.33, "step": 5686 }, { "epoch": 1.1210567823343849, "grad_norm": 0.4973353261615954, "learning_rate": 1.6383837242160304e-05, "loss": 0.3253, "step": 5687 }, { "epoch": 1.1212539432176656, "grad_norm": 0.5271878211430877, "learning_rate": 1.638264408215638e-05, "loss": 0.3273, "step": 5688 }, { "epoch": 1.1214511041009463, "grad_norm": 0.5007462516405465, "learning_rate": 1.6381450768804293e-05, "loss": 0.3566, "step": 5689 }, { "epoch": 1.1216482649842272, "grad_norm": 0.5032257739082565, "learning_rate": 1.638025730213271e-05, "loss": 0.3189, "step": 5690 }, { "epoch": 1.121845425867508, "grad_norm": 0.5057418106825747, "learning_rate": 1.637906368217031e-05, "loss": 0.3296, "step": 5691 }, { "epoch": 1.1220425867507886, "grad_norm": 0.5220864858440248, "learning_rate": 1.6377869908945763e-05, "loss": 0.3013, "step": 5692 }, { "epoch": 1.1222397476340693, "grad_norm": 0.49972633540948586, "learning_rate": 1.637667598248776e-05, "loss": 0.3246, "step": 5693 }, { "epoch": 1.1224369085173502, "grad_norm": 0.48521860052546617, "learning_rate": 1.6375481902824975e-05, "loss": 0.3067, "step": 5694 }, { "epoch": 1.122634069400631, "grad_norm": 0.5137855325686259, "learning_rate": 1.6374287669986108e-05, "loss": 0.333, "step": 5695 }, { "epoch": 1.1228312302839116, "grad_norm": 0.5053341438549699, "learning_rate": 1.637309328399985e-05, "loss": 0.3131, "step": 5696 }, { "epoch": 1.1230283911671923, "grad_norm": 0.4811130427810911, "learning_rate": 1.6371898744894888e-05, "loss": 0.2932, "step": 5697 }, { "epoch": 1.1232255520504733, "grad_norm": 0.4692455927554586, "learning_rate": 1.6370704052699927e-05, "loss": 0.3081, "step": 5698 }, { "epoch": 1.123422712933754, "grad_norm": 0.46390772652017165, "learning_rate": 1.6369509207443676e-05, "loss": 0.2891, "step": 5699 }, { "epoch": 1.1236198738170347, "grad_norm": 0.5387624182052491, "learning_rate": 1.6368314209154836e-05, "loss": 0.357, "step": 5700 }, { "epoch": 1.1238170347003154, "grad_norm": 0.4959986328725572, "learning_rate": 1.6367119057862115e-05, "loss": 0.3191, "step": 5701 }, { "epoch": 1.1240141955835963, "grad_norm": 0.5071814974603067, "learning_rate": 1.636592375359423e-05, "loss": 0.3324, "step": 5702 }, { "epoch": 1.124211356466877, "grad_norm": 0.5104341041644923, "learning_rate": 1.6364728296379904e-05, "loss": 0.3506, "step": 5703 }, { "epoch": 1.1244085173501577, "grad_norm": 0.5464624679656105, "learning_rate": 1.6363532686247853e-05, "loss": 0.3123, "step": 5704 }, { "epoch": 1.1246056782334386, "grad_norm": 0.5028129404728894, "learning_rate": 1.636233692322681e-05, "loss": 0.307, "step": 5705 }, { "epoch": 1.1248028391167193, "grad_norm": 0.6152702419910974, "learning_rate": 1.6361141007345494e-05, "loss": 0.3553, "step": 5706 }, { "epoch": 1.125, "grad_norm": 0.48179193603259657, "learning_rate": 1.6359944938632645e-05, "loss": 0.3147, "step": 5707 }, { "epoch": 1.1251971608832807, "grad_norm": 0.5598824963334975, "learning_rate": 1.6358748717116993e-05, "loss": 0.3402, "step": 5708 }, { "epoch": 1.1253943217665614, "grad_norm": 0.5571114963157853, "learning_rate": 1.6357552342827284e-05, "loss": 0.3206, "step": 5709 }, { "epoch": 1.1255914826498423, "grad_norm": 0.4925155006030104, "learning_rate": 1.6356355815792263e-05, "loss": 0.3077, "step": 5710 }, { "epoch": 1.125788643533123, "grad_norm": 0.6517198961949976, "learning_rate": 1.635515913604067e-05, "loss": 0.3521, "step": 5711 }, { "epoch": 1.1259858044164037, "grad_norm": 0.5747480536099305, "learning_rate": 1.6353962303601266e-05, "loss": 0.364, "step": 5712 }, { "epoch": 1.1261829652996846, "grad_norm": 0.5712666247584104, "learning_rate": 1.63527653185028e-05, "loss": 0.3705, "step": 5713 }, { "epoch": 1.1263801261829653, "grad_norm": 0.5587416233409346, "learning_rate": 1.6351568180774033e-05, "loss": 0.319, "step": 5714 }, { "epoch": 1.126577287066246, "grad_norm": 0.5358691155957552, "learning_rate": 1.635037089044372e-05, "loss": 0.3336, "step": 5715 }, { "epoch": 1.1267744479495267, "grad_norm": 0.5829710216948807, "learning_rate": 1.6349173447540634e-05, "loss": 0.3508, "step": 5716 }, { "epoch": 1.1269716088328074, "grad_norm": 0.5281339705441374, "learning_rate": 1.6347975852093544e-05, "loss": 0.3266, "step": 5717 }, { "epoch": 1.1271687697160884, "grad_norm": 0.5300750941636183, "learning_rate": 1.6346778104131222e-05, "loss": 0.3367, "step": 5718 }, { "epoch": 1.127365930599369, "grad_norm": 0.5660626646152135, "learning_rate": 1.634558020368245e-05, "loss": 0.3543, "step": 5719 }, { "epoch": 1.1275630914826498, "grad_norm": 0.4900690395273459, "learning_rate": 1.6344382150775994e-05, "loss": 0.2959, "step": 5720 }, { "epoch": 1.1277602523659307, "grad_norm": 0.5929670034437629, "learning_rate": 1.6343183945440656e-05, "loss": 0.3446, "step": 5721 }, { "epoch": 1.1279574132492114, "grad_norm": 0.5581429670822288, "learning_rate": 1.634198558770521e-05, "loss": 0.3424, "step": 5722 }, { "epoch": 1.128154574132492, "grad_norm": 0.5554338618967727, "learning_rate": 1.6340787077598456e-05, "loss": 0.3407, "step": 5723 }, { "epoch": 1.1283517350157728, "grad_norm": 0.4917238000036681, "learning_rate": 1.6339588415149186e-05, "loss": 0.3234, "step": 5724 }, { "epoch": 1.1285488958990537, "grad_norm": 0.6106081461017013, "learning_rate": 1.63383896003862e-05, "loss": 0.3239, "step": 5725 }, { "epoch": 1.1287460567823344, "grad_norm": 0.5010753713189542, "learning_rate": 1.6337190633338294e-05, "loss": 0.3191, "step": 5726 }, { "epoch": 1.1289432176656151, "grad_norm": 0.48674816598759957, "learning_rate": 1.6335991514034283e-05, "loss": 0.2984, "step": 5727 }, { "epoch": 1.1291403785488958, "grad_norm": 0.505286321715634, "learning_rate": 1.6334792242502978e-05, "loss": 0.3447, "step": 5728 }, { "epoch": 1.1293375394321767, "grad_norm": 0.5524393690270945, "learning_rate": 1.633359281877318e-05, "loss": 0.3392, "step": 5729 }, { "epoch": 1.1295347003154574, "grad_norm": 0.49196800593653534, "learning_rate": 1.633239324287372e-05, "loss": 0.3213, "step": 5730 }, { "epoch": 1.1297318611987381, "grad_norm": 0.5143287994894329, "learning_rate": 1.6331193514833413e-05, "loss": 0.3173, "step": 5731 }, { "epoch": 1.1299290220820188, "grad_norm": 0.5176908223126655, "learning_rate": 1.632999363468108e-05, "loss": 0.331, "step": 5732 }, { "epoch": 1.1301261829652998, "grad_norm": 0.5267013800473567, "learning_rate": 1.6328793602445553e-05, "loss": 0.3564, "step": 5733 }, { "epoch": 1.1303233438485805, "grad_norm": 0.5075643399395108, "learning_rate": 1.6327593418155667e-05, "loss": 0.3161, "step": 5734 }, { "epoch": 1.1305205047318612, "grad_norm": 0.5262025072548631, "learning_rate": 1.632639308184025e-05, "loss": 0.3345, "step": 5735 }, { "epoch": 1.1307176656151419, "grad_norm": 0.5292704648957198, "learning_rate": 1.632519259352814e-05, "loss": 0.3492, "step": 5736 }, { "epoch": 1.1309148264984228, "grad_norm": 0.48829983713727304, "learning_rate": 1.6323991953248192e-05, "loss": 0.3171, "step": 5737 }, { "epoch": 1.1311119873817035, "grad_norm": 0.4988512975455296, "learning_rate": 1.6322791161029245e-05, "loss": 0.3317, "step": 5738 }, { "epoch": 1.1313091482649842, "grad_norm": 0.5269442131354157, "learning_rate": 1.6321590216900146e-05, "loss": 0.3308, "step": 5739 }, { "epoch": 1.1315063091482649, "grad_norm": 0.4902800227785573, "learning_rate": 1.632038912088975e-05, "loss": 0.3061, "step": 5740 }, { "epoch": 1.1317034700315458, "grad_norm": 0.5247877126830968, "learning_rate": 1.6319187873026917e-05, "loss": 0.3197, "step": 5741 }, { "epoch": 1.1319006309148265, "grad_norm": 0.5541688059243428, "learning_rate": 1.6317986473340504e-05, "loss": 0.3295, "step": 5742 }, { "epoch": 1.1320977917981072, "grad_norm": 0.5564999524149586, "learning_rate": 1.631678492185938e-05, "loss": 0.3419, "step": 5743 }, { "epoch": 1.132294952681388, "grad_norm": 0.4751388780686587, "learning_rate": 1.631558321861241e-05, "loss": 0.3068, "step": 5744 }, { "epoch": 1.1324921135646688, "grad_norm": 0.5251581221447832, "learning_rate": 1.631438136362847e-05, "loss": 0.3438, "step": 5745 }, { "epoch": 1.1326892744479495, "grad_norm": 0.49557848543104777, "learning_rate": 1.6313179356936432e-05, "loss": 0.3336, "step": 5746 }, { "epoch": 1.1328864353312302, "grad_norm": 0.5759205928280032, "learning_rate": 1.631197719856518e-05, "loss": 0.341, "step": 5747 }, { "epoch": 1.1330835962145112, "grad_norm": 0.5279003018602465, "learning_rate": 1.6310774888543584e-05, "loss": 0.3361, "step": 5748 }, { "epoch": 1.1332807570977919, "grad_norm": 0.5105221641137494, "learning_rate": 1.6309572426900544e-05, "loss": 0.3463, "step": 5749 }, { "epoch": 1.1334779179810726, "grad_norm": 0.4979241157777894, "learning_rate": 1.6308369813664945e-05, "loss": 0.3316, "step": 5750 }, { "epoch": 1.1336750788643533, "grad_norm": 0.480147781148586, "learning_rate": 1.630716704886568e-05, "loss": 0.3278, "step": 5751 }, { "epoch": 1.133872239747634, "grad_norm": 0.5190158936682682, "learning_rate": 1.630596413253165e-05, "loss": 0.3487, "step": 5752 }, { "epoch": 1.1340694006309149, "grad_norm": 0.48721620702672924, "learning_rate": 1.6304761064691752e-05, "loss": 0.331, "step": 5753 }, { "epoch": 1.1342665615141956, "grad_norm": 0.48669878145102674, "learning_rate": 1.6303557845374894e-05, "loss": 0.3157, "step": 5754 }, { "epoch": 1.1344637223974763, "grad_norm": 0.48762645113611275, "learning_rate": 1.630235447460998e-05, "loss": 0.3414, "step": 5755 }, { "epoch": 1.1346608832807572, "grad_norm": 0.45531042840793384, "learning_rate": 1.6301150952425925e-05, "loss": 0.3097, "step": 5756 }, { "epoch": 1.134858044164038, "grad_norm": 0.5085651272774692, "learning_rate": 1.6299947278851644e-05, "loss": 0.333, "step": 5757 }, { "epoch": 1.1350552050473186, "grad_norm": 0.4893879791327469, "learning_rate": 1.6298743453916057e-05, "loss": 0.3044, "step": 5758 }, { "epoch": 1.1352523659305993, "grad_norm": 0.5589441744381375, "learning_rate": 1.6297539477648087e-05, "loss": 0.3467, "step": 5759 }, { "epoch": 1.13544952681388, "grad_norm": 0.4815754124533146, "learning_rate": 1.6296335350076658e-05, "loss": 0.3328, "step": 5760 }, { "epoch": 1.135646687697161, "grad_norm": 0.4867892480819176, "learning_rate": 1.6295131071230704e-05, "loss": 0.3256, "step": 5761 }, { "epoch": 1.1358438485804416, "grad_norm": 0.5067944820954798, "learning_rate": 1.6293926641139154e-05, "loss": 0.3323, "step": 5762 }, { "epoch": 1.1360410094637223, "grad_norm": 0.5113632611165255, "learning_rate": 1.629272205983095e-05, "loss": 0.3311, "step": 5763 }, { "epoch": 1.1362381703470033, "grad_norm": 0.5560655366878887, "learning_rate": 1.6291517327335027e-05, "loss": 0.3531, "step": 5764 }, { "epoch": 1.136435331230284, "grad_norm": 0.5079516665115742, "learning_rate": 1.6290312443680335e-05, "loss": 0.3268, "step": 5765 }, { "epoch": 1.1366324921135647, "grad_norm": 0.49476357370054935, "learning_rate": 1.6289107408895827e-05, "loss": 0.3199, "step": 5766 }, { "epoch": 1.1368296529968454, "grad_norm": 0.5068561554934843, "learning_rate": 1.6287902223010442e-05, "loss": 0.3093, "step": 5767 }, { "epoch": 1.1370268138801263, "grad_norm": 0.5241913684270388, "learning_rate": 1.6286696886053146e-05, "loss": 0.3442, "step": 5768 }, { "epoch": 1.137223974763407, "grad_norm": 0.5303723352436314, "learning_rate": 1.6285491398052892e-05, "loss": 0.3473, "step": 5769 }, { "epoch": 1.1374211356466877, "grad_norm": 0.5013219330856153, "learning_rate": 1.6284285759038647e-05, "loss": 0.3334, "step": 5770 }, { "epoch": 1.1376182965299684, "grad_norm": 0.5116295193399402, "learning_rate": 1.6283079969039376e-05, "loss": 0.3348, "step": 5771 }, { "epoch": 1.1378154574132493, "grad_norm": 0.5072830412189899, "learning_rate": 1.6281874028084048e-05, "loss": 0.3304, "step": 5772 }, { "epoch": 1.13801261829653, "grad_norm": 0.5969770845470891, "learning_rate": 1.6280667936201638e-05, "loss": 0.3693, "step": 5773 }, { "epoch": 1.1382097791798107, "grad_norm": 0.49707202160221337, "learning_rate": 1.6279461693421122e-05, "loss": 0.3092, "step": 5774 }, { "epoch": 1.1384069400630914, "grad_norm": 0.48306958026968283, "learning_rate": 1.6278255299771485e-05, "loss": 0.3133, "step": 5775 }, { "epoch": 1.1386041009463723, "grad_norm": 0.49732568766268076, "learning_rate": 1.6277048755281706e-05, "loss": 0.3133, "step": 5776 }, { "epoch": 1.138801261829653, "grad_norm": 0.49440667619738315, "learning_rate": 1.6275842059980777e-05, "loss": 0.3081, "step": 5777 }, { "epoch": 1.1389984227129337, "grad_norm": 0.49394133306858135, "learning_rate": 1.627463521389769e-05, "loss": 0.3075, "step": 5778 }, { "epoch": 1.1391955835962144, "grad_norm": 0.5006133223548448, "learning_rate": 1.6273428217061438e-05, "loss": 0.3382, "step": 5779 }, { "epoch": 1.1393927444794953, "grad_norm": 0.5094072983950105, "learning_rate": 1.627222106950102e-05, "loss": 0.3195, "step": 5780 }, { "epoch": 1.139589905362776, "grad_norm": 0.4911846217605256, "learning_rate": 1.627101377124544e-05, "loss": 0.3157, "step": 5781 }, { "epoch": 1.1397870662460567, "grad_norm": 0.5197657629535235, "learning_rate": 1.626980632232371e-05, "loss": 0.3455, "step": 5782 }, { "epoch": 1.1399842271293374, "grad_norm": 0.5165791218182875, "learning_rate": 1.6268598722764825e-05, "loss": 0.3571, "step": 5783 }, { "epoch": 1.1401813880126184, "grad_norm": 0.4911136281917946, "learning_rate": 1.6267390972597808e-05, "loss": 0.3258, "step": 5784 }, { "epoch": 1.140378548895899, "grad_norm": 0.4791334939004954, "learning_rate": 1.626618307185168e-05, "loss": 0.313, "step": 5785 }, { "epoch": 1.1405757097791798, "grad_norm": 0.507257163220278, "learning_rate": 1.626497502055546e-05, "loss": 0.3348, "step": 5786 }, { "epoch": 1.1407728706624605, "grad_norm": 0.5095261380886309, "learning_rate": 1.6263766818738163e-05, "loss": 0.3352, "step": 5787 }, { "epoch": 1.1409700315457414, "grad_norm": 0.489168264070942, "learning_rate": 1.6262558466428827e-05, "loss": 0.3181, "step": 5788 }, { "epoch": 1.141167192429022, "grad_norm": 0.5758632623492379, "learning_rate": 1.6261349963656482e-05, "loss": 0.3221, "step": 5789 }, { "epoch": 1.1413643533123028, "grad_norm": 0.5155189277830298, "learning_rate": 1.6260141310450158e-05, "loss": 0.3378, "step": 5790 }, { "epoch": 1.1415615141955837, "grad_norm": 0.5361821367760298, "learning_rate": 1.6258932506838903e-05, "loss": 0.3548, "step": 5791 }, { "epoch": 1.1417586750788644, "grad_norm": 0.510073475492786, "learning_rate": 1.6257723552851752e-05, "loss": 0.3259, "step": 5792 }, { "epoch": 1.1419558359621451, "grad_norm": 0.5181336652283103, "learning_rate": 1.6256514448517753e-05, "loss": 0.3181, "step": 5793 }, { "epoch": 1.1421529968454258, "grad_norm": 0.5218135564565195, "learning_rate": 1.6255305193865957e-05, "loss": 0.3269, "step": 5794 }, { "epoch": 1.1423501577287065, "grad_norm": 0.6090159220142208, "learning_rate": 1.6254095788925413e-05, "loss": 0.3487, "step": 5795 }, { "epoch": 1.1425473186119874, "grad_norm": 0.4918572565910208, "learning_rate": 1.6252886233725186e-05, "loss": 0.3222, "step": 5796 }, { "epoch": 1.1427444794952681, "grad_norm": 0.5170092942493402, "learning_rate": 1.625167652829433e-05, "loss": 0.3075, "step": 5797 }, { "epoch": 1.1429416403785488, "grad_norm": 0.5153552400826925, "learning_rate": 1.625046667266191e-05, "loss": 0.3552, "step": 5798 }, { "epoch": 1.1431388012618298, "grad_norm": 0.5092426572265226, "learning_rate": 1.6249256666856995e-05, "loss": 0.2974, "step": 5799 }, { "epoch": 1.1433359621451105, "grad_norm": 0.5381814851040011, "learning_rate": 1.6248046510908654e-05, "loss": 0.3186, "step": 5800 }, { "epoch": 1.1435331230283912, "grad_norm": 0.5393871623205769, "learning_rate": 1.6246836204845967e-05, "loss": 0.3503, "step": 5801 }, { "epoch": 1.1437302839116719, "grad_norm": 0.5047753301047979, "learning_rate": 1.624562574869801e-05, "loss": 0.3232, "step": 5802 }, { "epoch": 1.1439274447949526, "grad_norm": 0.5134380411824181, "learning_rate": 1.6244415142493867e-05, "loss": 0.3374, "step": 5803 }, { "epoch": 1.1441246056782335, "grad_norm": 0.5382530413103754, "learning_rate": 1.6243204386262618e-05, "loss": 0.3472, "step": 5804 }, { "epoch": 1.1443217665615142, "grad_norm": 0.4860783558028898, "learning_rate": 1.6241993480033353e-05, "loss": 0.2894, "step": 5805 }, { "epoch": 1.1445189274447949, "grad_norm": 0.5360606741317754, "learning_rate": 1.6240782423835174e-05, "loss": 0.3526, "step": 5806 }, { "epoch": 1.1447160883280758, "grad_norm": 0.4950368935661035, "learning_rate": 1.6239571217697164e-05, "loss": 0.3282, "step": 5807 }, { "epoch": 1.1449132492113565, "grad_norm": 0.5007041011557887, "learning_rate": 1.6238359861648438e-05, "loss": 0.3199, "step": 5808 }, { "epoch": 1.1451104100946372, "grad_norm": 0.4866543775966263, "learning_rate": 1.6237148355718092e-05, "loss": 0.3306, "step": 5809 }, { "epoch": 1.145307570977918, "grad_norm": 0.4824258516786147, "learning_rate": 1.623593669993523e-05, "loss": 0.3098, "step": 5810 }, { "epoch": 1.1455047318611988, "grad_norm": 0.4805879120897678, "learning_rate": 1.623472489432897e-05, "loss": 0.3024, "step": 5811 }, { "epoch": 1.1457018927444795, "grad_norm": 0.5450607872650646, "learning_rate": 1.623351293892842e-05, "loss": 0.3426, "step": 5812 }, { "epoch": 1.1458990536277602, "grad_norm": 0.49414340389336664, "learning_rate": 1.623230083376271e-05, "loss": 0.3353, "step": 5813 }, { "epoch": 1.146096214511041, "grad_norm": 0.5993703898368199, "learning_rate": 1.6231088578860946e-05, "loss": 0.3437, "step": 5814 }, { "epoch": 1.1462933753943219, "grad_norm": 0.5174297786339515, "learning_rate": 1.6229876174252265e-05, "loss": 0.3284, "step": 5815 }, { "epoch": 1.1464905362776026, "grad_norm": 0.49563315628009424, "learning_rate": 1.6228663619965787e-05, "loss": 0.316, "step": 5816 }, { "epoch": 1.1466876971608833, "grad_norm": 0.5185380709534432, "learning_rate": 1.6227450916030655e-05, "loss": 0.3324, "step": 5817 }, { "epoch": 1.146884858044164, "grad_norm": 0.517728568283838, "learning_rate": 1.6226238062476e-05, "loss": 0.3301, "step": 5818 }, { "epoch": 1.1470820189274449, "grad_norm": 0.5337322717327413, "learning_rate": 1.6225025059330954e-05, "loss": 0.3549, "step": 5819 }, { "epoch": 1.1472791798107256, "grad_norm": 0.51659644720446, "learning_rate": 1.6223811906624675e-05, "loss": 0.3304, "step": 5820 }, { "epoch": 1.1474763406940063, "grad_norm": 0.5135534115538601, "learning_rate": 1.6222598604386303e-05, "loss": 0.3253, "step": 5821 }, { "epoch": 1.147673501577287, "grad_norm": 0.5683240033324098, "learning_rate": 1.6221385152644986e-05, "loss": 0.3719, "step": 5822 }, { "epoch": 1.147870662460568, "grad_norm": 0.5006723049086741, "learning_rate": 1.622017155142988e-05, "loss": 0.3163, "step": 5823 }, { "epoch": 1.1480678233438486, "grad_norm": 0.5148405356777013, "learning_rate": 1.6218957800770146e-05, "loss": 0.3548, "step": 5824 }, { "epoch": 1.1482649842271293, "grad_norm": 0.5750083625058464, "learning_rate": 1.621774390069494e-05, "loss": 0.3561, "step": 5825 }, { "epoch": 1.14846214511041, "grad_norm": 0.5165474137396464, "learning_rate": 1.621652985123343e-05, "loss": 0.3339, "step": 5826 }, { "epoch": 1.148659305993691, "grad_norm": 0.5084589221237357, "learning_rate": 1.6215315652414786e-05, "loss": 0.3318, "step": 5827 }, { "epoch": 1.1488564668769716, "grad_norm": 0.5035248616427002, "learning_rate": 1.6214101304268177e-05, "loss": 0.338, "step": 5828 }, { "epoch": 1.1490536277602523, "grad_norm": 0.5544001188077334, "learning_rate": 1.621288680682278e-05, "loss": 0.3366, "step": 5829 }, { "epoch": 1.149250788643533, "grad_norm": 0.5160867252412951, "learning_rate": 1.6211672160107776e-05, "loss": 0.3518, "step": 5830 }, { "epoch": 1.149447949526814, "grad_norm": 0.5010643439725077, "learning_rate": 1.6210457364152345e-05, "loss": 0.3436, "step": 5831 }, { "epoch": 1.1496451104100947, "grad_norm": 0.5106652676242482, "learning_rate": 1.6209242418985673e-05, "loss": 0.3226, "step": 5832 }, { "epoch": 1.1498422712933754, "grad_norm": 0.48330155765068944, "learning_rate": 1.6208027324636956e-05, "loss": 0.3091, "step": 5833 }, { "epoch": 1.1500394321766563, "grad_norm": 0.5026909229195045, "learning_rate": 1.620681208113538e-05, "loss": 0.313, "step": 5834 }, { "epoch": 1.150236593059937, "grad_norm": 0.4919091752987136, "learning_rate": 1.6205596688510144e-05, "loss": 0.3118, "step": 5835 }, { "epoch": 1.1504337539432177, "grad_norm": 0.4987724584414972, "learning_rate": 1.6204381146790452e-05, "loss": 0.3159, "step": 5836 }, { "epoch": 1.1506309148264984, "grad_norm": 0.47351362341946696, "learning_rate": 1.6203165456005505e-05, "loss": 0.3215, "step": 5837 }, { "epoch": 1.150828075709779, "grad_norm": 0.5390815160396467, "learning_rate": 1.6201949616184515e-05, "loss": 0.3512, "step": 5838 }, { "epoch": 1.15102523659306, "grad_norm": 0.4907164796379062, "learning_rate": 1.620073362735669e-05, "loss": 0.3351, "step": 5839 }, { "epoch": 1.1512223974763407, "grad_norm": 0.5183309334339059, "learning_rate": 1.6199517489551246e-05, "loss": 0.3471, "step": 5840 }, { "epoch": 1.1514195583596214, "grad_norm": 0.5094113386762072, "learning_rate": 1.61983012027974e-05, "loss": 0.3393, "step": 5841 }, { "epoch": 1.1516167192429023, "grad_norm": 0.47916372568319, "learning_rate": 1.6197084767124378e-05, "loss": 0.3002, "step": 5842 }, { "epoch": 1.151813880126183, "grad_norm": 0.5554961928781669, "learning_rate": 1.619586818256141e-05, "loss": 0.3465, "step": 5843 }, { "epoch": 1.1520110410094637, "grad_norm": 0.4760523166959915, "learning_rate": 1.6194651449137708e-05, "loss": 0.315, "step": 5844 }, { "epoch": 1.1522082018927444, "grad_norm": 0.49772863471397966, "learning_rate": 1.6193434566882522e-05, "loss": 0.3392, "step": 5845 }, { "epoch": 1.1524053627760251, "grad_norm": 0.49931835312839523, "learning_rate": 1.6192217535825084e-05, "loss": 0.3259, "step": 5846 }, { "epoch": 1.152602523659306, "grad_norm": 0.4935523430213534, "learning_rate": 1.619100035599463e-05, "loss": 0.3188, "step": 5847 }, { "epoch": 1.1527996845425867, "grad_norm": 0.5009652113164703, "learning_rate": 1.618978302742041e-05, "loss": 0.3017, "step": 5848 }, { "epoch": 1.1529968454258674, "grad_norm": 0.513184184558755, "learning_rate": 1.6188565550131667e-05, "loss": 0.3389, "step": 5849 }, { "epoch": 1.1531940063091484, "grad_norm": 0.5120199704705086, "learning_rate": 1.6187347924157654e-05, "loss": 0.3272, "step": 5850 }, { "epoch": 1.153391167192429, "grad_norm": 0.55709606253943, "learning_rate": 1.618613014952762e-05, "loss": 0.3511, "step": 5851 }, { "epoch": 1.1535883280757098, "grad_norm": 0.528693155299282, "learning_rate": 1.6184912226270833e-05, "loss": 0.3246, "step": 5852 }, { "epoch": 1.1537854889589905, "grad_norm": 0.4926792899919863, "learning_rate": 1.6183694154416548e-05, "loss": 0.3401, "step": 5853 }, { "epoch": 1.1539826498422712, "grad_norm": 0.5247479545911596, "learning_rate": 1.618247593399403e-05, "loss": 0.3147, "step": 5854 }, { "epoch": 1.154179810725552, "grad_norm": 0.5391139938461473, "learning_rate": 1.6181257565032548e-05, "loss": 0.3424, "step": 5855 }, { "epoch": 1.1543769716088328, "grad_norm": 0.5379238676508531, "learning_rate": 1.6180039047561375e-05, "loss": 0.3489, "step": 5856 }, { "epoch": 1.1545741324921135, "grad_norm": 0.5433587033942305, "learning_rate": 1.6178820381609793e-05, "loss": 0.3583, "step": 5857 }, { "epoch": 1.1547712933753944, "grad_norm": 0.5515081493988264, "learning_rate": 1.617760156720707e-05, "loss": 0.3409, "step": 5858 }, { "epoch": 1.1549684542586751, "grad_norm": 0.5211754285676179, "learning_rate": 1.617638260438249e-05, "loss": 0.3441, "step": 5859 }, { "epoch": 1.1551656151419558, "grad_norm": 0.5728695227853102, "learning_rate": 1.6175163493165353e-05, "loss": 0.3692, "step": 5860 }, { "epoch": 1.1553627760252365, "grad_norm": 0.5398536734238296, "learning_rate": 1.6173944233584936e-05, "loss": 0.3333, "step": 5861 }, { "epoch": 1.1555599369085174, "grad_norm": 0.5261150430536126, "learning_rate": 1.6172724825670537e-05, "loss": 0.3279, "step": 5862 }, { "epoch": 1.1557570977917981, "grad_norm": 0.5106454983015667, "learning_rate": 1.6171505269451456e-05, "loss": 0.339, "step": 5863 }, { "epoch": 1.1559542586750788, "grad_norm": 0.5055443785259361, "learning_rate": 1.617028556495699e-05, "loss": 0.2946, "step": 5864 }, { "epoch": 1.1561514195583595, "grad_norm": 0.5803137026274476, "learning_rate": 1.6169065712216444e-05, "loss": 0.3789, "step": 5865 }, { "epoch": 1.1563485804416405, "grad_norm": 0.512878015699482, "learning_rate": 1.6167845711259123e-05, "loss": 0.3469, "step": 5866 }, { "epoch": 1.1565457413249212, "grad_norm": 0.5349976563998219, "learning_rate": 1.6166625562114347e-05, "loss": 0.3428, "step": 5867 }, { "epoch": 1.1567429022082019, "grad_norm": 0.46278679315128507, "learning_rate": 1.616540526481142e-05, "loss": 0.2879, "step": 5868 }, { "epoch": 1.1569400630914826, "grad_norm": 0.5228615238771711, "learning_rate": 1.6164184819379673e-05, "loss": 0.33, "step": 5869 }, { "epoch": 1.1571372239747635, "grad_norm": 0.5253461017754688, "learning_rate": 1.6162964225848416e-05, "loss": 0.3451, "step": 5870 }, { "epoch": 1.1573343848580442, "grad_norm": 0.5368330566994861, "learning_rate": 1.6161743484246987e-05, "loss": 0.32, "step": 5871 }, { "epoch": 1.1575315457413249, "grad_norm": 0.5655488658811362, "learning_rate": 1.6160522594604704e-05, "loss": 0.3267, "step": 5872 }, { "epoch": 1.1577287066246056, "grad_norm": 0.553771815896319, "learning_rate": 1.6159301556950904e-05, "loss": 0.3338, "step": 5873 }, { "epoch": 1.1579258675078865, "grad_norm": 0.49679032602454654, "learning_rate": 1.6158080371314926e-05, "loss": 0.3207, "step": 5874 }, { "epoch": 1.1581230283911672, "grad_norm": 0.5110047440222282, "learning_rate": 1.6156859037726108e-05, "loss": 0.3273, "step": 5875 }, { "epoch": 1.158320189274448, "grad_norm": 0.5164914172279298, "learning_rate": 1.6155637556213793e-05, "loss": 0.3248, "step": 5876 }, { "epoch": 1.1585173501577288, "grad_norm": 0.5120868579780778, "learning_rate": 1.6154415926807327e-05, "loss": 0.3603, "step": 5877 }, { "epoch": 1.1587145110410095, "grad_norm": 0.4585506611956392, "learning_rate": 1.6153194149536064e-05, "loss": 0.2974, "step": 5878 }, { "epoch": 1.1589116719242902, "grad_norm": 0.5616942590634053, "learning_rate": 1.6151972224429356e-05, "loss": 0.3255, "step": 5879 }, { "epoch": 1.159108832807571, "grad_norm": 0.5276396699788394, "learning_rate": 1.615075015151656e-05, "loss": 0.3336, "step": 5880 }, { "epoch": 1.1593059936908516, "grad_norm": 0.48785185958924593, "learning_rate": 1.6149527930827043e-05, "loss": 0.3183, "step": 5881 }, { "epoch": 1.1595031545741326, "grad_norm": 0.5070222632900729, "learning_rate": 1.614830556239016e-05, "loss": 0.3392, "step": 5882 }, { "epoch": 1.1597003154574133, "grad_norm": 0.5357213010735683, "learning_rate": 1.6147083046235287e-05, "loss": 0.3302, "step": 5883 }, { "epoch": 1.159897476340694, "grad_norm": 0.492115580680464, "learning_rate": 1.6145860382391792e-05, "loss": 0.3062, "step": 5884 }, { "epoch": 1.1600946372239749, "grad_norm": 0.4775935270120072, "learning_rate": 1.6144637570889055e-05, "loss": 0.3084, "step": 5885 }, { "epoch": 1.1602917981072556, "grad_norm": 0.5574502509553525, "learning_rate": 1.6143414611756448e-05, "loss": 0.3762, "step": 5886 }, { "epoch": 1.1604889589905363, "grad_norm": 0.5324231837542485, "learning_rate": 1.6142191505023362e-05, "loss": 0.3394, "step": 5887 }, { "epoch": 1.160686119873817, "grad_norm": 0.5540695998548545, "learning_rate": 1.6140968250719177e-05, "loss": 0.3334, "step": 5888 }, { "epoch": 1.1608832807570977, "grad_norm": 0.5152035583057871, "learning_rate": 1.6139744848873283e-05, "loss": 0.333, "step": 5889 }, { "epoch": 1.1610804416403786, "grad_norm": 0.5016308910529644, "learning_rate": 1.613852129951508e-05, "loss": 0.328, "step": 5890 }, { "epoch": 1.1612776025236593, "grad_norm": 0.4671308137477831, "learning_rate": 1.6137297602673955e-05, "loss": 0.3006, "step": 5891 }, { "epoch": 1.16147476340694, "grad_norm": 0.47717698267201425, "learning_rate": 1.613607375837931e-05, "loss": 0.3085, "step": 5892 }, { "epoch": 1.161671924290221, "grad_norm": 0.5411199798727852, "learning_rate": 1.6134849766660557e-05, "loss": 0.3512, "step": 5893 }, { "epoch": 1.1618690851735016, "grad_norm": 0.48072485668659576, "learning_rate": 1.6133625627547096e-05, "loss": 0.2991, "step": 5894 }, { "epoch": 1.1620662460567823, "grad_norm": 0.49074020021120335, "learning_rate": 1.613240134106834e-05, "loss": 0.3185, "step": 5895 }, { "epoch": 1.162263406940063, "grad_norm": 0.4742652414752287, "learning_rate": 1.6131176907253703e-05, "loss": 0.3219, "step": 5896 }, { "epoch": 1.1624605678233437, "grad_norm": 0.4926211188349463, "learning_rate": 1.6129952326132603e-05, "loss": 0.3213, "step": 5897 }, { "epoch": 1.1626577287066246, "grad_norm": 0.5260995869588211, "learning_rate": 1.6128727597734465e-05, "loss": 0.3266, "step": 5898 }, { "epoch": 1.1628548895899053, "grad_norm": 0.5005689603012923, "learning_rate": 1.6127502722088703e-05, "loss": 0.3281, "step": 5899 }, { "epoch": 1.163052050473186, "grad_norm": 0.5286226637548996, "learning_rate": 1.612627769922476e-05, "loss": 0.3383, "step": 5900 }, { "epoch": 1.163249211356467, "grad_norm": 0.5241554407104515, "learning_rate": 1.612505252917206e-05, "loss": 0.3442, "step": 5901 }, { "epoch": 1.1634463722397477, "grad_norm": 0.4969893983731258, "learning_rate": 1.6123827211960044e-05, "loss": 0.3355, "step": 5902 }, { "epoch": 1.1636435331230284, "grad_norm": 0.5176767431545333, "learning_rate": 1.6122601747618144e-05, "loss": 0.3456, "step": 5903 }, { "epoch": 1.163840694006309, "grad_norm": 0.5541524213460989, "learning_rate": 1.612137613617581e-05, "loss": 0.3082, "step": 5904 }, { "epoch": 1.16403785488959, "grad_norm": 0.5130645123161176, "learning_rate": 1.612015037766248e-05, "loss": 0.3595, "step": 5905 }, { "epoch": 1.1642350157728707, "grad_norm": 0.49077914052450783, "learning_rate": 1.611892447210761e-05, "loss": 0.3201, "step": 5906 }, { "epoch": 1.1644321766561514, "grad_norm": 0.5154065030810983, "learning_rate": 1.6117698419540655e-05, "loss": 0.345, "step": 5907 }, { "epoch": 1.164629337539432, "grad_norm": 0.5137017205525405, "learning_rate": 1.6116472219991066e-05, "loss": 0.321, "step": 5908 }, { "epoch": 1.164826498422713, "grad_norm": 0.5185538568944038, "learning_rate": 1.6115245873488308e-05, "loss": 0.3349, "step": 5909 }, { "epoch": 1.1650236593059937, "grad_norm": 0.47872637622833353, "learning_rate": 1.6114019380061844e-05, "loss": 0.3091, "step": 5910 }, { "epoch": 1.1652208201892744, "grad_norm": 0.5349706635614739, "learning_rate": 1.6112792739741138e-05, "loss": 0.3416, "step": 5911 }, { "epoch": 1.1654179810725551, "grad_norm": 0.494058437524682, "learning_rate": 1.6111565952555666e-05, "loss": 0.3411, "step": 5912 }, { "epoch": 1.165615141955836, "grad_norm": 0.5148890708118689, "learning_rate": 1.6110339018534898e-05, "loss": 0.3379, "step": 5913 }, { "epoch": 1.1658123028391167, "grad_norm": 0.48546285073114465, "learning_rate": 1.6109111937708317e-05, "loss": 0.3157, "step": 5914 }, { "epoch": 1.1660094637223974, "grad_norm": 0.5386295226523584, "learning_rate": 1.61078847101054e-05, "loss": 0.3524, "step": 5915 }, { "epoch": 1.1662066246056781, "grad_norm": 0.5357486231753327, "learning_rate": 1.6106657335755636e-05, "loss": 0.3296, "step": 5916 }, { "epoch": 1.166403785488959, "grad_norm": 0.5071347852474872, "learning_rate": 1.610542981468851e-05, "loss": 0.2972, "step": 5917 }, { "epoch": 1.1666009463722398, "grad_norm": 0.5136673972393638, "learning_rate": 1.6104202146933517e-05, "loss": 0.3427, "step": 5918 }, { "epoch": 1.1667981072555205, "grad_norm": 0.5366052967903738, "learning_rate": 1.6102974332520155e-05, "loss": 0.3348, "step": 5919 }, { "epoch": 1.1669952681388012, "grad_norm": 0.5142076244004472, "learning_rate": 1.6101746371477915e-05, "loss": 0.3409, "step": 5920 }, { "epoch": 1.167192429022082, "grad_norm": 0.5156383871361846, "learning_rate": 1.6100518263836305e-05, "loss": 0.3441, "step": 5921 }, { "epoch": 1.1673895899053628, "grad_norm": 0.48857227681411236, "learning_rate": 1.609929000962483e-05, "loss": 0.3249, "step": 5922 }, { "epoch": 1.1675867507886435, "grad_norm": 0.5116415565941662, "learning_rate": 1.6098061608873006e-05, "loss": 0.3347, "step": 5923 }, { "epoch": 1.1677839116719242, "grad_norm": 0.476427280273157, "learning_rate": 1.609683306161034e-05, "loss": 0.2935, "step": 5924 }, { "epoch": 1.1679810725552051, "grad_norm": 0.5315870178041014, "learning_rate": 1.6095604367866348e-05, "loss": 0.3298, "step": 5925 }, { "epoch": 1.1681782334384858, "grad_norm": 0.4885155093607642, "learning_rate": 1.6094375527670553e-05, "loss": 0.2914, "step": 5926 }, { "epoch": 1.1683753943217665, "grad_norm": 0.4958137107083927, "learning_rate": 1.6093146541052472e-05, "loss": 0.3236, "step": 5927 }, { "epoch": 1.1685725552050474, "grad_norm": 0.5186709145538795, "learning_rate": 1.609191740804165e-05, "loss": 0.3474, "step": 5928 }, { "epoch": 1.1687697160883281, "grad_norm": 0.5431827154799177, "learning_rate": 1.6090688128667597e-05, "loss": 0.2922, "step": 5929 }, { "epoch": 1.1689668769716088, "grad_norm": 0.5276177466035217, "learning_rate": 1.608945870295986e-05, "loss": 0.3273, "step": 5930 }, { "epoch": 1.1691640378548895, "grad_norm": 0.5403429181736961, "learning_rate": 1.6088229130947976e-05, "loss": 0.3573, "step": 5931 }, { "epoch": 1.1693611987381702, "grad_norm": 0.5682917162052413, "learning_rate": 1.6086999412661483e-05, "loss": 0.3577, "step": 5932 }, { "epoch": 1.1695583596214512, "grad_norm": 0.5092581015992266, "learning_rate": 1.6085769548129928e-05, "loss": 0.3178, "step": 5933 }, { "epoch": 1.1697555205047319, "grad_norm": 0.5341988017389373, "learning_rate": 1.6084539537382853e-05, "loss": 0.318, "step": 5934 }, { "epoch": 1.1699526813880126, "grad_norm": 0.5508242925818101, "learning_rate": 1.6083309380449822e-05, "loss": 0.3399, "step": 5935 }, { "epoch": 1.1701498422712935, "grad_norm": 0.5516468570148881, "learning_rate": 1.6082079077360382e-05, "loss": 0.3464, "step": 5936 }, { "epoch": 1.1703470031545742, "grad_norm": 0.5310729616685209, "learning_rate": 1.6080848628144097e-05, "loss": 0.3549, "step": 5937 }, { "epoch": 1.1705441640378549, "grad_norm": 0.5007231153163549, "learning_rate": 1.6079618032830523e-05, "loss": 0.317, "step": 5938 }, { "epoch": 1.1707413249211356, "grad_norm": 0.5205324245736465, "learning_rate": 1.6078387291449234e-05, "loss": 0.3343, "step": 5939 }, { "epoch": 1.1709384858044163, "grad_norm": 0.5293927543610932, "learning_rate": 1.607715640402979e-05, "loss": 0.3364, "step": 5940 }, { "epoch": 1.1711356466876972, "grad_norm": 0.5437502522427697, "learning_rate": 1.607592537060177e-05, "loss": 0.3625, "step": 5941 }, { "epoch": 1.171332807570978, "grad_norm": 0.5852873503945257, "learning_rate": 1.6074694191194758e-05, "loss": 0.343, "step": 5942 }, { "epoch": 1.1715299684542586, "grad_norm": 0.5258888667294329, "learning_rate": 1.607346286583832e-05, "loss": 0.3525, "step": 5943 }, { "epoch": 1.1717271293375395, "grad_norm": 0.49502127400893137, "learning_rate": 1.6072231394562045e-05, "loss": 0.3429, "step": 5944 }, { "epoch": 1.1719242902208202, "grad_norm": 0.5340808573650885, "learning_rate": 1.6070999777395522e-05, "loss": 0.3265, "step": 5945 }, { "epoch": 1.172121451104101, "grad_norm": 0.4913659047348541, "learning_rate": 1.6069768014368344e-05, "loss": 0.3264, "step": 5946 }, { "epoch": 1.1723186119873816, "grad_norm": 0.5796915968301445, "learning_rate": 1.6068536105510095e-05, "loss": 0.3314, "step": 5947 }, { "epoch": 1.1725157728706626, "grad_norm": 0.48015072710313483, "learning_rate": 1.606730405085038e-05, "loss": 0.3063, "step": 5948 }, { "epoch": 1.1727129337539433, "grad_norm": 0.5798304016447527, "learning_rate": 1.60660718504188e-05, "loss": 0.3673, "step": 5949 }, { "epoch": 1.172910094637224, "grad_norm": 0.49383450156062897, "learning_rate": 1.6064839504244964e-05, "loss": 0.3137, "step": 5950 }, { "epoch": 1.1731072555205047, "grad_norm": 0.49990802164314824, "learning_rate": 1.6063607012358474e-05, "loss": 0.345, "step": 5951 }, { "epoch": 1.1733044164037856, "grad_norm": 0.5710511478007725, "learning_rate": 1.6062374374788938e-05, "loss": 0.3489, "step": 5952 }, { "epoch": 1.1735015772870663, "grad_norm": 0.5406553339299977, "learning_rate": 1.6061141591565977e-05, "loss": 0.3376, "step": 5953 }, { "epoch": 1.173698738170347, "grad_norm": 0.5266705373762315, "learning_rate": 1.605990866271921e-05, "loss": 0.3169, "step": 5954 }, { "epoch": 1.1738958990536277, "grad_norm": 0.5290688903751791, "learning_rate": 1.605867558827825e-05, "loss": 0.3314, "step": 5955 }, { "epoch": 1.1740930599369086, "grad_norm": 0.4981019544107017, "learning_rate": 1.605744236827274e-05, "loss": 0.3499, "step": 5956 }, { "epoch": 1.1742902208201893, "grad_norm": 0.4966110971975489, "learning_rate": 1.6056209002732293e-05, "loss": 0.3241, "step": 5957 }, { "epoch": 1.17448738170347, "grad_norm": 0.5075484095401199, "learning_rate": 1.605497549168655e-05, "loss": 0.3164, "step": 5958 }, { "epoch": 1.1746845425867507, "grad_norm": 0.7450970335942361, "learning_rate": 1.6053741835165146e-05, "loss": 0.3461, "step": 5959 }, { "epoch": 1.1748817034700316, "grad_norm": 0.5291598106311712, "learning_rate": 1.6052508033197713e-05, "loss": 0.323, "step": 5960 }, { "epoch": 1.1750788643533123, "grad_norm": 0.5970311083346598, "learning_rate": 1.6051274085813906e-05, "loss": 0.3433, "step": 5961 }, { "epoch": 1.175276025236593, "grad_norm": 0.5257206942997955, "learning_rate": 1.6050039993043366e-05, "loss": 0.3312, "step": 5962 }, { "epoch": 1.1754731861198737, "grad_norm": 0.5470667266403514, "learning_rate": 1.604880575491574e-05, "loss": 0.3241, "step": 5963 }, { "epoch": 1.1756703470031546, "grad_norm": 0.526634277205355, "learning_rate": 1.6047571371460688e-05, "loss": 0.3486, "step": 5964 }, { "epoch": 1.1758675078864353, "grad_norm": 0.5087896217443573, "learning_rate": 1.6046336842707862e-05, "loss": 0.3373, "step": 5965 }, { "epoch": 1.176064668769716, "grad_norm": 0.5213596470506094, "learning_rate": 1.6045102168686925e-05, "loss": 0.3399, "step": 5966 }, { "epoch": 1.1762618296529967, "grad_norm": 0.5025963895764581, "learning_rate": 1.604386734942754e-05, "loss": 0.3309, "step": 5967 }, { "epoch": 1.1764589905362777, "grad_norm": 0.4947895294362806, "learning_rate": 1.6042632384959377e-05, "loss": 0.3266, "step": 5968 }, { "epoch": 1.1766561514195584, "grad_norm": 0.5280726977614281, "learning_rate": 1.6041397275312102e-05, "loss": 0.3347, "step": 5969 }, { "epoch": 1.176853312302839, "grad_norm": 0.49491475836802235, "learning_rate": 1.6040162020515394e-05, "loss": 0.3107, "step": 5970 }, { "epoch": 1.17705047318612, "grad_norm": 0.50955563379636, "learning_rate": 1.6038926620598924e-05, "loss": 0.3438, "step": 5971 }, { "epoch": 1.1772476340694007, "grad_norm": 0.47434081410915296, "learning_rate": 1.6037691075592384e-05, "loss": 0.2874, "step": 5972 }, { "epoch": 1.1774447949526814, "grad_norm": 0.5265992460388663, "learning_rate": 1.6036455385525452e-05, "loss": 0.3138, "step": 5973 }, { "epoch": 1.177641955835962, "grad_norm": 0.508538165601405, "learning_rate": 1.603521955042782e-05, "loss": 0.3364, "step": 5974 }, { "epoch": 1.1778391167192428, "grad_norm": 0.536584712113376, "learning_rate": 1.603398357032918e-05, "loss": 0.3425, "step": 5975 }, { "epoch": 1.1780362776025237, "grad_norm": 0.5637925361320564, "learning_rate": 1.603274744525922e-05, "loss": 0.3751, "step": 5976 }, { "epoch": 1.1782334384858044, "grad_norm": 0.4828201596184613, "learning_rate": 1.6031511175247648e-05, "loss": 0.3267, "step": 5977 }, { "epoch": 1.1784305993690851, "grad_norm": 0.5702750284269689, "learning_rate": 1.6030274760324163e-05, "loss": 0.3426, "step": 5978 }, { "epoch": 1.178627760252366, "grad_norm": 0.5166744599492986, "learning_rate": 1.602903820051847e-05, "loss": 0.3443, "step": 5979 }, { "epoch": 1.1788249211356467, "grad_norm": 0.4553996700869784, "learning_rate": 1.602780149586028e-05, "loss": 0.2921, "step": 5980 }, { "epoch": 1.1790220820189274, "grad_norm": 0.5395408232911948, "learning_rate": 1.60265646463793e-05, "loss": 0.3487, "step": 5981 }, { "epoch": 1.1792192429022081, "grad_norm": 0.5246852833105909, "learning_rate": 1.6025327652105256e-05, "loss": 0.353, "step": 5982 }, { "epoch": 1.1794164037854888, "grad_norm": 0.5504345490202539, "learning_rate": 1.6024090513067864e-05, "loss": 0.3337, "step": 5983 }, { "epoch": 1.1796135646687698, "grad_norm": 0.5273314886798296, "learning_rate": 1.6022853229296844e-05, "loss": 0.3613, "step": 5984 }, { "epoch": 1.1798107255520505, "grad_norm": 0.4976467528520897, "learning_rate": 1.6021615800821923e-05, "loss": 0.3119, "step": 5985 }, { "epoch": 1.1800078864353312, "grad_norm": 0.544586740515642, "learning_rate": 1.6020378227672834e-05, "loss": 0.3593, "step": 5986 }, { "epoch": 1.180205047318612, "grad_norm": 0.5288888595664981, "learning_rate": 1.6019140509879312e-05, "loss": 0.365, "step": 5987 }, { "epoch": 1.1804022082018928, "grad_norm": 0.5499369482970153, "learning_rate": 1.601790264747109e-05, "loss": 0.3673, "step": 5988 }, { "epoch": 1.1805993690851735, "grad_norm": 0.5132871954718244, "learning_rate": 1.6016664640477912e-05, "loss": 0.349, "step": 5989 }, { "epoch": 1.1807965299684542, "grad_norm": 0.5107401573623064, "learning_rate": 1.601542648892952e-05, "loss": 0.3328, "step": 5990 }, { "epoch": 1.1809936908517351, "grad_norm": 0.5003005634953831, "learning_rate": 1.6014188192855667e-05, "loss": 0.3325, "step": 5991 }, { "epoch": 1.1811908517350158, "grad_norm": 0.5419236213545058, "learning_rate": 1.6012949752286093e-05, "loss": 0.3567, "step": 5992 }, { "epoch": 1.1813880126182965, "grad_norm": 0.46447030510821447, "learning_rate": 1.6011711167250563e-05, "loss": 0.3133, "step": 5993 }, { "epoch": 1.1815851735015772, "grad_norm": 0.5146898936963578, "learning_rate": 1.6010472437778827e-05, "loss": 0.3507, "step": 5994 }, { "epoch": 1.1817823343848581, "grad_norm": 1.9545816777357674, "learning_rate": 1.6009233563900654e-05, "loss": 0.3232, "step": 5995 }, { "epoch": 1.1819794952681388, "grad_norm": 0.504150566251979, "learning_rate": 1.6007994545645807e-05, "loss": 0.3385, "step": 5996 }, { "epoch": 1.1821766561514195, "grad_norm": 0.5178105405608192, "learning_rate": 1.600675538304405e-05, "loss": 0.3198, "step": 5997 }, { "epoch": 1.1823738170347002, "grad_norm": 0.49033729293788897, "learning_rate": 1.600551607612516e-05, "loss": 0.3074, "step": 5998 }, { "epoch": 1.1825709779179812, "grad_norm": 0.5216261093702694, "learning_rate": 1.6004276624918906e-05, "loss": 0.3524, "step": 5999 }, { "epoch": 1.1827681388012619, "grad_norm": 0.48871872582944514, "learning_rate": 1.600303702945507e-05, "loss": 0.3333, "step": 6000 }, { "epoch": 1.1829652996845426, "grad_norm": 0.5425430954135214, "learning_rate": 1.600179728976344e-05, "loss": 0.3351, "step": 6001 }, { "epoch": 1.1831624605678233, "grad_norm": 0.4997582084309298, "learning_rate": 1.6000557405873793e-05, "loss": 0.3281, "step": 6002 }, { "epoch": 1.1833596214511042, "grad_norm": 0.5468290480793452, "learning_rate": 1.5999317377815927e-05, "loss": 0.3504, "step": 6003 }, { "epoch": 1.1835567823343849, "grad_norm": 0.5176645046959065, "learning_rate": 1.5998077205619625e-05, "loss": 0.3362, "step": 6004 }, { "epoch": 1.1837539432176656, "grad_norm": 0.5013566738277708, "learning_rate": 1.599683688931469e-05, "loss": 0.3276, "step": 6005 }, { "epoch": 1.1839511041009463, "grad_norm": 0.5694727653608741, "learning_rate": 1.599559642893092e-05, "loss": 0.346, "step": 6006 }, { "epoch": 1.1841482649842272, "grad_norm": 0.49787309953663916, "learning_rate": 1.5994355824498118e-05, "loss": 0.3237, "step": 6007 }, { "epoch": 1.184345425867508, "grad_norm": 0.5217262967923424, "learning_rate": 1.5993115076046085e-05, "loss": 0.3272, "step": 6008 }, { "epoch": 1.1845425867507886, "grad_norm": 0.5171039380183391, "learning_rate": 1.5991874183604638e-05, "loss": 0.3409, "step": 6009 }, { "epoch": 1.1847397476340693, "grad_norm": 0.5173593200967992, "learning_rate": 1.5990633147203595e-05, "loss": 0.3439, "step": 6010 }, { "epoch": 1.1849369085173502, "grad_norm": 0.5600380235112447, "learning_rate": 1.598939196687276e-05, "loss": 0.3539, "step": 6011 }, { "epoch": 1.185134069400631, "grad_norm": 0.47322318486676124, "learning_rate": 1.5988150642641963e-05, "loss": 0.3241, "step": 6012 }, { "epoch": 1.1853312302839116, "grad_norm": 0.509209862620946, "learning_rate": 1.598690917454102e-05, "loss": 0.3384, "step": 6013 }, { "epoch": 1.1855283911671926, "grad_norm": 0.5071917580583539, "learning_rate": 1.598566756259977e-05, "loss": 0.3306, "step": 6014 }, { "epoch": 1.1857255520504733, "grad_norm": 0.5285407284823473, "learning_rate": 1.598442580684803e-05, "loss": 0.353, "step": 6015 }, { "epoch": 1.185922712933754, "grad_norm": 0.5370829605446722, "learning_rate": 1.598318390731564e-05, "loss": 0.3423, "step": 6016 }, { "epoch": 1.1861198738170347, "grad_norm": 0.4874835855626257, "learning_rate": 1.5981941864032444e-05, "loss": 0.3312, "step": 6017 }, { "epoch": 1.1863170347003154, "grad_norm": 0.5441187518003243, "learning_rate": 1.5980699677028276e-05, "loss": 0.3475, "step": 6018 }, { "epoch": 1.1865141955835963, "grad_norm": 0.5291193191006429, "learning_rate": 1.597945734633298e-05, "loss": 0.3461, "step": 6019 }, { "epoch": 1.186711356466877, "grad_norm": 0.5254879886979995, "learning_rate": 1.5978214871976408e-05, "loss": 0.3455, "step": 6020 }, { "epoch": 1.1869085173501577, "grad_norm": 0.5377765608865902, "learning_rate": 1.597697225398841e-05, "loss": 0.3352, "step": 6021 }, { "epoch": 1.1871056782334386, "grad_norm": 0.5072825222986831, "learning_rate": 1.5975729492398836e-05, "loss": 0.3208, "step": 6022 }, { "epoch": 1.1873028391167193, "grad_norm": 0.47380472273260626, "learning_rate": 1.5974486587237554e-05, "loss": 0.3119, "step": 6023 }, { "epoch": 1.1875, "grad_norm": 0.5284173062047662, "learning_rate": 1.5973243538534416e-05, "loss": 0.348, "step": 6024 }, { "epoch": 1.1876971608832807, "grad_norm": 0.532090130405166, "learning_rate": 1.5972000346319296e-05, "loss": 0.3584, "step": 6025 }, { "epoch": 1.1878943217665614, "grad_norm": 0.5219678872457383, "learning_rate": 1.5970757010622056e-05, "loss": 0.3243, "step": 6026 }, { "epoch": 1.1880914826498423, "grad_norm": 0.49984607033393075, "learning_rate": 1.596951353147257e-05, "loss": 0.3215, "step": 6027 }, { "epoch": 1.188288643533123, "grad_norm": 0.49207605863384773, "learning_rate": 1.5968269908900714e-05, "loss": 0.2912, "step": 6028 }, { "epoch": 1.1884858044164037, "grad_norm": 0.5613782227268997, "learning_rate": 1.596702614293637e-05, "loss": 0.3553, "step": 6029 }, { "epoch": 1.1886829652996846, "grad_norm": 0.5250585921793455, "learning_rate": 1.5965782233609416e-05, "loss": 0.3738, "step": 6030 }, { "epoch": 1.1888801261829653, "grad_norm": 0.5008108248618431, "learning_rate": 1.5964538180949738e-05, "loss": 0.3319, "step": 6031 }, { "epoch": 1.189077287066246, "grad_norm": 0.5245356584082338, "learning_rate": 1.596329398498723e-05, "loss": 0.3239, "step": 6032 }, { "epoch": 1.1892744479495267, "grad_norm": 0.5355385935858578, "learning_rate": 1.5962049645751778e-05, "loss": 0.339, "step": 6033 }, { "epoch": 1.1894716088328074, "grad_norm": 0.6132239805769646, "learning_rate": 1.5960805163273287e-05, "loss": 0.365, "step": 6034 }, { "epoch": 1.1896687697160884, "grad_norm": 9.838847324850864, "learning_rate": 1.5959560537581646e-05, "loss": 0.343, "step": 6035 }, { "epoch": 1.189865930599369, "grad_norm": 0.5645786328771389, "learning_rate": 1.5958315768706767e-05, "loss": 0.3648, "step": 6036 }, { "epoch": 1.1900630914826498, "grad_norm": 0.48867840037373045, "learning_rate": 1.5957070856678553e-05, "loss": 0.3233, "step": 6037 }, { "epoch": 1.1902602523659307, "grad_norm": 0.529456568702109, "learning_rate": 1.5955825801526918e-05, "loss": 0.327, "step": 6038 }, { "epoch": 1.1904574132492114, "grad_norm": 1.176277930833411, "learning_rate": 1.5954580603281768e-05, "loss": 0.3511, "step": 6039 }, { "epoch": 1.190654574132492, "grad_norm": 0.5138247235703703, "learning_rate": 1.5953335261973024e-05, "loss": 0.3266, "step": 6040 }, { "epoch": 1.1908517350157728, "grad_norm": 0.504680208228039, "learning_rate": 1.5952089777630604e-05, "loss": 0.3379, "step": 6041 }, { "epoch": 1.1910488958990537, "grad_norm": 0.516549288250157, "learning_rate": 1.5950844150284438e-05, "loss": 0.3391, "step": 6042 }, { "epoch": 1.1912460567823344, "grad_norm": 0.524867491432488, "learning_rate": 1.5949598379964447e-05, "loss": 0.3345, "step": 6043 }, { "epoch": 1.1914432176656151, "grad_norm": 0.5311974639533232, "learning_rate": 1.594835246670056e-05, "loss": 0.3475, "step": 6044 }, { "epoch": 1.1916403785488958, "grad_norm": 0.4916105988107756, "learning_rate": 1.5947106410522722e-05, "loss": 0.318, "step": 6045 }, { "epoch": 1.1918375394321767, "grad_norm": 0.5324212670370398, "learning_rate": 1.594586021146086e-05, "loss": 0.3236, "step": 6046 }, { "epoch": 1.1920347003154574, "grad_norm": 0.4964775871993211, "learning_rate": 1.594461386954492e-05, "loss": 0.3247, "step": 6047 }, { "epoch": 1.1922318611987381, "grad_norm": 0.5471553857731185, "learning_rate": 1.5943367384804842e-05, "loss": 0.3463, "step": 6048 }, { "epoch": 1.1924290220820188, "grad_norm": 0.7831995811222979, "learning_rate": 1.5942120757270578e-05, "loss": 0.3071, "step": 6049 }, { "epoch": 1.1926261829652998, "grad_norm": 0.5010080635510805, "learning_rate": 1.5940873986972078e-05, "loss": 0.3082, "step": 6050 }, { "epoch": 1.1928233438485805, "grad_norm": 0.49204619020050877, "learning_rate": 1.5939627073939298e-05, "loss": 0.325, "step": 6051 }, { "epoch": 1.1930205047318612, "grad_norm": 0.5694016315454971, "learning_rate": 1.593838001820219e-05, "loss": 0.355, "step": 6052 }, { "epoch": 1.1932176656151419, "grad_norm": 0.5103103854682324, "learning_rate": 1.5937132819790722e-05, "loss": 0.336, "step": 6053 }, { "epoch": 1.1934148264984228, "grad_norm": 0.5971611394525508, "learning_rate": 1.593588547873486e-05, "loss": 0.2987, "step": 6054 }, { "epoch": 1.1936119873817035, "grad_norm": 0.5324309908139868, "learning_rate": 1.593463799506456e-05, "loss": 0.3381, "step": 6055 }, { "epoch": 1.1938091482649842, "grad_norm": 0.5513616850381257, "learning_rate": 1.593339036880981e-05, "loss": 0.3436, "step": 6056 }, { "epoch": 1.1940063091482649, "grad_norm": 0.48370871862770526, "learning_rate": 1.5932142600000577e-05, "loss": 0.3131, "step": 6057 }, { "epoch": 1.1942034700315458, "grad_norm": 0.5723593799005773, "learning_rate": 1.5930894688666843e-05, "loss": 0.3526, "step": 6058 }, { "epoch": 1.1944006309148265, "grad_norm": 0.5244334828806275, "learning_rate": 1.5929646634838583e-05, "loss": 0.3277, "step": 6059 }, { "epoch": 1.1945977917981072, "grad_norm": 0.5360911703031235, "learning_rate": 1.5928398438545792e-05, "loss": 0.3359, "step": 6060 }, { "epoch": 1.194794952681388, "grad_norm": 0.5103692154019288, "learning_rate": 1.5927150099818454e-05, "loss": 0.3397, "step": 6061 }, { "epoch": 1.1949921135646688, "grad_norm": 0.4920913803322767, "learning_rate": 1.592590161868656e-05, "loss": 0.2967, "step": 6062 }, { "epoch": 1.1951892744479495, "grad_norm": 0.5338396158035961, "learning_rate": 1.5924652995180106e-05, "loss": 0.3377, "step": 6063 }, { "epoch": 1.1953864353312302, "grad_norm": 0.4978817940429984, "learning_rate": 1.5923404229329097e-05, "loss": 0.3148, "step": 6064 }, { "epoch": 1.1955835962145112, "grad_norm": 0.5108248246371543, "learning_rate": 1.5922155321163528e-05, "loss": 0.344, "step": 6065 }, { "epoch": 1.1957807570977919, "grad_norm": 0.4962965845967901, "learning_rate": 1.592090627071341e-05, "loss": 0.32, "step": 6066 }, { "epoch": 1.1959779179810726, "grad_norm": 0.5153462208898505, "learning_rate": 1.591965707800875e-05, "loss": 0.3368, "step": 6067 }, { "epoch": 1.1961750788643533, "grad_norm": 0.5214640418478306, "learning_rate": 1.5918407743079564e-05, "loss": 0.3297, "step": 6068 }, { "epoch": 1.196372239747634, "grad_norm": 0.5677250498709867, "learning_rate": 1.5917158265955863e-05, "loss": 0.3743, "step": 6069 }, { "epoch": 1.1965694006309149, "grad_norm": 0.5268238165290028, "learning_rate": 1.591590864666767e-05, "loss": 0.3654, "step": 6070 }, { "epoch": 1.1967665615141956, "grad_norm": 0.4961093876386208, "learning_rate": 1.5914658885245006e-05, "loss": 0.3319, "step": 6071 }, { "epoch": 1.1969637223974763, "grad_norm": 0.5169884357870341, "learning_rate": 1.5913408981717902e-05, "loss": 0.355, "step": 6072 }, { "epoch": 1.1971608832807572, "grad_norm": 0.4863348861419478, "learning_rate": 1.5912158936116383e-05, "loss": 0.3118, "step": 6073 }, { "epoch": 1.197358044164038, "grad_norm": 0.49535517239582816, "learning_rate": 1.5910908748470485e-05, "loss": 0.2871, "step": 6074 }, { "epoch": 1.1975552050473186, "grad_norm": 0.5012578788415777, "learning_rate": 1.5909658418810246e-05, "loss": 0.3124, "step": 6075 }, { "epoch": 1.1977523659305993, "grad_norm": 6.581609483160163, "learning_rate": 1.5908407947165704e-05, "loss": 0.3924, "step": 6076 }, { "epoch": 1.19794952681388, "grad_norm": 0.5416411980138617, "learning_rate": 1.59071573335669e-05, "loss": 0.3265, "step": 6077 }, { "epoch": 1.198146687697161, "grad_norm": 5.137867502750501, "learning_rate": 1.5905906578043892e-05, "loss": 0.4025, "step": 6078 }, { "epoch": 1.1983438485804416, "grad_norm": 0.498758781908228, "learning_rate": 1.5904655680626712e-05, "loss": 0.3361, "step": 6079 }, { "epoch": 1.1985410094637223, "grad_norm": 0.5129283262777451, "learning_rate": 1.590340464134543e-05, "loss": 0.3302, "step": 6080 }, { "epoch": 1.1987381703470033, "grad_norm": 0.7337508073690954, "learning_rate": 1.5902153460230097e-05, "loss": 0.3584, "step": 6081 }, { "epoch": 1.198935331230284, "grad_norm": 0.4927136722041788, "learning_rate": 1.5900902137310777e-05, "loss": 0.315, "step": 6082 }, { "epoch": 1.1991324921135647, "grad_norm": 0.4999312067571768, "learning_rate": 1.5899650672617526e-05, "loss": 0.3218, "step": 6083 }, { "epoch": 1.1993296529968454, "grad_norm": 0.5108041586913794, "learning_rate": 1.589839906618042e-05, "loss": 0.3266, "step": 6084 }, { "epoch": 1.1995268138801263, "grad_norm": 0.5344740208260175, "learning_rate": 1.5897147318029524e-05, "loss": 0.3587, "step": 6085 }, { "epoch": 1.199723974763407, "grad_norm": 0.5888513085931902, "learning_rate": 1.5895895428194915e-05, "loss": 0.3455, "step": 6086 }, { "epoch": 1.1999211356466877, "grad_norm": 0.4926690283535484, "learning_rate": 1.5894643396706674e-05, "loss": 0.3181, "step": 6087 }, { "epoch": 1.2001182965299684, "grad_norm": 0.5047531232097675, "learning_rate": 1.5893391223594873e-05, "loss": 0.3396, "step": 6088 }, { "epoch": 1.2003154574132493, "grad_norm": 0.5194620816609076, "learning_rate": 1.5892138908889606e-05, "loss": 0.3498, "step": 6089 }, { "epoch": 1.20051261829653, "grad_norm": 0.49106900692891253, "learning_rate": 1.589088645262096e-05, "loss": 0.3156, "step": 6090 }, { "epoch": 1.2007097791798107, "grad_norm": 0.4816616653634188, "learning_rate": 1.5889633854819014e-05, "loss": 0.3269, "step": 6091 }, { "epoch": 1.2009069400630914, "grad_norm": 0.5018051104959026, "learning_rate": 1.5888381115513878e-05, "loss": 0.3261, "step": 6092 }, { "epoch": 1.2011041009463723, "grad_norm": 0.5243143133854937, "learning_rate": 1.5887128234735638e-05, "loss": 0.3356, "step": 6093 }, { "epoch": 1.201301261829653, "grad_norm": 0.5816921886358895, "learning_rate": 1.5885875212514408e-05, "loss": 0.3673, "step": 6094 }, { "epoch": 1.2014984227129337, "grad_norm": 0.4886736205149519, "learning_rate": 1.5884622048880283e-05, "loss": 0.348, "step": 6095 }, { "epoch": 1.2016955835962144, "grad_norm": 0.5352170470566625, "learning_rate": 1.5883368743863376e-05, "loss": 0.3531, "step": 6096 }, { "epoch": 1.2018927444794953, "grad_norm": 0.5070353571291337, "learning_rate": 1.5882115297493793e-05, "loss": 0.3074, "step": 6097 }, { "epoch": 1.202089905362776, "grad_norm": 0.5299404136107287, "learning_rate": 1.588086170980166e-05, "loss": 0.3548, "step": 6098 }, { "epoch": 1.2022870662460567, "grad_norm": 0.5010029776145218, "learning_rate": 1.5879607980817084e-05, "loss": 0.3322, "step": 6099 }, { "epoch": 1.2024842271293374, "grad_norm": 0.5443027119113688, "learning_rate": 1.5878354110570188e-05, "loss": 0.3445, "step": 6100 }, { "epoch": 1.2026813880126184, "grad_norm": 0.5116498162874465, "learning_rate": 1.5877100099091106e-05, "loss": 0.35, "step": 6101 }, { "epoch": 1.202878548895899, "grad_norm": 0.4990846323105568, "learning_rate": 1.587584594640996e-05, "loss": 0.303, "step": 6102 }, { "epoch": 1.2030757097791798, "grad_norm": 0.6083340219487477, "learning_rate": 1.5874591652556887e-05, "loss": 0.3466, "step": 6103 }, { "epoch": 1.2032728706624605, "grad_norm": 0.5699359859289749, "learning_rate": 1.5873337217562012e-05, "loss": 0.3492, "step": 6104 }, { "epoch": 1.2034700315457414, "grad_norm": 0.5305845146338529, "learning_rate": 1.5872082641455484e-05, "loss": 0.3311, "step": 6105 }, { "epoch": 1.203667192429022, "grad_norm": 0.4987574811623951, "learning_rate": 1.5870827924267442e-05, "loss": 0.3127, "step": 6106 }, { "epoch": 1.2038643533123028, "grad_norm": 0.5542197601383188, "learning_rate": 1.586957306602803e-05, "loss": 0.3711, "step": 6107 }, { "epoch": 1.2040615141955837, "grad_norm": 0.5296385507003382, "learning_rate": 1.58683180667674e-05, "loss": 0.3322, "step": 6108 }, { "epoch": 1.2042586750788644, "grad_norm": 0.510771246311118, "learning_rate": 1.5867062926515702e-05, "loss": 0.3478, "step": 6109 }, { "epoch": 1.2044558359621451, "grad_norm": 0.5083219009195116, "learning_rate": 1.586580764530309e-05, "loss": 0.3283, "step": 6110 }, { "epoch": 1.2046529968454258, "grad_norm": 0.4984543211011751, "learning_rate": 1.586455222315973e-05, "loss": 0.3161, "step": 6111 }, { "epoch": 1.2048501577287065, "grad_norm": 0.48449825361455534, "learning_rate": 1.5863296660115778e-05, "loss": 0.3235, "step": 6112 }, { "epoch": 1.2050473186119874, "grad_norm": 0.4880911697740563, "learning_rate": 1.58620409562014e-05, "loss": 0.3221, "step": 6113 }, { "epoch": 1.2052444794952681, "grad_norm": 0.6718849269017811, "learning_rate": 1.586078511144677e-05, "loss": 0.3891, "step": 6114 }, { "epoch": 1.2054416403785488, "grad_norm": 0.536419239014414, "learning_rate": 1.5859529125882058e-05, "loss": 0.3603, "step": 6115 }, { "epoch": 1.2056388012618298, "grad_norm": 0.5329763858491461, "learning_rate": 1.585827299953744e-05, "loss": 0.3488, "step": 6116 }, { "epoch": 1.2058359621451105, "grad_norm": 1.3571017677348047, "learning_rate": 1.5857016732443096e-05, "loss": 0.331, "step": 6117 }, { "epoch": 1.2060331230283912, "grad_norm": 0.5105153466144287, "learning_rate": 1.5855760324629204e-05, "loss": 0.324, "step": 6118 }, { "epoch": 1.2062302839116719, "grad_norm": 0.4980780054716261, "learning_rate": 1.585450377612596e-05, "loss": 0.3108, "step": 6119 }, { "epoch": 1.2064274447949526, "grad_norm": 0.969040129147523, "learning_rate": 1.5853247086963546e-05, "loss": 0.3388, "step": 6120 }, { "epoch": 1.2066246056782335, "grad_norm": 0.5442833624434943, "learning_rate": 1.585199025717216e-05, "loss": 0.3501, "step": 6121 }, { "epoch": 1.2068217665615142, "grad_norm": 0.50130789731431, "learning_rate": 1.585073328678199e-05, "loss": 0.33, "step": 6122 }, { "epoch": 1.2070189274447949, "grad_norm": 0.5068279641638377, "learning_rate": 1.5849476175823242e-05, "loss": 0.3371, "step": 6123 }, { "epoch": 1.2072160883280758, "grad_norm": 0.567218532987645, "learning_rate": 1.584821892432612e-05, "loss": 0.3313, "step": 6124 }, { "epoch": 1.2074132492113565, "grad_norm": 0.5501597914721346, "learning_rate": 1.5846961532320833e-05, "loss": 0.3363, "step": 6125 }, { "epoch": 1.2076104100946372, "grad_norm": 0.5130749844399497, "learning_rate": 1.584570399983758e-05, "loss": 0.3331, "step": 6126 }, { "epoch": 1.207807570977918, "grad_norm": 0.7105268873576728, "learning_rate": 1.5844446326906585e-05, "loss": 0.3289, "step": 6127 }, { "epoch": 1.2080047318611988, "grad_norm": 0.49590774482060557, "learning_rate": 1.5843188513558056e-05, "loss": 0.3159, "step": 6128 }, { "epoch": 1.2082018927444795, "grad_norm": 0.522580528542394, "learning_rate": 1.5841930559822222e-05, "loss": 0.3258, "step": 6129 }, { "epoch": 1.2083990536277602, "grad_norm": 0.5232730029770233, "learning_rate": 1.58406724657293e-05, "loss": 0.3242, "step": 6130 }, { "epoch": 1.208596214511041, "grad_norm": 0.5564685376916361, "learning_rate": 1.583941423130952e-05, "loss": 0.3414, "step": 6131 }, { "epoch": 1.2087933753943219, "grad_norm": 0.5155603754900965, "learning_rate": 1.583815585659311e-05, "loss": 0.3329, "step": 6132 }, { "epoch": 1.2089905362776026, "grad_norm": 0.49929691738974885, "learning_rate": 1.58368973416103e-05, "loss": 0.3302, "step": 6133 }, { "epoch": 1.2091876971608833, "grad_norm": 0.5307058011919017, "learning_rate": 1.5835638686391338e-05, "loss": 0.3232, "step": 6134 }, { "epoch": 1.209384858044164, "grad_norm": 0.5116249657384987, "learning_rate": 1.583437989096645e-05, "loss": 0.3149, "step": 6135 }, { "epoch": 1.2095820189274449, "grad_norm": 0.4958052917998363, "learning_rate": 1.5833120955365894e-05, "loss": 0.345, "step": 6136 }, { "epoch": 1.2097791798107256, "grad_norm": 0.5772963968620416, "learning_rate": 1.5831861879619904e-05, "loss": 0.3658, "step": 6137 }, { "epoch": 1.2099763406940063, "grad_norm": 0.5176663312841757, "learning_rate": 1.5830602663758737e-05, "loss": 0.3321, "step": 6138 }, { "epoch": 1.210173501577287, "grad_norm": 0.5572969524001481, "learning_rate": 1.582934330781265e-05, "loss": 0.3646, "step": 6139 }, { "epoch": 1.210370662460568, "grad_norm": 0.5149376440705687, "learning_rate": 1.582808381181189e-05, "loss": 0.3233, "step": 6140 }, { "epoch": 1.2105678233438486, "grad_norm": 0.5060377765127014, "learning_rate": 1.5826824175786724e-05, "loss": 0.3241, "step": 6141 }, { "epoch": 1.2107649842271293, "grad_norm": 0.5075915990283572, "learning_rate": 1.5825564399767416e-05, "loss": 0.3315, "step": 6142 }, { "epoch": 1.21096214511041, "grad_norm": 0.48101366074112933, "learning_rate": 1.5824304483784234e-05, "loss": 0.3271, "step": 6143 }, { "epoch": 1.211159305993691, "grad_norm": 0.5006150490244125, "learning_rate": 1.5823044427867446e-05, "loss": 0.3208, "step": 6144 }, { "epoch": 1.2113564668769716, "grad_norm": 0.5083292731196384, "learning_rate": 1.582178423204732e-05, "loss": 0.3332, "step": 6145 }, { "epoch": 1.2115536277602523, "grad_norm": 0.5431538616132132, "learning_rate": 1.5820523896354146e-05, "loss": 0.3188, "step": 6146 }, { "epoch": 1.211750788643533, "grad_norm": 0.4949725843395628, "learning_rate": 1.5819263420818198e-05, "loss": 0.3279, "step": 6147 }, { "epoch": 1.211947949526814, "grad_norm": 0.5283861058082052, "learning_rate": 1.5818002805469758e-05, "loss": 0.3466, "step": 6148 }, { "epoch": 1.2121451104100947, "grad_norm": 0.49009428195008536, "learning_rate": 1.581674205033912e-05, "loss": 0.316, "step": 6149 }, { "epoch": 1.2123422712933754, "grad_norm": 0.527558337367723, "learning_rate": 1.5815481155456566e-05, "loss": 0.3217, "step": 6150 }, { "epoch": 1.2125394321766563, "grad_norm": 0.46986929303135894, "learning_rate": 1.581422012085239e-05, "loss": 0.3353, "step": 6151 }, { "epoch": 1.212736593059937, "grad_norm": 0.5248790360545358, "learning_rate": 1.5812958946556897e-05, "loss": 0.3471, "step": 6152 }, { "epoch": 1.2129337539432177, "grad_norm": 0.4914294810645838, "learning_rate": 1.581169763260039e-05, "loss": 0.3302, "step": 6153 }, { "epoch": 1.2131309148264984, "grad_norm": 0.4763550637519196, "learning_rate": 1.5810436179013158e-05, "loss": 0.3128, "step": 6154 }, { "epoch": 1.213328075709779, "grad_norm": 0.5141786955182022, "learning_rate": 1.5809174585825523e-05, "loss": 0.3347, "step": 6155 }, { "epoch": 1.21352523659306, "grad_norm": 0.5181612279002368, "learning_rate": 1.5807912853067787e-05, "loss": 0.3764, "step": 6156 }, { "epoch": 1.2137223974763407, "grad_norm": 0.5063765486686969, "learning_rate": 1.5806650980770273e-05, "loss": 0.3455, "step": 6157 }, { "epoch": 1.2139195583596214, "grad_norm": 0.5010394541454587, "learning_rate": 1.5805388968963286e-05, "loss": 0.3283, "step": 6158 }, { "epoch": 1.2141167192429023, "grad_norm": 0.4758924748836648, "learning_rate": 1.5804126817677158e-05, "loss": 0.31, "step": 6159 }, { "epoch": 1.214313880126183, "grad_norm": 0.5025450657121269, "learning_rate": 1.580286452694221e-05, "loss": 0.3045, "step": 6160 }, { "epoch": 1.2145110410094637, "grad_norm": 0.5146546499338246, "learning_rate": 1.5801602096788768e-05, "loss": 0.3355, "step": 6161 }, { "epoch": 1.2147082018927444, "grad_norm": 0.5384444969117445, "learning_rate": 1.5800339527247163e-05, "loss": 0.3499, "step": 6162 }, { "epoch": 1.2149053627760251, "grad_norm": 0.4940594458584491, "learning_rate": 1.579907681834773e-05, "loss": 0.3296, "step": 6163 }, { "epoch": 1.215102523659306, "grad_norm": 0.5343762136162648, "learning_rate": 1.579781397012081e-05, "loss": 0.3352, "step": 6164 }, { "epoch": 1.2152996845425867, "grad_norm": 0.48752315188028517, "learning_rate": 1.5796550982596732e-05, "loss": 0.3282, "step": 6165 }, { "epoch": 1.2154968454258674, "grad_norm": 0.5547331237218581, "learning_rate": 1.5795287855805853e-05, "loss": 0.3829, "step": 6166 }, { "epoch": 1.2156940063091484, "grad_norm": 0.5004995021041893, "learning_rate": 1.5794024589778518e-05, "loss": 0.328, "step": 6167 }, { "epoch": 1.215891167192429, "grad_norm": 0.5175275340376262, "learning_rate": 1.5792761184545076e-05, "loss": 0.3307, "step": 6168 }, { "epoch": 1.2160883280757098, "grad_norm": 0.5027473643529223, "learning_rate": 1.579149764013588e-05, "loss": 0.3395, "step": 6169 }, { "epoch": 1.2162854889589905, "grad_norm": 0.4817141906134851, "learning_rate": 1.579023395658129e-05, "loss": 0.3249, "step": 6170 }, { "epoch": 1.2164826498422712, "grad_norm": 0.46188472835340966, "learning_rate": 1.578897013391167e-05, "loss": 0.3147, "step": 6171 }, { "epoch": 1.216679810725552, "grad_norm": 0.4988989117087451, "learning_rate": 1.5787706172157374e-05, "loss": 0.33, "step": 6172 }, { "epoch": 1.2168769716088328, "grad_norm": 0.49979162808926375, "learning_rate": 1.578644207134878e-05, "loss": 0.335, "step": 6173 }, { "epoch": 1.2170741324921135, "grad_norm": 9.87115802371154, "learning_rate": 1.5785177831516255e-05, "loss": 0.5038, "step": 6174 }, { "epoch": 1.2172712933753944, "grad_norm": 0.49379194281460687, "learning_rate": 1.5783913452690174e-05, "loss": 0.3263, "step": 6175 }, { "epoch": 1.2174684542586751, "grad_norm": 0.7283072054648132, "learning_rate": 1.5782648934900915e-05, "loss": 0.3288, "step": 6176 }, { "epoch": 1.2176656151419558, "grad_norm": 0.5063345599140339, "learning_rate": 1.5781384278178858e-05, "loss": 0.3322, "step": 6177 }, { "epoch": 1.2178627760252365, "grad_norm": 0.49525686855704687, "learning_rate": 1.578011948255439e-05, "loss": 0.3412, "step": 6178 }, { "epoch": 1.2180599369085174, "grad_norm": 0.4937155808164859, "learning_rate": 1.5778854548057893e-05, "loss": 0.3404, "step": 6179 }, { "epoch": 1.2182570977917981, "grad_norm": 0.5601096943034672, "learning_rate": 1.5777589474719764e-05, "loss": 0.3301, "step": 6180 }, { "epoch": 1.2184542586750788, "grad_norm": 0.5134943557442895, "learning_rate": 1.5776324262570394e-05, "loss": 0.3036, "step": 6181 }, { "epoch": 1.2186514195583595, "grad_norm": 0.5234618403194057, "learning_rate": 1.577505891164018e-05, "loss": 0.3487, "step": 6182 }, { "epoch": 1.2188485804416405, "grad_norm": 0.5060468910052316, "learning_rate": 1.5773793421959528e-05, "loss": 0.3045, "step": 6183 }, { "epoch": 1.2190457413249212, "grad_norm": 0.5385344505438445, "learning_rate": 1.577252779355884e-05, "loss": 0.3471, "step": 6184 }, { "epoch": 1.2192429022082019, "grad_norm": 0.47439335012121125, "learning_rate": 1.577126202646852e-05, "loss": 0.3137, "step": 6185 }, { "epoch": 1.2194400630914826, "grad_norm": 0.5634484011687884, "learning_rate": 1.5769996120718985e-05, "loss": 0.3504, "step": 6186 }, { "epoch": 1.2196372239747635, "grad_norm": 0.4783159791866919, "learning_rate": 1.5768730076340646e-05, "loss": 0.3238, "step": 6187 }, { "epoch": 1.2198343848580442, "grad_norm": 0.5282172252080681, "learning_rate": 1.5767463893363925e-05, "loss": 0.324, "step": 6188 }, { "epoch": 1.2200315457413249, "grad_norm": 0.508650741550587, "learning_rate": 1.5766197571819234e-05, "loss": 0.3108, "step": 6189 }, { "epoch": 1.2202287066246056, "grad_norm": 0.47948864235134103, "learning_rate": 1.5764931111737005e-05, "loss": 0.335, "step": 6190 }, { "epoch": 1.2204258675078865, "grad_norm": 0.5276091720414406, "learning_rate": 1.576366451314766e-05, "loss": 0.3546, "step": 6191 }, { "epoch": 1.2206230283911672, "grad_norm": 0.48975202677127483, "learning_rate": 1.576239777608164e-05, "loss": 0.3195, "step": 6192 }, { "epoch": 1.220820189274448, "grad_norm": 0.5115397169665277, "learning_rate": 1.576113090056937e-05, "loss": 0.3552, "step": 6193 }, { "epoch": 1.2210173501577288, "grad_norm": 0.5544373081682192, "learning_rate": 1.575986388664129e-05, "loss": 0.3388, "step": 6194 }, { "epoch": 1.2212145110410095, "grad_norm": 0.4789487893265251, "learning_rate": 1.5758596734327842e-05, "loss": 0.3049, "step": 6195 }, { "epoch": 1.2214116719242902, "grad_norm": 0.5085580816350456, "learning_rate": 1.5757329443659468e-05, "loss": 0.3337, "step": 6196 }, { "epoch": 1.221608832807571, "grad_norm": 0.5018198548590921, "learning_rate": 1.5756062014666622e-05, "loss": 0.3602, "step": 6197 }, { "epoch": 1.2218059936908516, "grad_norm": 0.5459993872154436, "learning_rate": 1.5754794447379747e-05, "loss": 0.3734, "step": 6198 }, { "epoch": 1.2220031545741326, "grad_norm": 0.5069803136116705, "learning_rate": 1.5753526741829302e-05, "loss": 0.3292, "step": 6199 }, { "epoch": 1.2222003154574133, "grad_norm": 0.5349842354961301, "learning_rate": 1.5752258898045747e-05, "loss": 0.3591, "step": 6200 }, { "epoch": 1.222397476340694, "grad_norm": 0.5113741033129703, "learning_rate": 1.5750990916059537e-05, "loss": 0.3354, "step": 6201 }, { "epoch": 1.2225946372239749, "grad_norm": 0.4764417943474872, "learning_rate": 1.5749722795901142e-05, "loss": 0.2776, "step": 6202 }, { "epoch": 1.2227917981072556, "grad_norm": 0.4852541319749403, "learning_rate": 1.574845453760102e-05, "loss": 0.2877, "step": 6203 }, { "epoch": 1.2229889589905363, "grad_norm": 0.5011535867185861, "learning_rate": 1.5747186141189654e-05, "loss": 0.3188, "step": 6204 }, { "epoch": 1.223186119873817, "grad_norm": 0.5585191080381091, "learning_rate": 1.574591760669751e-05, "loss": 0.3278, "step": 6205 }, { "epoch": 1.2233832807570977, "grad_norm": 0.5095163688965774, "learning_rate": 1.574464893415507e-05, "loss": 0.3388, "step": 6206 }, { "epoch": 1.2235804416403786, "grad_norm": 0.489944289171812, "learning_rate": 1.5743380123592815e-05, "loss": 0.3298, "step": 6207 }, { "epoch": 1.2237776025236593, "grad_norm": 0.56552053195053, "learning_rate": 1.5742111175041222e-05, "loss": 0.3271, "step": 6208 }, { "epoch": 1.22397476340694, "grad_norm": 0.4975929434887488, "learning_rate": 1.5740842088530788e-05, "loss": 0.3171, "step": 6209 }, { "epoch": 1.224171924290221, "grad_norm": 0.5635718664701436, "learning_rate": 1.5739572864091995e-05, "loss": 0.333, "step": 6210 }, { "epoch": 1.2243690851735016, "grad_norm": 0.5466750621448705, "learning_rate": 1.573830350175535e-05, "loss": 0.3226, "step": 6211 }, { "epoch": 1.2245662460567823, "grad_norm": 0.5179765647986949, "learning_rate": 1.5737034001551336e-05, "loss": 0.3451, "step": 6212 }, { "epoch": 1.224763406940063, "grad_norm": 0.505297026337151, "learning_rate": 1.573576436351046e-05, "loss": 0.3263, "step": 6213 }, { "epoch": 1.2249605678233437, "grad_norm": 0.5323927892854735, "learning_rate": 1.573449458766323e-05, "loss": 0.3348, "step": 6214 }, { "epoch": 1.2251577287066246, "grad_norm": 0.4585385421095369, "learning_rate": 1.573322467404015e-05, "loss": 0.2889, "step": 6215 }, { "epoch": 1.2253548895899053, "grad_norm": 0.5133915738969325, "learning_rate": 1.573195462267173e-05, "loss": 0.3354, "step": 6216 }, { "epoch": 1.225552050473186, "grad_norm": 0.4976376614042412, "learning_rate": 1.573068443358848e-05, "loss": 0.3261, "step": 6217 }, { "epoch": 1.225749211356467, "grad_norm": 0.5204545809416458, "learning_rate": 1.572941410682092e-05, "loss": 0.3152, "step": 6218 }, { "epoch": 1.2259463722397477, "grad_norm": 0.46008522086558107, "learning_rate": 1.572814364239958e-05, "loss": 0.3049, "step": 6219 }, { "epoch": 1.2261435331230284, "grad_norm": 0.5085240798538043, "learning_rate": 1.572687304035497e-05, "loss": 0.3392, "step": 6220 }, { "epoch": 1.226340694006309, "grad_norm": 0.49075420527666574, "learning_rate": 1.5725602300717628e-05, "loss": 0.3238, "step": 6221 }, { "epoch": 1.22653785488959, "grad_norm": 0.5009588499072523, "learning_rate": 1.5724331423518076e-05, "loss": 0.3588, "step": 6222 }, { "epoch": 1.2267350157728707, "grad_norm": 0.5004985108490898, "learning_rate": 1.572306040878685e-05, "loss": 0.3413, "step": 6223 }, { "epoch": 1.2269321766561514, "grad_norm": 0.5036837263941046, "learning_rate": 1.5721789256554495e-05, "loss": 0.3472, "step": 6224 }, { "epoch": 1.227129337539432, "grad_norm": 0.7266521472928784, "learning_rate": 1.5720517966851544e-05, "loss": 0.3553, "step": 6225 }, { "epoch": 1.227326498422713, "grad_norm": 0.5013043340465552, "learning_rate": 1.5719246539708536e-05, "loss": 0.3436, "step": 6226 }, { "epoch": 1.2275236593059937, "grad_norm": 0.4940987339117502, "learning_rate": 1.571797497515603e-05, "loss": 0.3127, "step": 6227 }, { "epoch": 1.2277208201892744, "grad_norm": 0.5237341159455894, "learning_rate": 1.5716703273224568e-05, "loss": 0.3173, "step": 6228 }, { "epoch": 1.2279179810725551, "grad_norm": 0.5284887213486391, "learning_rate": 1.5715431433944706e-05, "loss": 0.3326, "step": 6229 }, { "epoch": 1.228115141955836, "grad_norm": 0.5224757623724461, "learning_rate": 1.5714159457347007e-05, "loss": 0.3254, "step": 6230 }, { "epoch": 1.2283123028391167, "grad_norm": 0.49876895032485197, "learning_rate": 1.571288734346202e-05, "loss": 0.3423, "step": 6231 }, { "epoch": 1.2285094637223974, "grad_norm": 0.512553101690978, "learning_rate": 1.5711615092320315e-05, "loss": 0.3414, "step": 6232 }, { "epoch": 1.2287066246056781, "grad_norm": 0.5014118944270054, "learning_rate": 1.571034270395246e-05, "loss": 0.3183, "step": 6233 }, { "epoch": 1.228903785488959, "grad_norm": 1.4423480845194543, "learning_rate": 1.570907017838902e-05, "loss": 0.3328, "step": 6234 }, { "epoch": 1.2291009463722398, "grad_norm": 0.4737177745880879, "learning_rate": 1.5707797515660574e-05, "loss": 0.3193, "step": 6235 }, { "epoch": 1.2292981072555205, "grad_norm": 0.5036458325808413, "learning_rate": 1.5706524715797693e-05, "loss": 0.3438, "step": 6236 }, { "epoch": 1.2294952681388012, "grad_norm": 0.5174813077151698, "learning_rate": 1.5705251778830962e-05, "loss": 0.3515, "step": 6237 }, { "epoch": 1.229692429022082, "grad_norm": 0.4799244634840848, "learning_rate": 1.5703978704790962e-05, "loss": 0.3149, "step": 6238 }, { "epoch": 1.2298895899053628, "grad_norm": 0.5581993797302612, "learning_rate": 1.5702705493708283e-05, "loss": 0.3553, "step": 6239 }, { "epoch": 1.2300867507886435, "grad_norm": 0.5064827009129059, "learning_rate": 1.5701432145613508e-05, "loss": 0.3266, "step": 6240 }, { "epoch": 1.2302839116719242, "grad_norm": 0.6670134919739141, "learning_rate": 1.5700158660537235e-05, "loss": 0.3515, "step": 6241 }, { "epoch": 1.2304810725552051, "grad_norm": 0.510108854937946, "learning_rate": 1.569888503851006e-05, "loss": 0.3321, "step": 6242 }, { "epoch": 1.2306782334384858, "grad_norm": 0.5046667999599455, "learning_rate": 1.5697611279562584e-05, "loss": 0.3339, "step": 6243 }, { "epoch": 1.2308753943217665, "grad_norm": 0.49224188852519396, "learning_rate": 1.5696337383725412e-05, "loss": 0.3119, "step": 6244 }, { "epoch": 1.2310725552050474, "grad_norm": 0.5172616601262686, "learning_rate": 1.569506335102914e-05, "loss": 0.3443, "step": 6245 }, { "epoch": 1.2312697160883281, "grad_norm": 0.5191142722710178, "learning_rate": 1.569378918150439e-05, "loss": 0.356, "step": 6246 }, { "epoch": 1.2314668769716088, "grad_norm": 0.5412878379897871, "learning_rate": 1.5692514875181767e-05, "loss": 0.3716, "step": 6247 }, { "epoch": 1.2316640378548895, "grad_norm": 0.5465050490209235, "learning_rate": 1.5691240432091892e-05, "loss": 0.3478, "step": 6248 }, { "epoch": 1.2318611987381702, "grad_norm": 0.4939130381768039, "learning_rate": 1.5689965852265383e-05, "loss": 0.334, "step": 6249 }, { "epoch": 1.2320583596214512, "grad_norm": 0.5058870629567691, "learning_rate": 1.568869113573286e-05, "loss": 0.336, "step": 6250 }, { "epoch": 1.2322555205047319, "grad_norm": 0.48700312555624475, "learning_rate": 1.568741628252495e-05, "loss": 0.329, "step": 6251 }, { "epoch": 1.2324526813880126, "grad_norm": 0.5093770682043642, "learning_rate": 1.5686141292672287e-05, "loss": 0.3498, "step": 6252 }, { "epoch": 1.2326498422712935, "grad_norm": 0.49687858919236133, "learning_rate": 1.56848661662055e-05, "loss": 0.3263, "step": 6253 }, { "epoch": 1.2328470031545742, "grad_norm": 0.5077643833980352, "learning_rate": 1.5683590903155222e-05, "loss": 0.3385, "step": 6254 }, { "epoch": 1.2330441640378549, "grad_norm": 0.5185258515611184, "learning_rate": 1.56823155035521e-05, "loss": 0.3456, "step": 6255 }, { "epoch": 1.2332413249211356, "grad_norm": 0.5278963932263402, "learning_rate": 1.5681039967426773e-05, "loss": 0.3555, "step": 6256 }, { "epoch": 1.2334384858044163, "grad_norm": 0.4941475259778851, "learning_rate": 1.5679764294809882e-05, "loss": 0.3451, "step": 6257 }, { "epoch": 1.2336356466876972, "grad_norm": 0.4917220521203785, "learning_rate": 1.567848848573208e-05, "loss": 0.3115, "step": 6258 }, { "epoch": 1.233832807570978, "grad_norm": 0.4949766275099023, "learning_rate": 1.567721254022402e-05, "loss": 0.3181, "step": 6259 }, { "epoch": 1.2340299684542586, "grad_norm": 0.5273568098480352, "learning_rate": 1.5675936458316357e-05, "loss": 0.3399, "step": 6260 }, { "epoch": 1.2342271293375395, "grad_norm": 0.4886485870140973, "learning_rate": 1.567466024003975e-05, "loss": 0.3564, "step": 6261 }, { "epoch": 1.2344242902208202, "grad_norm": 0.5030459426717067, "learning_rate": 1.567338388542486e-05, "loss": 0.3335, "step": 6262 }, { "epoch": 1.234621451104101, "grad_norm": 0.4845019722082114, "learning_rate": 1.5672107394502357e-05, "loss": 0.3083, "step": 6263 }, { "epoch": 1.2348186119873816, "grad_norm": 0.49034401686565704, "learning_rate": 1.56708307673029e-05, "loss": 0.3406, "step": 6264 }, { "epoch": 1.2350157728706626, "grad_norm": 0.8790381187259672, "learning_rate": 1.5669554003857172e-05, "loss": 0.3443, "step": 6265 }, { "epoch": 1.2352129337539433, "grad_norm": 0.4998894425749621, "learning_rate": 1.566827710419584e-05, "loss": 0.335, "step": 6266 }, { "epoch": 1.235410094637224, "grad_norm": 0.532021790282657, "learning_rate": 1.566700006834959e-05, "loss": 0.3623, "step": 6267 }, { "epoch": 1.2356072555205047, "grad_norm": 0.5388159940953764, "learning_rate": 1.5665722896349098e-05, "loss": 0.3463, "step": 6268 }, { "epoch": 1.2358044164037856, "grad_norm": 0.5157145162260288, "learning_rate": 1.566444558822505e-05, "loss": 0.3421, "step": 6269 }, { "epoch": 1.2360015772870663, "grad_norm": 0.5605990079950705, "learning_rate": 1.5663168144008136e-05, "loss": 0.3561, "step": 6270 }, { "epoch": 1.236198738170347, "grad_norm": 0.4715933715273965, "learning_rate": 1.5661890563729045e-05, "loss": 0.3115, "step": 6271 }, { "epoch": 1.2363958990536277, "grad_norm": 0.5133180282336455, "learning_rate": 1.5660612847418476e-05, "loss": 0.3462, "step": 6272 }, { "epoch": 1.2365930599369086, "grad_norm": 0.5181709969632247, "learning_rate": 1.5659334995107124e-05, "loss": 0.349, "step": 6273 }, { "epoch": 1.2367902208201893, "grad_norm": 0.5196223017058347, "learning_rate": 1.565805700682569e-05, "loss": 0.3526, "step": 6274 }, { "epoch": 1.23698738170347, "grad_norm": 0.5375416524081914, "learning_rate": 1.565677888260488e-05, "loss": 0.3509, "step": 6275 }, { "epoch": 1.2371845425867507, "grad_norm": 0.4661403315888071, "learning_rate": 1.5655500622475405e-05, "loss": 0.2998, "step": 6276 }, { "epoch": 1.2373817034700316, "grad_norm": 0.5026678272938364, "learning_rate": 1.565422222646797e-05, "loss": 0.316, "step": 6277 }, { "epoch": 1.2375788643533123, "grad_norm": 0.5370007180074455, "learning_rate": 1.5652943694613293e-05, "loss": 0.3638, "step": 6278 }, { "epoch": 1.237776025236593, "grad_norm": 0.5161124298507306, "learning_rate": 1.5651665026942094e-05, "loss": 0.3346, "step": 6279 }, { "epoch": 1.2379731861198737, "grad_norm": 0.5652741095921482, "learning_rate": 1.565038622348509e-05, "loss": 0.3288, "step": 6280 }, { "epoch": 1.2381703470031546, "grad_norm": 0.9272850647527554, "learning_rate": 1.5649107284273007e-05, "loss": 0.3186, "step": 6281 }, { "epoch": 1.2383675078864353, "grad_norm": 0.5570913267865101, "learning_rate": 1.5647828209336572e-05, "loss": 0.3537, "step": 6282 }, { "epoch": 1.238564668769716, "grad_norm": 0.5082360498168839, "learning_rate": 1.5646548998706514e-05, "loss": 0.3099, "step": 6283 }, { "epoch": 1.2387618296529967, "grad_norm": 0.5205218960702347, "learning_rate": 1.5645269652413574e-05, "loss": 0.306, "step": 6284 }, { "epoch": 1.2389589905362777, "grad_norm": 0.504848365836123, "learning_rate": 1.564399017048848e-05, "loss": 0.3249, "step": 6285 }, { "epoch": 1.2391561514195584, "grad_norm": 0.5448082615998162, "learning_rate": 1.5642710552961982e-05, "loss": 0.3271, "step": 6286 }, { "epoch": 1.239353312302839, "grad_norm": 0.5213986756373774, "learning_rate": 1.564143079986481e-05, "loss": 0.3245, "step": 6287 }, { "epoch": 1.23955047318612, "grad_norm": 0.5069938937165306, "learning_rate": 1.564015091122773e-05, "loss": 0.3336, "step": 6288 }, { "epoch": 1.2397476340694007, "grad_norm": 0.5053084261882574, "learning_rate": 1.5638870887081476e-05, "loss": 0.3217, "step": 6289 }, { "epoch": 1.2399447949526814, "grad_norm": 0.521104683492062, "learning_rate": 1.5637590727456808e-05, "loss": 0.3168, "step": 6290 }, { "epoch": 1.240141955835962, "grad_norm": 0.5467959912507718, "learning_rate": 1.5636310432384487e-05, "loss": 0.3569, "step": 6291 }, { "epoch": 1.2403391167192428, "grad_norm": 0.5102807325723169, "learning_rate": 1.5635030001895267e-05, "loss": 0.3149, "step": 6292 }, { "epoch": 1.2405362776025237, "grad_norm": 1.3045505295401632, "learning_rate": 1.5633749436019913e-05, "loss": 0.3757, "step": 6293 }, { "epoch": 1.2407334384858044, "grad_norm": 0.5444503336415735, "learning_rate": 1.5632468734789192e-05, "loss": 0.3235, "step": 6294 }, { "epoch": 1.2409305993690851, "grad_norm": 0.5048607240462756, "learning_rate": 1.563118789823387e-05, "loss": 0.3353, "step": 6295 }, { "epoch": 1.241127760252366, "grad_norm": 0.5667148895534923, "learning_rate": 1.562990692638473e-05, "loss": 0.3819, "step": 6296 }, { "epoch": 1.2413249211356467, "grad_norm": 0.5446964393578572, "learning_rate": 1.562862581927254e-05, "loss": 0.3701, "step": 6297 }, { "epoch": 1.2415220820189274, "grad_norm": 0.5257025409305759, "learning_rate": 1.5627344576928085e-05, "loss": 0.3344, "step": 6298 }, { "epoch": 1.2417192429022081, "grad_norm": 0.5256923271112383, "learning_rate": 1.5626063199382138e-05, "loss": 0.3574, "step": 6299 }, { "epoch": 1.2419164037854888, "grad_norm": 0.5316599865695444, "learning_rate": 1.5624781686665498e-05, "loss": 0.3574, "step": 6300 }, { "epoch": 1.2421135646687698, "grad_norm": 0.5313857364398521, "learning_rate": 1.5623500038808946e-05, "loss": 0.3563, "step": 6301 }, { "epoch": 1.2423107255520505, "grad_norm": 0.5039351482023608, "learning_rate": 1.5622218255843276e-05, "loss": 0.3031, "step": 6302 }, { "epoch": 1.2425078864353312, "grad_norm": 0.5072016335647022, "learning_rate": 1.5620936337799287e-05, "loss": 0.3515, "step": 6303 }, { "epoch": 1.242705047318612, "grad_norm": 0.5672328567093293, "learning_rate": 1.5619654284707773e-05, "loss": 0.314, "step": 6304 }, { "epoch": 1.2429022082018928, "grad_norm": 0.5399431684544427, "learning_rate": 1.5618372096599547e-05, "loss": 0.3492, "step": 6305 }, { "epoch": 1.2430993690851735, "grad_norm": 0.49668681766651646, "learning_rate": 1.56170897735054e-05, "loss": 0.3435, "step": 6306 }, { "epoch": 1.2432965299684542, "grad_norm": 0.5134554384784643, "learning_rate": 1.561580731545615e-05, "loss": 0.3542, "step": 6307 }, { "epoch": 1.2434936908517351, "grad_norm": 0.507277410037156, "learning_rate": 1.5614524722482604e-05, "loss": 0.3333, "step": 6308 }, { "epoch": 1.2436908517350158, "grad_norm": 0.5301659849549519, "learning_rate": 1.561324199461558e-05, "loss": 0.3351, "step": 6309 }, { "epoch": 1.2438880126182965, "grad_norm": 0.4915261347852841, "learning_rate": 1.56119591318859e-05, "loss": 0.3319, "step": 6310 }, { "epoch": 1.2440851735015772, "grad_norm": 0.5361265111719739, "learning_rate": 1.561067613432438e-05, "loss": 0.3189, "step": 6311 }, { "epoch": 1.2442823343848581, "grad_norm": 0.5259903970751432, "learning_rate": 1.560939300196185e-05, "loss": 0.3497, "step": 6312 }, { "epoch": 1.2444794952681388, "grad_norm": 0.5066141181133125, "learning_rate": 1.5608109734829134e-05, "loss": 0.3335, "step": 6313 }, { "epoch": 1.2446766561514195, "grad_norm": 0.5640167051107872, "learning_rate": 1.5606826332957066e-05, "loss": 0.3371, "step": 6314 }, { "epoch": 1.2448738170347002, "grad_norm": 0.4993378182654541, "learning_rate": 1.560554279637648e-05, "loss": 0.3393, "step": 6315 }, { "epoch": 1.2450709779179812, "grad_norm": 0.4864313789939513, "learning_rate": 1.560425912511822e-05, "loss": 0.3251, "step": 6316 }, { "epoch": 1.2452681388012619, "grad_norm": 0.4908528558657437, "learning_rate": 1.5602975319213115e-05, "loss": 0.3377, "step": 6317 }, { "epoch": 1.2454652996845426, "grad_norm": 0.5386582667468415, "learning_rate": 1.5601691378692014e-05, "loss": 0.3517, "step": 6318 }, { "epoch": 1.2456624605678233, "grad_norm": 0.5109315855247596, "learning_rate": 1.5600407303585773e-05, "loss": 0.3195, "step": 6319 }, { "epoch": 1.2458596214511042, "grad_norm": 0.5286420872063581, "learning_rate": 1.559912309392523e-05, "loss": 0.3531, "step": 6320 }, { "epoch": 1.2460567823343849, "grad_norm": 0.5019801643055133, "learning_rate": 1.559783874974125e-05, "loss": 0.3223, "step": 6321 }, { "epoch": 1.2462539432176656, "grad_norm": 0.5106613080428457, "learning_rate": 1.559655427106468e-05, "loss": 0.3308, "step": 6322 }, { "epoch": 1.2464511041009463, "grad_norm": 0.5460596574557645, "learning_rate": 1.5595269657926396e-05, "loss": 0.3453, "step": 6323 }, { "epoch": 1.2466482649842272, "grad_norm": 0.5260622334928508, "learning_rate": 1.559398491035725e-05, "loss": 0.3253, "step": 6324 }, { "epoch": 1.246845425867508, "grad_norm": 0.4969801726967249, "learning_rate": 1.5592700028388107e-05, "loss": 0.3253, "step": 6325 }, { "epoch": 1.2470425867507886, "grad_norm": 0.4980689481986874, "learning_rate": 1.5591415012049846e-05, "loss": 0.3236, "step": 6326 }, { "epoch": 1.2472397476340693, "grad_norm": 0.49763176731095426, "learning_rate": 1.5590129861373335e-05, "loss": 0.3378, "step": 6327 }, { "epoch": 1.2474369085173502, "grad_norm": 0.5169444490816916, "learning_rate": 1.5588844576389454e-05, "loss": 0.3312, "step": 6328 }, { "epoch": 1.247634069400631, "grad_norm": 0.5109026656737135, "learning_rate": 1.5587559157129078e-05, "loss": 0.333, "step": 6329 }, { "epoch": 1.2478312302839116, "grad_norm": 0.5055755741491358, "learning_rate": 1.5586273603623098e-05, "loss": 0.3435, "step": 6330 }, { "epoch": 1.2480283911671926, "grad_norm": 0.5005392793433827, "learning_rate": 1.5584987915902393e-05, "loss": 0.3399, "step": 6331 }, { "epoch": 1.2482255520504733, "grad_norm": 0.4977560846580881, "learning_rate": 1.5583702093997855e-05, "loss": 0.3299, "step": 6332 }, { "epoch": 1.248422712933754, "grad_norm": 0.49705232631811735, "learning_rate": 1.558241613794038e-05, "loss": 0.3119, "step": 6333 }, { "epoch": 1.2486198738170347, "grad_norm": 0.5240628686982806, "learning_rate": 1.5581130047760865e-05, "loss": 0.3357, "step": 6334 }, { "epoch": 1.2488170347003154, "grad_norm": 0.5317216595261723, "learning_rate": 1.55798438234902e-05, "loss": 0.3368, "step": 6335 }, { "epoch": 1.2490141955835963, "grad_norm": 0.5388823858020947, "learning_rate": 1.5578557465159296e-05, "loss": 0.3451, "step": 6336 }, { "epoch": 1.249211356466877, "grad_norm": 0.5020216428038514, "learning_rate": 1.5577270972799056e-05, "loss": 0.3266, "step": 6337 }, { "epoch": 1.2494085173501577, "grad_norm": 0.5251732240656299, "learning_rate": 1.5575984346440393e-05, "loss": 0.3255, "step": 6338 }, { "epoch": 1.2496056782334386, "grad_norm": 0.5065709033953211, "learning_rate": 1.5574697586114213e-05, "loss": 0.3441, "step": 6339 }, { "epoch": 1.2498028391167193, "grad_norm": 0.49243721616902913, "learning_rate": 1.5573410691851432e-05, "loss": 0.3347, "step": 6340 }, { "epoch": 1.2498028391167193, "eval_loss": 0.43939557671546936, "eval_runtime": 344.8004, "eval_samples_per_second": 23.579, "eval_steps_per_second": 1.476, "step": 6340 }, { "epoch": 1.25, "grad_norm": 0.5101061503167154, "learning_rate": 1.5572123663682975e-05, "loss": 0.3102, "step": 6341 }, { "epoch": 1.2501971608832807, "grad_norm": 0.4808666726594073, "learning_rate": 1.5570836501639754e-05, "loss": 0.3435, "step": 6342 }, { "epoch": 1.2503943217665614, "grad_norm": 0.4999834817361975, "learning_rate": 1.5569549205752707e-05, "loss": 0.3381, "step": 6343 }, { "epoch": 1.2505914826498423, "grad_norm": 0.49204094898528117, "learning_rate": 1.556826177605275e-05, "loss": 0.3274, "step": 6344 }, { "epoch": 1.250788643533123, "grad_norm": 0.4949146499447072, "learning_rate": 1.556697421257082e-05, "loss": 0.3592, "step": 6345 }, { "epoch": 1.2509858044164037, "grad_norm": 0.47762676196393705, "learning_rate": 1.556568651533785e-05, "loss": 0.322, "step": 6346 }, { "epoch": 1.2511829652996846, "grad_norm": 0.5191724256820243, "learning_rate": 1.5564398684384787e-05, "loss": 0.3442, "step": 6347 }, { "epoch": 1.2513801261829653, "grad_norm": 0.4787762214934278, "learning_rate": 1.5563110719742558e-05, "loss": 0.3285, "step": 6348 }, { "epoch": 1.251577287066246, "grad_norm": 0.48201229976584514, "learning_rate": 1.5561822621442114e-05, "loss": 0.326, "step": 6349 }, { "epoch": 1.2517744479495267, "grad_norm": 0.46392186774839445, "learning_rate": 1.5560534389514407e-05, "loss": 0.3004, "step": 6350 }, { "epoch": 1.2519716088328074, "grad_norm": 0.4815291317559283, "learning_rate": 1.555924602399038e-05, "loss": 0.307, "step": 6351 }, { "epoch": 1.2521687697160884, "grad_norm": 0.8785203455669748, "learning_rate": 1.5557957524900993e-05, "loss": 0.3367, "step": 6352 }, { "epoch": 1.252365930599369, "grad_norm": 0.48203047338077637, "learning_rate": 1.5556668892277197e-05, "loss": 0.3262, "step": 6353 }, { "epoch": 1.2525630914826498, "grad_norm": 0.5133633911448251, "learning_rate": 1.555538012614996e-05, "loss": 0.3317, "step": 6354 }, { "epoch": 1.2527602523659307, "grad_norm": 0.5341940509642474, "learning_rate": 1.555409122655024e-05, "loss": 0.3572, "step": 6355 }, { "epoch": 1.2529574132492114, "grad_norm": 0.482176622994259, "learning_rate": 1.5552802193509003e-05, "loss": 0.3147, "step": 6356 }, { "epoch": 1.253154574132492, "grad_norm": 0.49374641797023383, "learning_rate": 1.5551513027057225e-05, "loss": 0.3481, "step": 6357 }, { "epoch": 1.2533517350157728, "grad_norm": 0.46859298112273634, "learning_rate": 1.5550223727225875e-05, "loss": 0.3099, "step": 6358 }, { "epoch": 1.2535488958990535, "grad_norm": 0.4986948734759226, "learning_rate": 1.554893429404593e-05, "loss": 0.3068, "step": 6359 }, { "epoch": 1.2537460567823344, "grad_norm": 0.49945204691498807, "learning_rate": 1.5547644727548373e-05, "loss": 0.3256, "step": 6360 }, { "epoch": 1.2539432176656151, "grad_norm": 0.5162525662741562, "learning_rate": 1.554635502776418e-05, "loss": 0.3326, "step": 6361 }, { "epoch": 1.2541403785488958, "grad_norm": 0.534638355896996, "learning_rate": 1.554506519472434e-05, "loss": 0.3336, "step": 6362 }, { "epoch": 1.2543375394321767, "grad_norm": 0.5029286598031089, "learning_rate": 1.5543775228459846e-05, "loss": 0.3516, "step": 6363 }, { "epoch": 1.2545347003154574, "grad_norm": 0.5261189540493615, "learning_rate": 1.554248512900169e-05, "loss": 0.328, "step": 6364 }, { "epoch": 1.2547318611987381, "grad_norm": 0.8234311010835397, "learning_rate": 1.5541194896380863e-05, "loss": 0.3409, "step": 6365 }, { "epoch": 1.254929022082019, "grad_norm": 0.5154543512907336, "learning_rate": 1.5539904530628365e-05, "loss": 0.3431, "step": 6366 }, { "epoch": 1.2551261829652998, "grad_norm": 0.478219086021865, "learning_rate": 1.55386140317752e-05, "loss": 0.3156, "step": 6367 }, { "epoch": 1.2553233438485805, "grad_norm": 0.49818756625431365, "learning_rate": 1.5537323399852373e-05, "loss": 0.3157, "step": 6368 }, { "epoch": 1.2555205047318612, "grad_norm": 0.47906698592620167, "learning_rate": 1.5536032634890892e-05, "loss": 0.3299, "step": 6369 }, { "epoch": 1.2557176656151419, "grad_norm": 0.5062243330914792, "learning_rate": 1.553474173692177e-05, "loss": 0.3484, "step": 6370 }, { "epoch": 1.2559148264984228, "grad_norm": 0.5307599529911612, "learning_rate": 1.5533450705976018e-05, "loss": 0.3385, "step": 6371 }, { "epoch": 1.2561119873817035, "grad_norm": 0.573572643755291, "learning_rate": 1.553215954208466e-05, "loss": 0.357, "step": 6372 }, { "epoch": 1.2563091482649842, "grad_norm": 0.5968608411513199, "learning_rate": 1.5530868245278708e-05, "loss": 0.3489, "step": 6373 }, { "epoch": 1.256506309148265, "grad_norm": 0.58115296279192, "learning_rate": 1.55295768155892e-05, "loss": 0.3522, "step": 6374 }, { "epoch": 1.2567034700315458, "grad_norm": 0.49820315714499325, "learning_rate": 1.5528285253047153e-05, "loss": 0.3303, "step": 6375 }, { "epoch": 1.2569006309148265, "grad_norm": 0.5089032372495764, "learning_rate": 1.55269935576836e-05, "loss": 0.344, "step": 6376 }, { "epoch": 1.2570977917981072, "grad_norm": 0.5371768444531454, "learning_rate": 1.5525701729529578e-05, "loss": 0.3675, "step": 6377 }, { "epoch": 1.257294952681388, "grad_norm": 0.5002102537095656, "learning_rate": 1.552440976861612e-05, "loss": 0.3248, "step": 6378 }, { "epoch": 1.2574921135646688, "grad_norm": 0.5018100903078823, "learning_rate": 1.5523117674974267e-05, "loss": 0.3217, "step": 6379 }, { "epoch": 1.2576892744479495, "grad_norm": 0.5336700648969083, "learning_rate": 1.5521825448635066e-05, "loss": 0.3441, "step": 6380 }, { "epoch": 1.2578864353312302, "grad_norm": 0.48938739876612136, "learning_rate": 1.5520533089629562e-05, "loss": 0.328, "step": 6381 }, { "epoch": 1.2580835962145112, "grad_norm": 0.49246018639601746, "learning_rate": 1.5519240597988806e-05, "loss": 0.3418, "step": 6382 }, { "epoch": 1.2582807570977919, "grad_norm": 0.49124354097385314, "learning_rate": 1.551794797374385e-05, "loss": 0.3294, "step": 6383 }, { "epoch": 1.2584779179810726, "grad_norm": 0.5583036731814169, "learning_rate": 1.5516655216925748e-05, "loss": 0.3576, "step": 6384 }, { "epoch": 1.2586750788643533, "grad_norm": 0.5097818791868225, "learning_rate": 1.5515362327565564e-05, "loss": 0.3372, "step": 6385 }, { "epoch": 1.258872239747634, "grad_norm": 0.5176295462370966, "learning_rate": 1.5514069305694356e-05, "loss": 0.3537, "step": 6386 }, { "epoch": 1.2590694006309149, "grad_norm": 0.5797974728770534, "learning_rate": 1.5512776151343198e-05, "loss": 0.3595, "step": 6387 }, { "epoch": 1.2592665615141956, "grad_norm": 0.6262421558004105, "learning_rate": 1.5511482864543147e-05, "loss": 0.3253, "step": 6388 }, { "epoch": 1.2594637223974763, "grad_norm": 0.5273042130470622, "learning_rate": 1.5510189445325284e-05, "loss": 0.3221, "step": 6389 }, { "epoch": 1.2596608832807572, "grad_norm": 0.5732125441797801, "learning_rate": 1.5508895893720685e-05, "loss": 0.3532, "step": 6390 }, { "epoch": 1.259858044164038, "grad_norm": 0.5275542533558232, "learning_rate": 1.550760220976042e-05, "loss": 0.356, "step": 6391 }, { "epoch": 1.2600552050473186, "grad_norm": 0.4857065591782998, "learning_rate": 1.5506308393475582e-05, "loss": 0.3308, "step": 6392 }, { "epoch": 1.2602523659305993, "grad_norm": 0.5099413329942116, "learning_rate": 1.550501444489725e-05, "loss": 0.3362, "step": 6393 }, { "epoch": 1.26044952681388, "grad_norm": 0.5056333274989698, "learning_rate": 1.5503720364056512e-05, "loss": 0.3489, "step": 6394 }, { "epoch": 1.260646687697161, "grad_norm": 0.4774413513695629, "learning_rate": 1.550242615098446e-05, "loss": 0.3304, "step": 6395 }, { "epoch": 1.2608438485804416, "grad_norm": 0.4843542395552344, "learning_rate": 1.5501131805712188e-05, "loss": 0.3047, "step": 6396 }, { "epoch": 1.2610410094637223, "grad_norm": 0.4843638404403003, "learning_rate": 1.549983732827079e-05, "loss": 0.319, "step": 6397 }, { "epoch": 1.2612381703470033, "grad_norm": 0.4783957926301324, "learning_rate": 1.5498542718691378e-05, "loss": 0.2986, "step": 6398 }, { "epoch": 1.261435331230284, "grad_norm": 0.5171370998231292, "learning_rate": 1.5497247977005047e-05, "loss": 0.3288, "step": 6399 }, { "epoch": 1.2616324921135647, "grad_norm": 0.5587928424642727, "learning_rate": 1.5495953103242908e-05, "loss": 0.3409, "step": 6400 }, { "epoch": 1.2618296529968454, "grad_norm": 0.5069107019620349, "learning_rate": 1.549465809743607e-05, "loss": 0.3489, "step": 6401 }, { "epoch": 1.262026813880126, "grad_norm": 0.5048904621722605, "learning_rate": 1.5493362959615646e-05, "loss": 0.3363, "step": 6402 }, { "epoch": 1.262223974763407, "grad_norm": 0.4918922172890412, "learning_rate": 1.549206768981275e-05, "loss": 0.3215, "step": 6403 }, { "epoch": 1.2624211356466877, "grad_norm": 0.5373254295839985, "learning_rate": 1.5490772288058508e-05, "loss": 0.3351, "step": 6404 }, { "epoch": 1.2626182965299684, "grad_norm": 0.5194441534825306, "learning_rate": 1.5489476754384035e-05, "loss": 0.3395, "step": 6405 }, { "epoch": 1.2628154574132493, "grad_norm": 0.49136904473331555, "learning_rate": 1.5488181088820468e-05, "loss": 0.3144, "step": 6406 }, { "epoch": 1.26301261829653, "grad_norm": 0.5815486766369652, "learning_rate": 1.548688529139893e-05, "loss": 0.3718, "step": 6407 }, { "epoch": 1.2632097791798107, "grad_norm": 0.49216042299624607, "learning_rate": 1.5485589362150552e-05, "loss": 0.3384, "step": 6408 }, { "epoch": 1.2634069400630916, "grad_norm": 0.49257478050805037, "learning_rate": 1.5484293301106475e-05, "loss": 0.3161, "step": 6409 }, { "epoch": 1.2636041009463723, "grad_norm": 0.5111418677883345, "learning_rate": 1.5482997108297834e-05, "loss": 0.3159, "step": 6410 }, { "epoch": 1.263801261829653, "grad_norm": 0.48719719419588625, "learning_rate": 1.5481700783755772e-05, "loss": 0.31, "step": 6411 }, { "epoch": 1.2639984227129337, "grad_norm": 0.6071472420270517, "learning_rate": 1.548040432751143e-05, "loss": 0.3906, "step": 6412 }, { "epoch": 1.2641955835962144, "grad_norm": 0.5109344409789744, "learning_rate": 1.5479107739595967e-05, "loss": 0.3505, "step": 6413 }, { "epoch": 1.2643927444794953, "grad_norm": 0.5163073576362812, "learning_rate": 1.5477811020040525e-05, "loss": 0.3396, "step": 6414 }, { "epoch": 1.264589905362776, "grad_norm": 0.5159856675009115, "learning_rate": 1.5476514168876264e-05, "loss": 0.3458, "step": 6415 }, { "epoch": 1.2647870662460567, "grad_norm": 0.5208996581565274, "learning_rate": 1.5475217186134335e-05, "loss": 0.3575, "step": 6416 }, { "epoch": 1.2649842271293377, "grad_norm": 0.6501488219759475, "learning_rate": 1.5473920071845906e-05, "loss": 0.3406, "step": 6417 }, { "epoch": 1.2651813880126184, "grad_norm": 0.5045419985244192, "learning_rate": 1.5472622826042144e-05, "loss": 0.3216, "step": 6418 }, { "epoch": 1.265378548895899, "grad_norm": 0.5083528685308321, "learning_rate": 1.5471325448754207e-05, "loss": 0.3485, "step": 6419 }, { "epoch": 1.2655757097791798, "grad_norm": 0.4756731932644996, "learning_rate": 1.547002794001327e-05, "loss": 0.3249, "step": 6420 }, { "epoch": 1.2657728706624605, "grad_norm": 0.5179143417797316, "learning_rate": 1.546873029985051e-05, "loss": 0.3634, "step": 6421 }, { "epoch": 1.2659700315457414, "grad_norm": 0.5165453574064824, "learning_rate": 1.54674325282971e-05, "loss": 0.3135, "step": 6422 }, { "epoch": 1.266167192429022, "grad_norm": 0.4897247296838715, "learning_rate": 1.5466134625384216e-05, "loss": 0.3222, "step": 6423 }, { "epoch": 1.2663643533123028, "grad_norm": 0.5067798904225712, "learning_rate": 1.546483659114305e-05, "loss": 0.3574, "step": 6424 }, { "epoch": 1.2665615141955837, "grad_norm": 0.4586524801850368, "learning_rate": 1.5463538425604782e-05, "loss": 0.3044, "step": 6425 }, { "epoch": 1.2667586750788644, "grad_norm": 0.48557837904062384, "learning_rate": 1.5462240128800604e-05, "loss": 0.3395, "step": 6426 }, { "epoch": 1.2669558359621451, "grad_norm": 0.4999102108308707, "learning_rate": 1.5460941700761706e-05, "loss": 0.3213, "step": 6427 }, { "epoch": 1.2671529968454258, "grad_norm": 0.5132500026273736, "learning_rate": 1.545964314151929e-05, "loss": 0.3422, "step": 6428 }, { "epoch": 1.2673501577287065, "grad_norm": 0.4913356576984297, "learning_rate": 1.545834445110455e-05, "loss": 0.3383, "step": 6429 }, { "epoch": 1.2675473186119874, "grad_norm": 0.5025975016108518, "learning_rate": 1.5457045629548687e-05, "loss": 0.3397, "step": 6430 }, { "epoch": 1.2677444794952681, "grad_norm": 0.4826877165006056, "learning_rate": 1.545574667688291e-05, "loss": 0.3151, "step": 6431 }, { "epoch": 1.2679416403785488, "grad_norm": 0.4772272594998397, "learning_rate": 1.5454447593138424e-05, "loss": 0.3349, "step": 6432 }, { "epoch": 1.2681388012618298, "grad_norm": 2.53849935570129, "learning_rate": 1.5453148378346444e-05, "loss": 0.3515, "step": 6433 }, { "epoch": 1.2683359621451105, "grad_norm": 0.5128594643909096, "learning_rate": 1.5451849032538185e-05, "loss": 0.3419, "step": 6434 }, { "epoch": 1.2685331230283912, "grad_norm": 0.493655568948996, "learning_rate": 1.5450549555744857e-05, "loss": 0.3182, "step": 6435 }, { "epoch": 1.2687302839116719, "grad_norm": 0.4969357386584924, "learning_rate": 1.5449249947997687e-05, "loss": 0.3429, "step": 6436 }, { "epoch": 1.2689274447949526, "grad_norm": 0.5468340570267272, "learning_rate": 1.5447950209327905e-05, "loss": 0.3493, "step": 6437 }, { "epoch": 1.2691246056782335, "grad_norm": 0.5418093845032692, "learning_rate": 1.5446650339766723e-05, "loss": 0.3476, "step": 6438 }, { "epoch": 1.2693217665615142, "grad_norm": 0.6775784568233131, "learning_rate": 1.544535033934539e-05, "loss": 0.3739, "step": 6439 }, { "epoch": 1.2695189274447949, "grad_norm": 0.48385467083683753, "learning_rate": 1.5444050208095124e-05, "loss": 0.3195, "step": 6440 }, { "epoch": 1.2697160883280758, "grad_norm": 0.493918698352294, "learning_rate": 1.544274994604717e-05, "loss": 0.3302, "step": 6441 }, { "epoch": 1.2699132492113565, "grad_norm": 0.5530138835909129, "learning_rate": 1.5441449553232764e-05, "loss": 0.3495, "step": 6442 }, { "epoch": 1.2701104100946372, "grad_norm": 0.49621433371123225, "learning_rate": 1.544014902968315e-05, "loss": 0.3333, "step": 6443 }, { "epoch": 1.270307570977918, "grad_norm": 0.49721837630990706, "learning_rate": 1.5438848375429576e-05, "loss": 0.3283, "step": 6444 }, { "epoch": 1.2705047318611986, "grad_norm": 0.5533506511752488, "learning_rate": 1.5437547590503288e-05, "loss": 0.385, "step": 6445 }, { "epoch": 1.2707018927444795, "grad_norm": 0.5283600379328754, "learning_rate": 1.5436246674935543e-05, "loss": 0.3434, "step": 6446 }, { "epoch": 1.2708990536277602, "grad_norm": 0.5013523816913494, "learning_rate": 1.5434945628757595e-05, "loss": 0.338, "step": 6447 }, { "epoch": 1.271096214511041, "grad_norm": 0.4905233287935434, "learning_rate": 1.54336444520007e-05, "loss": 0.3287, "step": 6448 }, { "epoch": 1.2712933753943219, "grad_norm": 0.5437498180898698, "learning_rate": 1.5432343144696117e-05, "loss": 0.3708, "step": 6449 }, { "epoch": 1.2714905362776026, "grad_norm": 0.48421826633903103, "learning_rate": 1.543104170687512e-05, "loss": 0.3125, "step": 6450 }, { "epoch": 1.2716876971608833, "grad_norm": 0.4793437101259274, "learning_rate": 1.542974013856897e-05, "loss": 0.3341, "step": 6451 }, { "epoch": 1.271884858044164, "grad_norm": 19.93851850988607, "learning_rate": 1.5428438439808942e-05, "loss": 0.6517, "step": 6452 }, { "epoch": 1.2720820189274447, "grad_norm": 0.5087943803471097, "learning_rate": 1.542713661062631e-05, "loss": 0.3086, "step": 6453 }, { "epoch": 1.2722791798107256, "grad_norm": 0.47671813979964683, "learning_rate": 1.542583465105235e-05, "loss": 0.3208, "step": 6454 }, { "epoch": 1.2724763406940063, "grad_norm": 0.5236254976524284, "learning_rate": 1.542453256111834e-05, "loss": 0.3477, "step": 6455 }, { "epoch": 1.272673501577287, "grad_norm": 0.49008247823279333, "learning_rate": 1.5423230340855572e-05, "loss": 0.3354, "step": 6456 }, { "epoch": 1.272870662460568, "grad_norm": 0.48169835947954537, "learning_rate": 1.5421927990295325e-05, "loss": 0.3369, "step": 6457 }, { "epoch": 1.2730678233438486, "grad_norm": 0.5245321000278956, "learning_rate": 1.5420625509468892e-05, "loss": 0.3466, "step": 6458 }, { "epoch": 1.2732649842271293, "grad_norm": 0.6897513827097659, "learning_rate": 1.5419322898407562e-05, "loss": 0.3384, "step": 6459 }, { "epoch": 1.2734621451104102, "grad_norm": 0.5001285523422395, "learning_rate": 1.541802015714264e-05, "loss": 0.3512, "step": 6460 }, { "epoch": 1.273659305993691, "grad_norm": 0.5336358617640525, "learning_rate": 1.5416717285705417e-05, "loss": 0.3448, "step": 6461 }, { "epoch": 1.2738564668769716, "grad_norm": 0.5466334074547992, "learning_rate": 1.5415414284127207e-05, "loss": 0.3506, "step": 6462 }, { "epoch": 1.2740536277602523, "grad_norm": 0.526166079775761, "learning_rate": 1.54141111524393e-05, "loss": 0.3268, "step": 6463 }, { "epoch": 1.274250788643533, "grad_norm": 0.47399584750409807, "learning_rate": 1.5412807890673015e-05, "loss": 0.3079, "step": 6464 }, { "epoch": 1.274447949526814, "grad_norm": 0.505741406230065, "learning_rate": 1.541150449885966e-05, "loss": 0.3296, "step": 6465 }, { "epoch": 1.2746451104100947, "grad_norm": 0.5386050119365994, "learning_rate": 1.5410200977030553e-05, "loss": 0.3652, "step": 6466 }, { "epoch": 1.2748422712933754, "grad_norm": 0.47587895585088, "learning_rate": 1.5408897325217012e-05, "loss": 0.3239, "step": 6467 }, { "epoch": 1.2750394321766563, "grad_norm": 0.504126225675362, "learning_rate": 1.5407593543450358e-05, "loss": 0.3362, "step": 6468 }, { "epoch": 1.275236593059937, "grad_norm": 0.485501366242255, "learning_rate": 1.540628963176191e-05, "loss": 0.3153, "step": 6469 }, { "epoch": 1.2754337539432177, "grad_norm": 0.49475043227065524, "learning_rate": 1.5404985590183e-05, "loss": 0.3079, "step": 6470 }, { "epoch": 1.2756309148264984, "grad_norm": 0.4753091395271322, "learning_rate": 1.5403681418744962e-05, "loss": 0.3128, "step": 6471 }, { "epoch": 1.275828075709779, "grad_norm": 0.5027088286339224, "learning_rate": 1.5402377117479127e-05, "loss": 0.3438, "step": 6472 }, { "epoch": 1.27602523659306, "grad_norm": 0.4918844457862931, "learning_rate": 1.5401072686416826e-05, "loss": 0.3433, "step": 6473 }, { "epoch": 1.2762223974763407, "grad_norm": 0.5104026241302757, "learning_rate": 1.539976812558941e-05, "loss": 0.3238, "step": 6474 }, { "epoch": 1.2764195583596214, "grad_norm": 0.5129479397583762, "learning_rate": 1.539846343502821e-05, "loss": 0.3521, "step": 6475 }, { "epoch": 1.2766167192429023, "grad_norm": 0.5917550936700113, "learning_rate": 1.5397158614764584e-05, "loss": 0.3882, "step": 6476 }, { "epoch": 1.276813880126183, "grad_norm": 0.5595087268305293, "learning_rate": 1.5395853664829876e-05, "loss": 0.3397, "step": 6477 }, { "epoch": 1.2770110410094637, "grad_norm": 0.4600650571043877, "learning_rate": 1.5394548585255437e-05, "loss": 0.3025, "step": 6478 }, { "epoch": 1.2772082018927444, "grad_norm": 0.49890829885179133, "learning_rate": 1.5393243376072625e-05, "loss": 0.3205, "step": 6479 }, { "epoch": 1.2774053627760251, "grad_norm": 0.4890558758581133, "learning_rate": 1.5391938037312795e-05, "loss": 0.3315, "step": 6480 }, { "epoch": 1.277602523659306, "grad_norm": 0.7150885015408796, "learning_rate": 1.5390632569007314e-05, "loss": 0.3116, "step": 6481 }, { "epoch": 1.2777996845425867, "grad_norm": 0.5271705955690947, "learning_rate": 1.5389326971187543e-05, "loss": 0.3453, "step": 6482 }, { "epoch": 1.2779968454258674, "grad_norm": 0.508576676488475, "learning_rate": 1.538802124388485e-05, "loss": 0.3424, "step": 6483 }, { "epoch": 1.2781940063091484, "grad_norm": 0.5597504943231402, "learning_rate": 1.538671538713061e-05, "loss": 0.3741, "step": 6484 }, { "epoch": 1.278391167192429, "grad_norm": 0.5144168371604326, "learning_rate": 1.5385409400956196e-05, "loss": 0.3434, "step": 6485 }, { "epoch": 1.2785883280757098, "grad_norm": 0.5022134043659201, "learning_rate": 1.538410328539298e-05, "loss": 0.3229, "step": 6486 }, { "epoch": 1.2787854889589905, "grad_norm": 0.5129040379585406, "learning_rate": 1.5382797040472352e-05, "loss": 0.326, "step": 6487 }, { "epoch": 1.2789826498422712, "grad_norm": 0.49329203335900706, "learning_rate": 1.538149066622569e-05, "loss": 0.3395, "step": 6488 }, { "epoch": 1.279179810725552, "grad_norm": 0.5055724718310393, "learning_rate": 1.538018416268438e-05, "loss": 0.3472, "step": 6489 }, { "epoch": 1.2793769716088328, "grad_norm": 0.5258954355075676, "learning_rate": 1.537887752987981e-05, "loss": 0.3538, "step": 6490 }, { "epoch": 1.2795741324921135, "grad_norm": 0.542150664111908, "learning_rate": 1.5377570767843377e-05, "loss": 0.3396, "step": 6491 }, { "epoch": 1.2797712933753944, "grad_norm": 0.47748206124296466, "learning_rate": 1.5376263876606475e-05, "loss": 0.3288, "step": 6492 }, { "epoch": 1.2799684542586751, "grad_norm": 3.411379492061442, "learning_rate": 1.5374956856200504e-05, "loss": 0.33, "step": 6493 }, { "epoch": 1.2801656151419558, "grad_norm": 0.5132738135149726, "learning_rate": 1.537364970665687e-05, "loss": 0.3298, "step": 6494 }, { "epoch": 1.2803627760252365, "grad_norm": 1.1343123502825927, "learning_rate": 1.537234242800697e-05, "loss": 0.336, "step": 6495 }, { "epoch": 1.2805599369085172, "grad_norm": 0.518109994618307, "learning_rate": 1.537103502028222e-05, "loss": 0.3338, "step": 6496 }, { "epoch": 1.2807570977917981, "grad_norm": 0.5142434953565715, "learning_rate": 1.5369727483514026e-05, "loss": 0.3498, "step": 6497 }, { "epoch": 1.2809542586750788, "grad_norm": 0.4857243954146504, "learning_rate": 1.536841981773381e-05, "loss": 0.3212, "step": 6498 }, { "epoch": 1.2811514195583595, "grad_norm": 0.5583875828356074, "learning_rate": 1.5367112022972977e-05, "loss": 0.3449, "step": 6499 }, { "epoch": 1.2813485804416405, "grad_norm": 0.7616833629256355, "learning_rate": 1.536580409926296e-05, "loss": 0.3047, "step": 6500 }, { "epoch": 1.2815457413249212, "grad_norm": 0.5422560861681756, "learning_rate": 1.5364496046635175e-05, "loss": 0.3336, "step": 6501 }, { "epoch": 1.2817429022082019, "grad_norm": 0.5234629592906149, "learning_rate": 1.5363187865121058e-05, "loss": 0.3207, "step": 6502 }, { "epoch": 1.2819400630914828, "grad_norm": 0.5788807821840986, "learning_rate": 1.5361879554752027e-05, "loss": 0.3518, "step": 6503 }, { "epoch": 1.2821372239747635, "grad_norm": 0.560152361635895, "learning_rate": 1.536057111555953e-05, "loss": 0.3671, "step": 6504 }, { "epoch": 1.2823343848580442, "grad_norm": 0.573760389051277, "learning_rate": 1.5359262547574986e-05, "loss": 0.3296, "step": 6505 }, { "epoch": 1.2825315457413249, "grad_norm": 0.639852449462523, "learning_rate": 1.535795385082985e-05, "loss": 0.3648, "step": 6506 }, { "epoch": 1.2827287066246056, "grad_norm": 0.5684843971461204, "learning_rate": 1.5356645025355556e-05, "loss": 0.3628, "step": 6507 }, { "epoch": 1.2829258675078865, "grad_norm": 0.48085580385372134, "learning_rate": 1.535533607118355e-05, "loss": 0.3272, "step": 6508 }, { "epoch": 1.2831230283911672, "grad_norm": 0.5728460211410128, "learning_rate": 1.5354026988345284e-05, "loss": 0.36, "step": 6509 }, { "epoch": 1.283320189274448, "grad_norm": 0.5181581225517601, "learning_rate": 1.5352717776872208e-05, "loss": 0.3219, "step": 6510 }, { "epoch": 1.2835173501577288, "grad_norm": 0.5081886354156057, "learning_rate": 1.5351408436795777e-05, "loss": 0.3369, "step": 6511 }, { "epoch": 1.2837145110410095, "grad_norm": 0.5230355727747609, "learning_rate": 1.535009896814745e-05, "loss": 0.3244, "step": 6512 }, { "epoch": 1.2839116719242902, "grad_norm": 0.5375516631231971, "learning_rate": 1.5348789370958687e-05, "loss": 0.3419, "step": 6513 }, { "epoch": 1.284108832807571, "grad_norm": 0.5618743671777963, "learning_rate": 1.534747964526095e-05, "loss": 0.3321, "step": 6514 }, { "epoch": 1.2843059936908516, "grad_norm": 0.5648952032025825, "learning_rate": 1.5346169791085707e-05, "loss": 0.3586, "step": 6515 }, { "epoch": 1.2845031545741326, "grad_norm": 0.4704499047660257, "learning_rate": 1.534485980846443e-05, "loss": 0.2988, "step": 6516 }, { "epoch": 1.2847003154574133, "grad_norm": 0.5115491062943802, "learning_rate": 1.5343549697428596e-05, "loss": 0.3401, "step": 6517 }, { "epoch": 1.284897476340694, "grad_norm": 0.5183202372840393, "learning_rate": 1.5342239458009675e-05, "loss": 0.3253, "step": 6518 }, { "epoch": 1.2850946372239749, "grad_norm": 0.48683435924195767, "learning_rate": 1.5340929090239146e-05, "loss": 0.3006, "step": 6519 }, { "epoch": 1.2852917981072556, "grad_norm": 0.5138313161943242, "learning_rate": 1.5339618594148497e-05, "loss": 0.3063, "step": 6520 }, { "epoch": 1.2854889589905363, "grad_norm": 0.5017471245997583, "learning_rate": 1.533830796976921e-05, "loss": 0.3297, "step": 6521 }, { "epoch": 1.285686119873817, "grad_norm": 0.5123547738441006, "learning_rate": 1.5336997217132777e-05, "loss": 0.3525, "step": 6522 }, { "epoch": 1.2858832807570977, "grad_norm": 0.5338495423729164, "learning_rate": 1.533568633627069e-05, "loss": 0.3295, "step": 6523 }, { "epoch": 1.2860804416403786, "grad_norm": 0.5013934896990097, "learning_rate": 1.5334375327214437e-05, "loss": 0.3459, "step": 6524 }, { "epoch": 1.2862776025236593, "grad_norm": 0.4948643141041335, "learning_rate": 1.5333064189995523e-05, "loss": 0.3553, "step": 6525 }, { "epoch": 1.28647476340694, "grad_norm": 0.46557296850936236, "learning_rate": 1.5331752924645448e-05, "loss": 0.3048, "step": 6526 }, { "epoch": 1.286671924290221, "grad_norm": 0.48584258034002087, "learning_rate": 1.5330441531195714e-05, "loss": 0.3127, "step": 6527 }, { "epoch": 1.2868690851735016, "grad_norm": 0.5002880962180022, "learning_rate": 1.532913000967783e-05, "loss": 0.3415, "step": 6528 }, { "epoch": 1.2870662460567823, "grad_norm": 0.5028923628602395, "learning_rate": 1.5327818360123307e-05, "loss": 0.3148, "step": 6529 }, { "epoch": 1.287263406940063, "grad_norm": 0.5195591290608911, "learning_rate": 1.532650658256366e-05, "loss": 0.3567, "step": 6530 }, { "epoch": 1.2874605678233437, "grad_norm": 0.5004401636204268, "learning_rate": 1.5325194677030396e-05, "loss": 0.3399, "step": 6531 }, { "epoch": 1.2876577287066246, "grad_norm": 0.4867335475638141, "learning_rate": 1.5323882643555045e-05, "loss": 0.3228, "step": 6532 }, { "epoch": 1.2878548895899053, "grad_norm": 0.501274068173939, "learning_rate": 1.5322570482169127e-05, "loss": 0.3034, "step": 6533 }, { "epoch": 1.288052050473186, "grad_norm": 0.502185959674175, "learning_rate": 1.5321258192904165e-05, "loss": 0.3134, "step": 6534 }, { "epoch": 1.288249211356467, "grad_norm": 0.51496969492382, "learning_rate": 1.531994577579169e-05, "loss": 0.3469, "step": 6535 }, { "epoch": 1.2884463722397477, "grad_norm": 0.4872788476858814, "learning_rate": 1.5318633230863237e-05, "loss": 0.3189, "step": 6536 }, { "epoch": 1.2886435331230284, "grad_norm": 0.5229951752622122, "learning_rate": 1.5317320558150336e-05, "loss": 0.3392, "step": 6537 }, { "epoch": 1.288840694006309, "grad_norm": 0.4772007251776008, "learning_rate": 1.5316007757684523e-05, "loss": 0.3089, "step": 6538 }, { "epoch": 1.2890378548895898, "grad_norm": 0.5003182201377973, "learning_rate": 1.5314694829497344e-05, "loss": 0.3164, "step": 6539 }, { "epoch": 1.2892350157728707, "grad_norm": 0.5065544170909857, "learning_rate": 1.5313381773620344e-05, "loss": 0.3202, "step": 6540 }, { "epoch": 1.2894321766561514, "grad_norm": 0.5077740483878833, "learning_rate": 1.5312068590085067e-05, "loss": 0.343, "step": 6541 }, { "epoch": 1.289629337539432, "grad_norm": 0.5577531098567273, "learning_rate": 1.5310755278923067e-05, "loss": 0.3386, "step": 6542 }, { "epoch": 1.289826498422713, "grad_norm": 10.547858415373208, "learning_rate": 1.530944184016589e-05, "loss": 0.3561, "step": 6543 }, { "epoch": 1.2900236593059937, "grad_norm": 0.5345161964355895, "learning_rate": 1.53081282738451e-05, "loss": 0.3251, "step": 6544 }, { "epoch": 1.2902208201892744, "grad_norm": 0.5104260494177608, "learning_rate": 1.5306814579992254e-05, "loss": 0.3355, "step": 6545 }, { "epoch": 1.2904179810725553, "grad_norm": 0.46675498748982025, "learning_rate": 1.530550075863891e-05, "loss": 0.3111, "step": 6546 }, { "epoch": 1.290615141955836, "grad_norm": 0.5293919854328544, "learning_rate": 1.5304186809816644e-05, "loss": 0.3247, "step": 6547 }, { "epoch": 1.2908123028391167, "grad_norm": 0.4883341570443513, "learning_rate": 1.5302872733557013e-05, "loss": 0.316, "step": 6548 }, { "epoch": 1.2910094637223974, "grad_norm": 0.5129224662462487, "learning_rate": 1.53015585298916e-05, "loss": 0.3536, "step": 6549 }, { "epoch": 1.2912066246056781, "grad_norm": 0.4954761184679624, "learning_rate": 1.5300244198851965e-05, "loss": 0.3269, "step": 6550 }, { "epoch": 1.291403785488959, "grad_norm": 0.5132713779944943, "learning_rate": 1.5298929740469707e-05, "loss": 0.34, "step": 6551 }, { "epoch": 1.2916009463722398, "grad_norm": 0.5025532639181867, "learning_rate": 1.5297615154776384e-05, "loss": 0.3253, "step": 6552 }, { "epoch": 1.2917981072555205, "grad_norm": 0.5058178309161923, "learning_rate": 1.5296300441803594e-05, "loss": 0.3464, "step": 6553 }, { "epoch": 1.2919952681388014, "grad_norm": 0.5248801782387191, "learning_rate": 1.5294985601582922e-05, "loss": 0.3535, "step": 6554 }, { "epoch": 1.292192429022082, "grad_norm": 0.4613729865477732, "learning_rate": 1.5293670634145955e-05, "loss": 0.2844, "step": 6555 }, { "epoch": 1.2923895899053628, "grad_norm": 0.548846735245273, "learning_rate": 1.529235553952429e-05, "loss": 0.3593, "step": 6556 }, { "epoch": 1.2925867507886435, "grad_norm": 0.48961397369557463, "learning_rate": 1.5291040317749522e-05, "loss": 0.3248, "step": 6557 }, { "epoch": 1.2927839116719242, "grad_norm": 0.5242798395592415, "learning_rate": 1.528972496885325e-05, "loss": 0.3569, "step": 6558 }, { "epoch": 1.2929810725552051, "grad_norm": 0.5183526269970178, "learning_rate": 1.5288409492867075e-05, "loss": 0.3333, "step": 6559 }, { "epoch": 1.2931782334384858, "grad_norm": 0.6706130182709, "learning_rate": 1.52870938898226e-05, "loss": 0.3381, "step": 6560 }, { "epoch": 1.2933753943217665, "grad_norm": 0.5081309223342054, "learning_rate": 1.528577815975144e-05, "loss": 0.3075, "step": 6561 }, { "epoch": 1.2935725552050474, "grad_norm": 0.5513296961101606, "learning_rate": 1.5284462302685203e-05, "loss": 0.3199, "step": 6562 }, { "epoch": 1.2937697160883281, "grad_norm": 0.49894267015353083, "learning_rate": 1.52831463186555e-05, "loss": 0.3275, "step": 6563 }, { "epoch": 1.2939668769716088, "grad_norm": 0.5327112678308222, "learning_rate": 1.5281830207693955e-05, "loss": 0.3446, "step": 6564 }, { "epoch": 1.2941640378548895, "grad_norm": 0.5119589336578062, "learning_rate": 1.5280513969832185e-05, "loss": 0.3238, "step": 6565 }, { "epoch": 1.2943611987381702, "grad_norm": 0.5181686124023328, "learning_rate": 1.5279197605101814e-05, "loss": 0.327, "step": 6566 }, { "epoch": 1.2945583596214512, "grad_norm": 0.5017388093095246, "learning_rate": 1.527788111353447e-05, "loss": 0.3306, "step": 6567 }, { "epoch": 1.2947555205047319, "grad_norm": 0.5001792806641484, "learning_rate": 1.5276564495161787e-05, "loss": 0.3197, "step": 6568 }, { "epoch": 1.2949526813880126, "grad_norm": 1.4540496805328393, "learning_rate": 1.5275247750015383e-05, "loss": 0.3723, "step": 6569 }, { "epoch": 1.2951498422712935, "grad_norm": 1.8774355679525034, "learning_rate": 1.5273930878126912e-05, "loss": 0.3558, "step": 6570 }, { "epoch": 1.2953470031545742, "grad_norm": 0.6034476548951655, "learning_rate": 1.5272613879528e-05, "loss": 0.3327, "step": 6571 }, { "epoch": 1.2955441640378549, "grad_norm": 0.6074067971559209, "learning_rate": 1.5271296754250296e-05, "loss": 0.3475, "step": 6572 }, { "epoch": 1.2957413249211356, "grad_norm": 2.176593999697865, "learning_rate": 1.526997950232544e-05, "loss": 0.388, "step": 6573 }, { "epoch": 1.2959384858044163, "grad_norm": 0.6536726906942081, "learning_rate": 1.5268662123785084e-05, "loss": 0.3288, "step": 6574 }, { "epoch": 1.2961356466876972, "grad_norm": 0.5254118788592566, "learning_rate": 1.5267344618660876e-05, "loss": 0.3427, "step": 6575 }, { "epoch": 1.296332807570978, "grad_norm": 0.504643420107229, "learning_rate": 1.526602698698447e-05, "loss": 0.3277, "step": 6576 }, { "epoch": 1.2965299684542586, "grad_norm": 0.5190602422795867, "learning_rate": 1.5264709228787534e-05, "loss": 0.3404, "step": 6577 }, { "epoch": 1.2967271293375395, "grad_norm": 0.525866200522297, "learning_rate": 1.5263391344101713e-05, "loss": 0.351, "step": 6578 }, { "epoch": 1.2969242902208202, "grad_norm": 0.5358394825870619, "learning_rate": 1.5262073332958677e-05, "loss": 0.3478, "step": 6579 }, { "epoch": 1.297121451104101, "grad_norm": 0.5301335549732624, "learning_rate": 1.526075519539009e-05, "loss": 0.3253, "step": 6580 }, { "epoch": 1.2973186119873816, "grad_norm": 0.5239831490523945, "learning_rate": 1.5259436931427624e-05, "loss": 0.3308, "step": 6581 }, { "epoch": 1.2975157728706623, "grad_norm": 0.5013826442278009, "learning_rate": 1.525811854110295e-05, "loss": 0.3492, "step": 6582 }, { "epoch": 1.2977129337539433, "grad_norm": 0.5146221951802675, "learning_rate": 1.5256800024447744e-05, "loss": 0.3288, "step": 6583 }, { "epoch": 1.297910094637224, "grad_norm": 0.5480449934066463, "learning_rate": 1.5255481381493686e-05, "loss": 0.3557, "step": 6584 }, { "epoch": 1.2981072555205047, "grad_norm": 0.5334935940934137, "learning_rate": 1.5254162612272451e-05, "loss": 0.3467, "step": 6585 }, { "epoch": 1.2983044164037856, "grad_norm": 0.5358329007175838, "learning_rate": 1.5252843716815733e-05, "loss": 0.3814, "step": 6586 }, { "epoch": 1.2985015772870663, "grad_norm": 0.49941905926425306, "learning_rate": 1.5251524695155214e-05, "loss": 0.3412, "step": 6587 }, { "epoch": 1.298698738170347, "grad_norm": 0.5883299960109943, "learning_rate": 1.525020554732258e-05, "loss": 0.3469, "step": 6588 }, { "epoch": 1.2988958990536277, "grad_norm": 0.4752126077673626, "learning_rate": 1.5248886273349537e-05, "loss": 0.3147, "step": 6589 }, { "epoch": 1.2990930599369084, "grad_norm": 0.5009771354117454, "learning_rate": 1.524756687326777e-05, "loss": 0.3461, "step": 6590 }, { "epoch": 1.2992902208201893, "grad_norm": 0.5678355964259746, "learning_rate": 1.5246247347108984e-05, "loss": 0.3774, "step": 6591 }, { "epoch": 1.29948738170347, "grad_norm": 0.4622894199811546, "learning_rate": 1.524492769490488e-05, "loss": 0.3359, "step": 6592 }, { "epoch": 1.2996845425867507, "grad_norm": 0.5169518693086831, "learning_rate": 1.5243607916687167e-05, "loss": 0.3513, "step": 6593 }, { "epoch": 1.2998817034700316, "grad_norm": 0.49140954893508976, "learning_rate": 1.524228801248755e-05, "loss": 0.3222, "step": 6594 }, { "epoch": 1.3000788643533123, "grad_norm": 0.5198162659011691, "learning_rate": 1.5240967982337738e-05, "loss": 0.3309, "step": 6595 }, { "epoch": 1.300276025236593, "grad_norm": 0.5186960022124844, "learning_rate": 1.5239647826269455e-05, "loss": 0.3305, "step": 6596 }, { "epoch": 1.300473186119874, "grad_norm": 0.5377070970944036, "learning_rate": 1.5238327544314409e-05, "loss": 0.3584, "step": 6597 }, { "epoch": 1.3006703470031546, "grad_norm": 0.5046133132932712, "learning_rate": 1.5237007136504329e-05, "loss": 0.3403, "step": 6598 }, { "epoch": 1.3008675078864353, "grad_norm": 0.5454800240840171, "learning_rate": 1.5235686602870932e-05, "loss": 0.377, "step": 6599 }, { "epoch": 1.301064668769716, "grad_norm": 0.5025344180773049, "learning_rate": 1.5234365943445953e-05, "loss": 0.3432, "step": 6600 }, { "epoch": 1.3012618296529967, "grad_norm": 0.5276225149052198, "learning_rate": 1.523304515826111e-05, "loss": 0.3527, "step": 6601 }, { "epoch": 1.3014589905362777, "grad_norm": 0.47724462165996756, "learning_rate": 1.5231724247348148e-05, "loss": 0.3126, "step": 6602 }, { "epoch": 1.3016561514195584, "grad_norm": 0.5567156598290902, "learning_rate": 1.5230403210738796e-05, "loss": 0.3633, "step": 6603 }, { "epoch": 1.301853312302839, "grad_norm": 0.48550253059271353, "learning_rate": 1.5229082048464796e-05, "loss": 0.3226, "step": 6604 }, { "epoch": 1.30205047318612, "grad_norm": 0.5343562201937075, "learning_rate": 1.5227760760557887e-05, "loss": 0.3421, "step": 6605 }, { "epoch": 1.3022476340694007, "grad_norm": 0.5152095426808443, "learning_rate": 1.522643934704982e-05, "loss": 0.3454, "step": 6606 }, { "epoch": 1.3024447949526814, "grad_norm": 0.5411901174169776, "learning_rate": 1.5225117807972334e-05, "loss": 0.3494, "step": 6607 }, { "epoch": 1.302641955835962, "grad_norm": 0.5079018703629262, "learning_rate": 1.5223796143357188e-05, "loss": 0.3364, "step": 6608 }, { "epoch": 1.3028391167192428, "grad_norm": 0.5422309368847433, "learning_rate": 1.522247435323613e-05, "loss": 0.332, "step": 6609 }, { "epoch": 1.3030362776025237, "grad_norm": 1.8926296586911842, "learning_rate": 1.5221152437640922e-05, "loss": 0.321, "step": 6610 }, { "epoch": 1.3032334384858044, "grad_norm": 0.5764351786305123, "learning_rate": 1.5219830396603321e-05, "loss": 0.3629, "step": 6611 }, { "epoch": 1.3034305993690851, "grad_norm": 0.5013264006168145, "learning_rate": 1.5218508230155093e-05, "loss": 0.3306, "step": 6612 }, { "epoch": 1.303627760252366, "grad_norm": 0.5367421957046887, "learning_rate": 1.5217185938328003e-05, "loss": 0.3283, "step": 6613 }, { "epoch": 1.3038249211356467, "grad_norm": 0.6051753693199979, "learning_rate": 1.5215863521153817e-05, "loss": 0.3365, "step": 6614 }, { "epoch": 1.3040220820189274, "grad_norm": 0.5121455196545641, "learning_rate": 1.521454097866431e-05, "loss": 0.3465, "step": 6615 }, { "epoch": 1.3042192429022081, "grad_norm": 0.5204250557148496, "learning_rate": 1.5213218310891256e-05, "loss": 0.343, "step": 6616 }, { "epoch": 1.3044164037854888, "grad_norm": 0.5312764234356772, "learning_rate": 1.5211895517866437e-05, "loss": 0.3386, "step": 6617 }, { "epoch": 1.3046135646687698, "grad_norm": 0.5071761949128407, "learning_rate": 1.5210572599621626e-05, "loss": 0.3328, "step": 6618 }, { "epoch": 1.3048107255520505, "grad_norm": 0.5433017323113483, "learning_rate": 1.5209249556188619e-05, "loss": 0.3632, "step": 6619 }, { "epoch": 1.3050078864353312, "grad_norm": 0.568002908855061, "learning_rate": 1.520792638759919e-05, "loss": 0.3341, "step": 6620 }, { "epoch": 1.305205047318612, "grad_norm": 0.4905457370725845, "learning_rate": 1.520660309388514e-05, "loss": 0.3361, "step": 6621 }, { "epoch": 1.3054022082018928, "grad_norm": 0.5619377123694016, "learning_rate": 1.5205279675078255e-05, "loss": 0.3574, "step": 6622 }, { "epoch": 1.3055993690851735, "grad_norm": 0.5451812237029258, "learning_rate": 1.5203956131210333e-05, "loss": 0.3632, "step": 6623 }, { "epoch": 1.3057965299684542, "grad_norm": 0.541134699754437, "learning_rate": 1.5202632462313178e-05, "loss": 0.35, "step": 6624 }, { "epoch": 1.305993690851735, "grad_norm": 0.5465872753368111, "learning_rate": 1.5201308668418588e-05, "loss": 0.3686, "step": 6625 }, { "epoch": 1.3061908517350158, "grad_norm": 0.48991274685024205, "learning_rate": 1.5199984749558367e-05, "loss": 0.3211, "step": 6626 }, { "epoch": 1.3063880126182965, "grad_norm": 0.4938660731028014, "learning_rate": 1.5198660705764326e-05, "loss": 0.3393, "step": 6627 }, { "epoch": 1.3065851735015772, "grad_norm": 0.5656182414746819, "learning_rate": 1.5197336537068275e-05, "loss": 0.3565, "step": 6628 }, { "epoch": 1.3067823343848581, "grad_norm": 0.48654324405735194, "learning_rate": 1.5196012243502027e-05, "loss": 0.3061, "step": 6629 }, { "epoch": 1.3069794952681388, "grad_norm": 0.5696072738061521, "learning_rate": 1.5194687825097401e-05, "loss": 0.3624, "step": 6630 }, { "epoch": 1.3071766561514195, "grad_norm": 0.4992301751223368, "learning_rate": 1.5193363281886217e-05, "loss": 0.3332, "step": 6631 }, { "epoch": 1.3073738170347002, "grad_norm": 0.47480549774075814, "learning_rate": 1.5192038613900297e-05, "loss": 0.33, "step": 6632 }, { "epoch": 1.307570977917981, "grad_norm": 0.621679280561631, "learning_rate": 1.519071382117147e-05, "loss": 0.3277, "step": 6633 }, { "epoch": 1.3077681388012619, "grad_norm": 0.5098237493871496, "learning_rate": 1.5189388903731562e-05, "loss": 0.347, "step": 6634 }, { "epoch": 1.3079652996845426, "grad_norm": 0.4863659684564988, "learning_rate": 1.5188063861612405e-05, "loss": 0.3305, "step": 6635 }, { "epoch": 1.3081624605678233, "grad_norm": 0.5124706812541354, "learning_rate": 1.518673869484584e-05, "loss": 0.3275, "step": 6636 }, { "epoch": 1.3083596214511042, "grad_norm": 0.5190003537835063, "learning_rate": 1.5185413403463698e-05, "loss": 0.3355, "step": 6637 }, { "epoch": 1.3085567823343849, "grad_norm": 1.1096517702373996, "learning_rate": 1.5184087987497824e-05, "loss": 0.3525, "step": 6638 }, { "epoch": 1.3087539432176656, "grad_norm": 0.531753230713042, "learning_rate": 1.5182762446980061e-05, "loss": 0.3487, "step": 6639 }, { "epoch": 1.3089511041009465, "grad_norm": 0.5226931181499541, "learning_rate": 1.5181436781942258e-05, "loss": 0.336, "step": 6640 }, { "epoch": 1.3091482649842272, "grad_norm": 0.49468338224537034, "learning_rate": 1.5180110992416262e-05, "loss": 0.3089, "step": 6641 }, { "epoch": 1.309345425867508, "grad_norm": 0.47660945364337853, "learning_rate": 1.5178785078433928e-05, "loss": 0.3207, "step": 6642 }, { "epoch": 1.3095425867507886, "grad_norm": 0.5172184894772541, "learning_rate": 1.5177459040027114e-05, "loss": 0.3366, "step": 6643 }, { "epoch": 1.3097397476340693, "grad_norm": 0.570608404247494, "learning_rate": 1.5176132877227674e-05, "loss": 0.3571, "step": 6644 }, { "epoch": 1.3099369085173502, "grad_norm": 0.5231899305828399, "learning_rate": 1.5174806590067475e-05, "loss": 0.3496, "step": 6645 }, { "epoch": 1.310134069400631, "grad_norm": 0.499514906187556, "learning_rate": 1.517348017857838e-05, "loss": 0.3125, "step": 6646 }, { "epoch": 1.3103312302839116, "grad_norm": 0.49103990204237496, "learning_rate": 1.517215364279226e-05, "loss": 0.3127, "step": 6647 }, { "epoch": 1.3105283911671926, "grad_norm": 0.47212331175692046, "learning_rate": 1.517082698274098e-05, "loss": 0.3166, "step": 6648 }, { "epoch": 1.3107255520504733, "grad_norm": 0.5661729386565737, "learning_rate": 1.5169500198456417e-05, "loss": 0.376, "step": 6649 }, { "epoch": 1.310922712933754, "grad_norm": 0.5266195177462345, "learning_rate": 1.5168173289970453e-05, "loss": 0.3619, "step": 6650 }, { "epoch": 1.3111198738170347, "grad_norm": 0.4968849869857472, "learning_rate": 1.5166846257314961e-05, "loss": 0.3527, "step": 6651 }, { "epoch": 1.3113170347003154, "grad_norm": 0.5036423318919169, "learning_rate": 1.5165519100521828e-05, "loss": 0.339, "step": 6652 }, { "epoch": 1.3115141955835963, "grad_norm": 0.8033202698895086, "learning_rate": 1.5164191819622937e-05, "loss": 0.3572, "step": 6653 }, { "epoch": 1.311711356466877, "grad_norm": 0.5025110332228436, "learning_rate": 1.516286441465018e-05, "loss": 0.335, "step": 6654 }, { "epoch": 1.3119085173501577, "grad_norm": 0.5067929824788047, "learning_rate": 1.5161536885635451e-05, "loss": 0.3255, "step": 6655 }, { "epoch": 1.3121056782334386, "grad_norm": 0.6200278707751725, "learning_rate": 1.5160209232610637e-05, "loss": 0.3556, "step": 6656 }, { "epoch": 1.3123028391167193, "grad_norm": 0.5316032164983252, "learning_rate": 1.5158881455607643e-05, "loss": 0.3387, "step": 6657 }, { "epoch": 1.3125, "grad_norm": 0.5213969466823882, "learning_rate": 1.5157553554658367e-05, "loss": 0.3442, "step": 6658 }, { "epoch": 1.3126971608832807, "grad_norm": 0.5401001295546158, "learning_rate": 1.5156225529794713e-05, "loss": 0.3456, "step": 6659 }, { "epoch": 1.3128943217665614, "grad_norm": 0.5668921110797275, "learning_rate": 1.5154897381048588e-05, "loss": 0.3473, "step": 6660 }, { "epoch": 1.3130914826498423, "grad_norm": 0.5275183994588266, "learning_rate": 1.5153569108451905e-05, "loss": 0.3245, "step": 6661 }, { "epoch": 1.313288643533123, "grad_norm": 0.5126597475316212, "learning_rate": 1.5152240712036573e-05, "loss": 0.3375, "step": 6662 }, { "epoch": 1.3134858044164037, "grad_norm": 0.49094805836830735, "learning_rate": 1.5150912191834504e-05, "loss": 0.3237, "step": 6663 }, { "epoch": 1.3136829652996846, "grad_norm": 0.5276645393537767, "learning_rate": 1.5149583547877629e-05, "loss": 0.3586, "step": 6664 }, { "epoch": 1.3138801261829653, "grad_norm": 0.5293512949651924, "learning_rate": 1.5148254780197856e-05, "loss": 0.3064, "step": 6665 }, { "epoch": 1.314077287066246, "grad_norm": 0.5109488720261187, "learning_rate": 1.514692588882712e-05, "loss": 0.3164, "step": 6666 }, { "epoch": 1.3142744479495267, "grad_norm": 0.49826567908107544, "learning_rate": 1.5145596873797342e-05, "loss": 0.3158, "step": 6667 }, { "epoch": 1.3144716088328074, "grad_norm": 0.5250546099243462, "learning_rate": 1.5144267735140459e-05, "loss": 0.3415, "step": 6668 }, { "epoch": 1.3146687697160884, "grad_norm": 0.4974244732092123, "learning_rate": 1.5142938472888395e-05, "loss": 0.3153, "step": 6669 }, { "epoch": 1.314865930599369, "grad_norm": 0.5030333271298699, "learning_rate": 1.5141609087073099e-05, "loss": 0.3212, "step": 6670 }, { "epoch": 1.3150630914826498, "grad_norm": 0.49516039944032675, "learning_rate": 1.51402795777265e-05, "loss": 0.3373, "step": 6671 }, { "epoch": 1.3152602523659307, "grad_norm": 0.5251022630682348, "learning_rate": 1.5138949944880547e-05, "loss": 0.3594, "step": 6672 }, { "epoch": 1.3154574132492114, "grad_norm": 0.5080937428606618, "learning_rate": 1.5137620188567183e-05, "loss": 0.2982, "step": 6673 }, { "epoch": 1.315654574132492, "grad_norm": 0.5762031753271423, "learning_rate": 1.5136290308818355e-05, "loss": 0.365, "step": 6674 }, { "epoch": 1.3158517350157728, "grad_norm": 0.492333953850378, "learning_rate": 1.5134960305666017e-05, "loss": 0.3197, "step": 6675 }, { "epoch": 1.3160488958990535, "grad_norm": 0.5353009146536675, "learning_rate": 1.5133630179142124e-05, "loss": 0.3206, "step": 6676 }, { "epoch": 1.3162460567823344, "grad_norm": 0.5193275884525621, "learning_rate": 1.5132299929278631e-05, "loss": 0.3569, "step": 6677 }, { "epoch": 1.3164432176656151, "grad_norm": 0.5170594306368025, "learning_rate": 1.5130969556107498e-05, "loss": 0.3609, "step": 6678 }, { "epoch": 1.3166403785488958, "grad_norm": 0.5376643454719803, "learning_rate": 1.512963905966069e-05, "loss": 0.332, "step": 6679 }, { "epoch": 1.3168375394321767, "grad_norm": 0.5133927845944496, "learning_rate": 1.5128308439970174e-05, "loss": 0.3372, "step": 6680 }, { "epoch": 1.3170347003154574, "grad_norm": 0.5399002043459604, "learning_rate": 1.5126977697067915e-05, "loss": 0.363, "step": 6681 }, { "epoch": 1.3172318611987381, "grad_norm": 0.49610768530525307, "learning_rate": 1.5125646830985892e-05, "loss": 0.3136, "step": 6682 }, { "epoch": 1.317429022082019, "grad_norm": 0.5473955010074304, "learning_rate": 1.5124315841756072e-05, "loss": 0.355, "step": 6683 }, { "epoch": 1.3176261829652998, "grad_norm": 0.578461631477163, "learning_rate": 1.5122984729410437e-05, "loss": 0.3719, "step": 6684 }, { "epoch": 1.3178233438485805, "grad_norm": 0.5463357652437992, "learning_rate": 1.5121653493980973e-05, "loss": 0.3432, "step": 6685 }, { "epoch": 1.3180205047318612, "grad_norm": 0.4772451928962519, "learning_rate": 1.5120322135499654e-05, "loss": 0.3071, "step": 6686 }, { "epoch": 1.3182176656151419, "grad_norm": 0.5767867603367968, "learning_rate": 1.511899065399848e-05, "loss": 0.3512, "step": 6687 }, { "epoch": 1.3184148264984228, "grad_norm": 0.5041565865933484, "learning_rate": 1.5117659049509425e-05, "loss": 0.3437, "step": 6688 }, { "epoch": 1.3186119873817035, "grad_norm": 0.5042883160416222, "learning_rate": 1.5116327322064497e-05, "loss": 0.3271, "step": 6689 }, { "epoch": 1.3188091482649842, "grad_norm": 0.4844327881363672, "learning_rate": 1.5114995471695679e-05, "loss": 0.2951, "step": 6690 }, { "epoch": 1.319006309148265, "grad_norm": 0.5160775384323053, "learning_rate": 1.5113663498434979e-05, "loss": 0.3308, "step": 6691 }, { "epoch": 1.3192034700315458, "grad_norm": 0.5182153228216186, "learning_rate": 1.5112331402314393e-05, "loss": 0.3311, "step": 6692 }, { "epoch": 1.3194006309148265, "grad_norm": 0.5509813748947842, "learning_rate": 1.5110999183365933e-05, "loss": 0.3603, "step": 6693 }, { "epoch": 1.3195977917981072, "grad_norm": 0.4984381584603614, "learning_rate": 1.5109666841621597e-05, "loss": 0.3316, "step": 6694 }, { "epoch": 1.319794952681388, "grad_norm": 0.5204935976874747, "learning_rate": 1.51083343771134e-05, "loss": 0.3161, "step": 6695 }, { "epoch": 1.3199921135646688, "grad_norm": 0.536865132939174, "learning_rate": 1.510700178987336e-05, "loss": 0.3332, "step": 6696 }, { "epoch": 1.3201892744479495, "grad_norm": 0.520088029730267, "learning_rate": 1.5105669079933486e-05, "loss": 0.336, "step": 6697 }, { "epoch": 1.3203864353312302, "grad_norm": 0.898006104693384, "learning_rate": 1.5104336247325803e-05, "loss": 0.3888, "step": 6698 }, { "epoch": 1.3205835962145112, "grad_norm": 0.49085502589272256, "learning_rate": 1.510300329208233e-05, "loss": 0.3267, "step": 6699 }, { "epoch": 1.3207807570977919, "grad_norm": 0.5621876865017514, "learning_rate": 1.5101670214235094e-05, "loss": 0.3831, "step": 6700 }, { "epoch": 1.3209779179810726, "grad_norm": 0.5512732326331895, "learning_rate": 1.5100337013816122e-05, "loss": 0.3478, "step": 6701 }, { "epoch": 1.3211750788643533, "grad_norm": 0.7789605507183492, "learning_rate": 1.5099003690857448e-05, "loss": 0.3209, "step": 6702 }, { "epoch": 1.321372239747634, "grad_norm": 0.5419622274274699, "learning_rate": 1.50976702453911e-05, "loss": 0.3443, "step": 6703 }, { "epoch": 1.3215694006309149, "grad_norm": 0.5100055997526229, "learning_rate": 1.5096336677449125e-05, "loss": 0.3405, "step": 6704 }, { "epoch": 1.3217665615141956, "grad_norm": 0.4976601575607139, "learning_rate": 1.5095002987063549e-05, "loss": 0.3209, "step": 6705 }, { "epoch": 1.3219637223974763, "grad_norm": 0.5566156732462322, "learning_rate": 1.509366917426643e-05, "loss": 0.3416, "step": 6706 }, { "epoch": 1.3221608832807572, "grad_norm": 0.5069545882513634, "learning_rate": 1.5092335239089803e-05, "loss": 0.3394, "step": 6707 }, { "epoch": 1.322358044164038, "grad_norm": 0.5033024334874232, "learning_rate": 1.5091001181565725e-05, "loss": 0.3424, "step": 6708 }, { "epoch": 1.3225552050473186, "grad_norm": 0.49579326452908207, "learning_rate": 1.5089667001726243e-05, "loss": 0.3147, "step": 6709 }, { "epoch": 1.3227523659305993, "grad_norm": 0.5271145089705676, "learning_rate": 1.5088332699603412e-05, "loss": 0.3593, "step": 6710 }, { "epoch": 1.32294952681388, "grad_norm": 0.5320580774064494, "learning_rate": 1.508699827522929e-05, "loss": 0.3364, "step": 6711 }, { "epoch": 1.323146687697161, "grad_norm": 0.5104256370446169, "learning_rate": 1.5085663728635935e-05, "loss": 0.333, "step": 6712 }, { "epoch": 1.3233438485804416, "grad_norm": 0.49891079466541427, "learning_rate": 1.5084329059855419e-05, "loss": 0.3334, "step": 6713 }, { "epoch": 1.3235410094637223, "grad_norm": 0.5733949567223368, "learning_rate": 1.5082994268919798e-05, "loss": 0.3157, "step": 6714 }, { "epoch": 1.3237381703470033, "grad_norm": 0.5435038605202684, "learning_rate": 1.508165935586115e-05, "loss": 0.3233, "step": 6715 }, { "epoch": 1.323935331230284, "grad_norm": 0.5673725835719423, "learning_rate": 1.5080324320711542e-05, "loss": 0.3399, "step": 6716 }, { "epoch": 1.3241324921135647, "grad_norm": 0.46434225808628016, "learning_rate": 1.507898916350305e-05, "loss": 0.315, "step": 6717 }, { "epoch": 1.3243296529968454, "grad_norm": 0.5335227211463376, "learning_rate": 1.5077653884267753e-05, "loss": 0.3509, "step": 6718 }, { "epoch": 1.324526813880126, "grad_norm": 0.5194316243511954, "learning_rate": 1.5076318483037736e-05, "loss": 0.3524, "step": 6719 }, { "epoch": 1.324723974763407, "grad_norm": 0.5237285016193873, "learning_rate": 1.5074982959845077e-05, "loss": 0.3842, "step": 6720 }, { "epoch": 1.3249211356466877, "grad_norm": 0.512608848459545, "learning_rate": 1.5073647314721867e-05, "loss": 0.3554, "step": 6721 }, { "epoch": 1.3251182965299684, "grad_norm": 22.837779278990155, "learning_rate": 1.5072311547700194e-05, "loss": 0.3395, "step": 6722 }, { "epoch": 1.3253154574132493, "grad_norm": 0.5584335319441984, "learning_rate": 1.507097565881215e-05, "loss": 0.3746, "step": 6723 }, { "epoch": 1.32551261829653, "grad_norm": 0.4725105964351633, "learning_rate": 1.5069639648089833e-05, "loss": 0.3079, "step": 6724 }, { "epoch": 1.3257097791798107, "grad_norm": 0.48649112789508087, "learning_rate": 1.506830351556534e-05, "loss": 0.3243, "step": 6725 }, { "epoch": 1.3259069400630916, "grad_norm": 0.5047703578036561, "learning_rate": 1.5066967261270775e-05, "loss": 0.352, "step": 6726 }, { "epoch": 1.3261041009463723, "grad_norm": 0.5240622470462101, "learning_rate": 1.506563088523824e-05, "loss": 0.3676, "step": 6727 }, { "epoch": 1.326301261829653, "grad_norm": 0.5168279521689652, "learning_rate": 1.5064294387499844e-05, "loss": 0.3542, "step": 6728 }, { "epoch": 1.3264984227129337, "grad_norm": 0.4816304700981381, "learning_rate": 1.5062957768087698e-05, "loss": 0.3053, "step": 6729 }, { "epoch": 1.3266955835962144, "grad_norm": 0.606385343673032, "learning_rate": 1.5061621027033914e-05, "loss": 0.3315, "step": 6730 }, { "epoch": 1.3268927444794953, "grad_norm": 0.49813137918087436, "learning_rate": 1.5060284164370606e-05, "loss": 0.3501, "step": 6731 }, { "epoch": 1.327089905362776, "grad_norm": 0.5003849761342135, "learning_rate": 1.5058947180129902e-05, "loss": 0.3308, "step": 6732 }, { "epoch": 1.3272870662460567, "grad_norm": 0.5092842617270525, "learning_rate": 1.5057610074343911e-05, "loss": 0.3215, "step": 6733 }, { "epoch": 1.3274842271293377, "grad_norm": 0.5426283767618892, "learning_rate": 1.505627284704477e-05, "loss": 0.3455, "step": 6734 }, { "epoch": 1.3276813880126184, "grad_norm": 0.5093218204427572, "learning_rate": 1.50549354982646e-05, "loss": 0.341, "step": 6735 }, { "epoch": 1.327878548895899, "grad_norm": 0.47254370160551584, "learning_rate": 1.5053598028035534e-05, "loss": 0.3253, "step": 6736 }, { "epoch": 1.3280757097791798, "grad_norm": 0.5063195573320013, "learning_rate": 1.5052260436389708e-05, "loss": 0.3158, "step": 6737 }, { "epoch": 1.3282728706624605, "grad_norm": 0.5253016137313395, "learning_rate": 1.5050922723359254e-05, "loss": 0.3387, "step": 6738 }, { "epoch": 1.3284700315457414, "grad_norm": 0.4894300336691798, "learning_rate": 1.5049584888976311e-05, "loss": 0.3385, "step": 6739 }, { "epoch": 1.328667192429022, "grad_norm": 0.5590119104561938, "learning_rate": 1.504824693327303e-05, "loss": 0.3397, "step": 6740 }, { "epoch": 1.3288643533123028, "grad_norm": 0.5350527773958544, "learning_rate": 1.504690885628155e-05, "loss": 0.3326, "step": 6741 }, { "epoch": 1.3290615141955837, "grad_norm": 0.5138921218572929, "learning_rate": 1.5045570658034022e-05, "loss": 0.3447, "step": 6742 }, { "epoch": 1.3292586750788644, "grad_norm": 1.730892234036529, "learning_rate": 1.504423233856259e-05, "loss": 0.373, "step": 6743 }, { "epoch": 1.3294558359621451, "grad_norm": 0.5491132539355397, "learning_rate": 1.5042893897899417e-05, "loss": 0.3453, "step": 6744 }, { "epoch": 1.3296529968454258, "grad_norm": 0.539322312311082, "learning_rate": 1.5041555336076661e-05, "loss": 0.3468, "step": 6745 }, { "epoch": 1.3298501577287065, "grad_norm": 0.5125742645152108, "learning_rate": 1.5040216653126471e-05, "loss": 0.3266, "step": 6746 }, { "epoch": 1.3300473186119874, "grad_norm": 0.5010768816584499, "learning_rate": 1.5038877849081023e-05, "loss": 0.325, "step": 6747 }, { "epoch": 1.3302444794952681, "grad_norm": 0.5237464923768348, "learning_rate": 1.5037538923972474e-05, "loss": 0.3392, "step": 6748 }, { "epoch": 1.3304416403785488, "grad_norm": 0.5259667674407368, "learning_rate": 1.5036199877832997e-05, "loss": 0.3405, "step": 6749 }, { "epoch": 1.3306388012618298, "grad_norm": 0.5653193705799677, "learning_rate": 1.503486071069476e-05, "loss": 0.3687, "step": 6750 }, { "epoch": 1.3308359621451105, "grad_norm": 0.5180820594611166, "learning_rate": 1.5033521422589943e-05, "loss": 0.3785, "step": 6751 }, { "epoch": 1.3310331230283912, "grad_norm": 0.5264455832857787, "learning_rate": 1.5032182013550719e-05, "loss": 0.349, "step": 6752 }, { "epoch": 1.3312302839116719, "grad_norm": 0.5131681170313048, "learning_rate": 1.5030842483609268e-05, "loss": 0.3309, "step": 6753 }, { "epoch": 1.3314274447949526, "grad_norm": 0.5006781906744829, "learning_rate": 1.5029502832797775e-05, "loss": 0.3106, "step": 6754 }, { "epoch": 1.3316246056782335, "grad_norm": 0.4732747904769726, "learning_rate": 1.5028163061148432e-05, "loss": 0.3355, "step": 6755 }, { "epoch": 1.3318217665615142, "grad_norm": 0.4995838593733347, "learning_rate": 1.5026823168693414e-05, "loss": 0.3322, "step": 6756 }, { "epoch": 1.3320189274447949, "grad_norm": 0.478174941952485, "learning_rate": 1.5025483155464926e-05, "loss": 0.3073, "step": 6757 }, { "epoch": 1.3322160883280758, "grad_norm": 0.484122961464807, "learning_rate": 1.5024143021495157e-05, "loss": 0.3413, "step": 6758 }, { "epoch": 1.3324132492113565, "grad_norm": 0.5951585544576415, "learning_rate": 1.5022802766816306e-05, "loss": 0.3635, "step": 6759 }, { "epoch": 1.3326104100946372, "grad_norm": 0.5340216464990926, "learning_rate": 1.5021462391460576e-05, "loss": 0.3552, "step": 6760 }, { "epoch": 1.332807570977918, "grad_norm": 0.5645843694433802, "learning_rate": 1.5020121895460165e-05, "loss": 0.3593, "step": 6761 }, { "epoch": 1.3330047318611986, "grad_norm": 0.5728974673232404, "learning_rate": 1.5018781278847286e-05, "loss": 0.3529, "step": 6762 }, { "epoch": 1.3332018927444795, "grad_norm": 0.5256692890594539, "learning_rate": 1.501744054165414e-05, "loss": 0.3387, "step": 6763 }, { "epoch": 1.3333990536277602, "grad_norm": 0.5674819000439734, "learning_rate": 1.501609968391295e-05, "loss": 0.3509, "step": 6764 }, { "epoch": 1.333596214511041, "grad_norm": 0.4986446571718649, "learning_rate": 1.5014758705655922e-05, "loss": 0.3239, "step": 6765 }, { "epoch": 1.3337933753943219, "grad_norm": 0.4896630219969831, "learning_rate": 1.5013417606915279e-05, "loss": 0.3204, "step": 6766 }, { "epoch": 1.3339905362776026, "grad_norm": 0.5177912836591282, "learning_rate": 1.501207638772324e-05, "loss": 0.3386, "step": 6767 }, { "epoch": 1.3341876971608833, "grad_norm": 0.48153562676597106, "learning_rate": 1.5010735048112031e-05, "loss": 0.3056, "step": 6768 }, { "epoch": 1.334384858044164, "grad_norm": 0.499695810551327, "learning_rate": 1.5009393588113876e-05, "loss": 0.2962, "step": 6769 }, { "epoch": 1.3345820189274447, "grad_norm": 0.48456873034066944, "learning_rate": 1.5008052007761009e-05, "loss": 0.3056, "step": 6770 }, { "epoch": 1.3347791798107256, "grad_norm": 0.5138152037859967, "learning_rate": 1.5006710307085656e-05, "loss": 0.3473, "step": 6771 }, { "epoch": 1.3349763406940063, "grad_norm": 0.4989689701123164, "learning_rate": 1.5005368486120058e-05, "loss": 0.3527, "step": 6772 }, { "epoch": 1.335173501577287, "grad_norm": 0.5240843932016397, "learning_rate": 1.5004026544896448e-05, "loss": 0.3289, "step": 6773 }, { "epoch": 1.335370662460568, "grad_norm": 0.51061194084439, "learning_rate": 1.5002684483447074e-05, "loss": 0.3582, "step": 6774 }, { "epoch": 1.3355678233438486, "grad_norm": 0.5005446884388874, "learning_rate": 1.5001342301804176e-05, "loss": 0.3458, "step": 6775 }, { "epoch": 1.3357649842271293, "grad_norm": 0.5140258358304517, "learning_rate": 1.5000000000000002e-05, "loss": 0.3606, "step": 6776 }, { "epoch": 1.3359621451104102, "grad_norm": 0.49447960825964604, "learning_rate": 1.49986575780668e-05, "loss": 0.3337, "step": 6777 }, { "epoch": 1.336159305993691, "grad_norm": 0.4734352743858005, "learning_rate": 1.4997315036036826e-05, "loss": 0.3288, "step": 6778 }, { "epoch": 1.3363564668769716, "grad_norm": 0.5153787348270493, "learning_rate": 1.4995972373942334e-05, "loss": 0.3512, "step": 6779 }, { "epoch": 1.3365536277602523, "grad_norm": 0.5614500712850554, "learning_rate": 1.4994629591815579e-05, "loss": 0.3581, "step": 6780 }, { "epoch": 1.336750788643533, "grad_norm": 0.5086243455682851, "learning_rate": 1.4993286689688831e-05, "loss": 0.3368, "step": 6781 }, { "epoch": 1.336947949526814, "grad_norm": 0.5109131318818986, "learning_rate": 1.4991943667594344e-05, "loss": 0.3473, "step": 6782 }, { "epoch": 1.3371451104100947, "grad_norm": 0.48373369562864504, "learning_rate": 1.4990600525564394e-05, "loss": 0.3171, "step": 6783 }, { "epoch": 1.3373422712933754, "grad_norm": 0.48077824595792773, "learning_rate": 1.4989257263631246e-05, "loss": 0.3112, "step": 6784 }, { "epoch": 1.3375394321766563, "grad_norm": 0.5403123400327008, "learning_rate": 1.4987913881827177e-05, "loss": 0.3509, "step": 6785 }, { "epoch": 1.337736593059937, "grad_norm": 0.5283987542462503, "learning_rate": 1.4986570380184454e-05, "loss": 0.3369, "step": 6786 }, { "epoch": 1.3379337539432177, "grad_norm": 0.5135808090920458, "learning_rate": 1.4985226758735368e-05, "loss": 0.3436, "step": 6787 }, { "epoch": 1.3381309148264984, "grad_norm": 0.5241808952900016, "learning_rate": 1.498388301751219e-05, "loss": 0.329, "step": 6788 }, { "epoch": 1.338328075709779, "grad_norm": 0.539221498149948, "learning_rate": 1.4982539156547214e-05, "loss": 0.3565, "step": 6789 }, { "epoch": 1.33852523659306, "grad_norm": 0.5312950545419894, "learning_rate": 1.4981195175872718e-05, "loss": 0.3392, "step": 6790 }, { "epoch": 1.3387223974763407, "grad_norm": 0.5526638195715327, "learning_rate": 1.4979851075521e-05, "loss": 0.3518, "step": 6791 }, { "epoch": 1.3389195583596214, "grad_norm": 0.5416794319715723, "learning_rate": 1.4978506855524348e-05, "loss": 0.3632, "step": 6792 }, { "epoch": 1.3391167192429023, "grad_norm": 0.5235504870365736, "learning_rate": 1.497716251591506e-05, "loss": 0.316, "step": 6793 }, { "epoch": 1.339313880126183, "grad_norm": 0.5043048732799276, "learning_rate": 1.4975818056725433e-05, "loss": 0.3434, "step": 6794 }, { "epoch": 1.3395110410094637, "grad_norm": 0.49798925055448356, "learning_rate": 1.4974473477987771e-05, "loss": 0.3184, "step": 6795 }, { "epoch": 1.3397082018927444, "grad_norm": 0.4979629803712915, "learning_rate": 1.4973128779734381e-05, "loss": 0.321, "step": 6796 }, { "epoch": 1.3399053627760251, "grad_norm": 0.5040873484662636, "learning_rate": 1.4971783961997561e-05, "loss": 0.3426, "step": 6797 }, { "epoch": 1.340102523659306, "grad_norm": 0.4974631337369193, "learning_rate": 1.4970439024809634e-05, "loss": 0.3271, "step": 6798 }, { "epoch": 1.3402996845425867, "grad_norm": 0.5323248192377603, "learning_rate": 1.49690939682029e-05, "loss": 0.3537, "step": 6799 }, { "epoch": 1.3404968454258674, "grad_norm": 0.48171766055599236, "learning_rate": 1.4967748792209689e-05, "loss": 0.3173, "step": 6800 }, { "epoch": 1.3406940063091484, "grad_norm": 0.510686613417193, "learning_rate": 1.4966403496862304e-05, "loss": 0.3195, "step": 6801 }, { "epoch": 1.340891167192429, "grad_norm": 0.48429649890064824, "learning_rate": 1.4965058082193084e-05, "loss": 0.3141, "step": 6802 }, { "epoch": 1.3410883280757098, "grad_norm": 0.49594807930521567, "learning_rate": 1.496371254823434e-05, "loss": 0.321, "step": 6803 }, { "epoch": 1.3412854889589905, "grad_norm": 0.526673612495566, "learning_rate": 1.496236689501841e-05, "loss": 0.3415, "step": 6804 }, { "epoch": 1.3414826498422712, "grad_norm": 0.45404096769055313, "learning_rate": 1.4961021122577613e-05, "loss": 0.2986, "step": 6805 }, { "epoch": 1.341679810725552, "grad_norm": 0.5806061745177113, "learning_rate": 1.495967523094429e-05, "loss": 0.3928, "step": 6806 }, { "epoch": 1.3418769716088328, "grad_norm": 0.470337630001986, "learning_rate": 1.4958329220150778e-05, "loss": 0.3148, "step": 6807 }, { "epoch": 1.3420741324921135, "grad_norm": 0.47772699425478293, "learning_rate": 1.4956983090229413e-05, "loss": 0.3203, "step": 6808 }, { "epoch": 1.3422712933753944, "grad_norm": 0.4886302060088961, "learning_rate": 1.4955636841212538e-05, "loss": 0.3518, "step": 6809 }, { "epoch": 1.3424684542586751, "grad_norm": 0.48102229230186133, "learning_rate": 1.4954290473132495e-05, "loss": 0.3198, "step": 6810 }, { "epoch": 1.3426656151419558, "grad_norm": 0.48930648987210024, "learning_rate": 1.4952943986021633e-05, "loss": 0.3305, "step": 6811 }, { "epoch": 1.3428627760252365, "grad_norm": 0.540876771967004, "learning_rate": 1.4951597379912306e-05, "loss": 0.3504, "step": 6812 }, { "epoch": 1.3430599369085172, "grad_norm": 0.5164837203011601, "learning_rate": 1.4950250654836862e-05, "loss": 0.3657, "step": 6813 }, { "epoch": 1.3432570977917981, "grad_norm": 0.4476636874195238, "learning_rate": 1.4948903810827662e-05, "loss": 0.295, "step": 6814 }, { "epoch": 1.3434542586750788, "grad_norm": 0.5015700179009981, "learning_rate": 1.4947556847917062e-05, "loss": 0.3495, "step": 6815 }, { "epoch": 1.3436514195583595, "grad_norm": 0.4883735796558397, "learning_rate": 1.4946209766137422e-05, "loss": 0.3262, "step": 6816 }, { "epoch": 1.3438485804416405, "grad_norm": 0.4960996354859811, "learning_rate": 1.4944862565521113e-05, "loss": 0.3266, "step": 6817 }, { "epoch": 1.3440457413249212, "grad_norm": 0.5111574948370762, "learning_rate": 1.4943515246100498e-05, "loss": 0.3534, "step": 6818 }, { "epoch": 1.3442429022082019, "grad_norm": 0.4806601835647626, "learning_rate": 1.4942167807907945e-05, "loss": 0.3156, "step": 6819 }, { "epoch": 1.3444400630914828, "grad_norm": 0.4771804770811557, "learning_rate": 1.494082025097583e-05, "loss": 0.3411, "step": 6820 }, { "epoch": 1.3446372239747635, "grad_norm": 0.5010047370612712, "learning_rate": 1.4939472575336535e-05, "loss": 0.3332, "step": 6821 }, { "epoch": 1.3448343848580442, "grad_norm": 0.5128977303636474, "learning_rate": 1.4938124781022429e-05, "loss": 0.3502, "step": 6822 }, { "epoch": 1.3450315457413249, "grad_norm": 0.5115752389432223, "learning_rate": 1.4936776868065904e-05, "loss": 0.3251, "step": 6823 }, { "epoch": 1.3452287066246056, "grad_norm": 0.5115575705230232, "learning_rate": 1.4935428836499333e-05, "loss": 0.3251, "step": 6824 }, { "epoch": 1.3454258675078865, "grad_norm": 0.45496251393031, "learning_rate": 1.4934080686355112e-05, "loss": 0.3212, "step": 6825 }, { "epoch": 1.3456230283911672, "grad_norm": 0.4683704737332005, "learning_rate": 1.4932732417665627e-05, "loss": 0.314, "step": 6826 }, { "epoch": 1.345820189274448, "grad_norm": 0.5058243078911916, "learning_rate": 1.4931384030463276e-05, "loss": 0.3547, "step": 6827 }, { "epoch": 1.3460173501577288, "grad_norm": 0.49135394576091385, "learning_rate": 1.4930035524780455e-05, "loss": 0.3351, "step": 6828 }, { "epoch": 1.3462145110410095, "grad_norm": 0.49176472785254427, "learning_rate": 1.4928686900649557e-05, "loss": 0.3467, "step": 6829 }, { "epoch": 1.3464116719242902, "grad_norm": 0.4877950490063829, "learning_rate": 1.4927338158102988e-05, "loss": 0.32, "step": 6830 }, { "epoch": 1.346608832807571, "grad_norm": 0.5644001631439873, "learning_rate": 1.4925989297173148e-05, "loss": 0.3492, "step": 6831 }, { "epoch": 1.3468059936908516, "grad_norm": 0.5182639423714361, "learning_rate": 1.4924640317892457e-05, "loss": 0.3616, "step": 6832 }, { "epoch": 1.3470031545741326, "grad_norm": 0.4911569508385157, "learning_rate": 1.4923291220293307e-05, "loss": 0.3187, "step": 6833 }, { "epoch": 1.3472003154574133, "grad_norm": 0.4939068383078795, "learning_rate": 1.4921942004408126e-05, "loss": 0.3352, "step": 6834 }, { "epoch": 1.347397476340694, "grad_norm": 0.48064147010076375, "learning_rate": 1.4920592670269323e-05, "loss": 0.3334, "step": 6835 }, { "epoch": 1.3475946372239749, "grad_norm": 0.5150465597535828, "learning_rate": 1.4919243217909318e-05, "loss": 0.3525, "step": 6836 }, { "epoch": 1.3477917981072556, "grad_norm": 0.48198241690638255, "learning_rate": 1.4917893647360538e-05, "loss": 0.3215, "step": 6837 }, { "epoch": 1.3479889589905363, "grad_norm": 0.49192236847221166, "learning_rate": 1.4916543958655396e-05, "loss": 0.3446, "step": 6838 }, { "epoch": 1.348186119873817, "grad_norm": 0.5380105418206355, "learning_rate": 1.491519415182633e-05, "loss": 0.3455, "step": 6839 }, { "epoch": 1.3483832807570977, "grad_norm": 0.5360035265768491, "learning_rate": 1.4913844226905767e-05, "loss": 0.3578, "step": 6840 }, { "epoch": 1.3485804416403786, "grad_norm": 0.6845647174250372, "learning_rate": 1.4912494183926139e-05, "loss": 0.3396, "step": 6841 }, { "epoch": 1.3487776025236593, "grad_norm": 0.5269738365614332, "learning_rate": 1.4911144022919879e-05, "loss": 0.3559, "step": 6842 }, { "epoch": 1.34897476340694, "grad_norm": 0.4813494392754745, "learning_rate": 1.4909793743919432e-05, "loss": 0.3228, "step": 6843 }, { "epoch": 1.349171924290221, "grad_norm": 0.4816948637640496, "learning_rate": 1.4908443346957235e-05, "loss": 0.3154, "step": 6844 }, { "epoch": 1.3493690851735016, "grad_norm": 0.45620872496423787, "learning_rate": 1.4907092832065734e-05, "loss": 0.3105, "step": 6845 }, { "epoch": 1.3495662460567823, "grad_norm": 0.5146288021545241, "learning_rate": 1.4905742199277376e-05, "loss": 0.3563, "step": 6846 }, { "epoch": 1.349763406940063, "grad_norm": 0.508730043660222, "learning_rate": 1.4904391448624612e-05, "loss": 0.3473, "step": 6847 }, { "epoch": 1.3499605678233437, "grad_norm": 0.5656947915688717, "learning_rate": 1.4903040580139891e-05, "loss": 0.3603, "step": 6848 }, { "epoch": 1.3501577287066246, "grad_norm": 0.48859833009143055, "learning_rate": 1.4901689593855677e-05, "loss": 0.3243, "step": 6849 }, { "epoch": 1.3503548895899053, "grad_norm": 0.5710007684390376, "learning_rate": 1.4900338489804418e-05, "loss": 0.3194, "step": 6850 }, { "epoch": 1.350552050473186, "grad_norm": 0.5249828286967833, "learning_rate": 1.4898987268018586e-05, "loss": 0.3633, "step": 6851 }, { "epoch": 1.350749211356467, "grad_norm": 0.5414339406348158, "learning_rate": 1.4897635928530634e-05, "loss": 0.3512, "step": 6852 }, { "epoch": 1.3509463722397477, "grad_norm": 0.44873869097621316, "learning_rate": 1.4896284471373038e-05, "loss": 0.3052, "step": 6853 }, { "epoch": 1.3511435331230284, "grad_norm": 0.4672888541955871, "learning_rate": 1.4894932896578262e-05, "loss": 0.3198, "step": 6854 }, { "epoch": 1.351340694006309, "grad_norm": 0.5477287513630034, "learning_rate": 1.4893581204178785e-05, "loss": 0.3337, "step": 6855 }, { "epoch": 1.3515378548895898, "grad_norm": 0.47013217105100197, "learning_rate": 1.4892229394207076e-05, "loss": 0.3484, "step": 6856 }, { "epoch": 1.3517350157728707, "grad_norm": 0.47570179900580484, "learning_rate": 1.4890877466695617e-05, "loss": 0.3142, "step": 6857 }, { "epoch": 1.3519321766561514, "grad_norm": 0.5040215090763913, "learning_rate": 1.488952542167689e-05, "loss": 0.3213, "step": 6858 }, { "epoch": 1.352129337539432, "grad_norm": 0.5374781827924892, "learning_rate": 1.4888173259183375e-05, "loss": 0.3559, "step": 6859 }, { "epoch": 1.352326498422713, "grad_norm": 0.4893306035975573, "learning_rate": 1.4886820979247561e-05, "loss": 0.3199, "step": 6860 }, { "epoch": 1.3525236593059937, "grad_norm": 0.47307143181191913, "learning_rate": 1.4885468581901939e-05, "loss": 0.3081, "step": 6861 }, { "epoch": 1.3527208201892744, "grad_norm": 0.49226139486267406, "learning_rate": 1.4884116067178997e-05, "loss": 0.3359, "step": 6862 }, { "epoch": 1.3529179810725553, "grad_norm": 2.9054703966108226, "learning_rate": 1.4882763435111236e-05, "loss": 0.3751, "step": 6863 }, { "epoch": 1.353115141955836, "grad_norm": 0.5783559792239383, "learning_rate": 1.4881410685731152e-05, "loss": 0.341, "step": 6864 }, { "epoch": 1.3533123028391167, "grad_norm": 0.5284375900313468, "learning_rate": 1.4880057819071244e-05, "loss": 0.3455, "step": 6865 }, { "epoch": 1.3535094637223974, "grad_norm": 0.47608340152473533, "learning_rate": 1.4878704835164018e-05, "loss": 0.2983, "step": 6866 }, { "epoch": 1.3537066246056781, "grad_norm": 0.506845850224768, "learning_rate": 1.4877351734041976e-05, "loss": 0.3592, "step": 6867 }, { "epoch": 1.353903785488959, "grad_norm": 0.4935101468881354, "learning_rate": 1.4875998515737635e-05, "loss": 0.3066, "step": 6868 }, { "epoch": 1.3541009463722398, "grad_norm": 0.5059246656166723, "learning_rate": 1.48746451802835e-05, "loss": 0.3418, "step": 6869 }, { "epoch": 1.3542981072555205, "grad_norm": 0.46928991796641456, "learning_rate": 1.4873291727712094e-05, "loss": 0.3037, "step": 6870 }, { "epoch": 1.3544952681388014, "grad_norm": 0.5073410603661683, "learning_rate": 1.4871938158055926e-05, "loss": 0.3429, "step": 6871 }, { "epoch": 1.354692429022082, "grad_norm": 0.5676528958205695, "learning_rate": 1.487058447134752e-05, "loss": 0.386, "step": 6872 }, { "epoch": 1.3548895899053628, "grad_norm": 0.5449613542540548, "learning_rate": 1.4869230667619399e-05, "loss": 0.3391, "step": 6873 }, { "epoch": 1.3550867507886435, "grad_norm": 0.5309796727926993, "learning_rate": 1.4867876746904093e-05, "loss": 0.3191, "step": 6874 }, { "epoch": 1.3552839116719242, "grad_norm": 0.48139917834477924, "learning_rate": 1.4866522709234125e-05, "loss": 0.3207, "step": 6875 }, { "epoch": 1.3554810725552051, "grad_norm": 0.4741924876595785, "learning_rate": 1.4865168554642033e-05, "loss": 0.3279, "step": 6876 }, { "epoch": 1.3556782334384858, "grad_norm": 0.48900065703542145, "learning_rate": 1.4863814283160348e-05, "loss": 0.3399, "step": 6877 }, { "epoch": 1.3558753943217665, "grad_norm": 0.5735243814672437, "learning_rate": 1.4862459894821606e-05, "loss": 0.3556, "step": 6878 }, { "epoch": 1.3560725552050474, "grad_norm": 0.5398399252571137, "learning_rate": 1.486110538965835e-05, "loss": 0.384, "step": 6879 }, { "epoch": 1.3562697160883281, "grad_norm": 0.5117636275917532, "learning_rate": 1.4859750767703122e-05, "loss": 0.3438, "step": 6880 }, { "epoch": 1.3564668769716088, "grad_norm": 0.5051003334328982, "learning_rate": 1.4858396028988472e-05, "loss": 0.3428, "step": 6881 }, { "epoch": 1.3566640378548895, "grad_norm": 0.5441154566116012, "learning_rate": 1.4857041173546941e-05, "loss": 0.3412, "step": 6882 }, { "epoch": 1.3568611987381702, "grad_norm": 0.4798420407498326, "learning_rate": 1.4855686201411086e-05, "loss": 0.3448, "step": 6883 }, { "epoch": 1.3570583596214512, "grad_norm": 0.5141849025940272, "learning_rate": 1.485433111261346e-05, "loss": 0.3407, "step": 6884 }, { "epoch": 1.3572555205047319, "grad_norm": 0.534717445086196, "learning_rate": 1.4852975907186618e-05, "loss": 0.3731, "step": 6885 }, { "epoch": 1.3574526813880126, "grad_norm": 0.5435535748807395, "learning_rate": 1.4851620585163123e-05, "loss": 0.3712, "step": 6886 }, { "epoch": 1.3576498422712935, "grad_norm": 0.4963966514211321, "learning_rate": 1.4850265146575535e-05, "loss": 0.3203, "step": 6887 }, { "epoch": 1.3578470031545742, "grad_norm": 0.5097541899694029, "learning_rate": 1.4848909591456421e-05, "loss": 0.3511, "step": 6888 }, { "epoch": 1.3580441640378549, "grad_norm": 0.5022870460738562, "learning_rate": 1.4847553919838353e-05, "loss": 0.3459, "step": 6889 }, { "epoch": 1.3582413249211356, "grad_norm": 0.5010019389495004, "learning_rate": 1.4846198131753894e-05, "loss": 0.349, "step": 6890 }, { "epoch": 1.3584384858044163, "grad_norm": 0.4838676445079221, "learning_rate": 1.4844842227235628e-05, "loss": 0.3237, "step": 6891 }, { "epoch": 1.3586356466876972, "grad_norm": 0.49788296424059325, "learning_rate": 1.4843486206316122e-05, "loss": 0.3319, "step": 6892 }, { "epoch": 1.358832807570978, "grad_norm": 0.5331292005137582, "learning_rate": 1.4842130069027957e-05, "loss": 0.3446, "step": 6893 }, { "epoch": 1.3590299684542586, "grad_norm": 0.5009763516051616, "learning_rate": 1.4840773815403722e-05, "loss": 0.3255, "step": 6894 }, { "epoch": 1.3592271293375395, "grad_norm": 0.5154972070799596, "learning_rate": 1.4839417445475995e-05, "loss": 0.3382, "step": 6895 }, { "epoch": 1.3594242902208202, "grad_norm": 0.522383735377928, "learning_rate": 1.483806095927737e-05, "loss": 0.3481, "step": 6896 }, { "epoch": 1.359621451104101, "grad_norm": 0.5040201789415579, "learning_rate": 1.4836704356840428e-05, "loss": 0.3329, "step": 6897 }, { "epoch": 1.3598186119873816, "grad_norm": 0.525037747965827, "learning_rate": 1.4835347638197777e-05, "loss": 0.3359, "step": 6898 }, { "epoch": 1.3600157728706623, "grad_norm": 0.5051300920201072, "learning_rate": 1.4833990803381997e-05, "loss": 0.3467, "step": 6899 }, { "epoch": 1.3602129337539433, "grad_norm": 0.4999739246930199, "learning_rate": 1.4832633852425702e-05, "loss": 0.361, "step": 6900 }, { "epoch": 1.360410094637224, "grad_norm": 0.5107030410105877, "learning_rate": 1.4831276785361484e-05, "loss": 0.3556, "step": 6901 }, { "epoch": 1.3606072555205047, "grad_norm": 0.4974505550953817, "learning_rate": 1.4829919602221949e-05, "loss": 0.3282, "step": 6902 }, { "epoch": 1.3608044164037856, "grad_norm": 0.48064179340063234, "learning_rate": 1.4828562303039708e-05, "loss": 0.3416, "step": 6903 }, { "epoch": 1.3610015772870663, "grad_norm": 0.5127276668204793, "learning_rate": 1.4827204887847369e-05, "loss": 0.3347, "step": 6904 }, { "epoch": 1.361198738170347, "grad_norm": 0.4780273243210127, "learning_rate": 1.4825847356677546e-05, "loss": 0.304, "step": 6905 }, { "epoch": 1.3613958990536277, "grad_norm": 0.4887295391580163, "learning_rate": 1.4824489709562854e-05, "loss": 0.3327, "step": 6906 }, { "epoch": 1.3615930599369084, "grad_norm": 0.48670211288915743, "learning_rate": 1.4823131946535912e-05, "loss": 0.3355, "step": 6907 }, { "epoch": 1.3617902208201893, "grad_norm": 0.5294737580364534, "learning_rate": 1.4821774067629338e-05, "loss": 0.3631, "step": 6908 }, { "epoch": 1.36198738170347, "grad_norm": 0.4803323354930984, "learning_rate": 1.482041607287576e-05, "loss": 0.3238, "step": 6909 }, { "epoch": 1.3621845425867507, "grad_norm": 0.5172113196727169, "learning_rate": 1.4819057962307805e-05, "loss": 0.3276, "step": 6910 }, { "epoch": 1.3623817034700316, "grad_norm": 0.5122817965816258, "learning_rate": 1.4817699735958103e-05, "loss": 0.34, "step": 6911 }, { "epoch": 1.3625788643533123, "grad_norm": 0.500184273669358, "learning_rate": 1.4816341393859283e-05, "loss": 0.3579, "step": 6912 }, { "epoch": 1.362776025236593, "grad_norm": 0.49730830810220844, "learning_rate": 1.4814982936043984e-05, "loss": 0.3338, "step": 6913 }, { "epoch": 1.362973186119874, "grad_norm": 0.4523508533879372, "learning_rate": 1.481362436254484e-05, "loss": 0.3122, "step": 6914 }, { "epoch": 1.3631703470031546, "grad_norm": 0.5318999450446498, "learning_rate": 1.4812265673394496e-05, "loss": 0.3442, "step": 6915 }, { "epoch": 1.3633675078864353, "grad_norm": 0.4991165752969515, "learning_rate": 1.4810906868625595e-05, "loss": 0.3403, "step": 6916 }, { "epoch": 1.363564668769716, "grad_norm": 0.5238776570531024, "learning_rate": 1.4809547948270782e-05, "loss": 0.358, "step": 6917 }, { "epoch": 1.3637618296529967, "grad_norm": 0.4971815188764086, "learning_rate": 1.4808188912362705e-05, "loss": 0.3149, "step": 6918 }, { "epoch": 1.3639589905362777, "grad_norm": 0.5124620131345615, "learning_rate": 1.4806829760934018e-05, "loss": 0.3232, "step": 6919 }, { "epoch": 1.3641561514195584, "grad_norm": 0.5178627263514879, "learning_rate": 1.4805470494017373e-05, "loss": 0.3513, "step": 6920 }, { "epoch": 1.364353312302839, "grad_norm": 0.52380799206788, "learning_rate": 1.4804111111645434e-05, "loss": 0.3487, "step": 6921 }, { "epoch": 1.36455047318612, "grad_norm": 0.48203134434437056, "learning_rate": 1.4802751613850853e-05, "loss": 0.3213, "step": 6922 }, { "epoch": 1.3647476340694007, "grad_norm": 0.5102651067337406, "learning_rate": 1.4801392000666297e-05, "loss": 0.3266, "step": 6923 }, { "epoch": 1.3649447949526814, "grad_norm": 0.5414007491135933, "learning_rate": 1.4800032272124432e-05, "loss": 0.3818, "step": 6924 }, { "epoch": 1.365141955835962, "grad_norm": 0.5237634060391902, "learning_rate": 1.4798672428257928e-05, "loss": 0.3349, "step": 6925 }, { "epoch": 1.3653391167192428, "grad_norm": 0.5011403516477672, "learning_rate": 1.4797312469099454e-05, "loss": 0.3213, "step": 6926 }, { "epoch": 1.3655362776025237, "grad_norm": 0.5571042867990981, "learning_rate": 1.4795952394681682e-05, "loss": 0.3691, "step": 6927 }, { "epoch": 1.3657334384858044, "grad_norm": 0.48652049099214323, "learning_rate": 1.4794592205037295e-05, "loss": 0.3387, "step": 6928 }, { "epoch": 1.3659305993690851, "grad_norm": 0.4823341354472324, "learning_rate": 1.4793231900198968e-05, "loss": 0.3235, "step": 6929 }, { "epoch": 1.366127760252366, "grad_norm": 0.5152844218945443, "learning_rate": 1.4791871480199385e-05, "loss": 0.3405, "step": 6930 }, { "epoch": 1.3663249211356467, "grad_norm": 0.4901566411635077, "learning_rate": 1.479051094507123e-05, "loss": 0.328, "step": 6931 }, { "epoch": 1.3665220820189274, "grad_norm": 0.5243962870706206, "learning_rate": 1.4789150294847192e-05, "loss": 0.3436, "step": 6932 }, { "epoch": 1.3667192429022081, "grad_norm": 0.5194816070602105, "learning_rate": 1.4787789529559961e-05, "loss": 0.3309, "step": 6933 }, { "epoch": 1.3669164037854888, "grad_norm": 0.5123859611810547, "learning_rate": 1.4786428649242232e-05, "loss": 0.3367, "step": 6934 }, { "epoch": 1.3671135646687698, "grad_norm": 0.4896479386075604, "learning_rate": 1.4785067653926701e-05, "loss": 0.3204, "step": 6935 }, { "epoch": 1.3673107255520505, "grad_norm": 0.49790804021992413, "learning_rate": 1.4783706543646066e-05, "loss": 0.3368, "step": 6936 }, { "epoch": 1.3675078864353312, "grad_norm": 0.5019017261167511, "learning_rate": 1.4782345318433025e-05, "loss": 0.3278, "step": 6937 }, { "epoch": 1.367705047318612, "grad_norm": 0.4904028947701245, "learning_rate": 1.478098397832029e-05, "loss": 0.3129, "step": 6938 }, { "epoch": 1.3679022082018928, "grad_norm": 0.5321594647922132, "learning_rate": 1.4779622523340562e-05, "loss": 0.3425, "step": 6939 }, { "epoch": 1.3680993690851735, "grad_norm": 0.4986035776438812, "learning_rate": 1.477826095352656e-05, "loss": 0.3367, "step": 6940 }, { "epoch": 1.3682965299684542, "grad_norm": 0.5246630991127984, "learning_rate": 1.4776899268910985e-05, "loss": 0.3542, "step": 6941 }, { "epoch": 1.368493690851735, "grad_norm": 0.4648451353245271, "learning_rate": 1.477553746952656e-05, "loss": 0.3196, "step": 6942 }, { "epoch": 1.3686908517350158, "grad_norm": 0.546723814002127, "learning_rate": 1.4774175555406e-05, "loss": 0.3466, "step": 6943 }, { "epoch": 1.3688880126182965, "grad_norm": 0.5018774391569213, "learning_rate": 1.477281352658203e-05, "loss": 0.3239, "step": 6944 }, { "epoch": 1.3690851735015772, "grad_norm": 0.4926341199579351, "learning_rate": 1.4771451383087373e-05, "loss": 0.3356, "step": 6945 }, { "epoch": 1.3692823343848581, "grad_norm": 0.5338603435373444, "learning_rate": 1.477008912495475e-05, "loss": 0.3629, "step": 6946 }, { "epoch": 1.3694794952681388, "grad_norm": 0.46986959001400663, "learning_rate": 1.4768726752216898e-05, "loss": 0.3107, "step": 6947 }, { "epoch": 1.3696766561514195, "grad_norm": 0.5335668009350745, "learning_rate": 1.4767364264906542e-05, "loss": 0.3481, "step": 6948 }, { "epoch": 1.3698738170347002, "grad_norm": 0.4653687936969232, "learning_rate": 1.4766001663056422e-05, "loss": 0.3155, "step": 6949 }, { "epoch": 1.370070977917981, "grad_norm": 0.5286992819313316, "learning_rate": 1.4764638946699275e-05, "loss": 0.3449, "step": 6950 }, { "epoch": 1.3702681388012619, "grad_norm": 0.5307937528972434, "learning_rate": 1.476327611586784e-05, "loss": 0.3482, "step": 6951 }, { "epoch": 1.3704652996845426, "grad_norm": 0.47920732199670424, "learning_rate": 1.4761913170594859e-05, "loss": 0.3334, "step": 6952 }, { "epoch": 1.3706624605678233, "grad_norm": 0.4986071058412949, "learning_rate": 1.4760550110913081e-05, "loss": 0.3575, "step": 6953 }, { "epoch": 1.3708596214511042, "grad_norm": 0.48580007339999265, "learning_rate": 1.4759186936855253e-05, "loss": 0.3166, "step": 6954 }, { "epoch": 1.3710567823343849, "grad_norm": 0.4847134680546892, "learning_rate": 1.4757823648454124e-05, "loss": 0.3355, "step": 6955 }, { "epoch": 1.3712539432176656, "grad_norm": 0.49892520436181714, "learning_rate": 1.475646024574245e-05, "loss": 0.3437, "step": 6956 }, { "epoch": 1.3714511041009465, "grad_norm": 0.46579178489145134, "learning_rate": 1.4755096728752992e-05, "loss": 0.3251, "step": 6957 }, { "epoch": 1.3716482649842272, "grad_norm": 0.4854313491900763, "learning_rate": 1.4753733097518503e-05, "loss": 0.3197, "step": 6958 }, { "epoch": 1.371845425867508, "grad_norm": 0.5084781508505444, "learning_rate": 1.475236935207175e-05, "loss": 0.3447, "step": 6959 }, { "epoch": 1.3720425867507886, "grad_norm": 0.5154369206311532, "learning_rate": 1.4751005492445496e-05, "loss": 0.3022, "step": 6960 }, { "epoch": 1.3722397476340693, "grad_norm": 0.5072733874919334, "learning_rate": 1.4749641518672508e-05, "loss": 0.3478, "step": 6961 }, { "epoch": 1.3724369085173502, "grad_norm": 0.5276500649999918, "learning_rate": 1.4748277430785557e-05, "loss": 0.3433, "step": 6962 }, { "epoch": 1.372634069400631, "grad_norm": 0.5068275820907489, "learning_rate": 1.4746913228817416e-05, "loss": 0.3324, "step": 6963 }, { "epoch": 1.3728312302839116, "grad_norm": 0.5588764584629632, "learning_rate": 1.4745548912800867e-05, "loss": 0.381, "step": 6964 }, { "epoch": 1.3730283911671926, "grad_norm": 0.48397923052902175, "learning_rate": 1.4744184482768678e-05, "loss": 0.3241, "step": 6965 }, { "epoch": 1.3732255520504733, "grad_norm": 0.5009095018317993, "learning_rate": 1.4742819938753641e-05, "loss": 0.3374, "step": 6966 }, { "epoch": 1.373422712933754, "grad_norm": 0.5214836401414918, "learning_rate": 1.4741455280788533e-05, "loss": 0.3338, "step": 6967 }, { "epoch": 1.3736198738170347, "grad_norm": 0.4969774753340322, "learning_rate": 1.4740090508906147e-05, "loss": 0.3243, "step": 6968 }, { "epoch": 1.3738170347003154, "grad_norm": 0.5559922169108119, "learning_rate": 1.4738725623139263e-05, "loss": 0.3567, "step": 6969 }, { "epoch": 1.3740141955835963, "grad_norm": 0.5165083267867453, "learning_rate": 1.4737360623520684e-05, "loss": 0.3324, "step": 6970 }, { "epoch": 1.374211356466877, "grad_norm": 0.5186856386291233, "learning_rate": 1.47359955100832e-05, "loss": 0.3333, "step": 6971 }, { "epoch": 1.3744085173501577, "grad_norm": 0.5238837322487031, "learning_rate": 1.473463028285961e-05, "loss": 0.3484, "step": 6972 }, { "epoch": 1.3746056782334386, "grad_norm": 2.292582007349146, "learning_rate": 1.4733264941882714e-05, "loss": 0.3815, "step": 6973 }, { "epoch": 1.3748028391167193, "grad_norm": 0.5643806541463167, "learning_rate": 1.4731899487185319e-05, "loss": 0.3442, "step": 6974 }, { "epoch": 1.375, "grad_norm": 0.47507958786208415, "learning_rate": 1.4730533918800227e-05, "loss": 0.3075, "step": 6975 }, { "epoch": 1.3751971608832807, "grad_norm": 0.4912700593505576, "learning_rate": 1.4729168236760248e-05, "loss": 0.3229, "step": 6976 }, { "epoch": 1.3753943217665614, "grad_norm": 0.5116118374339484, "learning_rate": 1.4727802441098193e-05, "loss": 0.3345, "step": 6977 }, { "epoch": 1.3755914826498423, "grad_norm": 0.5865259693172717, "learning_rate": 1.4726436531846877e-05, "loss": 0.3333, "step": 6978 }, { "epoch": 1.375788643533123, "grad_norm": 0.49366649497140386, "learning_rate": 1.4725070509039117e-05, "loss": 0.3337, "step": 6979 }, { "epoch": 1.3759858044164037, "grad_norm": 0.5372885703885392, "learning_rate": 1.4723704372707734e-05, "loss": 0.3287, "step": 6980 }, { "epoch": 1.3761829652996846, "grad_norm": 0.5043319157124423, "learning_rate": 1.4722338122885548e-05, "loss": 0.3396, "step": 6981 }, { "epoch": 1.3763801261829653, "grad_norm": 0.6345501977793165, "learning_rate": 1.4720971759605387e-05, "loss": 0.3289, "step": 6982 }, { "epoch": 1.376577287066246, "grad_norm": 0.5375836546232604, "learning_rate": 1.4719605282900077e-05, "loss": 0.3757, "step": 6983 }, { "epoch": 1.3767744479495267, "grad_norm": 0.5178077248278057, "learning_rate": 1.4718238692802449e-05, "loss": 0.3594, "step": 6984 }, { "epoch": 1.3769716088328074, "grad_norm": 0.5147912712127123, "learning_rate": 1.4716871989345338e-05, "loss": 0.3474, "step": 6985 }, { "epoch": 1.3771687697160884, "grad_norm": 0.5189874060272969, "learning_rate": 1.4715505172561577e-05, "loss": 0.3247, "step": 6986 }, { "epoch": 1.377365930599369, "grad_norm": 0.5281430394013332, "learning_rate": 1.471413824248401e-05, "loss": 0.3568, "step": 6987 }, { "epoch": 1.3775630914826498, "grad_norm": 0.5147078126304419, "learning_rate": 1.4712771199145472e-05, "loss": 0.3559, "step": 6988 }, { "epoch": 1.3777602523659307, "grad_norm": 0.49074990134353796, "learning_rate": 1.4711404042578814e-05, "loss": 0.3416, "step": 6989 }, { "epoch": 1.3779574132492114, "grad_norm": 0.4841551234965781, "learning_rate": 1.4710036772816877e-05, "loss": 0.3476, "step": 6990 }, { "epoch": 1.378154574132492, "grad_norm": 0.5076124986937911, "learning_rate": 1.4708669389892514e-05, "loss": 0.3451, "step": 6991 }, { "epoch": 1.3783517350157728, "grad_norm": 0.5101395115062465, "learning_rate": 1.4707301893838578e-05, "loss": 0.337, "step": 6992 }, { "epoch": 1.3785488958990535, "grad_norm": 0.47965592126549544, "learning_rate": 1.4705934284687923e-05, "loss": 0.3242, "step": 6993 }, { "epoch": 1.3787460567823344, "grad_norm": 0.5219389338261724, "learning_rate": 1.4704566562473408e-05, "loss": 0.3683, "step": 6994 }, { "epoch": 1.3789432176656151, "grad_norm": 0.5205227382874077, "learning_rate": 1.4703198727227892e-05, "loss": 0.3318, "step": 6995 }, { "epoch": 1.3791403785488958, "grad_norm": 0.48222107531876607, "learning_rate": 1.4701830778984239e-05, "loss": 0.3253, "step": 6996 }, { "epoch": 1.3793375394321767, "grad_norm": 0.49353311997235216, "learning_rate": 1.4700462717775317e-05, "loss": 0.3219, "step": 6997 }, { "epoch": 1.3795347003154574, "grad_norm": 0.5345425712355901, "learning_rate": 1.4699094543633989e-05, "loss": 0.3624, "step": 6998 }, { "epoch": 1.3797318611987381, "grad_norm": 0.5050606461841181, "learning_rate": 1.4697726256593132e-05, "loss": 0.3441, "step": 6999 }, { "epoch": 1.379929022082019, "grad_norm": 0.47478110398234236, "learning_rate": 1.469635785668562e-05, "loss": 0.3204, "step": 7000 }, { "epoch": 1.3801261829652998, "grad_norm": 0.503768054032645, "learning_rate": 1.4694989343944327e-05, "loss": 0.3508, "step": 7001 }, { "epoch": 1.3803233438485805, "grad_norm": 0.6492703807376875, "learning_rate": 1.4693620718402137e-05, "loss": 0.3212, "step": 7002 }, { "epoch": 1.3805205047318612, "grad_norm": 0.4805328225116543, "learning_rate": 1.4692251980091927e-05, "loss": 0.3254, "step": 7003 }, { "epoch": 1.3807176656151419, "grad_norm": 0.4842656353572685, "learning_rate": 1.4690883129046585e-05, "loss": 0.3372, "step": 7004 }, { "epoch": 1.3809148264984228, "grad_norm": 0.5211693690974576, "learning_rate": 1.4689514165298995e-05, "loss": 0.3491, "step": 7005 }, { "epoch": 1.3811119873817035, "grad_norm": 0.5885371549295725, "learning_rate": 1.4688145088882056e-05, "loss": 0.3444, "step": 7006 }, { "epoch": 1.3813091482649842, "grad_norm": 0.4754832288654815, "learning_rate": 1.4686775899828651e-05, "loss": 0.3224, "step": 7007 }, { "epoch": 1.381506309148265, "grad_norm": 0.4780150559237149, "learning_rate": 1.4685406598171686e-05, "loss": 0.3052, "step": 7008 }, { "epoch": 1.3817034700315458, "grad_norm": 0.49765867952072046, "learning_rate": 1.4684037183944051e-05, "loss": 0.3434, "step": 7009 }, { "epoch": 1.3819006309148265, "grad_norm": 0.47835652909606524, "learning_rate": 1.4682667657178653e-05, "loss": 0.3396, "step": 7010 }, { "epoch": 1.3820977917981072, "grad_norm": 0.46329597154012175, "learning_rate": 1.4681298017908391e-05, "loss": 0.3226, "step": 7011 }, { "epoch": 1.382294952681388, "grad_norm": 0.49817719336350075, "learning_rate": 1.4679928266166175e-05, "loss": 0.3357, "step": 7012 }, { "epoch": 1.3824921135646688, "grad_norm": 0.8663152836331668, "learning_rate": 1.4678558401984915e-05, "loss": 0.3431, "step": 7013 }, { "epoch": 1.3826892744479495, "grad_norm": 0.5197376334684173, "learning_rate": 1.467718842539752e-05, "loss": 0.3367, "step": 7014 }, { "epoch": 1.3828864353312302, "grad_norm": 0.5621592048068038, "learning_rate": 1.467581833643691e-05, "loss": 0.3152, "step": 7015 }, { "epoch": 1.3830835962145112, "grad_norm": 0.5392174642427607, "learning_rate": 1.4674448135135993e-05, "loss": 0.348, "step": 7016 }, { "epoch": 1.3832807570977919, "grad_norm": 0.4767809373705618, "learning_rate": 1.46730778215277e-05, "loss": 0.3133, "step": 7017 }, { "epoch": 1.3834779179810726, "grad_norm": 1.1165470822976469, "learning_rate": 1.4671707395644946e-05, "loss": 0.3479, "step": 7018 }, { "epoch": 1.3836750788643533, "grad_norm": 0.5264241691242134, "learning_rate": 1.4670336857520661e-05, "loss": 0.3528, "step": 7019 }, { "epoch": 1.383872239747634, "grad_norm": 0.4677220107524557, "learning_rate": 1.4668966207187774e-05, "loss": 0.3322, "step": 7020 }, { "epoch": 1.3840694006309149, "grad_norm": 0.4729142635567184, "learning_rate": 1.4667595444679212e-05, "loss": 0.3265, "step": 7021 }, { "epoch": 1.3842665615141956, "grad_norm": 0.5851446109658627, "learning_rate": 1.466622457002791e-05, "loss": 0.3491, "step": 7022 }, { "epoch": 1.3844637223974763, "grad_norm": 0.5195972584107764, "learning_rate": 1.4664853583266807e-05, "loss": 0.3408, "step": 7023 }, { "epoch": 1.3846608832807572, "grad_norm": 0.48331307744775936, "learning_rate": 1.4663482484428839e-05, "loss": 0.3438, "step": 7024 }, { "epoch": 1.384858044164038, "grad_norm": 0.4872542154414702, "learning_rate": 1.4662111273546949e-05, "loss": 0.3116, "step": 7025 }, { "epoch": 1.3850552050473186, "grad_norm": 0.5209465468531163, "learning_rate": 1.4660739950654081e-05, "loss": 0.3713, "step": 7026 }, { "epoch": 1.3852523659305993, "grad_norm": 0.4946017796024822, "learning_rate": 1.4659368515783183e-05, "loss": 0.3238, "step": 7027 }, { "epoch": 1.38544952681388, "grad_norm": 0.5092592005569798, "learning_rate": 1.4657996968967202e-05, "loss": 0.3456, "step": 7028 }, { "epoch": 1.385646687697161, "grad_norm": 0.47093221898089077, "learning_rate": 1.4656625310239095e-05, "loss": 0.3068, "step": 7029 }, { "epoch": 1.3858438485804416, "grad_norm": 0.5092749341911277, "learning_rate": 1.4655253539631816e-05, "loss": 0.3501, "step": 7030 }, { "epoch": 1.3860410094637223, "grad_norm": 0.49688273138146666, "learning_rate": 1.4653881657178317e-05, "loss": 0.329, "step": 7031 }, { "epoch": 1.3862381703470033, "grad_norm": 0.5210667444751383, "learning_rate": 1.465250966291157e-05, "loss": 0.3308, "step": 7032 }, { "epoch": 1.386435331230284, "grad_norm": 0.5051826303648079, "learning_rate": 1.4651137556864526e-05, "loss": 0.3246, "step": 7033 }, { "epoch": 1.3866324921135647, "grad_norm": 0.5131056295583875, "learning_rate": 1.4649765339070161e-05, "loss": 0.3472, "step": 7034 }, { "epoch": 1.3868296529968454, "grad_norm": 0.5045828989240048, "learning_rate": 1.4648393009561434e-05, "loss": 0.35, "step": 7035 }, { "epoch": 1.387026813880126, "grad_norm": 2.3607682081415566, "learning_rate": 1.4647020568371329e-05, "loss": 0.3819, "step": 7036 }, { "epoch": 1.387223974763407, "grad_norm": 0.7063638677947639, "learning_rate": 1.4645648015532806e-05, "loss": 0.3035, "step": 7037 }, { "epoch": 1.3874211356466877, "grad_norm": 0.5347651370289318, "learning_rate": 1.464427535107885e-05, "loss": 0.3415, "step": 7038 }, { "epoch": 1.3876182965299684, "grad_norm": 0.5063799018311032, "learning_rate": 1.4642902575042439e-05, "loss": 0.3682, "step": 7039 }, { "epoch": 1.3878154574132493, "grad_norm": 0.5418753232993281, "learning_rate": 1.4641529687456558e-05, "loss": 0.3801, "step": 7040 }, { "epoch": 1.38801261829653, "grad_norm": 0.510285067684664, "learning_rate": 1.4640156688354183e-05, "loss": 0.3417, "step": 7041 }, { "epoch": 1.3882097791798107, "grad_norm": 0.5074567362081557, "learning_rate": 1.4638783577768312e-05, "loss": 0.3426, "step": 7042 }, { "epoch": 1.3884069400630916, "grad_norm": 0.4986711709526979, "learning_rate": 1.4637410355731927e-05, "loss": 0.3517, "step": 7043 }, { "epoch": 1.3886041009463723, "grad_norm": 0.4834879827214969, "learning_rate": 1.4636037022278022e-05, "loss": 0.325, "step": 7044 }, { "epoch": 1.388801261829653, "grad_norm": 0.4725810540995025, "learning_rate": 1.4634663577439598e-05, "loss": 0.3329, "step": 7045 }, { "epoch": 1.3889984227129337, "grad_norm": 0.499687944712635, "learning_rate": 1.4633290021249646e-05, "loss": 0.3261, "step": 7046 }, { "epoch": 1.3891955835962144, "grad_norm": 0.49600002260141485, "learning_rate": 1.4631916353741174e-05, "loss": 0.3625, "step": 7047 }, { "epoch": 1.3893927444794953, "grad_norm": 0.5362771118840135, "learning_rate": 1.4630542574947177e-05, "loss": 0.3511, "step": 7048 }, { "epoch": 1.389589905362776, "grad_norm": 0.49021230005905664, "learning_rate": 1.462916868490067e-05, "loss": 0.3459, "step": 7049 }, { "epoch": 1.3897870662460567, "grad_norm": 0.5226203596978446, "learning_rate": 1.4627794683634655e-05, "loss": 0.3359, "step": 7050 }, { "epoch": 1.3899842271293377, "grad_norm": 0.4892949036036693, "learning_rate": 1.4626420571182146e-05, "loss": 0.3354, "step": 7051 }, { "epoch": 1.3901813880126184, "grad_norm": 0.49319072511760836, "learning_rate": 1.4625046347576155e-05, "loss": 0.3213, "step": 7052 }, { "epoch": 1.390378548895899, "grad_norm": 0.48026160014473557, "learning_rate": 1.4623672012849705e-05, "loss": 0.3328, "step": 7053 }, { "epoch": 1.3905757097791798, "grad_norm": 0.519599979648436, "learning_rate": 1.462229756703581e-05, "loss": 0.3768, "step": 7054 }, { "epoch": 1.3907728706624605, "grad_norm": 0.5158052836985166, "learning_rate": 1.4620923010167496e-05, "loss": 0.3472, "step": 7055 }, { "epoch": 1.3909700315457414, "grad_norm": 0.49003976941752064, "learning_rate": 1.461954834227778e-05, "loss": 0.3197, "step": 7056 }, { "epoch": 1.391167192429022, "grad_norm": 0.47468028680465263, "learning_rate": 1.46181735633997e-05, "loss": 0.3163, "step": 7057 }, { "epoch": 1.3913643533123028, "grad_norm": 0.4951911330562906, "learning_rate": 1.4616798673566276e-05, "loss": 0.3479, "step": 7058 }, { "epoch": 1.3915615141955837, "grad_norm": 0.4785035909764805, "learning_rate": 1.4615423672810549e-05, "loss": 0.3244, "step": 7059 }, { "epoch": 1.3917586750788644, "grad_norm": 0.46972733617436485, "learning_rate": 1.4614048561165552e-05, "loss": 0.3264, "step": 7060 }, { "epoch": 1.3919558359621451, "grad_norm": 0.5045776545128406, "learning_rate": 1.4612673338664322e-05, "loss": 0.3555, "step": 7061 }, { "epoch": 1.3921529968454258, "grad_norm": 0.4873523714181044, "learning_rate": 1.46112980053399e-05, "loss": 0.3357, "step": 7062 }, { "epoch": 1.3923501577287065, "grad_norm": 0.5568338246839404, "learning_rate": 1.460992256122533e-05, "loss": 0.3606, "step": 7063 }, { "epoch": 1.3925473186119874, "grad_norm": 0.5056504428860361, "learning_rate": 1.4608547006353661e-05, "loss": 0.3515, "step": 7064 }, { "epoch": 1.3927444794952681, "grad_norm": 0.5637743705182227, "learning_rate": 1.4607171340757935e-05, "loss": 0.387, "step": 7065 }, { "epoch": 1.3929416403785488, "grad_norm": 0.5012198726529813, "learning_rate": 1.460579556447121e-05, "loss": 0.3339, "step": 7066 }, { "epoch": 1.3931388012618298, "grad_norm": 0.5255783486737358, "learning_rate": 1.4604419677526536e-05, "loss": 0.34, "step": 7067 }, { "epoch": 1.3933359621451105, "grad_norm": 0.49793007788388866, "learning_rate": 1.4603043679956972e-05, "loss": 0.3364, "step": 7068 }, { "epoch": 1.3935331230283912, "grad_norm": 0.5065921955646553, "learning_rate": 1.4601667571795577e-05, "loss": 0.3439, "step": 7069 }, { "epoch": 1.3937302839116719, "grad_norm": 0.46887161071080763, "learning_rate": 1.4600291353075413e-05, "loss": 0.3174, "step": 7070 }, { "epoch": 1.3939274447949526, "grad_norm": 0.5473282371408335, "learning_rate": 1.4598915023829543e-05, "loss": 0.3446, "step": 7071 }, { "epoch": 1.3941246056782335, "grad_norm": 0.5483532718507378, "learning_rate": 1.4597538584091038e-05, "loss": 0.356, "step": 7072 }, { "epoch": 1.3943217665615142, "grad_norm": 0.5316150064778802, "learning_rate": 1.4596162033892962e-05, "loss": 0.3371, "step": 7073 }, { "epoch": 1.3945189274447949, "grad_norm": 0.5159110622307619, "learning_rate": 1.4594785373268399e-05, "loss": 0.3488, "step": 7074 }, { "epoch": 1.3947160883280758, "grad_norm": 0.48598790594632707, "learning_rate": 1.4593408602250412e-05, "loss": 0.3235, "step": 7075 }, { "epoch": 1.3949132492113565, "grad_norm": 0.4724557712538457, "learning_rate": 1.4592031720872086e-05, "loss": 0.3286, "step": 7076 }, { "epoch": 1.3951104100946372, "grad_norm": 0.48240302693285425, "learning_rate": 1.45906547291665e-05, "loss": 0.3269, "step": 7077 }, { "epoch": 1.395307570977918, "grad_norm": 0.5090930573384124, "learning_rate": 1.4589277627166738e-05, "loss": 0.3376, "step": 7078 }, { "epoch": 1.3955047318611986, "grad_norm": 0.49297802997817497, "learning_rate": 1.4587900414905884e-05, "loss": 0.341, "step": 7079 }, { "epoch": 1.3957018927444795, "grad_norm": 0.4997166046095957, "learning_rate": 1.4586523092417023e-05, "loss": 0.3496, "step": 7080 }, { "epoch": 1.3958990536277602, "grad_norm": 0.5090637102230009, "learning_rate": 1.4585145659733261e-05, "loss": 0.3348, "step": 7081 }, { "epoch": 1.396096214511041, "grad_norm": 0.5273133534088523, "learning_rate": 1.4583768116887675e-05, "loss": 0.3552, "step": 7082 }, { "epoch": 1.3962933753943219, "grad_norm": 0.49036343465671867, "learning_rate": 1.4582390463913374e-05, "loss": 0.3291, "step": 7083 }, { "epoch": 1.3964905362776026, "grad_norm": 0.5038344983388285, "learning_rate": 1.4581012700843447e-05, "loss": 0.333, "step": 7084 }, { "epoch": 1.3966876971608833, "grad_norm": 0.4900487095066304, "learning_rate": 1.4579634827711004e-05, "loss": 0.3483, "step": 7085 }, { "epoch": 1.396884858044164, "grad_norm": 0.5031078803878702, "learning_rate": 1.4578256844549144e-05, "loss": 0.3157, "step": 7086 }, { "epoch": 1.3970820189274447, "grad_norm": 0.6475198113556349, "learning_rate": 1.4576878751390977e-05, "loss": 0.3807, "step": 7087 }, { "epoch": 1.3972791798107256, "grad_norm": 0.5187308668664504, "learning_rate": 1.4575500548269612e-05, "loss": 0.3417, "step": 7088 }, { "epoch": 1.3974763406940063, "grad_norm": 0.4702071372526483, "learning_rate": 1.4574122235218165e-05, "loss": 0.3175, "step": 7089 }, { "epoch": 1.397673501577287, "grad_norm": 0.5261987476073774, "learning_rate": 1.4572743812269742e-05, "loss": 0.3641, "step": 7090 }, { "epoch": 1.397870662460568, "grad_norm": 0.5232677422093927, "learning_rate": 1.457136527945747e-05, "loss": 0.3366, "step": 7091 }, { "epoch": 1.3980678233438486, "grad_norm": 0.5413611690474402, "learning_rate": 1.4569986636814467e-05, "loss": 0.3387, "step": 7092 }, { "epoch": 1.3982649842271293, "grad_norm": 0.504814762397322, "learning_rate": 1.4568607884373853e-05, "loss": 0.3559, "step": 7093 }, { "epoch": 1.3984621451104102, "grad_norm": 0.5127986043906394, "learning_rate": 1.4567229022168756e-05, "loss": 0.3346, "step": 7094 }, { "epoch": 1.398659305993691, "grad_norm": 0.5194678556274392, "learning_rate": 1.4565850050232303e-05, "loss": 0.3264, "step": 7095 }, { "epoch": 1.3988564668769716, "grad_norm": 1.36315390724496, "learning_rate": 1.4564470968597629e-05, "loss": 0.3374, "step": 7096 }, { "epoch": 1.3990536277602523, "grad_norm": 0.5377737743898511, "learning_rate": 1.456309177729786e-05, "loss": 0.3548, "step": 7097 }, { "epoch": 1.399250788643533, "grad_norm": 0.5141750880497071, "learning_rate": 1.4561712476366138e-05, "loss": 0.3328, "step": 7098 }, { "epoch": 1.399447949526814, "grad_norm": 0.48903622056427165, "learning_rate": 1.4560333065835597e-05, "loss": 0.3118, "step": 7099 }, { "epoch": 1.3996451104100947, "grad_norm": 0.5462273751987472, "learning_rate": 1.4558953545739386e-05, "loss": 0.3234, "step": 7100 }, { "epoch": 1.3998422712933754, "grad_norm": 0.4955309077143716, "learning_rate": 1.4557573916110643e-05, "loss": 0.3108, "step": 7101 }, { "epoch": 1.4000394321766563, "grad_norm": 0.5069308732041122, "learning_rate": 1.4556194176982521e-05, "loss": 0.3231, "step": 7102 }, { "epoch": 1.400236593059937, "grad_norm": 0.48081025706983266, "learning_rate": 1.4554814328388158e-05, "loss": 0.311, "step": 7103 }, { "epoch": 1.4004337539432177, "grad_norm": 0.5215098010308659, "learning_rate": 1.4553434370360718e-05, "loss": 0.3516, "step": 7104 }, { "epoch": 1.4006309148264984, "grad_norm": 0.4849700152957339, "learning_rate": 1.4552054302933344e-05, "loss": 0.3482, "step": 7105 }, { "epoch": 1.400828075709779, "grad_norm": 0.5043874074673499, "learning_rate": 1.4550674126139206e-05, "loss": 0.3562, "step": 7106 }, { "epoch": 1.40102523659306, "grad_norm": 0.4972388280378365, "learning_rate": 1.4549293840011453e-05, "loss": 0.3371, "step": 7107 }, { "epoch": 1.4012223974763407, "grad_norm": 0.501745399798661, "learning_rate": 1.4547913444583254e-05, "loss": 0.3435, "step": 7108 }, { "epoch": 1.4014195583596214, "grad_norm": 0.5121863899401545, "learning_rate": 1.4546532939887775e-05, "loss": 0.3126, "step": 7109 }, { "epoch": 1.4016167192429023, "grad_norm": 0.5122597557427135, "learning_rate": 1.4545152325958176e-05, "loss": 0.3241, "step": 7110 }, { "epoch": 1.401813880126183, "grad_norm": 0.5356092124198039, "learning_rate": 1.4543771602827635e-05, "loss": 0.3352, "step": 7111 }, { "epoch": 1.4020110410094637, "grad_norm": 0.5046731418348064, "learning_rate": 1.454239077052932e-05, "loss": 0.3255, "step": 7112 }, { "epoch": 1.4022082018927444, "grad_norm": 0.5329137134815505, "learning_rate": 1.4541009829096411e-05, "loss": 0.3668, "step": 7113 }, { "epoch": 1.4024053627760251, "grad_norm": 0.5089935811054527, "learning_rate": 1.4539628778562082e-05, "loss": 0.3246, "step": 7114 }, { "epoch": 1.402602523659306, "grad_norm": 0.5703923513720676, "learning_rate": 1.4538247618959519e-05, "loss": 0.3691, "step": 7115 }, { "epoch": 1.4027996845425867, "grad_norm": 0.48847066057622457, "learning_rate": 1.4536866350321899e-05, "loss": 0.3268, "step": 7116 }, { "epoch": 1.4029968454258674, "grad_norm": 0.5073745379809114, "learning_rate": 1.4535484972682412e-05, "loss": 0.3149, "step": 7117 }, { "epoch": 1.4031940063091484, "grad_norm": 0.6462283290424533, "learning_rate": 1.4534103486074246e-05, "loss": 0.3336, "step": 7118 }, { "epoch": 1.403391167192429, "grad_norm": 0.515565073309167, "learning_rate": 1.4532721890530594e-05, "loss": 0.3456, "step": 7119 }, { "epoch": 1.4035883280757098, "grad_norm": 0.4990119046287184, "learning_rate": 1.4531340186084647e-05, "loss": 0.3122, "step": 7120 }, { "epoch": 1.4037854889589905, "grad_norm": 0.510752738342485, "learning_rate": 1.4529958372769603e-05, "loss": 0.3478, "step": 7121 }, { "epoch": 1.4039826498422712, "grad_norm": 0.533223751953392, "learning_rate": 1.4528576450618661e-05, "loss": 0.3166, "step": 7122 }, { "epoch": 1.404179810725552, "grad_norm": 0.5250407115618655, "learning_rate": 1.4527194419665027e-05, "loss": 0.3405, "step": 7123 }, { "epoch": 1.4043769716088328, "grad_norm": 0.5492621619572747, "learning_rate": 1.4525812279941896e-05, "loss": 0.3307, "step": 7124 }, { "epoch": 1.4045741324921135, "grad_norm": 0.5808645619860187, "learning_rate": 1.4524430031482483e-05, "loss": 0.3236, "step": 7125 }, { "epoch": 1.4047712933753944, "grad_norm": 0.516938354436358, "learning_rate": 1.4523047674319992e-05, "loss": 0.3397, "step": 7126 }, { "epoch": 1.4049684542586751, "grad_norm": 0.5446571797503498, "learning_rate": 1.452166520848764e-05, "loss": 0.3739, "step": 7127 }, { "epoch": 1.4051656151419558, "grad_norm": 0.5247398532256848, "learning_rate": 1.4520282634018642e-05, "loss": 0.33, "step": 7128 }, { "epoch": 1.4053627760252365, "grad_norm": 0.5174783252477193, "learning_rate": 1.451889995094621e-05, "loss": 0.3291, "step": 7129 }, { "epoch": 1.4055599369085172, "grad_norm": 0.45618206010019524, "learning_rate": 1.4517517159303573e-05, "loss": 0.3154, "step": 7130 }, { "epoch": 1.4057570977917981, "grad_norm": 0.5302269877465452, "learning_rate": 1.4516134259123944e-05, "loss": 0.3305, "step": 7131 }, { "epoch": 1.4059542586750788, "grad_norm": 0.49691950071685087, "learning_rate": 1.4514751250440556e-05, "loss": 0.3465, "step": 7132 }, { "epoch": 1.4061514195583595, "grad_norm": 0.5204660479559051, "learning_rate": 1.4513368133286628e-05, "loss": 0.3484, "step": 7133 }, { "epoch": 1.4063485804416405, "grad_norm": 0.5209911663466499, "learning_rate": 1.45119849076954e-05, "loss": 0.3324, "step": 7134 }, { "epoch": 1.4065457413249212, "grad_norm": 0.52870920201794, "learning_rate": 1.4510601573700098e-05, "loss": 0.3464, "step": 7135 }, { "epoch": 1.4067429022082019, "grad_norm": 0.5337821985731359, "learning_rate": 1.4509218131333964e-05, "loss": 0.3547, "step": 7136 }, { "epoch": 1.4069400630914828, "grad_norm": 0.4726297001749275, "learning_rate": 1.4507834580630231e-05, "loss": 0.3287, "step": 7137 }, { "epoch": 1.4071372239747635, "grad_norm": 0.5411170706465964, "learning_rate": 1.4506450921622144e-05, "loss": 0.3249, "step": 7138 }, { "epoch": 1.4073343848580442, "grad_norm": 0.5280250037967184, "learning_rate": 1.4505067154342944e-05, "loss": 0.3481, "step": 7139 }, { "epoch": 1.4075315457413249, "grad_norm": 0.5254200953659547, "learning_rate": 1.4503683278825877e-05, "loss": 0.3102, "step": 7140 }, { "epoch": 1.4077287066246056, "grad_norm": 0.5167455740268537, "learning_rate": 1.4502299295104194e-05, "loss": 0.3626, "step": 7141 }, { "epoch": 1.4079258675078865, "grad_norm": 0.5231802604371326, "learning_rate": 1.4500915203211144e-05, "loss": 0.3298, "step": 7142 }, { "epoch": 1.4081230283911672, "grad_norm": 0.5273278352996336, "learning_rate": 1.449953100317998e-05, "loss": 0.3262, "step": 7143 }, { "epoch": 1.408320189274448, "grad_norm": 0.467342150805326, "learning_rate": 1.4498146695043963e-05, "loss": 0.3136, "step": 7144 }, { "epoch": 1.4085173501577288, "grad_norm": 0.6192865687389703, "learning_rate": 1.4496762278836347e-05, "loss": 0.3684, "step": 7145 }, { "epoch": 1.4087145110410095, "grad_norm": 0.5257953374768632, "learning_rate": 1.4495377754590396e-05, "loss": 0.3528, "step": 7146 }, { "epoch": 1.4089116719242902, "grad_norm": 0.5430432195240142, "learning_rate": 1.4493993122339375e-05, "loss": 0.3398, "step": 7147 }, { "epoch": 1.409108832807571, "grad_norm": 0.5071243838122111, "learning_rate": 1.4492608382116548e-05, "loss": 0.3306, "step": 7148 }, { "epoch": 1.4093059936908516, "grad_norm": 0.591861854929956, "learning_rate": 1.4491223533955191e-05, "loss": 0.3736, "step": 7149 }, { "epoch": 1.4095031545741326, "grad_norm": 0.5388065248530421, "learning_rate": 1.4489838577888569e-05, "loss": 0.3513, "step": 7150 }, { "epoch": 1.4097003154574133, "grad_norm": 0.517788510937729, "learning_rate": 1.4488453513949963e-05, "loss": 0.3479, "step": 7151 }, { "epoch": 1.409897476340694, "grad_norm": 0.5610229696228063, "learning_rate": 1.4487068342172642e-05, "loss": 0.3469, "step": 7152 }, { "epoch": 1.4100946372239749, "grad_norm": 0.4712793595470132, "learning_rate": 1.4485683062589895e-05, "loss": 0.3182, "step": 7153 }, { "epoch": 1.4102917981072556, "grad_norm": 0.5668593758772468, "learning_rate": 1.4484297675234995e-05, "loss": 0.3809, "step": 7154 }, { "epoch": 1.4104889589905363, "grad_norm": 0.4925454763252612, "learning_rate": 1.4482912180141236e-05, "loss": 0.3475, "step": 7155 }, { "epoch": 1.410686119873817, "grad_norm": 0.5510156258206566, "learning_rate": 1.44815265773419e-05, "loss": 0.3579, "step": 7156 }, { "epoch": 1.4108832807570977, "grad_norm": 0.5405436736712104, "learning_rate": 1.4480140866870281e-05, "loss": 0.3426, "step": 7157 }, { "epoch": 1.4110804416403786, "grad_norm": 0.5237178960169243, "learning_rate": 1.4478755048759668e-05, "loss": 0.3632, "step": 7158 }, { "epoch": 1.4112776025236593, "grad_norm": 0.5175312490148417, "learning_rate": 1.4477369123043358e-05, "loss": 0.3236, "step": 7159 }, { "epoch": 1.41147476340694, "grad_norm": 0.4913933173819947, "learning_rate": 1.447598308975465e-05, "loss": 0.324, "step": 7160 }, { "epoch": 1.411671924290221, "grad_norm": 0.4826321503807621, "learning_rate": 1.4474596948926844e-05, "loss": 0.326, "step": 7161 }, { "epoch": 1.4118690851735016, "grad_norm": 0.4915727914760078, "learning_rate": 1.4473210700593242e-05, "loss": 0.33, "step": 7162 }, { "epoch": 1.4120662460567823, "grad_norm": 0.5060788214654445, "learning_rate": 1.4471824344787153e-05, "loss": 0.314, "step": 7163 }, { "epoch": 1.412263406940063, "grad_norm": 0.491241758641821, "learning_rate": 1.4470437881541882e-05, "loss": 0.3421, "step": 7164 }, { "epoch": 1.4124605678233437, "grad_norm": 0.4942260983754986, "learning_rate": 1.446905131089074e-05, "loss": 0.3491, "step": 7165 }, { "epoch": 1.4126577287066246, "grad_norm": 0.48529396730532887, "learning_rate": 1.4467664632867042e-05, "loss": 0.332, "step": 7166 }, { "epoch": 1.4128548895899053, "grad_norm": 0.5474445100151097, "learning_rate": 1.44662778475041e-05, "loss": 0.3503, "step": 7167 }, { "epoch": 1.413052050473186, "grad_norm": 0.49293254269828907, "learning_rate": 1.4464890954835242e-05, "loss": 0.326, "step": 7168 }, { "epoch": 1.413249211356467, "grad_norm": 0.5017859566166725, "learning_rate": 1.4463503954893778e-05, "loss": 0.3244, "step": 7169 }, { "epoch": 1.4134463722397477, "grad_norm": 0.5240485239593689, "learning_rate": 1.446211684771304e-05, "loss": 0.3708, "step": 7170 }, { "epoch": 1.4136435331230284, "grad_norm": 0.4870947916252577, "learning_rate": 1.4460729633326351e-05, "loss": 0.3162, "step": 7171 }, { "epoch": 1.413840694006309, "grad_norm": 0.4593378993420768, "learning_rate": 1.4459342311767041e-05, "loss": 0.3221, "step": 7172 }, { "epoch": 1.4140378548895898, "grad_norm": 0.48340606797854385, "learning_rate": 1.445795488306844e-05, "loss": 0.3228, "step": 7173 }, { "epoch": 1.4142350157728707, "grad_norm": 0.4713202093530444, "learning_rate": 1.445656734726388e-05, "loss": 0.3172, "step": 7174 }, { "epoch": 1.4144321766561514, "grad_norm": 0.4780513871592405, "learning_rate": 1.4455179704386706e-05, "loss": 0.3551, "step": 7175 }, { "epoch": 1.414629337539432, "grad_norm": 0.48058366057305824, "learning_rate": 1.4453791954470248e-05, "loss": 0.3257, "step": 7176 }, { "epoch": 1.414826498422713, "grad_norm": 0.49793083984186, "learning_rate": 1.4452404097547855e-05, "loss": 0.3592, "step": 7177 }, { "epoch": 1.4150236593059937, "grad_norm": 0.472865401917883, "learning_rate": 1.4451016133652864e-05, "loss": 0.3218, "step": 7178 }, { "epoch": 1.4152208201892744, "grad_norm": 0.46575544454257445, "learning_rate": 1.4449628062818628e-05, "loss": 0.3077, "step": 7179 }, { "epoch": 1.4154179810725553, "grad_norm": 0.48121486650008277, "learning_rate": 1.4448239885078494e-05, "loss": 0.3326, "step": 7180 }, { "epoch": 1.415615141955836, "grad_norm": 0.473468751152917, "learning_rate": 1.4446851600465817e-05, "loss": 0.3376, "step": 7181 }, { "epoch": 1.4158123028391167, "grad_norm": 0.5137837266489765, "learning_rate": 1.4445463209013948e-05, "loss": 0.3401, "step": 7182 }, { "epoch": 1.4160094637223974, "grad_norm": 0.497315989831655, "learning_rate": 1.4444074710756244e-05, "loss": 0.3391, "step": 7183 }, { "epoch": 1.4162066246056781, "grad_norm": 0.5170085322910686, "learning_rate": 1.4442686105726066e-05, "loss": 0.3612, "step": 7184 }, { "epoch": 1.416403785488959, "grad_norm": 0.5055898381412588, "learning_rate": 1.4441297393956779e-05, "loss": 0.3368, "step": 7185 }, { "epoch": 1.4166009463722398, "grad_norm": 0.4588445495464938, "learning_rate": 1.4439908575481744e-05, "loss": 0.3215, "step": 7186 }, { "epoch": 1.4167981072555205, "grad_norm": 0.4940514211342652, "learning_rate": 1.443851965033433e-05, "loss": 0.3229, "step": 7187 }, { "epoch": 1.4169952681388014, "grad_norm": 0.5237805317857336, "learning_rate": 1.4437130618547905e-05, "loss": 0.3746, "step": 7188 }, { "epoch": 1.417192429022082, "grad_norm": 0.46802246011483617, "learning_rate": 1.443574148015585e-05, "loss": 0.3247, "step": 7189 }, { "epoch": 1.4173895899053628, "grad_norm": 0.4760962059204285, "learning_rate": 1.4434352235191526e-05, "loss": 0.3145, "step": 7190 }, { "epoch": 1.4175867507886435, "grad_norm": 0.46609718235500464, "learning_rate": 1.4432962883688327e-05, "loss": 0.3309, "step": 7191 }, { "epoch": 1.4177839116719242, "grad_norm": 0.5063549571401547, "learning_rate": 1.443157342567962e-05, "loss": 0.3656, "step": 7192 }, { "epoch": 1.4179810725552051, "grad_norm": 0.5161414428496777, "learning_rate": 1.4430183861198792e-05, "loss": 0.3811, "step": 7193 }, { "epoch": 1.4181782334384858, "grad_norm": 0.46919572179206825, "learning_rate": 1.4428794190279231e-05, "loss": 0.3344, "step": 7194 }, { "epoch": 1.4183753943217665, "grad_norm": 0.5020263012424411, "learning_rate": 1.442740441295432e-05, "loss": 0.3387, "step": 7195 }, { "epoch": 1.4185725552050474, "grad_norm": 0.4593456839363708, "learning_rate": 1.4426014529257457e-05, "loss": 0.3161, "step": 7196 }, { "epoch": 1.4187697160883281, "grad_norm": 0.50798569973899, "learning_rate": 1.4424624539222028e-05, "loss": 0.3444, "step": 7197 }, { "epoch": 1.4189668769716088, "grad_norm": 0.4790042993417883, "learning_rate": 1.4423234442881433e-05, "loss": 0.3375, "step": 7198 }, { "epoch": 1.4191640378548895, "grad_norm": 0.5051516627231682, "learning_rate": 1.4421844240269064e-05, "loss": 0.3502, "step": 7199 }, { "epoch": 1.4193611987381702, "grad_norm": 0.49347178341054193, "learning_rate": 1.4420453931418332e-05, "loss": 0.3503, "step": 7200 }, { "epoch": 1.4195583596214512, "grad_norm": 0.4764569323870533, "learning_rate": 1.4419063516362633e-05, "loss": 0.3142, "step": 7201 }, { "epoch": 1.4197555205047319, "grad_norm": 0.48966644810736204, "learning_rate": 1.4417672995135372e-05, "loss": 0.3348, "step": 7202 }, { "epoch": 1.4199526813880126, "grad_norm": 0.4897579050663058, "learning_rate": 1.4416282367769961e-05, "loss": 0.3284, "step": 7203 }, { "epoch": 1.4201498422712935, "grad_norm": 0.4709690024262115, "learning_rate": 1.441489163429981e-05, "loss": 0.3158, "step": 7204 }, { "epoch": 1.4203470031545742, "grad_norm": 0.5083654874998398, "learning_rate": 1.4413500794758333e-05, "loss": 0.3569, "step": 7205 }, { "epoch": 1.4205441640378549, "grad_norm": 0.4977170017599856, "learning_rate": 1.4412109849178944e-05, "loss": 0.3207, "step": 7206 }, { "epoch": 1.4207413249211356, "grad_norm": 0.49785411016808323, "learning_rate": 1.4410718797595063e-05, "loss": 0.3385, "step": 7207 }, { "epoch": 1.4209384858044163, "grad_norm": 0.4985331494542473, "learning_rate": 1.4409327640040111e-05, "loss": 0.3256, "step": 7208 }, { "epoch": 1.4211356466876972, "grad_norm": 12.637841434778313, "learning_rate": 1.440793637654751e-05, "loss": 0.4233, "step": 7209 }, { "epoch": 1.421332807570978, "grad_norm": 0.5318624726951542, "learning_rate": 1.4406545007150693e-05, "loss": 0.3293, "step": 7210 }, { "epoch": 1.4215299684542586, "grad_norm": 0.5031296805718974, "learning_rate": 1.440515353188308e-05, "loss": 0.3301, "step": 7211 }, { "epoch": 1.4217271293375395, "grad_norm": 0.5445707070467908, "learning_rate": 1.4403761950778106e-05, "loss": 0.3248, "step": 7212 }, { "epoch": 1.4219242902208202, "grad_norm": 0.4659455732252377, "learning_rate": 1.4402370263869205e-05, "loss": 0.3216, "step": 7213 }, { "epoch": 1.422121451104101, "grad_norm": 0.5288243979500802, "learning_rate": 1.4400978471189812e-05, "loss": 0.3693, "step": 7214 }, { "epoch": 1.4223186119873816, "grad_norm": 0.528449022273337, "learning_rate": 1.439958657277337e-05, "loss": 0.3402, "step": 7215 }, { "epoch": 1.4225157728706623, "grad_norm": 0.5119944243955784, "learning_rate": 1.4398194568653313e-05, "loss": 0.3494, "step": 7216 }, { "epoch": 1.4227129337539433, "grad_norm": 0.4916670845158821, "learning_rate": 1.4396802458863095e-05, "loss": 0.3217, "step": 7217 }, { "epoch": 1.422910094637224, "grad_norm": 0.48113900294298484, "learning_rate": 1.4395410243436153e-05, "loss": 0.3314, "step": 7218 }, { "epoch": 1.4231072555205047, "grad_norm": 0.47058896620118573, "learning_rate": 1.4394017922405943e-05, "loss": 0.3159, "step": 7219 }, { "epoch": 1.4233044164037856, "grad_norm": 0.4849453680724847, "learning_rate": 1.4392625495805913e-05, "loss": 0.3395, "step": 7220 }, { "epoch": 1.4235015772870663, "grad_norm": 0.5373072375771156, "learning_rate": 1.439123296366952e-05, "loss": 0.3288, "step": 7221 }, { "epoch": 1.423698738170347, "grad_norm": 0.4693872279830995, "learning_rate": 1.4389840326030213e-05, "loss": 0.3302, "step": 7222 }, { "epoch": 1.4238958990536277, "grad_norm": 0.5319368606240615, "learning_rate": 1.438844758292146e-05, "loss": 0.3312, "step": 7223 }, { "epoch": 1.4240930599369084, "grad_norm": 0.5817496986395633, "learning_rate": 1.4387054734376722e-05, "loss": 0.3249, "step": 7224 }, { "epoch": 1.4242902208201893, "grad_norm": 0.48934565333610924, "learning_rate": 1.4385661780429461e-05, "loss": 0.334, "step": 7225 }, { "epoch": 1.42448738170347, "grad_norm": 0.49779432157118675, "learning_rate": 1.438426872111314e-05, "loss": 0.3428, "step": 7226 }, { "epoch": 1.4246845425867507, "grad_norm": 0.4940338187493709, "learning_rate": 1.4382875556461238e-05, "loss": 0.3252, "step": 7227 }, { "epoch": 1.4248817034700316, "grad_norm": 0.4815774455413699, "learning_rate": 1.4381482286507216e-05, "loss": 0.3158, "step": 7228 }, { "epoch": 1.4250788643533123, "grad_norm": 0.4826791184071724, "learning_rate": 1.4380088911284557e-05, "loss": 0.3388, "step": 7229 }, { "epoch": 1.425276025236593, "grad_norm": 0.5434358688212808, "learning_rate": 1.4378695430826732e-05, "loss": 0.3457, "step": 7230 }, { "epoch": 1.425473186119874, "grad_norm": 0.496948379843039, "learning_rate": 1.4377301845167227e-05, "loss": 0.3431, "step": 7231 }, { "epoch": 1.4256703470031546, "grad_norm": 0.4980057220909191, "learning_rate": 1.4375908154339517e-05, "loss": 0.3411, "step": 7232 }, { "epoch": 1.4258675078864353, "grad_norm": 0.5381831341243695, "learning_rate": 1.437451435837709e-05, "loss": 0.3364, "step": 7233 }, { "epoch": 1.426064668769716, "grad_norm": 0.5286642410721416, "learning_rate": 1.4373120457313435e-05, "loss": 0.3471, "step": 7234 }, { "epoch": 1.4262618296529967, "grad_norm": 0.7724157082129258, "learning_rate": 1.4371726451182038e-05, "loss": 0.349, "step": 7235 }, { "epoch": 1.4264589905362777, "grad_norm": 0.5265279436116093, "learning_rate": 1.437033234001639e-05, "loss": 0.3557, "step": 7236 }, { "epoch": 1.4266561514195584, "grad_norm": 1.479067454447673, "learning_rate": 1.436893812384999e-05, "loss": 0.3668, "step": 7237 }, { "epoch": 1.426853312302839, "grad_norm": 0.5212610233545351, "learning_rate": 1.4367543802716334e-05, "loss": 0.3513, "step": 7238 }, { "epoch": 1.42705047318612, "grad_norm": 0.5107655933355794, "learning_rate": 1.436614937664892e-05, "loss": 0.3375, "step": 7239 }, { "epoch": 1.4272476340694007, "grad_norm": 0.5255564965953446, "learning_rate": 1.4364754845681253e-05, "loss": 0.344, "step": 7240 }, { "epoch": 1.4274447949526814, "grad_norm": 0.5736229739284229, "learning_rate": 1.4363360209846833e-05, "loss": 0.357, "step": 7241 }, { "epoch": 1.427641955835962, "grad_norm": 0.500621320395277, "learning_rate": 1.4361965469179173e-05, "loss": 0.3204, "step": 7242 }, { "epoch": 1.4278391167192428, "grad_norm": 0.49627772986305124, "learning_rate": 1.4360570623711778e-05, "loss": 0.3196, "step": 7243 }, { "epoch": 1.4280362776025237, "grad_norm": 0.5137643214444078, "learning_rate": 1.4359175673478163e-05, "loss": 0.3455, "step": 7244 }, { "epoch": 1.4282334384858044, "grad_norm": 0.5139351480323677, "learning_rate": 1.435778061851184e-05, "loss": 0.3494, "step": 7245 }, { "epoch": 1.4284305993690851, "grad_norm": 0.5154850389726875, "learning_rate": 1.435638545884633e-05, "loss": 0.3538, "step": 7246 }, { "epoch": 1.428627760252366, "grad_norm": 0.49555806148409276, "learning_rate": 1.4354990194515155e-05, "loss": 0.345, "step": 7247 }, { "epoch": 1.4288249211356467, "grad_norm": 0.5168230858439605, "learning_rate": 1.4353594825551827e-05, "loss": 0.3459, "step": 7248 }, { "epoch": 1.4290220820189274, "grad_norm": 0.5471309658606096, "learning_rate": 1.4352199351989881e-05, "loss": 0.3419, "step": 7249 }, { "epoch": 1.4292192429022081, "grad_norm": 0.47810122148847417, "learning_rate": 1.4350803773862841e-05, "loss": 0.3093, "step": 7250 }, { "epoch": 1.4294164037854888, "grad_norm": 0.4924167515842062, "learning_rate": 1.4349408091204234e-05, "loss": 0.3169, "step": 7251 }, { "epoch": 1.4296135646687698, "grad_norm": 0.48173871551750974, "learning_rate": 1.4348012304047596e-05, "loss": 0.3336, "step": 7252 }, { "epoch": 1.4298107255520505, "grad_norm": 0.49482226426234466, "learning_rate": 1.4346616412426464e-05, "loss": 0.3356, "step": 7253 }, { "epoch": 1.4300078864353312, "grad_norm": 0.505868389409233, "learning_rate": 1.434522041637437e-05, "loss": 0.3555, "step": 7254 }, { "epoch": 1.430205047318612, "grad_norm": 0.46830331430343086, "learning_rate": 1.4343824315924855e-05, "loss": 0.3163, "step": 7255 }, { "epoch": 1.4304022082018928, "grad_norm": 0.4854035751744796, "learning_rate": 1.4342428111111461e-05, "loss": 0.3245, "step": 7256 }, { "epoch": 1.4305993690851735, "grad_norm": 0.4948071078265386, "learning_rate": 1.4341031801967742e-05, "loss": 0.3319, "step": 7257 }, { "epoch": 1.4307965299684542, "grad_norm": 0.5325267099042197, "learning_rate": 1.4339635388527231e-05, "loss": 0.3412, "step": 7258 }, { "epoch": 1.430993690851735, "grad_norm": 0.6445122208735229, "learning_rate": 1.433823887082349e-05, "loss": 0.3409, "step": 7259 }, { "epoch": 1.4311908517350158, "grad_norm": 0.5057312476219762, "learning_rate": 1.4336842248890065e-05, "loss": 0.3433, "step": 7260 }, { "epoch": 1.4313880126182965, "grad_norm": 0.5013879369726504, "learning_rate": 1.4335445522760512e-05, "loss": 0.3373, "step": 7261 }, { "epoch": 1.4315851735015772, "grad_norm": 0.5361669119631592, "learning_rate": 1.433404869246839e-05, "loss": 0.3599, "step": 7262 }, { "epoch": 1.4317823343848581, "grad_norm": 0.5096029349380119, "learning_rate": 1.4332651758047254e-05, "loss": 0.3446, "step": 7263 }, { "epoch": 1.4319794952681388, "grad_norm": 0.5265991854311413, "learning_rate": 1.4331254719530676e-05, "loss": 0.3717, "step": 7264 }, { "epoch": 1.4321766561514195, "grad_norm": 0.4825505433718403, "learning_rate": 1.4329857576952212e-05, "loss": 0.3206, "step": 7265 }, { "epoch": 1.4323738170347002, "grad_norm": 0.509655525792269, "learning_rate": 1.4328460330345434e-05, "loss": 0.3277, "step": 7266 }, { "epoch": 1.432570977917981, "grad_norm": 0.4686753391448895, "learning_rate": 1.432706297974391e-05, "loss": 0.3028, "step": 7267 }, { "epoch": 1.4327681388012619, "grad_norm": 0.5074945700987945, "learning_rate": 1.4325665525181213e-05, "loss": 0.3524, "step": 7268 }, { "epoch": 1.4329652996845426, "grad_norm": 0.4994918673135275, "learning_rate": 1.4324267966690919e-05, "loss": 0.3373, "step": 7269 }, { "epoch": 1.4331624605678233, "grad_norm": 0.4840679030796876, "learning_rate": 1.4322870304306604e-05, "loss": 0.3397, "step": 7270 }, { "epoch": 1.4333596214511042, "grad_norm": 0.4620784380959485, "learning_rate": 1.4321472538061852e-05, "loss": 0.3314, "step": 7271 }, { "epoch": 1.4335567823343849, "grad_norm": 0.4551298101484555, "learning_rate": 1.4320074667990237e-05, "loss": 0.3147, "step": 7272 }, { "epoch": 1.4337539432176656, "grad_norm": 0.48354743821185125, "learning_rate": 1.4318676694125353e-05, "loss": 0.3441, "step": 7273 }, { "epoch": 1.4339511041009465, "grad_norm": 0.44671416102776385, "learning_rate": 1.4317278616500785e-05, "loss": 0.3095, "step": 7274 }, { "epoch": 1.4341482649842272, "grad_norm": 0.48747299202730604, "learning_rate": 1.4315880435150119e-05, "loss": 0.3416, "step": 7275 }, { "epoch": 1.434345425867508, "grad_norm": 0.4931741101936371, "learning_rate": 1.431448215010695e-05, "loss": 0.3516, "step": 7276 }, { "epoch": 1.4345425867507886, "grad_norm": 0.49240050671361324, "learning_rate": 1.4313083761404874e-05, "loss": 0.3476, "step": 7277 }, { "epoch": 1.4347397476340693, "grad_norm": 0.4827511773398998, "learning_rate": 1.4311685269077484e-05, "loss": 0.3185, "step": 7278 }, { "epoch": 1.4349369085173502, "grad_norm": 0.4971900913026981, "learning_rate": 1.4310286673158387e-05, "loss": 0.3614, "step": 7279 }, { "epoch": 1.435134069400631, "grad_norm": 0.4857373079412297, "learning_rate": 1.430888797368118e-05, "loss": 0.3184, "step": 7280 }, { "epoch": 1.4353312302839116, "grad_norm": 0.5759392585568717, "learning_rate": 1.430748917067947e-05, "loss": 0.3404, "step": 7281 }, { "epoch": 1.4355283911671926, "grad_norm": 0.48435593637452573, "learning_rate": 1.4306090264186863e-05, "loss": 0.3346, "step": 7282 }, { "epoch": 1.4357255520504733, "grad_norm": 0.5183943537528527, "learning_rate": 1.430469125423697e-05, "loss": 0.3635, "step": 7283 }, { "epoch": 1.435922712933754, "grad_norm": 0.4874012749572438, "learning_rate": 1.4303292140863402e-05, "loss": 0.3417, "step": 7284 }, { "epoch": 1.4361198738170347, "grad_norm": 0.47674893494615467, "learning_rate": 1.4301892924099778e-05, "loss": 0.326, "step": 7285 }, { "epoch": 1.4363170347003154, "grad_norm": 0.4975791293304716, "learning_rate": 1.430049360397971e-05, "loss": 0.3426, "step": 7286 }, { "epoch": 1.4365141955835963, "grad_norm": 0.47977676406998915, "learning_rate": 1.4299094180536821e-05, "loss": 0.3338, "step": 7287 }, { "epoch": 1.436711356466877, "grad_norm": 0.4849615911471788, "learning_rate": 1.429769465380473e-05, "loss": 0.3365, "step": 7288 }, { "epoch": 1.4369085173501577, "grad_norm": 0.50892477699892, "learning_rate": 1.4296295023817068e-05, "loss": 0.3194, "step": 7289 }, { "epoch": 1.4371056782334386, "grad_norm": 0.5145007377570271, "learning_rate": 1.4294895290607454e-05, "loss": 0.3497, "step": 7290 }, { "epoch": 1.4373028391167193, "grad_norm": 0.5513464913968162, "learning_rate": 1.4293495454209525e-05, "loss": 0.3693, "step": 7291 }, { "epoch": 1.4375, "grad_norm": 0.5079800533958371, "learning_rate": 1.4292095514656907e-05, "loss": 0.3527, "step": 7292 }, { "epoch": 1.4376971608832807, "grad_norm": 0.48977789204818567, "learning_rate": 1.4290695471983243e-05, "loss": 0.3207, "step": 7293 }, { "epoch": 1.4378943217665614, "grad_norm": 0.6229406915573323, "learning_rate": 1.4289295326222161e-05, "loss": 0.3519, "step": 7294 }, { "epoch": 1.4380914826498423, "grad_norm": 0.5705871966971622, "learning_rate": 1.4287895077407306e-05, "loss": 0.3602, "step": 7295 }, { "epoch": 1.438288643533123, "grad_norm": 0.47501369073246813, "learning_rate": 1.4286494725572317e-05, "loss": 0.3289, "step": 7296 }, { "epoch": 1.4384858044164037, "grad_norm": 0.4981376094438177, "learning_rate": 1.4285094270750843e-05, "loss": 0.3381, "step": 7297 }, { "epoch": 1.4386829652996846, "grad_norm": 0.5229391942882657, "learning_rate": 1.4283693712976527e-05, "loss": 0.3332, "step": 7298 }, { "epoch": 1.4388801261829653, "grad_norm": 0.5054756208664721, "learning_rate": 1.4282293052283019e-05, "loss": 0.352, "step": 7299 }, { "epoch": 1.439077287066246, "grad_norm": 0.4951109600486945, "learning_rate": 1.4280892288703974e-05, "loss": 0.331, "step": 7300 }, { "epoch": 1.4392744479495267, "grad_norm": 0.46364365237898975, "learning_rate": 1.4279491422273043e-05, "loss": 0.318, "step": 7301 }, { "epoch": 1.4394716088328074, "grad_norm": 0.4947030225121084, "learning_rate": 1.4278090453023885e-05, "loss": 0.3423, "step": 7302 }, { "epoch": 1.4396687697160884, "grad_norm": 0.4751068461401763, "learning_rate": 1.4276689380990156e-05, "loss": 0.3291, "step": 7303 }, { "epoch": 1.439865930599369, "grad_norm": 0.5071969159877269, "learning_rate": 1.4275288206205525e-05, "loss": 0.3428, "step": 7304 }, { "epoch": 1.4400630914826498, "grad_norm": 0.4964103377634554, "learning_rate": 1.4273886928703648e-05, "loss": 0.3302, "step": 7305 }, { "epoch": 1.4402602523659307, "grad_norm": 0.4786434613936568, "learning_rate": 1.4272485548518198e-05, "loss": 0.3171, "step": 7306 }, { "epoch": 1.4404574132492114, "grad_norm": 0.4821496603078477, "learning_rate": 1.427108406568284e-05, "loss": 0.3286, "step": 7307 }, { "epoch": 1.440654574132492, "grad_norm": 0.4899274116908069, "learning_rate": 1.4269682480231253e-05, "loss": 0.3384, "step": 7308 }, { "epoch": 1.4408517350157728, "grad_norm": 0.496373158989974, "learning_rate": 1.42682807921971e-05, "loss": 0.3355, "step": 7309 }, { "epoch": 1.4410488958990535, "grad_norm": 0.509985991828358, "learning_rate": 1.4266879001614067e-05, "loss": 0.3311, "step": 7310 }, { "epoch": 1.4412460567823344, "grad_norm": 0.4956371703782259, "learning_rate": 1.4265477108515828e-05, "loss": 0.3319, "step": 7311 }, { "epoch": 1.4414432176656151, "grad_norm": 0.490193768475847, "learning_rate": 1.426407511293607e-05, "loss": 0.3395, "step": 7312 }, { "epoch": 1.4416403785488958, "grad_norm": 0.4955718459701483, "learning_rate": 1.4262673014908472e-05, "loss": 0.3417, "step": 7313 }, { "epoch": 1.4418375394321767, "grad_norm": 0.5089810792820509, "learning_rate": 1.4261270814466719e-05, "loss": 0.3719, "step": 7314 }, { "epoch": 1.4420347003154574, "grad_norm": 0.5373269993400187, "learning_rate": 1.4259868511644508e-05, "loss": 0.3366, "step": 7315 }, { "epoch": 1.4422318611987381, "grad_norm": 0.5191745083925651, "learning_rate": 1.4258466106475522e-05, "loss": 0.3412, "step": 7316 }, { "epoch": 1.442429022082019, "grad_norm": 0.49462707282721535, "learning_rate": 1.4257063598993458e-05, "loss": 0.3463, "step": 7317 }, { "epoch": 1.4426261829652998, "grad_norm": 0.4818656214076428, "learning_rate": 1.4255660989232014e-05, "loss": 0.321, "step": 7318 }, { "epoch": 1.4428233438485805, "grad_norm": 0.5247053748007937, "learning_rate": 1.4254258277224888e-05, "loss": 0.3496, "step": 7319 }, { "epoch": 1.4430205047318612, "grad_norm": 0.4934106851982142, "learning_rate": 1.4252855463005782e-05, "loss": 0.3463, "step": 7320 }, { "epoch": 1.4432176656151419, "grad_norm": 0.4941318734425298, "learning_rate": 1.4251452546608397e-05, "loss": 0.3258, "step": 7321 }, { "epoch": 1.4434148264984228, "grad_norm": 0.4740782133591454, "learning_rate": 1.4250049528066441e-05, "loss": 0.3002, "step": 7322 }, { "epoch": 1.4436119873817035, "grad_norm": 0.528973383583317, "learning_rate": 1.4248646407413622e-05, "loss": 0.3539, "step": 7323 }, { "epoch": 1.4438091482649842, "grad_norm": 0.49936440814980976, "learning_rate": 1.424724318468365e-05, "loss": 0.3563, "step": 7324 }, { "epoch": 1.444006309148265, "grad_norm": 0.5037879267205015, "learning_rate": 1.4245839859910247e-05, "loss": 0.3528, "step": 7325 }, { "epoch": 1.4442034700315458, "grad_norm": 0.5022670283442432, "learning_rate": 1.4244436433127118e-05, "loss": 0.3321, "step": 7326 }, { "epoch": 1.4444006309148265, "grad_norm": 0.49067613703910734, "learning_rate": 1.4243032904367984e-05, "loss": 0.3502, "step": 7327 }, { "epoch": 1.4445977917981072, "grad_norm": 0.49245160744368666, "learning_rate": 1.424162927366657e-05, "loss": 0.3459, "step": 7328 }, { "epoch": 1.444794952681388, "grad_norm": 0.4882492569918211, "learning_rate": 1.4240225541056596e-05, "loss": 0.3348, "step": 7329 }, { "epoch": 1.4449921135646688, "grad_norm": 0.5009716845668267, "learning_rate": 1.423882170657179e-05, "loss": 0.3276, "step": 7330 }, { "epoch": 1.4451892744479495, "grad_norm": 0.46911913556435136, "learning_rate": 1.4237417770245877e-05, "loss": 0.3144, "step": 7331 }, { "epoch": 1.4453864353312302, "grad_norm": 0.5206696861698887, "learning_rate": 1.423601373211259e-05, "loss": 0.3623, "step": 7332 }, { "epoch": 1.4455835962145112, "grad_norm": 0.4906485716791266, "learning_rate": 1.4234609592205662e-05, "loss": 0.3442, "step": 7333 }, { "epoch": 1.4457807570977919, "grad_norm": 0.5090573234966967, "learning_rate": 1.423320535055883e-05, "loss": 0.3448, "step": 7334 }, { "epoch": 1.4459779179810726, "grad_norm": 0.4712940869922216, "learning_rate": 1.4231801007205827e-05, "loss": 0.3334, "step": 7335 }, { "epoch": 1.4461750788643533, "grad_norm": 0.5092882547025841, "learning_rate": 1.4230396562180401e-05, "loss": 0.3217, "step": 7336 }, { "epoch": 1.446372239747634, "grad_norm": 0.498188825420168, "learning_rate": 1.4228992015516287e-05, "loss": 0.3587, "step": 7337 }, { "epoch": 1.4465694006309149, "grad_norm": 0.49312249691916427, "learning_rate": 1.4227587367247238e-05, "loss": 0.3465, "step": 7338 }, { "epoch": 1.4467665615141956, "grad_norm": 0.5293802799153761, "learning_rate": 1.4226182617406996e-05, "loss": 0.3549, "step": 7339 }, { "epoch": 1.4469637223974763, "grad_norm": 0.45293571133841104, "learning_rate": 1.4224777766029311e-05, "loss": 0.3161, "step": 7340 }, { "epoch": 1.4471608832807572, "grad_norm": 0.46222311219300893, "learning_rate": 1.4223372813147942e-05, "loss": 0.3056, "step": 7341 }, { "epoch": 1.447358044164038, "grad_norm": 0.5057284253651947, "learning_rate": 1.422196775879664e-05, "loss": 0.3432, "step": 7342 }, { "epoch": 1.4475552050473186, "grad_norm": 0.48672729425679273, "learning_rate": 1.422056260300916e-05, "loss": 0.3328, "step": 7343 }, { "epoch": 1.4477523659305993, "grad_norm": 0.4625625209082356, "learning_rate": 1.4219157345819268e-05, "loss": 0.3003, "step": 7344 }, { "epoch": 1.44794952681388, "grad_norm": 0.5101487287795781, "learning_rate": 1.421775198726072e-05, "loss": 0.3384, "step": 7345 }, { "epoch": 1.448146687697161, "grad_norm": 0.4940574773463017, "learning_rate": 1.4216346527367284e-05, "loss": 0.3219, "step": 7346 }, { "epoch": 1.4483438485804416, "grad_norm": 0.4876148749981708, "learning_rate": 1.421494096617273e-05, "loss": 0.3428, "step": 7347 }, { "epoch": 1.4485410094637223, "grad_norm": 0.47062050672221883, "learning_rate": 1.4213535303710822e-05, "loss": 0.3042, "step": 7348 }, { "epoch": 1.4487381703470033, "grad_norm": 0.48615991071089154, "learning_rate": 1.4212129540015339e-05, "loss": 0.3421, "step": 7349 }, { "epoch": 1.448935331230284, "grad_norm": 0.45576427389164237, "learning_rate": 1.4210723675120049e-05, "loss": 0.3115, "step": 7350 }, { "epoch": 1.4491324921135647, "grad_norm": 0.5118194739469882, "learning_rate": 1.420931770905873e-05, "loss": 0.3674, "step": 7351 }, { "epoch": 1.4493296529968454, "grad_norm": 0.5135213363178154, "learning_rate": 1.4207911641865164e-05, "loss": 0.349, "step": 7352 }, { "epoch": 1.449526813880126, "grad_norm": 0.5010570752976321, "learning_rate": 1.4206505473573135e-05, "loss": 0.3526, "step": 7353 }, { "epoch": 1.449723974763407, "grad_norm": 0.48809906237733963, "learning_rate": 1.4205099204216421e-05, "loss": 0.3339, "step": 7354 }, { "epoch": 1.4499211356466877, "grad_norm": 0.48260002387765977, "learning_rate": 1.4203692833828817e-05, "loss": 0.315, "step": 7355 }, { "epoch": 1.4501182965299684, "grad_norm": 0.4752614271985504, "learning_rate": 1.4202286362444105e-05, "loss": 0.3108, "step": 7356 }, { "epoch": 1.4503154574132493, "grad_norm": 0.48050012591845676, "learning_rate": 1.4200879790096078e-05, "loss": 0.3118, "step": 7357 }, { "epoch": 1.45051261829653, "grad_norm": 0.49612980028187825, "learning_rate": 1.419947311681853e-05, "loss": 0.3452, "step": 7358 }, { "epoch": 1.4507097791798107, "grad_norm": 6.6692877091280165, "learning_rate": 1.4198066342645262e-05, "loss": 0.3363, "step": 7359 }, { "epoch": 1.4509069400630916, "grad_norm": 0.5452461743710111, "learning_rate": 1.4196659467610068e-05, "loss": 0.3699, "step": 7360 }, { "epoch": 1.4511041009463723, "grad_norm": 0.5175861598786076, "learning_rate": 1.419525249174675e-05, "loss": 0.3358, "step": 7361 }, { "epoch": 1.451301261829653, "grad_norm": 0.5004638206871369, "learning_rate": 1.4193845415089113e-05, "loss": 0.3465, "step": 7362 }, { "epoch": 1.4514984227129337, "grad_norm": 0.5018482035564551, "learning_rate": 1.4192438237670962e-05, "loss": 0.3553, "step": 7363 }, { "epoch": 1.4516955835962144, "grad_norm": 0.6472551081778795, "learning_rate": 1.4191030959526106e-05, "loss": 0.3311, "step": 7364 }, { "epoch": 1.4518927444794953, "grad_norm": 0.48615491438153513, "learning_rate": 1.4189623580688358e-05, "loss": 0.3209, "step": 7365 }, { "epoch": 1.452089905362776, "grad_norm": 0.4996419475157942, "learning_rate": 1.418821610119153e-05, "loss": 0.3374, "step": 7366 }, { "epoch": 1.4522870662460567, "grad_norm": 0.503614653074225, "learning_rate": 1.4186808521069436e-05, "loss": 0.3366, "step": 7367 }, { "epoch": 1.4524842271293377, "grad_norm": 0.6183156108882925, "learning_rate": 1.4185400840355895e-05, "loss": 0.3291, "step": 7368 }, { "epoch": 1.4526813880126184, "grad_norm": 0.4913587909312157, "learning_rate": 1.4183993059084728e-05, "loss": 0.3561, "step": 7369 }, { "epoch": 1.452878548895899, "grad_norm": 0.5001590541017276, "learning_rate": 1.418258517728976e-05, "loss": 0.3561, "step": 7370 }, { "epoch": 1.4530757097791798, "grad_norm": 0.4642898097782909, "learning_rate": 1.4181177195004814e-05, "loss": 0.3192, "step": 7371 }, { "epoch": 1.4532728706624605, "grad_norm": 0.5378165455257576, "learning_rate": 1.4179769112263719e-05, "loss": 0.3453, "step": 7372 }, { "epoch": 1.4534700315457414, "grad_norm": 0.4847498894723453, "learning_rate": 1.4178360929100303e-05, "loss": 0.3151, "step": 7373 }, { "epoch": 1.453667192429022, "grad_norm": 0.5067406612634322, "learning_rate": 1.4176952645548406e-05, "loss": 0.3362, "step": 7374 }, { "epoch": 1.4538643533123028, "grad_norm": 0.4918446350110439, "learning_rate": 1.4175544261641854e-05, "loss": 0.3319, "step": 7375 }, { "epoch": 1.4540615141955837, "grad_norm": 0.49347879868509636, "learning_rate": 1.417413577741449e-05, "loss": 0.342, "step": 7376 }, { "epoch": 1.4542586750788644, "grad_norm": 0.4783914406259114, "learning_rate": 1.417272719290015e-05, "loss": 0.3255, "step": 7377 }, { "epoch": 1.4544558359621451, "grad_norm": 0.4941216512113406, "learning_rate": 1.4171318508132683e-05, "loss": 0.3196, "step": 7378 }, { "epoch": 1.4546529968454258, "grad_norm": 0.49485434742875883, "learning_rate": 1.416990972314593e-05, "loss": 0.3293, "step": 7379 }, { "epoch": 1.4548501577287065, "grad_norm": 0.4788373125318391, "learning_rate": 1.4168500837973733e-05, "loss": 0.3059, "step": 7380 }, { "epoch": 1.4550473186119874, "grad_norm": 0.4834085395567045, "learning_rate": 1.416709185264995e-05, "loss": 0.3363, "step": 7381 }, { "epoch": 1.4552444794952681, "grad_norm": 0.5134730730417215, "learning_rate": 1.4165682767208426e-05, "loss": 0.3416, "step": 7382 }, { "epoch": 1.4554416403785488, "grad_norm": 0.5044382136084319, "learning_rate": 1.4164273581683023e-05, "loss": 0.3297, "step": 7383 }, { "epoch": 1.4556388012618298, "grad_norm": 0.5714557060481008, "learning_rate": 1.4162864296107593e-05, "loss": 0.3529, "step": 7384 }, { "epoch": 1.4558359621451105, "grad_norm": 0.5054705611777254, "learning_rate": 1.4161454910515997e-05, "loss": 0.3306, "step": 7385 }, { "epoch": 1.4560331230283912, "grad_norm": 0.47824473710019705, "learning_rate": 1.416004542494209e-05, "loss": 0.3287, "step": 7386 }, { "epoch": 1.4562302839116719, "grad_norm": 0.4835230550928211, "learning_rate": 1.4158635839419745e-05, "loss": 0.3193, "step": 7387 }, { "epoch": 1.4564274447949526, "grad_norm": 0.5966821280918011, "learning_rate": 1.4157226153982826e-05, "loss": 0.3512, "step": 7388 }, { "epoch": 1.4566246056782335, "grad_norm": 0.487708074097899, "learning_rate": 1.4155816368665201e-05, "loss": 0.3356, "step": 7389 }, { "epoch": 1.4568217665615142, "grad_norm": 0.47577454166544947, "learning_rate": 1.415440648350074e-05, "loss": 0.3373, "step": 7390 }, { "epoch": 1.4570189274447949, "grad_norm": 0.4872308941864148, "learning_rate": 1.4152996498523317e-05, "loss": 0.3309, "step": 7391 }, { "epoch": 1.4572160883280758, "grad_norm": 0.5225407802789235, "learning_rate": 1.4151586413766811e-05, "loss": 0.3419, "step": 7392 }, { "epoch": 1.4574132492113565, "grad_norm": 0.4998933279734873, "learning_rate": 1.4150176229265096e-05, "loss": 0.3668, "step": 7393 }, { "epoch": 1.4576104100946372, "grad_norm": 0.5050373414529986, "learning_rate": 1.4148765945052056e-05, "loss": 0.3389, "step": 7394 }, { "epoch": 1.457807570977918, "grad_norm": 0.5064131922922323, "learning_rate": 1.4147355561161574e-05, "loss": 0.3561, "step": 7395 }, { "epoch": 1.4580047318611986, "grad_norm": 0.4948700452474314, "learning_rate": 1.4145945077627531e-05, "loss": 0.3395, "step": 7396 }, { "epoch": 1.4582018927444795, "grad_norm": 0.5239164208524786, "learning_rate": 1.4144534494483824e-05, "loss": 0.3643, "step": 7397 }, { "epoch": 1.4583990536277602, "grad_norm": 0.5345022125344085, "learning_rate": 1.4143123811764335e-05, "loss": 0.3509, "step": 7398 }, { "epoch": 1.458596214511041, "grad_norm": 0.49581376523712395, "learning_rate": 1.414171302950296e-05, "loss": 0.3298, "step": 7399 }, { "epoch": 1.4587933753943219, "grad_norm": 0.4921239829922169, "learning_rate": 1.4140302147733596e-05, "loss": 0.3322, "step": 7400 }, { "epoch": 1.4589905362776026, "grad_norm": 0.5592026245468484, "learning_rate": 1.4138891166490135e-05, "loss": 0.3293, "step": 7401 }, { "epoch": 1.4591876971608833, "grad_norm": 0.5442208251383976, "learning_rate": 1.4137480085806486e-05, "loss": 0.3361, "step": 7402 }, { "epoch": 1.459384858044164, "grad_norm": 0.5170050108654181, "learning_rate": 1.413606890571654e-05, "loss": 0.3538, "step": 7403 }, { "epoch": 1.4595820189274447, "grad_norm": 0.510647100212481, "learning_rate": 1.4134657626254214e-05, "loss": 0.337, "step": 7404 }, { "epoch": 1.4597791798107256, "grad_norm": 0.5306321326123213, "learning_rate": 1.4133246247453403e-05, "loss": 0.3282, "step": 7405 }, { "epoch": 1.4599763406940063, "grad_norm": 0.5133341234040529, "learning_rate": 1.4131834769348026e-05, "loss": 0.3366, "step": 7406 }, { "epoch": 1.460173501577287, "grad_norm": 0.5395277590006632, "learning_rate": 1.4130423191971992e-05, "loss": 0.3333, "step": 7407 }, { "epoch": 1.460370662460568, "grad_norm": 0.47995198018193463, "learning_rate": 1.4129011515359212e-05, "loss": 0.3362, "step": 7408 }, { "epoch": 1.4605678233438486, "grad_norm": 0.4987772919977686, "learning_rate": 1.4127599739543606e-05, "loss": 0.3347, "step": 7409 }, { "epoch": 1.4607649842271293, "grad_norm": 0.5453925939490976, "learning_rate": 1.4126187864559094e-05, "loss": 0.3689, "step": 7410 }, { "epoch": 1.4609621451104102, "grad_norm": 0.4959830516006325, "learning_rate": 1.4124775890439595e-05, "loss": 0.3154, "step": 7411 }, { "epoch": 1.461159305993691, "grad_norm": 0.5124401018607515, "learning_rate": 1.4123363817219034e-05, "loss": 0.321, "step": 7412 }, { "epoch": 1.4613564668769716, "grad_norm": 1.2270349005748566, "learning_rate": 1.4121951644931336e-05, "loss": 0.3614, "step": 7413 }, { "epoch": 1.4615536277602523, "grad_norm": 0.5138791690617409, "learning_rate": 1.4120539373610429e-05, "loss": 0.3237, "step": 7414 }, { "epoch": 1.461750788643533, "grad_norm": 0.4521273911127816, "learning_rate": 1.4119127003290248e-05, "loss": 0.3186, "step": 7415 }, { "epoch": 1.461947949526814, "grad_norm": 0.49874325861714064, "learning_rate": 1.411771453400472e-05, "loss": 0.3281, "step": 7416 }, { "epoch": 1.4621451104100947, "grad_norm": 0.5021452602032279, "learning_rate": 1.4116301965787786e-05, "loss": 0.3404, "step": 7417 }, { "epoch": 1.4623422712933754, "grad_norm": 0.4973741088361569, "learning_rate": 1.4114889298673383e-05, "loss": 0.3309, "step": 7418 }, { "epoch": 1.4625394321766563, "grad_norm": 0.516601612052012, "learning_rate": 1.4113476532695452e-05, "loss": 0.3076, "step": 7419 }, { "epoch": 1.462736593059937, "grad_norm": 0.5526911269279787, "learning_rate": 1.4112063667887932e-05, "loss": 0.3539, "step": 7420 }, { "epoch": 1.4629337539432177, "grad_norm": 0.5562535752274906, "learning_rate": 1.4110650704284773e-05, "loss": 0.3569, "step": 7421 }, { "epoch": 1.4631309148264984, "grad_norm": 0.5488213791263362, "learning_rate": 1.410923764191992e-05, "loss": 0.357, "step": 7422 }, { "epoch": 1.463328075709779, "grad_norm": 0.5107788895192845, "learning_rate": 1.4107824480827324e-05, "loss": 0.3328, "step": 7423 }, { "epoch": 1.46352523659306, "grad_norm": 0.5171838627063462, "learning_rate": 1.4106411221040935e-05, "loss": 0.3217, "step": 7424 }, { "epoch": 1.4637223974763407, "grad_norm": 0.4922378526729013, "learning_rate": 1.4104997862594711e-05, "loss": 0.3343, "step": 7425 }, { "epoch": 1.4639195583596214, "grad_norm": 0.5037026945262372, "learning_rate": 1.4103584405522605e-05, "loss": 0.3284, "step": 7426 }, { "epoch": 1.4641167192429023, "grad_norm": 0.49326551249285355, "learning_rate": 1.4102170849858583e-05, "loss": 0.3443, "step": 7427 }, { "epoch": 1.464313880126183, "grad_norm": 0.546820130602552, "learning_rate": 1.41007571956366e-05, "loss": 0.3602, "step": 7428 }, { "epoch": 1.4645110410094637, "grad_norm": 0.4837797980660489, "learning_rate": 1.4099343442890624e-05, "loss": 0.3333, "step": 7429 }, { "epoch": 1.4647082018927444, "grad_norm": 0.5126857949066812, "learning_rate": 1.4097929591654621e-05, "loss": 0.3383, "step": 7430 }, { "epoch": 1.4649053627760251, "grad_norm": 0.46279281738729217, "learning_rate": 1.4096515641962563e-05, "loss": 0.3435, "step": 7431 }, { "epoch": 1.465102523659306, "grad_norm": 2.09692903793275, "learning_rate": 1.4095101593848415e-05, "loss": 0.3672, "step": 7432 }, { "epoch": 1.4652996845425867, "grad_norm": 0.5501540341610509, "learning_rate": 1.4093687447346151e-05, "loss": 0.3876, "step": 7433 }, { "epoch": 1.4654968454258674, "grad_norm": 0.47609912782149055, "learning_rate": 1.409227320248975e-05, "loss": 0.3153, "step": 7434 }, { "epoch": 1.4656940063091484, "grad_norm": 0.525537823437008, "learning_rate": 1.4090858859313193e-05, "loss": 0.3046, "step": 7435 }, { "epoch": 1.465891167192429, "grad_norm": 0.5028179572182535, "learning_rate": 1.4089444417850455e-05, "loss": 0.3421, "step": 7436 }, { "epoch": 1.4660883280757098, "grad_norm": 0.5163272751032175, "learning_rate": 1.408802987813552e-05, "loss": 0.3294, "step": 7437 }, { "epoch": 1.4662854889589905, "grad_norm": 0.5132796172325643, "learning_rate": 1.408661524020238e-05, "loss": 0.3264, "step": 7438 }, { "epoch": 1.4664826498422712, "grad_norm": 0.5108505262340117, "learning_rate": 1.4085200504085013e-05, "loss": 0.3491, "step": 7439 }, { "epoch": 1.466679810725552, "grad_norm": 0.48158384519210207, "learning_rate": 1.4083785669817417e-05, "loss": 0.3258, "step": 7440 }, { "epoch": 1.4668769716088328, "grad_norm": 0.6121936078703637, "learning_rate": 1.408237073743358e-05, "loss": 0.3299, "step": 7441 }, { "epoch": 1.4670741324921135, "grad_norm": 0.5452188941885944, "learning_rate": 1.4080955706967501e-05, "loss": 0.3629, "step": 7442 }, { "epoch": 1.4672712933753944, "grad_norm": 0.467809566032474, "learning_rate": 1.407954057845317e-05, "loss": 0.3111, "step": 7443 }, { "epoch": 1.4674684542586751, "grad_norm": 0.5136940118705474, "learning_rate": 1.4078125351924597e-05, "loss": 0.3638, "step": 7444 }, { "epoch": 1.4676656151419558, "grad_norm": 0.511820180671828, "learning_rate": 1.4076710027415776e-05, "loss": 0.357, "step": 7445 }, { "epoch": 1.4678627760252365, "grad_norm": 0.490168596701873, "learning_rate": 1.4075294604960715e-05, "loss": 0.3198, "step": 7446 }, { "epoch": 1.4680599369085172, "grad_norm": 0.5058740263507188, "learning_rate": 1.4073879084593416e-05, "loss": 0.3195, "step": 7447 }, { "epoch": 1.4682570977917981, "grad_norm": 0.4674312373171209, "learning_rate": 1.4072463466347892e-05, "loss": 0.3201, "step": 7448 }, { "epoch": 1.4684542586750788, "grad_norm": 2.1568178975708383, "learning_rate": 1.4071047750258156e-05, "loss": 0.3337, "step": 7449 }, { "epoch": 1.4686514195583595, "grad_norm": 0.5546196457316546, "learning_rate": 1.4069631936358214e-05, "loss": 0.3781, "step": 7450 }, { "epoch": 1.4688485804416405, "grad_norm": 0.5357099924800648, "learning_rate": 1.4068216024682095e-05, "loss": 0.3631, "step": 7451 }, { "epoch": 1.4690457413249212, "grad_norm": 0.507066065309645, "learning_rate": 1.4066800015263807e-05, "loss": 0.3288, "step": 7452 }, { "epoch": 1.4692429022082019, "grad_norm": 0.4874279119555861, "learning_rate": 1.4065383908137373e-05, "loss": 0.3218, "step": 7453 }, { "epoch": 1.4694400630914828, "grad_norm": 0.4762728451473061, "learning_rate": 1.4063967703336814e-05, "loss": 0.3198, "step": 7454 }, { "epoch": 1.4696372239747635, "grad_norm": 0.4854069150886455, "learning_rate": 1.4062551400896163e-05, "loss": 0.3287, "step": 7455 }, { "epoch": 1.4698343848580442, "grad_norm": 0.4715608643695716, "learning_rate": 1.406113500084944e-05, "loss": 0.2892, "step": 7456 }, { "epoch": 1.4700315457413249, "grad_norm": 0.5002411384785533, "learning_rate": 1.405971850323068e-05, "loss": 0.3071, "step": 7457 }, { "epoch": 1.4702287066246056, "grad_norm": 0.5388064136656453, "learning_rate": 1.4058301908073912e-05, "loss": 0.3851, "step": 7458 }, { "epoch": 1.4704258675078865, "grad_norm": 0.49296401636550136, "learning_rate": 1.4056885215413174e-05, "loss": 0.3298, "step": 7459 }, { "epoch": 1.4706230283911672, "grad_norm": 0.5355211557498206, "learning_rate": 1.4055468425282502e-05, "loss": 0.338, "step": 7460 }, { "epoch": 1.470820189274448, "grad_norm": 0.48977540245567913, "learning_rate": 1.4054051537715933e-05, "loss": 0.317, "step": 7461 }, { "epoch": 1.4710173501577288, "grad_norm": 0.5326327177590915, "learning_rate": 1.4052634552747512e-05, "loss": 0.346, "step": 7462 }, { "epoch": 1.4712145110410095, "grad_norm": 0.526993033584732, "learning_rate": 1.4051217470411284e-05, "loss": 0.3519, "step": 7463 }, { "epoch": 1.4714116719242902, "grad_norm": 0.5258328026865123, "learning_rate": 1.4049800290741293e-05, "loss": 0.3475, "step": 7464 }, { "epoch": 1.471608832807571, "grad_norm": 0.5203038982752369, "learning_rate": 1.4048383013771588e-05, "loss": 0.3321, "step": 7465 }, { "epoch": 1.4718059936908516, "grad_norm": 0.5113074927167442, "learning_rate": 1.404696563953622e-05, "loss": 0.3354, "step": 7466 }, { "epoch": 1.4720031545741326, "grad_norm": 0.49919882528097426, "learning_rate": 1.4045548168069246e-05, "loss": 0.3539, "step": 7467 }, { "epoch": 1.4722003154574133, "grad_norm": 0.5016200068739187, "learning_rate": 1.4044130599404717e-05, "loss": 0.325, "step": 7468 }, { "epoch": 1.472397476340694, "grad_norm": 0.5564339720636347, "learning_rate": 1.4042712933576694e-05, "loss": 0.3908, "step": 7469 }, { "epoch": 1.4725946372239749, "grad_norm": 0.47101819777928333, "learning_rate": 1.4041295170619241e-05, "loss": 0.3266, "step": 7470 }, { "epoch": 1.4727917981072556, "grad_norm": 0.4968101062786291, "learning_rate": 1.403987731056641e-05, "loss": 0.3045, "step": 7471 }, { "epoch": 1.4729889589905363, "grad_norm": 0.5063663744372873, "learning_rate": 1.403845935345228e-05, "loss": 0.323, "step": 7472 }, { "epoch": 1.473186119873817, "grad_norm": 0.4892904205195065, "learning_rate": 1.4037041299310908e-05, "loss": 0.3245, "step": 7473 }, { "epoch": 1.4733832807570977, "grad_norm": 0.5159500037402428, "learning_rate": 1.4035623148176369e-05, "loss": 0.3428, "step": 7474 }, { "epoch": 1.4735804416403786, "grad_norm": 0.48891170305663467, "learning_rate": 1.4034204900082734e-05, "loss": 0.3305, "step": 7475 }, { "epoch": 1.4737776025236593, "grad_norm": 0.557799558291397, "learning_rate": 1.4032786555064077e-05, "loss": 0.3915, "step": 7476 }, { "epoch": 1.47397476340694, "grad_norm": 0.49527625272519055, "learning_rate": 1.4031368113154478e-05, "loss": 0.323, "step": 7477 }, { "epoch": 1.474171924290221, "grad_norm": 0.5316789203889967, "learning_rate": 1.4029949574388009e-05, "loss": 0.3531, "step": 7478 }, { "epoch": 1.4743690851735016, "grad_norm": 0.47098548601018697, "learning_rate": 1.4028530938798759e-05, "loss": 0.3212, "step": 7479 }, { "epoch": 1.4745662460567823, "grad_norm": 0.490959684927149, "learning_rate": 1.402711220642081e-05, "loss": 0.328, "step": 7480 }, { "epoch": 1.474763406940063, "grad_norm": 0.5168859380738673, "learning_rate": 1.4025693377288246e-05, "loss": 0.3489, "step": 7481 }, { "epoch": 1.4749605678233437, "grad_norm": 0.49571446645437645, "learning_rate": 1.4024274451435157e-05, "loss": 0.3375, "step": 7482 }, { "epoch": 1.4751577287066246, "grad_norm": 0.5197521991945561, "learning_rate": 1.4022855428895632e-05, "loss": 0.3343, "step": 7483 }, { "epoch": 1.4753548895899053, "grad_norm": 0.48589615662818103, "learning_rate": 1.4021436309703766e-05, "loss": 0.3201, "step": 7484 }, { "epoch": 1.475552050473186, "grad_norm": 0.4995993722216193, "learning_rate": 1.4020017093893656e-05, "loss": 0.3371, "step": 7485 }, { "epoch": 1.475749211356467, "grad_norm": 0.5036745093017024, "learning_rate": 1.4018597781499399e-05, "loss": 0.3505, "step": 7486 }, { "epoch": 1.4759463722397477, "grad_norm": 0.4878243600702588, "learning_rate": 1.4017178372555092e-05, "loss": 0.3085, "step": 7487 }, { "epoch": 1.4761435331230284, "grad_norm": 0.508433396912967, "learning_rate": 1.4015758867094837e-05, "loss": 0.3382, "step": 7488 }, { "epoch": 1.476340694006309, "grad_norm": 0.5127101875396283, "learning_rate": 1.4014339265152748e-05, "loss": 0.3524, "step": 7489 }, { "epoch": 1.4765378548895898, "grad_norm": 0.4724562924935433, "learning_rate": 1.401291956676292e-05, "loss": 0.3308, "step": 7490 }, { "epoch": 1.4767350157728707, "grad_norm": 0.49909821615758254, "learning_rate": 1.4011499771959469e-05, "loss": 0.3278, "step": 7491 }, { "epoch": 1.4769321766561514, "grad_norm": 0.500929277106934, "learning_rate": 1.4010079880776505e-05, "loss": 0.3428, "step": 7492 }, { "epoch": 1.477129337539432, "grad_norm": 0.4578591298461616, "learning_rate": 1.4008659893248147e-05, "loss": 0.3263, "step": 7493 }, { "epoch": 1.477326498422713, "grad_norm": 0.4829431282504663, "learning_rate": 1.40072398094085e-05, "loss": 0.3338, "step": 7494 }, { "epoch": 1.4775236593059937, "grad_norm": 0.48641913532636866, "learning_rate": 1.4005819629291692e-05, "loss": 0.3367, "step": 7495 }, { "epoch": 1.4777208201892744, "grad_norm": 0.5087064304556296, "learning_rate": 1.4004399352931846e-05, "loss": 0.3482, "step": 7496 }, { "epoch": 1.4779179810725553, "grad_norm": 0.4930581875396616, "learning_rate": 1.4002978980363075e-05, "loss": 0.3192, "step": 7497 }, { "epoch": 1.478115141955836, "grad_norm": 0.4957220120205278, "learning_rate": 1.4001558511619515e-05, "loss": 0.3341, "step": 7498 }, { "epoch": 1.4783123028391167, "grad_norm": 0.514560592615134, "learning_rate": 1.4000137946735284e-05, "loss": 0.3032, "step": 7499 }, { "epoch": 1.4785094637223974, "grad_norm": 0.5043148636879016, "learning_rate": 1.3998717285744524e-05, "loss": 0.3387, "step": 7500 }, { "epoch": 1.4787066246056781, "grad_norm": 0.5094476950875343, "learning_rate": 1.3997296528681355e-05, "loss": 0.3536, "step": 7501 }, { "epoch": 1.478903785488959, "grad_norm": 0.509271590468565, "learning_rate": 1.3995875675579922e-05, "loss": 0.3335, "step": 7502 }, { "epoch": 1.4791009463722398, "grad_norm": 0.475098165612926, "learning_rate": 1.3994454726474355e-05, "loss": 0.3232, "step": 7503 }, { "epoch": 1.4792981072555205, "grad_norm": 0.49409393273104263, "learning_rate": 1.3993033681398797e-05, "loss": 0.336, "step": 7504 }, { "epoch": 1.4794952681388014, "grad_norm": 0.5025889163503731, "learning_rate": 1.399161254038739e-05, "loss": 0.3479, "step": 7505 }, { "epoch": 1.479692429022082, "grad_norm": 0.5213687127491038, "learning_rate": 1.399019130347428e-05, "loss": 0.3591, "step": 7506 }, { "epoch": 1.4798895899053628, "grad_norm": 0.49494600597377825, "learning_rate": 1.3988769970693607e-05, "loss": 0.3501, "step": 7507 }, { "epoch": 1.4800867507886435, "grad_norm": 0.5047526802460451, "learning_rate": 1.3987348542079526e-05, "loss": 0.355, "step": 7508 }, { "epoch": 1.4802839116719242, "grad_norm": 0.48534628796917784, "learning_rate": 1.3985927017666183e-05, "loss": 0.3306, "step": 7509 }, { "epoch": 1.4804810725552051, "grad_norm": 0.5163215036754177, "learning_rate": 1.3984505397487736e-05, "loss": 0.3277, "step": 7510 }, { "epoch": 1.4806782334384858, "grad_norm": 0.48771109504055543, "learning_rate": 1.3983083681578336e-05, "loss": 0.3142, "step": 7511 }, { "epoch": 1.4808753943217665, "grad_norm": 0.49100954381314643, "learning_rate": 1.3981661869972143e-05, "loss": 0.3508, "step": 7512 }, { "epoch": 1.4810725552050474, "grad_norm": 0.4964769898882689, "learning_rate": 1.3980239962703316e-05, "loss": 0.3195, "step": 7513 }, { "epoch": 1.4812697160883281, "grad_norm": 0.4933646357348641, "learning_rate": 1.3978817959806022e-05, "loss": 0.3346, "step": 7514 }, { "epoch": 1.4814668769716088, "grad_norm": 0.5237222580406612, "learning_rate": 1.397739586131442e-05, "loss": 0.3534, "step": 7515 }, { "epoch": 1.4816640378548895, "grad_norm": 0.5499841344032339, "learning_rate": 1.3975973667262678e-05, "loss": 0.3752, "step": 7516 }, { "epoch": 1.4818611987381702, "grad_norm": 0.5281225252481041, "learning_rate": 1.397455137768497e-05, "loss": 0.3612, "step": 7517 }, { "epoch": 1.4820583596214512, "grad_norm": 0.4754396639001578, "learning_rate": 1.3973128992615461e-05, "loss": 0.3294, "step": 7518 }, { "epoch": 1.4822555205047319, "grad_norm": 0.5106352743028668, "learning_rate": 1.3971706512088334e-05, "loss": 0.3489, "step": 7519 }, { "epoch": 1.4824526813880126, "grad_norm": 0.4899386431725224, "learning_rate": 1.3970283936137755e-05, "loss": 0.326, "step": 7520 }, { "epoch": 1.4826498422712935, "grad_norm": 0.49818208559529764, "learning_rate": 1.3968861264797911e-05, "loss": 0.3261, "step": 7521 }, { "epoch": 1.4828470031545742, "grad_norm": 0.49436959103979455, "learning_rate": 1.3967438498102971e-05, "loss": 0.3511, "step": 7522 }, { "epoch": 1.4830441640378549, "grad_norm": 0.5033269127969884, "learning_rate": 1.3966015636087133e-05, "loss": 0.3574, "step": 7523 }, { "epoch": 1.4832413249211356, "grad_norm": 0.4906719889384929, "learning_rate": 1.3964592678784574e-05, "loss": 0.3317, "step": 7524 }, { "epoch": 1.4834384858044163, "grad_norm": 0.516805390516583, "learning_rate": 1.3963169626229485e-05, "loss": 0.3533, "step": 7525 }, { "epoch": 1.4836356466876972, "grad_norm": 0.4704857874190068, "learning_rate": 1.396174647845605e-05, "loss": 0.3244, "step": 7526 }, { "epoch": 1.483832807570978, "grad_norm": 0.8886639274014726, "learning_rate": 1.396032323549847e-05, "loss": 0.36, "step": 7527 }, { "epoch": 1.4840299684542586, "grad_norm": 0.5070409296139642, "learning_rate": 1.3958899897390935e-05, "loss": 0.3479, "step": 7528 }, { "epoch": 1.4842271293375395, "grad_norm": 0.46156188122605435, "learning_rate": 1.3957476464167639e-05, "loss": 0.317, "step": 7529 }, { "epoch": 1.4844242902208202, "grad_norm": 0.4798500007246751, "learning_rate": 1.3956052935862782e-05, "loss": 0.3307, "step": 7530 }, { "epoch": 1.484621451104101, "grad_norm": 0.4699403853866003, "learning_rate": 1.3954629312510573e-05, "loss": 0.3395, "step": 7531 }, { "epoch": 1.4848186119873816, "grad_norm": 0.5019467243741862, "learning_rate": 1.3953205594145207e-05, "loss": 0.3489, "step": 7532 }, { "epoch": 1.4850157728706623, "grad_norm": 0.49998290248687316, "learning_rate": 1.3951781780800892e-05, "loss": 0.3493, "step": 7533 }, { "epoch": 1.4852129337539433, "grad_norm": 0.4952129809254002, "learning_rate": 1.3950357872511839e-05, "loss": 0.3442, "step": 7534 }, { "epoch": 1.485410094637224, "grad_norm": 0.5444354708755205, "learning_rate": 1.3948933869312258e-05, "loss": 0.3442, "step": 7535 }, { "epoch": 1.4856072555205047, "grad_norm": 0.4968530516895542, "learning_rate": 1.3947509771236361e-05, "loss": 0.3205, "step": 7536 }, { "epoch": 1.4858044164037856, "grad_norm": 0.6219558232573202, "learning_rate": 1.3946085578318358e-05, "loss": 0.3735, "step": 7537 }, { "epoch": 1.4860015772870663, "grad_norm": 0.48319973377602, "learning_rate": 1.3944661290592476e-05, "loss": 0.3647, "step": 7538 }, { "epoch": 1.486198738170347, "grad_norm": 0.4717457826676787, "learning_rate": 1.3943236908092926e-05, "loss": 0.3193, "step": 7539 }, { "epoch": 1.4863958990536277, "grad_norm": 0.544058207471016, "learning_rate": 1.3941812430853938e-05, "loss": 0.3496, "step": 7540 }, { "epoch": 1.4865930599369084, "grad_norm": 0.5298496126711097, "learning_rate": 1.394038785890973e-05, "loss": 0.3244, "step": 7541 }, { "epoch": 1.4867902208201893, "grad_norm": 0.4946581847477271, "learning_rate": 1.3938963192294533e-05, "loss": 0.355, "step": 7542 }, { "epoch": 1.48698738170347, "grad_norm": 0.46001185450442234, "learning_rate": 1.3937538431042567e-05, "loss": 0.3039, "step": 7543 }, { "epoch": 1.4871845425867507, "grad_norm": 0.6003857261401706, "learning_rate": 1.3936113575188074e-05, "loss": 0.3356, "step": 7544 }, { "epoch": 1.4873817034700316, "grad_norm": 0.48039873611738615, "learning_rate": 1.3934688624765282e-05, "loss": 0.3188, "step": 7545 }, { "epoch": 1.4875788643533123, "grad_norm": 0.458905469054827, "learning_rate": 1.3933263579808426e-05, "loss": 0.3186, "step": 7546 }, { "epoch": 1.487776025236593, "grad_norm": 0.5281694152795835, "learning_rate": 1.3931838440351748e-05, "loss": 0.3689, "step": 7547 }, { "epoch": 1.487973186119874, "grad_norm": 0.48164249983072344, "learning_rate": 1.3930413206429483e-05, "loss": 0.3264, "step": 7548 }, { "epoch": 1.4881703470031546, "grad_norm": 0.4698429076871976, "learning_rate": 1.3928987878075874e-05, "loss": 0.3312, "step": 7549 }, { "epoch": 1.4883675078864353, "grad_norm": 0.4930590803919952, "learning_rate": 1.392756245532517e-05, "loss": 0.3312, "step": 7550 }, { "epoch": 1.488564668769716, "grad_norm": 0.4898819634829872, "learning_rate": 1.3926136938211615e-05, "loss": 0.3485, "step": 7551 }, { "epoch": 1.4887618296529967, "grad_norm": 0.48401145600289563, "learning_rate": 1.3924711326769457e-05, "loss": 0.3347, "step": 7552 }, { "epoch": 1.4889589905362777, "grad_norm": 0.47592844367724896, "learning_rate": 1.392328562103295e-05, "loss": 0.3189, "step": 7553 }, { "epoch": 1.4891561514195584, "grad_norm": 0.47313537818185825, "learning_rate": 1.3921859821036345e-05, "loss": 0.3195, "step": 7554 }, { "epoch": 1.489353312302839, "grad_norm": 0.4828381063664307, "learning_rate": 1.3920433926813901e-05, "loss": 0.348, "step": 7555 }, { "epoch": 1.48955047318612, "grad_norm": 0.5075800480469936, "learning_rate": 1.3919007938399873e-05, "loss": 0.353, "step": 7556 }, { "epoch": 1.4897476340694007, "grad_norm": 0.4605077583110866, "learning_rate": 1.3917581855828526e-05, "loss": 0.3158, "step": 7557 }, { "epoch": 1.4899447949526814, "grad_norm": 0.523185615476558, "learning_rate": 1.3916155679134118e-05, "loss": 0.3629, "step": 7558 }, { "epoch": 1.490141955835962, "grad_norm": 4.614531591833265, "learning_rate": 1.3914729408350918e-05, "loss": 0.3198, "step": 7559 }, { "epoch": 1.4903391167192428, "grad_norm": 0.6690308387896126, "learning_rate": 1.3913303043513188e-05, "loss": 0.3414, "step": 7560 }, { "epoch": 1.4905362776025237, "grad_norm": 0.497169553741172, "learning_rate": 1.3911876584655206e-05, "loss": 0.3556, "step": 7561 }, { "epoch": 1.4907334384858044, "grad_norm": 0.47652650301196414, "learning_rate": 1.3910450031811235e-05, "loss": 0.3258, "step": 7562 }, { "epoch": 1.4909305993690851, "grad_norm": 0.500693022752151, "learning_rate": 1.3909023385015551e-05, "loss": 0.3462, "step": 7563 }, { "epoch": 1.491127760252366, "grad_norm": 0.5068560385443583, "learning_rate": 1.390759664430244e-05, "loss": 0.3506, "step": 7564 }, { "epoch": 1.4913249211356467, "grad_norm": 0.5227101584388434, "learning_rate": 1.3906169809706165e-05, "loss": 0.3384, "step": 7565 }, { "epoch": 1.4915220820189274, "grad_norm": 0.49830512467585986, "learning_rate": 1.390474288126102e-05, "loss": 0.3357, "step": 7566 }, { "epoch": 1.4917192429022081, "grad_norm": 0.5135981865878078, "learning_rate": 1.3903315859001278e-05, "loss": 0.3515, "step": 7567 }, { "epoch": 1.4919164037854888, "grad_norm": 0.5072032920741137, "learning_rate": 1.3901888742961233e-05, "loss": 0.3636, "step": 7568 }, { "epoch": 1.4921135646687698, "grad_norm": 0.46652743790054163, "learning_rate": 1.3900461533175167e-05, "loss": 0.3255, "step": 7569 }, { "epoch": 1.4923107255520505, "grad_norm": 0.4559665210107085, "learning_rate": 1.3899034229677373e-05, "loss": 0.3187, "step": 7570 }, { "epoch": 1.4925078864353312, "grad_norm": 0.4955869955690288, "learning_rate": 1.389760683250214e-05, "loss": 0.3473, "step": 7571 }, { "epoch": 1.492705047318612, "grad_norm": 0.532343767493093, "learning_rate": 1.3896179341683763e-05, "loss": 0.3666, "step": 7572 }, { "epoch": 1.4929022082018928, "grad_norm": 0.49706921835508877, "learning_rate": 1.3894751757256544e-05, "loss": 0.3403, "step": 7573 }, { "epoch": 1.4930993690851735, "grad_norm": 0.5079181565121896, "learning_rate": 1.3893324079254776e-05, "loss": 0.3687, "step": 7574 }, { "epoch": 1.4932965299684542, "grad_norm": 0.4908701711814409, "learning_rate": 1.389189630771276e-05, "loss": 0.3468, "step": 7575 }, { "epoch": 1.493493690851735, "grad_norm": 0.4992855170423717, "learning_rate": 1.3890468442664801e-05, "loss": 0.3304, "step": 7576 }, { "epoch": 1.4936908517350158, "grad_norm": 0.5074458565945166, "learning_rate": 1.3889040484145206e-05, "loss": 0.349, "step": 7577 }, { "epoch": 1.4938880126182965, "grad_norm": 0.4776291728277284, "learning_rate": 1.3887612432188282e-05, "loss": 0.3391, "step": 7578 }, { "epoch": 1.4940851735015772, "grad_norm": 0.47517474086457523, "learning_rate": 1.388618428682834e-05, "loss": 0.3163, "step": 7579 }, { "epoch": 1.4942823343848581, "grad_norm": 0.4757824327904514, "learning_rate": 1.3884756048099688e-05, "loss": 0.3221, "step": 7580 }, { "epoch": 1.4944794952681388, "grad_norm": 0.5268640892228607, "learning_rate": 1.3883327716036643e-05, "loss": 0.3628, "step": 7581 }, { "epoch": 1.4946766561514195, "grad_norm": 0.4902773690445549, "learning_rate": 1.3881899290673526e-05, "loss": 0.3641, "step": 7582 }, { "epoch": 1.4948738170347002, "grad_norm": 0.4598374139054406, "learning_rate": 1.388047077204465e-05, "loss": 0.3223, "step": 7583 }, { "epoch": 1.495070977917981, "grad_norm": 0.4847424035881873, "learning_rate": 1.3879042160184337e-05, "loss": 0.3355, "step": 7584 }, { "epoch": 1.4952681388012619, "grad_norm": 0.5130377501847655, "learning_rate": 1.3877613455126918e-05, "loss": 0.3306, "step": 7585 }, { "epoch": 1.4954652996845426, "grad_norm": 0.4886195265619356, "learning_rate": 1.3876184656906706e-05, "loss": 0.3358, "step": 7586 }, { "epoch": 1.4956624605678233, "grad_norm": 0.48902748634085175, "learning_rate": 1.387475576555804e-05, "loss": 0.339, "step": 7587 }, { "epoch": 1.4958596214511042, "grad_norm": 0.47601101485961045, "learning_rate": 1.3873326781115247e-05, "loss": 0.3206, "step": 7588 }, { "epoch": 1.4960567823343849, "grad_norm": 0.4942695337736247, "learning_rate": 1.3871897703612658e-05, "loss": 0.3439, "step": 7589 }, { "epoch": 1.4962539432176656, "grad_norm": 0.4695829997055428, "learning_rate": 1.3870468533084606e-05, "loss": 0.3288, "step": 7590 }, { "epoch": 1.4964511041009465, "grad_norm": 0.5104474370022767, "learning_rate": 1.3869039269565434e-05, "loss": 0.3388, "step": 7591 }, { "epoch": 1.4966482649842272, "grad_norm": 0.508559710861152, "learning_rate": 1.3867609913089476e-05, "loss": 0.3353, "step": 7592 }, { "epoch": 1.496845425867508, "grad_norm": 0.5039315719219875, "learning_rate": 1.3866180463691077e-05, "loss": 0.3208, "step": 7593 }, { "epoch": 1.4970425867507886, "grad_norm": 0.5191040517201374, "learning_rate": 1.386475092140458e-05, "loss": 0.3468, "step": 7594 }, { "epoch": 1.4972397476340693, "grad_norm": 0.4731795958829586, "learning_rate": 1.3863321286264326e-05, "loss": 0.2784, "step": 7595 }, { "epoch": 1.4974369085173502, "grad_norm": 0.5740007403730863, "learning_rate": 1.386189155830467e-05, "loss": 0.3461, "step": 7596 }, { "epoch": 1.497634069400631, "grad_norm": 0.5117098865886438, "learning_rate": 1.3860461737559958e-05, "loss": 0.3335, "step": 7597 }, { "epoch": 1.4978312302839116, "grad_norm": 0.4963354358560534, "learning_rate": 1.3859031824064543e-05, "loss": 0.3213, "step": 7598 }, { "epoch": 1.4980283911671926, "grad_norm": 0.5362349288534428, "learning_rate": 1.3857601817852785e-05, "loss": 0.3503, "step": 7599 }, { "epoch": 1.4982255520504733, "grad_norm": 0.46854667444421344, "learning_rate": 1.3856171718959033e-05, "loss": 0.3113, "step": 7600 }, { "epoch": 1.498422712933754, "grad_norm": 0.4976815417371681, "learning_rate": 1.385474152741765e-05, "loss": 0.3528, "step": 7601 }, { "epoch": 1.4986198738170347, "grad_norm": 0.5129716538025957, "learning_rate": 1.3853311243262999e-05, "loss": 0.3332, "step": 7602 }, { "epoch": 1.4988170347003154, "grad_norm": 0.4604531892193082, "learning_rate": 1.3851880866529444e-05, "loss": 0.3217, "step": 7603 }, { "epoch": 1.4990141955835963, "grad_norm": 0.6867111001620063, "learning_rate": 1.3850450397251344e-05, "loss": 0.3352, "step": 7604 }, { "epoch": 1.499211356466877, "grad_norm": 0.49556889703029594, "learning_rate": 1.3849019835463076e-05, "loss": 0.3441, "step": 7605 }, { "epoch": 1.4994085173501577, "grad_norm": 0.6455148770924305, "learning_rate": 1.3847589181199009e-05, "loss": 0.3688, "step": 7606 }, { "epoch": 1.4996056782334386, "grad_norm": 0.49503800910765083, "learning_rate": 1.3846158434493507e-05, "loss": 0.3615, "step": 7607 }, { "epoch": 1.4998028391167193, "grad_norm": 0.5347889365402132, "learning_rate": 1.3844727595380958e-05, "loss": 0.3554, "step": 7608 }, { "epoch": 1.4998028391167193, "eval_loss": 0.43272438645362854, "eval_runtime": 344.4401, "eval_samples_per_second": 23.604, "eval_steps_per_second": 1.478, "step": 7608 }, { "epoch": 1.5, "grad_norm": 11.409221739880106, "learning_rate": 1.3843296663895726e-05, "loss": 0.421, "step": 7609 }, { "epoch": 1.500197160883281, "grad_norm": 0.5293429847721727, "learning_rate": 1.3841865640072203e-05, "loss": 0.3249, "step": 7610 }, { "epoch": 1.5003943217665614, "grad_norm": 0.5879361449704711, "learning_rate": 1.3840434523944759e-05, "loss": 0.3574, "step": 7611 }, { "epoch": 1.5005914826498423, "grad_norm": 0.507948237489469, "learning_rate": 1.3839003315547785e-05, "loss": 0.3477, "step": 7612 }, { "epoch": 1.500788643533123, "grad_norm": 0.5174406330344764, "learning_rate": 1.3837572014915669e-05, "loss": 0.3464, "step": 7613 }, { "epoch": 1.5009858044164037, "grad_norm": 0.507543096114278, "learning_rate": 1.3836140622082788e-05, "loss": 0.3351, "step": 7614 }, { "epoch": 1.5011829652996846, "grad_norm": 0.5397406725670891, "learning_rate": 1.3834709137083544e-05, "loss": 0.3506, "step": 7615 }, { "epoch": 1.5013801261829653, "grad_norm": 0.5232479139909186, "learning_rate": 1.3833277559952323e-05, "loss": 0.3521, "step": 7616 }, { "epoch": 1.501577287066246, "grad_norm": 0.4913998494274104, "learning_rate": 1.3831845890723523e-05, "loss": 0.351, "step": 7617 }, { "epoch": 1.501774447949527, "grad_norm": 0.5928180766843834, "learning_rate": 1.3830414129431538e-05, "loss": 0.3554, "step": 7618 }, { "epoch": 1.5019716088328074, "grad_norm": 0.47768284134792766, "learning_rate": 1.3828982276110767e-05, "loss": 0.323, "step": 7619 }, { "epoch": 1.5021687697160884, "grad_norm": 0.4784865791461987, "learning_rate": 1.3827550330795618e-05, "loss": 0.2985, "step": 7620 }, { "epoch": 1.502365930599369, "grad_norm": 0.5109879323464163, "learning_rate": 1.3826118293520488e-05, "loss": 0.3387, "step": 7621 }, { "epoch": 1.5025630914826498, "grad_norm": 0.4942414542354045, "learning_rate": 1.3824686164319782e-05, "loss": 0.3306, "step": 7622 }, { "epoch": 1.5027602523659307, "grad_norm": 0.5035209979496461, "learning_rate": 1.3823253943227916e-05, "loss": 0.3313, "step": 7623 }, { "epoch": 1.5029574132492114, "grad_norm": 0.5438482479157711, "learning_rate": 1.382182163027929e-05, "loss": 0.3251, "step": 7624 }, { "epoch": 1.503154574132492, "grad_norm": 0.48477404769438337, "learning_rate": 1.3820389225508327e-05, "loss": 0.3358, "step": 7625 }, { "epoch": 1.503351735015773, "grad_norm": 0.49598419627564255, "learning_rate": 1.3818956728949432e-05, "loss": 0.3176, "step": 7626 }, { "epoch": 1.5035488958990535, "grad_norm": 0.6022310575721551, "learning_rate": 1.3817524140637029e-05, "loss": 0.3363, "step": 7627 }, { "epoch": 1.5037460567823344, "grad_norm": 0.4892461050892043, "learning_rate": 1.3816091460605534e-05, "loss": 0.3255, "step": 7628 }, { "epoch": 1.5039432176656151, "grad_norm": 0.552459885881935, "learning_rate": 1.3814658688889369e-05, "loss": 0.3272, "step": 7629 }, { "epoch": 1.5041403785488958, "grad_norm": 0.5011577635235227, "learning_rate": 1.3813225825522954e-05, "loss": 0.3263, "step": 7630 }, { "epoch": 1.5043375394321767, "grad_norm": 0.5071339854073511, "learning_rate": 1.3811792870540717e-05, "loss": 0.3306, "step": 7631 }, { "epoch": 1.5045347003154574, "grad_norm": 0.4690112256108807, "learning_rate": 1.3810359823977094e-05, "loss": 0.3142, "step": 7632 }, { "epoch": 1.5047318611987381, "grad_norm": 0.507389006042168, "learning_rate": 1.38089266858665e-05, "loss": 0.3595, "step": 7633 }, { "epoch": 1.504929022082019, "grad_norm": 0.5136892463140303, "learning_rate": 1.380749345624338e-05, "loss": 0.3552, "step": 7634 }, { "epoch": 1.5051261829652995, "grad_norm": 0.5230467446626003, "learning_rate": 1.3806060135142159e-05, "loss": 0.3361, "step": 7635 }, { "epoch": 1.5053233438485805, "grad_norm": 0.47365615277402534, "learning_rate": 1.3804626722597283e-05, "loss": 0.3181, "step": 7636 }, { "epoch": 1.5055205047318612, "grad_norm": 0.5263364289856198, "learning_rate": 1.3803193218643181e-05, "loss": 0.3324, "step": 7637 }, { "epoch": 1.5057176656151419, "grad_norm": 0.5134844859050957, "learning_rate": 1.3801759623314302e-05, "loss": 0.3429, "step": 7638 }, { "epoch": 1.5059148264984228, "grad_norm": 0.4801245661280436, "learning_rate": 1.3800325936645087e-05, "loss": 0.2968, "step": 7639 }, { "epoch": 1.5061119873817035, "grad_norm": 0.4700963225645677, "learning_rate": 1.379889215866998e-05, "loss": 0.3203, "step": 7640 }, { "epoch": 1.5063091482649842, "grad_norm": 0.4710374994497814, "learning_rate": 1.3797458289423431e-05, "loss": 0.3241, "step": 7641 }, { "epoch": 1.506506309148265, "grad_norm": 0.4957756081340281, "learning_rate": 1.3796024328939887e-05, "loss": 0.3512, "step": 7642 }, { "epoch": 1.5067034700315456, "grad_norm": 0.4761451630851939, "learning_rate": 1.3794590277253803e-05, "loss": 0.334, "step": 7643 }, { "epoch": 1.5069006309148265, "grad_norm": 0.4814926896127318, "learning_rate": 1.3793156134399633e-05, "loss": 0.3246, "step": 7644 }, { "epoch": 1.5070977917981072, "grad_norm": 0.4828137836425216, "learning_rate": 1.379172190041183e-05, "loss": 0.3414, "step": 7645 }, { "epoch": 1.507294952681388, "grad_norm": 0.5194210689710018, "learning_rate": 1.3790287575324854e-05, "loss": 0.3533, "step": 7646 }, { "epoch": 1.5074921135646688, "grad_norm": 0.4962183001455974, "learning_rate": 1.3788853159173169e-05, "loss": 0.3336, "step": 7647 }, { "epoch": 1.5076892744479495, "grad_norm": 0.4950050985316144, "learning_rate": 1.3787418651991233e-05, "loss": 0.3565, "step": 7648 }, { "epoch": 1.5078864353312302, "grad_norm": 0.49021770978943213, "learning_rate": 1.3785984053813517e-05, "loss": 0.3356, "step": 7649 }, { "epoch": 1.5080835962145112, "grad_norm": 0.47294234808211555, "learning_rate": 1.3784549364674485e-05, "loss": 0.3198, "step": 7650 }, { "epoch": 1.5082807570977916, "grad_norm": 0.5189806281206693, "learning_rate": 1.3783114584608605e-05, "loss": 0.3294, "step": 7651 }, { "epoch": 1.5084779179810726, "grad_norm": 0.5185822245810467, "learning_rate": 1.3781679713650349e-05, "loss": 0.3446, "step": 7652 }, { "epoch": 1.5086750788643533, "grad_norm": 0.4825839494350987, "learning_rate": 1.3780244751834197e-05, "loss": 0.3422, "step": 7653 }, { "epoch": 1.508872239747634, "grad_norm": 0.4796252236159735, "learning_rate": 1.3778809699194616e-05, "loss": 0.3431, "step": 7654 }, { "epoch": 1.5090694006309149, "grad_norm": 0.5021963528108637, "learning_rate": 1.3777374555766093e-05, "loss": 0.3401, "step": 7655 }, { "epoch": 1.5092665615141956, "grad_norm": 0.5751823831914286, "learning_rate": 1.37759393215831e-05, "loss": 0.3532, "step": 7656 }, { "epoch": 1.5094637223974763, "grad_norm": 0.4777026387116764, "learning_rate": 1.3774503996680128e-05, "loss": 0.3461, "step": 7657 }, { "epoch": 1.5096608832807572, "grad_norm": 0.47046319672734455, "learning_rate": 1.3773068581091655e-05, "loss": 0.3252, "step": 7658 }, { "epoch": 1.509858044164038, "grad_norm": 0.5047250594337946, "learning_rate": 1.3771633074852173e-05, "loss": 0.3539, "step": 7659 }, { "epoch": 1.5100552050473186, "grad_norm": 0.5141111962164151, "learning_rate": 1.3770197477996168e-05, "loss": 0.3272, "step": 7660 }, { "epoch": 1.5102523659305995, "grad_norm": 0.4811169395000833, "learning_rate": 1.3768761790558134e-05, "loss": 0.3245, "step": 7661 }, { "epoch": 1.51044952681388, "grad_norm": 0.5244771681249779, "learning_rate": 1.3767326012572561e-05, "loss": 0.3566, "step": 7662 }, { "epoch": 1.510646687697161, "grad_norm": 0.4704331253968268, "learning_rate": 1.376589014407395e-05, "loss": 0.3154, "step": 7663 }, { "epoch": 1.5108438485804416, "grad_norm": 0.49390026411452725, "learning_rate": 1.3764454185096792e-05, "loss": 0.3377, "step": 7664 }, { "epoch": 1.5110410094637223, "grad_norm": 0.45874452901953416, "learning_rate": 1.3763018135675592e-05, "loss": 0.3388, "step": 7665 }, { "epoch": 1.5112381703470033, "grad_norm": 0.4768940389299864, "learning_rate": 1.3761581995844852e-05, "loss": 0.3355, "step": 7666 }, { "epoch": 1.511435331230284, "grad_norm": 0.511936222268746, "learning_rate": 1.3760145765639075e-05, "loss": 0.3519, "step": 7667 }, { "epoch": 1.5116324921135647, "grad_norm": 0.7261444629938607, "learning_rate": 1.3758709445092767e-05, "loss": 0.3503, "step": 7668 }, { "epoch": 1.5118296529968456, "grad_norm": 0.47879946943440466, "learning_rate": 1.3757273034240437e-05, "loss": 0.3403, "step": 7669 }, { "epoch": 1.512026813880126, "grad_norm": 0.5050574898314102, "learning_rate": 1.3755836533116597e-05, "loss": 0.3364, "step": 7670 }, { "epoch": 1.512223974763407, "grad_norm": 0.49140249592667906, "learning_rate": 1.3754399941755763e-05, "loss": 0.3237, "step": 7671 }, { "epoch": 1.5124211356466877, "grad_norm": 0.5364739642991352, "learning_rate": 1.3752963260192442e-05, "loss": 0.3594, "step": 7672 }, { "epoch": 1.5126182965299684, "grad_norm": 0.5184206685342432, "learning_rate": 1.3751526488461158e-05, "loss": 0.3346, "step": 7673 }, { "epoch": 1.5128154574132493, "grad_norm": 0.5054659832613505, "learning_rate": 1.375008962659643e-05, "loss": 0.3309, "step": 7674 }, { "epoch": 1.51301261829653, "grad_norm": 0.5555312582768909, "learning_rate": 1.3748652674632779e-05, "loss": 0.359, "step": 7675 }, { "epoch": 1.5132097791798107, "grad_norm": 0.5258696149826486, "learning_rate": 1.374721563260473e-05, "loss": 0.3485, "step": 7676 }, { "epoch": 1.5134069400630916, "grad_norm": 7.007449812708166, "learning_rate": 1.3745778500546805e-05, "loss": 0.326, "step": 7677 }, { "epoch": 1.513604100946372, "grad_norm": 0.5503065900084199, "learning_rate": 1.3744341278493535e-05, "loss": 0.333, "step": 7678 }, { "epoch": 1.513801261829653, "grad_norm": 0.46540872566930663, "learning_rate": 1.374290396647945e-05, "loss": 0.308, "step": 7679 }, { "epoch": 1.5139984227129337, "grad_norm": 0.48222335793406707, "learning_rate": 1.3741466564539085e-05, "loss": 0.3374, "step": 7680 }, { "epoch": 1.5141955835962144, "grad_norm": 0.5340584301274428, "learning_rate": 1.3740029072706975e-05, "loss": 0.3339, "step": 7681 }, { "epoch": 1.5143927444794953, "grad_norm": 0.4798870462171238, "learning_rate": 1.373859149101765e-05, "loss": 0.327, "step": 7682 }, { "epoch": 1.514589905362776, "grad_norm": 0.5198175974996748, "learning_rate": 1.3737153819505658e-05, "loss": 0.3557, "step": 7683 }, { "epoch": 1.5147870662460567, "grad_norm": 0.5400239741407015, "learning_rate": 1.3735716058205533e-05, "loss": 0.3415, "step": 7684 }, { "epoch": 1.5149842271293377, "grad_norm": 0.740156822222281, "learning_rate": 1.3734278207151824e-05, "loss": 0.353, "step": 7685 }, { "epoch": 1.5151813880126181, "grad_norm": 0.5022300864303025, "learning_rate": 1.3732840266379071e-05, "loss": 0.3161, "step": 7686 }, { "epoch": 1.515378548895899, "grad_norm": 0.4932017068142421, "learning_rate": 1.3731402235921824e-05, "loss": 0.3324, "step": 7687 }, { "epoch": 1.5155757097791798, "grad_norm": 0.5065348898290931, "learning_rate": 1.3729964115814636e-05, "loss": 0.3333, "step": 7688 }, { "epoch": 1.5157728706624605, "grad_norm": 0.5417643838253485, "learning_rate": 1.3728525906092056e-05, "loss": 0.3521, "step": 7689 }, { "epoch": 1.5159700315457414, "grad_norm": 0.4907116291133275, "learning_rate": 1.3727087606788639e-05, "loss": 0.3293, "step": 7690 }, { "epoch": 1.516167192429022, "grad_norm": 0.47069234491091616, "learning_rate": 1.3725649217938938e-05, "loss": 0.328, "step": 7691 }, { "epoch": 1.5163643533123028, "grad_norm": 0.6967452358004222, "learning_rate": 1.3724210739577516e-05, "loss": 0.3358, "step": 7692 }, { "epoch": 1.5165615141955837, "grad_norm": 0.5071542817692918, "learning_rate": 1.3722772171738932e-05, "loss": 0.3593, "step": 7693 }, { "epoch": 1.5167586750788642, "grad_norm": 0.48669388997244545, "learning_rate": 1.3721333514457748e-05, "loss": 0.3457, "step": 7694 }, { "epoch": 1.5169558359621451, "grad_norm": 0.49829646210372097, "learning_rate": 1.3719894767768532e-05, "loss": 0.3418, "step": 7695 }, { "epoch": 1.5171529968454258, "grad_norm": 0.5104164374213708, "learning_rate": 1.3718455931705845e-05, "loss": 0.3544, "step": 7696 }, { "epoch": 1.5173501577287065, "grad_norm": 0.5017546986684652, "learning_rate": 1.371701700630426e-05, "loss": 0.355, "step": 7697 }, { "epoch": 1.5175473186119874, "grad_norm": 0.5926037563656988, "learning_rate": 1.3715577991598352e-05, "loss": 0.3789, "step": 7698 }, { "epoch": 1.5177444794952681, "grad_norm": 0.4963131869513678, "learning_rate": 1.3714138887622685e-05, "loss": 0.3448, "step": 7699 }, { "epoch": 1.5179416403785488, "grad_norm": 0.5147685242406805, "learning_rate": 1.3712699694411846e-05, "loss": 0.3254, "step": 7700 }, { "epoch": 1.5181388012618298, "grad_norm": 0.5075260322768886, "learning_rate": 1.3711260412000403e-05, "loss": 0.3396, "step": 7701 }, { "epoch": 1.5183359621451105, "grad_norm": 0.4938057819218849, "learning_rate": 1.3709821040422944e-05, "loss": 0.3434, "step": 7702 }, { "epoch": 1.5185331230283912, "grad_norm": 0.48016800863389886, "learning_rate": 1.370838157971404e-05, "loss": 0.3161, "step": 7703 }, { "epoch": 1.518730283911672, "grad_norm": 0.4947174895321279, "learning_rate": 1.370694202990829e-05, "loss": 0.3636, "step": 7704 }, { "epoch": 1.5189274447949526, "grad_norm": 0.4617872910169582, "learning_rate": 1.3705502391040266e-05, "loss": 0.3193, "step": 7705 }, { "epoch": 1.5191246056782335, "grad_norm": 0.49049250891250884, "learning_rate": 1.3704062663144569e-05, "loss": 0.3336, "step": 7706 }, { "epoch": 1.5193217665615142, "grad_norm": 0.4457461204930738, "learning_rate": 1.370262284625578e-05, "loss": 0.2909, "step": 7707 }, { "epoch": 1.5195189274447949, "grad_norm": 0.5403694722432415, "learning_rate": 1.3701182940408495e-05, "loss": 0.3567, "step": 7708 }, { "epoch": 1.5197160883280758, "grad_norm": 0.4816431559603944, "learning_rate": 1.3699742945637312e-05, "loss": 0.3163, "step": 7709 }, { "epoch": 1.5199132492113565, "grad_norm": 0.5071851732106253, "learning_rate": 1.3698302861976822e-05, "loss": 0.3197, "step": 7710 }, { "epoch": 1.5201104100946372, "grad_norm": 0.5036103402901476, "learning_rate": 1.369686268946163e-05, "loss": 0.3594, "step": 7711 }, { "epoch": 1.5203075709779181, "grad_norm": 0.5082597299605005, "learning_rate": 1.3695422428126335e-05, "loss": 0.3452, "step": 7712 }, { "epoch": 1.5205047318611986, "grad_norm": 0.5034774870521724, "learning_rate": 1.3693982078005538e-05, "loss": 0.3415, "step": 7713 }, { "epoch": 1.5207018927444795, "grad_norm": 0.49017821736880307, "learning_rate": 1.3692541639133849e-05, "loss": 0.3503, "step": 7714 }, { "epoch": 1.5208990536277602, "grad_norm": 0.5033328297669176, "learning_rate": 1.3691101111545873e-05, "loss": 0.3306, "step": 7715 }, { "epoch": 1.521096214511041, "grad_norm": 0.5089744822677265, "learning_rate": 1.368966049527622e-05, "loss": 0.3579, "step": 7716 }, { "epoch": 1.5212933753943219, "grad_norm": 0.5390487095974562, "learning_rate": 1.3688219790359503e-05, "loss": 0.369, "step": 7717 }, { "epoch": 1.5214905362776026, "grad_norm": 0.4996281857585761, "learning_rate": 1.3686778996830335e-05, "loss": 0.317, "step": 7718 }, { "epoch": 1.5216876971608833, "grad_norm": 0.5360343337780741, "learning_rate": 1.3685338114723331e-05, "loss": 0.3536, "step": 7719 }, { "epoch": 1.5218848580441642, "grad_norm": 0.48695308863803854, "learning_rate": 1.3683897144073111e-05, "loss": 0.3319, "step": 7720 }, { "epoch": 1.5220820189274447, "grad_norm": 0.5111947489161915, "learning_rate": 1.36824560849143e-05, "loss": 0.3506, "step": 7721 }, { "epoch": 1.5222791798107256, "grad_norm": 0.515635778087092, "learning_rate": 1.3681014937281509e-05, "loss": 0.2918, "step": 7722 }, { "epoch": 1.5224763406940063, "grad_norm": 0.7712050067029159, "learning_rate": 1.3679573701209376e-05, "loss": 0.3375, "step": 7723 }, { "epoch": 1.522673501577287, "grad_norm": 0.5100802873571525, "learning_rate": 1.3678132376732518e-05, "loss": 0.3686, "step": 7724 }, { "epoch": 1.522870662460568, "grad_norm": 0.4995731991498597, "learning_rate": 1.367669096388557e-05, "loss": 0.3459, "step": 7725 }, { "epoch": 1.5230678233438486, "grad_norm": 0.49021205630646913, "learning_rate": 1.367524946270316e-05, "loss": 0.3568, "step": 7726 }, { "epoch": 1.5232649842271293, "grad_norm": 0.5069952766234989, "learning_rate": 1.3673807873219921e-05, "loss": 0.3652, "step": 7727 }, { "epoch": 1.5234621451104102, "grad_norm": 0.5044160352391255, "learning_rate": 1.367236619547049e-05, "loss": 0.3359, "step": 7728 }, { "epoch": 1.5236593059936907, "grad_norm": 0.5003901713256061, "learning_rate": 1.3670924429489505e-05, "loss": 0.345, "step": 7729 }, { "epoch": 1.5238564668769716, "grad_norm": 0.4700435713643861, "learning_rate": 1.3669482575311604e-05, "loss": 0.3138, "step": 7730 }, { "epoch": 1.5240536277602523, "grad_norm": 0.489703666578115, "learning_rate": 1.366804063297143e-05, "loss": 0.3327, "step": 7731 }, { "epoch": 1.524250788643533, "grad_norm": 0.5862569684725426, "learning_rate": 1.3666598602503622e-05, "loss": 0.3267, "step": 7732 }, { "epoch": 1.524447949526814, "grad_norm": 0.4842290548690208, "learning_rate": 1.3665156483942834e-05, "loss": 0.3538, "step": 7733 }, { "epoch": 1.5246451104100947, "grad_norm": 0.5540354619028578, "learning_rate": 1.3663714277323707e-05, "loss": 0.373, "step": 7734 }, { "epoch": 1.5248422712933754, "grad_norm": 0.49143503772872, "learning_rate": 1.3662271982680895e-05, "loss": 0.3312, "step": 7735 }, { "epoch": 1.5250394321766563, "grad_norm": 0.52098501675389, "learning_rate": 1.366082960004905e-05, "loss": 0.3509, "step": 7736 }, { "epoch": 1.5252365930599368, "grad_norm": 0.4553592686652839, "learning_rate": 1.3659387129462826e-05, "loss": 0.3139, "step": 7737 }, { "epoch": 1.5254337539432177, "grad_norm": 0.519004995842705, "learning_rate": 1.365794457095688e-05, "loss": 0.3671, "step": 7738 }, { "epoch": 1.5256309148264984, "grad_norm": 0.5279828914728227, "learning_rate": 1.3656501924565867e-05, "loss": 0.3654, "step": 7739 }, { "epoch": 1.525828075709779, "grad_norm": 0.47358808531757013, "learning_rate": 1.3655059190324453e-05, "loss": 0.2977, "step": 7740 }, { "epoch": 1.52602523659306, "grad_norm": 0.5047664002702137, "learning_rate": 1.3653616368267297e-05, "loss": 0.3231, "step": 7741 }, { "epoch": 1.5262223974763407, "grad_norm": 0.49780512154971746, "learning_rate": 1.3652173458429068e-05, "loss": 0.3339, "step": 7742 }, { "epoch": 1.5264195583596214, "grad_norm": 0.49842257988288025, "learning_rate": 1.3650730460844428e-05, "loss": 0.3231, "step": 7743 }, { "epoch": 1.5266167192429023, "grad_norm": 0.5112605303580525, "learning_rate": 1.3649287375548052e-05, "loss": 0.3416, "step": 7744 }, { "epoch": 1.526813880126183, "grad_norm": 0.49859797442001647, "learning_rate": 1.3647844202574603e-05, "loss": 0.3594, "step": 7745 }, { "epoch": 1.5270110410094637, "grad_norm": 0.5055274577950344, "learning_rate": 1.3646400941958766e-05, "loss": 0.3519, "step": 7746 }, { "epoch": 1.5272082018927446, "grad_norm": 0.5512290570751095, "learning_rate": 1.3644957593735206e-05, "loss": 0.3529, "step": 7747 }, { "epoch": 1.5274053627760251, "grad_norm": 0.47973030402608713, "learning_rate": 1.3643514157938603e-05, "loss": 0.3028, "step": 7748 }, { "epoch": 1.527602523659306, "grad_norm": 0.5198514112156248, "learning_rate": 1.364207063460364e-05, "loss": 0.3197, "step": 7749 }, { "epoch": 1.5277996845425867, "grad_norm": 0.720977186938447, "learning_rate": 1.3640627023764998e-05, "loss": 0.3224, "step": 7750 }, { "epoch": 1.5279968454258674, "grad_norm": 0.4966494106627751, "learning_rate": 1.363918332545736e-05, "loss": 0.3673, "step": 7751 }, { "epoch": 1.5281940063091484, "grad_norm": 0.4806489488052334, "learning_rate": 1.363773953971541e-05, "loss": 0.3246, "step": 7752 }, { "epoch": 1.528391167192429, "grad_norm": 0.5090026005116808, "learning_rate": 1.3636295666573841e-05, "loss": 0.3489, "step": 7753 }, { "epoch": 1.5285883280757098, "grad_norm": 0.5410349262919377, "learning_rate": 1.3634851706067335e-05, "loss": 0.3381, "step": 7754 }, { "epoch": 1.5287854889589907, "grad_norm": 0.49058400059922475, "learning_rate": 1.3633407658230596e-05, "loss": 0.3363, "step": 7755 }, { "epoch": 1.5289826498422712, "grad_norm": 0.45828128853910793, "learning_rate": 1.3631963523098308e-05, "loss": 0.334, "step": 7756 }, { "epoch": 1.529179810725552, "grad_norm": 0.4813482445211936, "learning_rate": 1.3630519300705171e-05, "loss": 0.3347, "step": 7757 }, { "epoch": 1.5293769716088328, "grad_norm": 0.484990899943496, "learning_rate": 1.3629074991085886e-05, "loss": 0.3298, "step": 7758 }, { "epoch": 1.5295741324921135, "grad_norm": 0.49602894149932886, "learning_rate": 1.3627630594275151e-05, "loss": 0.3363, "step": 7759 }, { "epoch": 1.5297712933753944, "grad_norm": 0.46335979722560355, "learning_rate": 1.3626186110307673e-05, "loss": 0.3197, "step": 7760 }, { "epoch": 1.5299684542586751, "grad_norm": 0.4907689809200294, "learning_rate": 1.3624741539218151e-05, "loss": 0.3183, "step": 7761 }, { "epoch": 1.5301656151419558, "grad_norm": 0.48534404933301745, "learning_rate": 1.3623296881041294e-05, "loss": 0.3298, "step": 7762 }, { "epoch": 1.5303627760252367, "grad_norm": 0.44875294362215107, "learning_rate": 1.3621852135811812e-05, "loss": 0.2968, "step": 7763 }, { "epoch": 1.5305599369085172, "grad_norm": 0.5797019681078132, "learning_rate": 1.3620407303564416e-05, "loss": 0.353, "step": 7764 }, { "epoch": 1.5307570977917981, "grad_norm": 0.5028625687192622, "learning_rate": 1.3618962384333818e-05, "loss": 0.319, "step": 7765 }, { "epoch": 1.5309542586750788, "grad_norm": 0.5099435059525019, "learning_rate": 1.3617517378154737e-05, "loss": 0.3495, "step": 7766 }, { "epoch": 1.5311514195583595, "grad_norm": 0.502597011605929, "learning_rate": 1.3616072285061886e-05, "loss": 0.3425, "step": 7767 }, { "epoch": 1.5313485804416405, "grad_norm": 0.48907353787315166, "learning_rate": 1.3614627105089986e-05, "loss": 0.3266, "step": 7768 }, { "epoch": 1.5315457413249212, "grad_norm": 0.519646837650501, "learning_rate": 1.3613181838273758e-05, "loss": 0.3502, "step": 7769 }, { "epoch": 1.5317429022082019, "grad_norm": 0.51805181382411, "learning_rate": 1.3611736484647928e-05, "loss": 0.3447, "step": 7770 }, { "epoch": 1.5319400630914828, "grad_norm": 0.5093499251255805, "learning_rate": 1.3610291044247218e-05, "loss": 0.3474, "step": 7771 }, { "epoch": 1.5321372239747633, "grad_norm": 0.5281367810739265, "learning_rate": 1.3608845517106364e-05, "loss": 0.3572, "step": 7772 }, { "epoch": 1.5323343848580442, "grad_norm": 0.4742137497487593, "learning_rate": 1.3607399903260085e-05, "loss": 0.3142, "step": 7773 }, { "epoch": 1.5325315457413249, "grad_norm": 0.506813178653113, "learning_rate": 1.3605954202743118e-05, "loss": 0.3509, "step": 7774 }, { "epoch": 1.5327287066246056, "grad_norm": 0.4904880273353688, "learning_rate": 1.36045084155902e-05, "loss": 0.3103, "step": 7775 }, { "epoch": 1.5329258675078865, "grad_norm": 0.49044062116786236, "learning_rate": 1.3603062541836068e-05, "loss": 0.3517, "step": 7776 }, { "epoch": 1.5331230283911672, "grad_norm": 0.4634871291754897, "learning_rate": 1.3601616581515451e-05, "loss": 0.328, "step": 7777 }, { "epoch": 1.533320189274448, "grad_norm": 0.47077009351413396, "learning_rate": 1.3600170534663097e-05, "loss": 0.3169, "step": 7778 }, { "epoch": 1.5335173501577288, "grad_norm": 0.4899373883327769, "learning_rate": 1.3598724401313748e-05, "loss": 0.3151, "step": 7779 }, { "epoch": 1.5337145110410093, "grad_norm": 0.519208902268232, "learning_rate": 1.3597278181502146e-05, "loss": 0.3416, "step": 7780 }, { "epoch": 1.5339116719242902, "grad_norm": 0.504751981494511, "learning_rate": 1.3595831875263038e-05, "loss": 0.354, "step": 7781 }, { "epoch": 1.534108832807571, "grad_norm": 0.5020543827284227, "learning_rate": 1.3594385482631176e-05, "loss": 0.3447, "step": 7782 }, { "epoch": 1.5343059936908516, "grad_norm": 0.47429364811655356, "learning_rate": 1.3592939003641308e-05, "loss": 0.3392, "step": 7783 }, { "epoch": 1.5345031545741326, "grad_norm": 6.77906927343203, "learning_rate": 1.3591492438328185e-05, "loss": 0.3508, "step": 7784 }, { "epoch": 1.5347003154574133, "grad_norm": 0.5933469489722475, "learning_rate": 1.3590045786726565e-05, "loss": 0.3581, "step": 7785 }, { "epoch": 1.534897476340694, "grad_norm": 0.4826236402550896, "learning_rate": 1.3588599048871202e-05, "loss": 0.3339, "step": 7786 }, { "epoch": 1.5350946372239749, "grad_norm": 0.4830828768348468, "learning_rate": 1.358715222479686e-05, "loss": 0.3166, "step": 7787 }, { "epoch": 1.5352917981072554, "grad_norm": 0.5336248468657963, "learning_rate": 1.3585705314538293e-05, "loss": 0.3423, "step": 7788 }, { "epoch": 1.5354889589905363, "grad_norm": 0.5287306065268992, "learning_rate": 1.3584258318130274e-05, "loss": 0.3569, "step": 7789 }, { "epoch": 1.535686119873817, "grad_norm": 0.5579283549576503, "learning_rate": 1.3582811235607559e-05, "loss": 0.3587, "step": 7790 }, { "epoch": 1.5358832807570977, "grad_norm": 0.5054259962581725, "learning_rate": 1.358136406700492e-05, "loss": 0.3343, "step": 7791 }, { "epoch": 1.5360804416403786, "grad_norm": 0.49852211546975295, "learning_rate": 1.3579916812357123e-05, "loss": 0.3143, "step": 7792 }, { "epoch": 1.5362776025236593, "grad_norm": 0.4885451610000915, "learning_rate": 1.3578469471698946e-05, "loss": 0.3347, "step": 7793 }, { "epoch": 1.53647476340694, "grad_norm": 1.097411098465949, "learning_rate": 1.3577022045065154e-05, "loss": 0.3622, "step": 7794 }, { "epoch": 1.536671924290221, "grad_norm": 0.48941610871328045, "learning_rate": 1.3575574532490528e-05, "loss": 0.3244, "step": 7795 }, { "epoch": 1.5368690851735016, "grad_norm": 0.5230416955206131, "learning_rate": 1.3574126934009843e-05, "loss": 0.3488, "step": 7796 }, { "epoch": 1.5370662460567823, "grad_norm": 0.47356891013201374, "learning_rate": 1.3572679249657883e-05, "loss": 0.3262, "step": 7797 }, { "epoch": 1.5372634069400632, "grad_norm": 0.5633856591758994, "learning_rate": 1.3571231479469428e-05, "loss": 0.3206, "step": 7798 }, { "epoch": 1.5374605678233437, "grad_norm": 0.48005492038383385, "learning_rate": 1.3569783623479259e-05, "loss": 0.3251, "step": 7799 }, { "epoch": 1.5376577287066246, "grad_norm": 0.44407837297296465, "learning_rate": 1.3568335681722165e-05, "loss": 0.3067, "step": 7800 }, { "epoch": 1.5378548895899053, "grad_norm": 0.49462114423942827, "learning_rate": 1.3566887654232927e-05, "loss": 0.3368, "step": 7801 }, { "epoch": 1.538052050473186, "grad_norm": 0.5323311830491543, "learning_rate": 1.3565439541046346e-05, "loss": 0.3671, "step": 7802 }, { "epoch": 1.538249211356467, "grad_norm": 0.4877716449979154, "learning_rate": 1.3563991342197207e-05, "loss": 0.3255, "step": 7803 }, { "epoch": 1.5384463722397477, "grad_norm": 0.5199206793938796, "learning_rate": 1.3562543057720308e-05, "loss": 0.3473, "step": 7804 }, { "epoch": 1.5386435331230284, "grad_norm": 0.48858101161536593, "learning_rate": 1.356109468765044e-05, "loss": 0.3512, "step": 7805 }, { "epoch": 1.5388406940063093, "grad_norm": 0.6302130832925654, "learning_rate": 1.3559646232022408e-05, "loss": 0.3233, "step": 7806 }, { "epoch": 1.5390378548895898, "grad_norm": 0.5264654250829359, "learning_rate": 1.3558197690871004e-05, "loss": 0.3619, "step": 7807 }, { "epoch": 1.5392350157728707, "grad_norm": 1.142333806645509, "learning_rate": 1.3556749064231038e-05, "loss": 0.3357, "step": 7808 }, { "epoch": 1.5394321766561514, "grad_norm": 0.5172634182082058, "learning_rate": 1.3555300352137311e-05, "loss": 0.3492, "step": 7809 }, { "epoch": 1.539629337539432, "grad_norm": 0.521433933106068, "learning_rate": 1.3553851554624631e-05, "loss": 0.374, "step": 7810 }, { "epoch": 1.539826498422713, "grad_norm": 0.46871626887951895, "learning_rate": 1.3552402671727805e-05, "loss": 0.325, "step": 7811 }, { "epoch": 1.5400236593059937, "grad_norm": 0.46267385384546883, "learning_rate": 1.3550953703481645e-05, "loss": 0.307, "step": 7812 }, { "epoch": 1.5402208201892744, "grad_norm": 0.5082065142815442, "learning_rate": 1.3549504649920961e-05, "loss": 0.3179, "step": 7813 }, { "epoch": 1.5404179810725553, "grad_norm": 0.4835751507338688, "learning_rate": 1.3548055511080568e-05, "loss": 0.3464, "step": 7814 }, { "epoch": 1.5406151419558358, "grad_norm": 0.5054295099711004, "learning_rate": 1.3546606286995288e-05, "loss": 0.3411, "step": 7815 }, { "epoch": 1.5408123028391167, "grad_norm": 0.5124576121953441, "learning_rate": 1.3545156977699931e-05, "loss": 0.3484, "step": 7816 }, { "epoch": 1.5410094637223974, "grad_norm": 0.5937412421888819, "learning_rate": 1.3543707583229328e-05, "loss": 0.347, "step": 7817 }, { "epoch": 1.5412066246056781, "grad_norm": 0.47414570200167167, "learning_rate": 1.3542258103618293e-05, "loss": 0.333, "step": 7818 }, { "epoch": 1.541403785488959, "grad_norm": 0.4826001472020254, "learning_rate": 1.3540808538901658e-05, "loss": 0.3439, "step": 7819 }, { "epoch": 1.5416009463722398, "grad_norm": 0.46449722652318304, "learning_rate": 1.353935888911424e-05, "loss": 0.3237, "step": 7820 }, { "epoch": 1.5417981072555205, "grad_norm": 0.48229434160473766, "learning_rate": 1.3537909154290883e-05, "loss": 0.3343, "step": 7821 }, { "epoch": 1.5419952681388014, "grad_norm": 0.48229421460382765, "learning_rate": 1.3536459334466403e-05, "loss": 0.3375, "step": 7822 }, { "epoch": 1.5421924290220819, "grad_norm": 0.5116523420598664, "learning_rate": 1.3535009429675641e-05, "loss": 0.3473, "step": 7823 }, { "epoch": 1.5423895899053628, "grad_norm": 0.5776036825506357, "learning_rate": 1.3533559439953429e-05, "loss": 0.3598, "step": 7824 }, { "epoch": 1.5425867507886435, "grad_norm": 0.4835560536957904, "learning_rate": 1.3532109365334609e-05, "loss": 0.3359, "step": 7825 }, { "epoch": 1.5427839116719242, "grad_norm": 0.4967123924164706, "learning_rate": 1.3530659205854018e-05, "loss": 0.3343, "step": 7826 }, { "epoch": 1.5429810725552051, "grad_norm": 0.5275219762814375, "learning_rate": 1.3529208961546494e-05, "loss": 0.357, "step": 7827 }, { "epoch": 1.5431782334384858, "grad_norm": 0.5090525455291536, "learning_rate": 1.3527758632446884e-05, "loss": 0.3525, "step": 7828 }, { "epoch": 1.5433753943217665, "grad_norm": 0.4942938895394185, "learning_rate": 1.3526308218590032e-05, "loss": 0.3466, "step": 7829 }, { "epoch": 1.5435725552050474, "grad_norm": 0.485652733290798, "learning_rate": 1.3524857720010784e-05, "loss": 0.3549, "step": 7830 }, { "epoch": 1.543769716088328, "grad_norm": 0.49030437167938445, "learning_rate": 1.3523407136743992e-05, "loss": 0.3342, "step": 7831 }, { "epoch": 1.5439668769716088, "grad_norm": 0.48804412538773867, "learning_rate": 1.3521956468824505e-05, "loss": 0.3553, "step": 7832 }, { "epoch": 1.5441640378548895, "grad_norm": 0.48837222737277775, "learning_rate": 1.3520505716287178e-05, "loss": 0.3262, "step": 7833 }, { "epoch": 1.5443611987381702, "grad_norm": 0.47270681354383004, "learning_rate": 1.3519054879166867e-05, "loss": 0.3444, "step": 7834 }, { "epoch": 1.5445583596214512, "grad_norm": 0.4926982400357956, "learning_rate": 1.3517603957498426e-05, "loss": 0.3336, "step": 7835 }, { "epoch": 1.5447555205047319, "grad_norm": 0.49014739600256885, "learning_rate": 1.351615295131672e-05, "loss": 0.3392, "step": 7836 }, { "epoch": 1.5449526813880126, "grad_norm": 0.4966773713684002, "learning_rate": 1.3514701860656605e-05, "loss": 0.3234, "step": 7837 }, { "epoch": 1.5451498422712935, "grad_norm": 0.49585084596816736, "learning_rate": 1.351325068555295e-05, "loss": 0.3287, "step": 7838 }, { "epoch": 1.5453470031545742, "grad_norm": 0.4772798404368484, "learning_rate": 1.3511799426040617e-05, "loss": 0.3295, "step": 7839 }, { "epoch": 1.5455441640378549, "grad_norm": 0.49778402900715435, "learning_rate": 1.3510348082154476e-05, "loss": 0.3277, "step": 7840 }, { "epoch": 1.5457413249211358, "grad_norm": 0.4984788297997575, "learning_rate": 1.3508896653929392e-05, "loss": 0.3276, "step": 7841 }, { "epoch": 1.5459384858044163, "grad_norm": 1.3775125906338332, "learning_rate": 1.3507445141400247e-05, "loss": 0.3507, "step": 7842 }, { "epoch": 1.5461356466876972, "grad_norm": 0.4962221209215611, "learning_rate": 1.35059935446019e-05, "loss": 0.363, "step": 7843 }, { "epoch": 1.546332807570978, "grad_norm": 0.5016263606305518, "learning_rate": 1.3504541863569237e-05, "loss": 0.3119, "step": 7844 }, { "epoch": 1.5465299684542586, "grad_norm": 0.5307167375631622, "learning_rate": 1.3503090098337138e-05, "loss": 0.332, "step": 7845 }, { "epoch": 1.5467271293375395, "grad_norm": 0.46696657884783277, "learning_rate": 1.3501638248940475e-05, "loss": 0.3343, "step": 7846 }, { "epoch": 1.5469242902208202, "grad_norm": 0.5225512410564163, "learning_rate": 1.3500186315414133e-05, "loss": 0.3373, "step": 7847 }, { "epoch": 1.547121451104101, "grad_norm": 0.487864098453998, "learning_rate": 1.3498734297792994e-05, "loss": 0.3357, "step": 7848 }, { "epoch": 1.5473186119873819, "grad_norm": 0.4768617570887559, "learning_rate": 1.3497282196111949e-05, "loss": 0.352, "step": 7849 }, { "epoch": 1.5475157728706623, "grad_norm": 0.5331400342710803, "learning_rate": 1.3495830010405884e-05, "loss": 0.3285, "step": 7850 }, { "epoch": 1.5477129337539433, "grad_norm": 0.48686256381769405, "learning_rate": 1.3494377740709685e-05, "loss": 0.3352, "step": 7851 }, { "epoch": 1.547910094637224, "grad_norm": 0.4919653626792826, "learning_rate": 1.3492925387058249e-05, "loss": 0.349, "step": 7852 }, { "epoch": 1.5481072555205047, "grad_norm": 0.5069611129563922, "learning_rate": 1.3491472949486466e-05, "loss": 0.3184, "step": 7853 }, { "epoch": 1.5483044164037856, "grad_norm": 0.4979053390062899, "learning_rate": 1.3490020428029236e-05, "loss": 0.324, "step": 7854 }, { "epoch": 1.5485015772870663, "grad_norm": 0.5968503457298566, "learning_rate": 1.3488567822721453e-05, "loss": 0.3675, "step": 7855 }, { "epoch": 1.548698738170347, "grad_norm": 0.4798040440294239, "learning_rate": 1.3487115133598017e-05, "loss": 0.332, "step": 7856 }, { "epoch": 1.548895899053628, "grad_norm": 0.9676067949318208, "learning_rate": 1.3485662360693834e-05, "loss": 0.3271, "step": 7857 }, { "epoch": 1.5490930599369084, "grad_norm": 0.5290026297802422, "learning_rate": 1.3484209504043804e-05, "loss": 0.3601, "step": 7858 }, { "epoch": 1.5492902208201893, "grad_norm": 0.5572104887123693, "learning_rate": 1.3482756563682837e-05, "loss": 0.3474, "step": 7859 }, { "epoch": 1.54948738170347, "grad_norm": 0.48952981915700133, "learning_rate": 1.3481303539645838e-05, "loss": 0.349, "step": 7860 }, { "epoch": 1.5496845425867507, "grad_norm": 0.49539223483029654, "learning_rate": 1.347985043196772e-05, "loss": 0.3374, "step": 7861 }, { "epoch": 1.5498817034700316, "grad_norm": 0.4935516593268971, "learning_rate": 1.3478397240683387e-05, "loss": 0.3269, "step": 7862 }, { "epoch": 1.5500788643533123, "grad_norm": 0.5036803822601013, "learning_rate": 1.3476943965827765e-05, "loss": 0.3285, "step": 7863 }, { "epoch": 1.550276025236593, "grad_norm": 0.5028733026075429, "learning_rate": 1.3475490607435764e-05, "loss": 0.3494, "step": 7864 }, { "epoch": 1.550473186119874, "grad_norm": 0.4764330154599795, "learning_rate": 1.34740371655423e-05, "loss": 0.3291, "step": 7865 }, { "epoch": 1.5506703470031544, "grad_norm": 2.4174378790452895, "learning_rate": 1.3472583640182298e-05, "loss": 0.3636, "step": 7866 }, { "epoch": 1.5508675078864353, "grad_norm": 0.5015765985791284, "learning_rate": 1.3471130031390673e-05, "loss": 0.332, "step": 7867 }, { "epoch": 1.551064668769716, "grad_norm": 0.5309675714334715, "learning_rate": 1.346967633920236e-05, "loss": 0.3314, "step": 7868 }, { "epoch": 1.5512618296529967, "grad_norm": 0.4868835503536096, "learning_rate": 1.3468222563652274e-05, "loss": 0.3377, "step": 7869 }, { "epoch": 1.5514589905362777, "grad_norm": 0.5127927157453637, "learning_rate": 1.3466768704775348e-05, "loss": 0.3672, "step": 7870 }, { "epoch": 1.5516561514195584, "grad_norm": 0.4827580717241643, "learning_rate": 1.3465314762606513e-05, "loss": 0.3218, "step": 7871 }, { "epoch": 1.551853312302839, "grad_norm": 0.5736851092892326, "learning_rate": 1.3463860737180703e-05, "loss": 0.366, "step": 7872 }, { "epoch": 1.55205047318612, "grad_norm": 0.5318567397637377, "learning_rate": 1.3462406628532846e-05, "loss": 0.3378, "step": 7873 }, { "epoch": 1.5522476340694005, "grad_norm": 0.5155275111603366, "learning_rate": 1.3460952436697883e-05, "loss": 0.3438, "step": 7874 }, { "epoch": 1.5524447949526814, "grad_norm": 0.4775879907549808, "learning_rate": 1.345949816171075e-05, "loss": 0.3427, "step": 7875 }, { "epoch": 1.552641955835962, "grad_norm": 0.5168763642660567, "learning_rate": 1.3458043803606386e-05, "loss": 0.3597, "step": 7876 }, { "epoch": 1.5528391167192428, "grad_norm": 0.5349163438973762, "learning_rate": 1.3456589362419739e-05, "loss": 0.3805, "step": 7877 }, { "epoch": 1.5530362776025237, "grad_norm": 0.5844522900838138, "learning_rate": 1.3455134838185746e-05, "loss": 0.3263, "step": 7878 }, { "epoch": 1.5532334384858044, "grad_norm": 0.4829639630596844, "learning_rate": 1.3453680230939357e-05, "loss": 0.3256, "step": 7879 }, { "epoch": 1.5534305993690851, "grad_norm": 0.5218800494399819, "learning_rate": 1.345222554071552e-05, "loss": 0.3303, "step": 7880 }, { "epoch": 1.553627760252366, "grad_norm": 0.521156173705382, "learning_rate": 1.3450770767549181e-05, "loss": 0.3395, "step": 7881 }, { "epoch": 1.5538249211356467, "grad_norm": 0.4981289635139336, "learning_rate": 1.34493159114753e-05, "loss": 0.3327, "step": 7882 }, { "epoch": 1.5540220820189274, "grad_norm": 0.5448350587172559, "learning_rate": 1.3447860972528823e-05, "loss": 0.3581, "step": 7883 }, { "epoch": 1.5542192429022084, "grad_norm": 0.5056691581563505, "learning_rate": 1.3446405950744709e-05, "loss": 0.3432, "step": 7884 }, { "epoch": 1.5544164037854888, "grad_norm": 0.4953282778846118, "learning_rate": 1.344495084615792e-05, "loss": 0.3137, "step": 7885 }, { "epoch": 1.5546135646687698, "grad_norm": 0.4751287087730977, "learning_rate": 1.344349565880341e-05, "loss": 0.3207, "step": 7886 }, { "epoch": 1.5548107255520505, "grad_norm": 0.5085147826900555, "learning_rate": 1.3442040388716146e-05, "loss": 0.3499, "step": 7887 }, { "epoch": 1.5550078864353312, "grad_norm": 0.47206999350998413, "learning_rate": 1.3440585035931089e-05, "loss": 0.327, "step": 7888 }, { "epoch": 1.555205047318612, "grad_norm": 0.4488347810163907, "learning_rate": 1.3439129600483207e-05, "loss": 0.318, "step": 7889 }, { "epoch": 1.5554022082018928, "grad_norm": 0.5051353448066457, "learning_rate": 1.3437674082407463e-05, "loss": 0.3564, "step": 7890 }, { "epoch": 1.5555993690851735, "grad_norm": 0.49268581152861246, "learning_rate": 1.3436218481738834e-05, "loss": 0.3415, "step": 7891 }, { "epoch": 1.5557965299684544, "grad_norm": 0.45508654653177255, "learning_rate": 1.343476279851229e-05, "loss": 0.3255, "step": 7892 }, { "epoch": 1.555993690851735, "grad_norm": 0.5114403490145619, "learning_rate": 1.3433307032762799e-05, "loss": 0.3453, "step": 7893 }, { "epoch": 1.5561908517350158, "grad_norm": 0.5594515489170677, "learning_rate": 1.3431851184525343e-05, "loss": 0.3382, "step": 7894 }, { "epoch": 1.5563880126182965, "grad_norm": 0.5360994056754633, "learning_rate": 1.3430395253834902e-05, "loss": 0.359, "step": 7895 }, { "epoch": 1.5565851735015772, "grad_norm": 0.5154345101589657, "learning_rate": 1.3428939240726451e-05, "loss": 0.3317, "step": 7896 }, { "epoch": 1.5567823343848581, "grad_norm": 0.5196905638444523, "learning_rate": 1.3427483145234974e-05, "loss": 0.3562, "step": 7897 }, { "epoch": 1.5569794952681388, "grad_norm": 0.6914979302075561, "learning_rate": 1.342602696739545e-05, "loss": 0.3364, "step": 7898 }, { "epoch": 1.5571766561514195, "grad_norm": 0.4739027137576714, "learning_rate": 1.3424570707242875e-05, "loss": 0.3069, "step": 7899 }, { "epoch": 1.5573738170347005, "grad_norm": 0.4986733883254391, "learning_rate": 1.3423114364812229e-05, "loss": 0.3468, "step": 7900 }, { "epoch": 1.557570977917981, "grad_norm": 0.5096555609944065, "learning_rate": 1.3421657940138504e-05, "loss": 0.3436, "step": 7901 }, { "epoch": 1.5577681388012619, "grad_norm": 0.5170708802611125, "learning_rate": 1.342020143325669e-05, "loss": 0.3461, "step": 7902 }, { "epoch": 1.5579652996845426, "grad_norm": 0.5134132452327762, "learning_rate": 1.3418744844201783e-05, "loss": 0.3173, "step": 7903 }, { "epoch": 1.5581624605678233, "grad_norm": 0.48373651440601273, "learning_rate": 1.3417288173008778e-05, "loss": 0.3306, "step": 7904 }, { "epoch": 1.5583596214511042, "grad_norm": 0.5305843575820182, "learning_rate": 1.341583141971267e-05, "loss": 0.359, "step": 7905 }, { "epoch": 1.5585567823343849, "grad_norm": 0.4992440462111545, "learning_rate": 1.3414374584348466e-05, "loss": 0.3436, "step": 7906 }, { "epoch": 1.5587539432176656, "grad_norm": 0.4815491827718251, "learning_rate": 1.3412917666951159e-05, "loss": 0.3462, "step": 7907 }, { "epoch": 1.5589511041009465, "grad_norm": 0.4686319393529876, "learning_rate": 1.3411460667555762e-05, "loss": 0.3345, "step": 7908 }, { "epoch": 1.559148264984227, "grad_norm": 0.4699483011052357, "learning_rate": 1.341000358619727e-05, "loss": 0.3093, "step": 7909 }, { "epoch": 1.559345425867508, "grad_norm": 0.464632109167414, "learning_rate": 1.34085464229107e-05, "loss": 0.3376, "step": 7910 }, { "epoch": 1.5595425867507886, "grad_norm": 0.4823143160131754, "learning_rate": 1.3407089177731052e-05, "loss": 0.3385, "step": 7911 }, { "epoch": 1.5597397476340693, "grad_norm": 0.5218932690257738, "learning_rate": 1.3405631850693347e-05, "loss": 0.3565, "step": 7912 }, { "epoch": 1.5599369085173502, "grad_norm": 0.45189898716468596, "learning_rate": 1.3404174441832592e-05, "loss": 0.3183, "step": 7913 }, { "epoch": 1.560134069400631, "grad_norm": 0.5167028440182481, "learning_rate": 1.3402716951183807e-05, "loss": 0.3643, "step": 7914 }, { "epoch": 1.5603312302839116, "grad_norm": 0.48376625167026677, "learning_rate": 1.3401259378782005e-05, "loss": 0.3401, "step": 7915 }, { "epoch": 1.5605283911671926, "grad_norm": 0.4773795454869428, "learning_rate": 1.3399801724662209e-05, "loss": 0.3224, "step": 7916 }, { "epoch": 1.560725552050473, "grad_norm": 0.4945502868783542, "learning_rate": 1.3398343988859439e-05, "loss": 0.3307, "step": 7917 }, { "epoch": 1.560922712933754, "grad_norm": 0.5019400523084708, "learning_rate": 1.3396886171408717e-05, "loss": 0.3409, "step": 7918 }, { "epoch": 1.5611198738170347, "grad_norm": 0.4691324729142486, "learning_rate": 1.3395428272345067e-05, "loss": 0.325, "step": 7919 }, { "epoch": 1.5613170347003154, "grad_norm": 0.4672956168760278, "learning_rate": 1.3393970291703523e-05, "loss": 0.3032, "step": 7920 }, { "epoch": 1.5615141955835963, "grad_norm": 0.6166235750122606, "learning_rate": 1.3392512229519105e-05, "loss": 0.3498, "step": 7921 }, { "epoch": 1.561711356466877, "grad_norm": 0.4806514171065023, "learning_rate": 1.339105408582685e-05, "loss": 0.3212, "step": 7922 }, { "epoch": 1.5619085173501577, "grad_norm": 0.48674489059156, "learning_rate": 1.3389595860661793e-05, "loss": 0.3374, "step": 7923 }, { "epoch": 1.5621056782334386, "grad_norm": 0.502066062621961, "learning_rate": 1.3388137554058961e-05, "loss": 0.3328, "step": 7924 }, { "epoch": 1.562302839116719, "grad_norm": 0.479052777119119, "learning_rate": 1.33866791660534e-05, "loss": 0.3363, "step": 7925 }, { "epoch": 1.5625, "grad_norm": 0.512932795649453, "learning_rate": 1.3385220696680142e-05, "loss": 0.3632, "step": 7926 }, { "epoch": 1.562697160883281, "grad_norm": 0.5291035392129729, "learning_rate": 1.3383762145974233e-05, "loss": 0.3619, "step": 7927 }, { "epoch": 1.5628943217665614, "grad_norm": 0.49465140055838475, "learning_rate": 1.338230351397071e-05, "loss": 0.3417, "step": 7928 }, { "epoch": 1.5630914826498423, "grad_norm": 0.4967349452813738, "learning_rate": 1.3380844800704624e-05, "loss": 0.3243, "step": 7929 }, { "epoch": 1.563288643533123, "grad_norm": 0.47414238203469705, "learning_rate": 1.3379386006211021e-05, "loss": 0.3145, "step": 7930 }, { "epoch": 1.5634858044164037, "grad_norm": 0.5112298596770326, "learning_rate": 1.3377927130524943e-05, "loss": 0.3532, "step": 7931 }, { "epoch": 1.5636829652996846, "grad_norm": 0.49210192456304214, "learning_rate": 1.337646817368145e-05, "loss": 0.3367, "step": 7932 }, { "epoch": 1.5638801261829653, "grad_norm": 0.5028321558489999, "learning_rate": 1.3375009135715584e-05, "loss": 0.3532, "step": 7933 }, { "epoch": 1.564077287066246, "grad_norm": 0.45880232523040504, "learning_rate": 1.3373550016662414e-05, "loss": 0.3176, "step": 7934 }, { "epoch": 1.564274447949527, "grad_norm": 0.5003629680329704, "learning_rate": 1.337209081655698e-05, "loss": 0.3133, "step": 7935 }, { "epoch": 1.5644716088328074, "grad_norm": 0.5046115931642703, "learning_rate": 1.3370631535434356e-05, "loss": 0.3378, "step": 7936 }, { "epoch": 1.5646687697160884, "grad_norm": 0.47177267681957447, "learning_rate": 1.3369172173329588e-05, "loss": 0.322, "step": 7937 }, { "epoch": 1.564865930599369, "grad_norm": 0.5082813052750841, "learning_rate": 1.3367712730277748e-05, "loss": 0.3357, "step": 7938 }, { "epoch": 1.5650630914826498, "grad_norm": 0.509547876262465, "learning_rate": 1.33662532063139e-05, "loss": 0.3505, "step": 7939 }, { "epoch": 1.5652602523659307, "grad_norm": 0.4783658190779783, "learning_rate": 1.3364793601473105e-05, "loss": 0.3284, "step": 7940 }, { "epoch": 1.5654574132492114, "grad_norm": 0.5056684800626197, "learning_rate": 1.3363333915790435e-05, "loss": 0.3331, "step": 7941 }, { "epoch": 1.565654574132492, "grad_norm": 0.5243617531078861, "learning_rate": 1.336187414930096e-05, "loss": 0.3303, "step": 7942 }, { "epoch": 1.565851735015773, "grad_norm": 0.49836190761234994, "learning_rate": 1.336041430203975e-05, "loss": 0.3452, "step": 7943 }, { "epoch": 1.5660488958990535, "grad_norm": 0.4968929524695866, "learning_rate": 1.3358954374041882e-05, "loss": 0.3172, "step": 7944 }, { "epoch": 1.5662460567823344, "grad_norm": 0.4356287567333736, "learning_rate": 1.335749436534243e-05, "loss": 0.2964, "step": 7945 }, { "epoch": 1.5664432176656151, "grad_norm": 0.4984101635486525, "learning_rate": 1.335603427597647e-05, "loss": 0.355, "step": 7946 }, { "epoch": 1.5666403785488958, "grad_norm": 0.47682247093029434, "learning_rate": 1.3354574105979085e-05, "loss": 0.3365, "step": 7947 }, { "epoch": 1.5668375394321767, "grad_norm": 0.4845515350210312, "learning_rate": 1.3353113855385356e-05, "loss": 0.3208, "step": 7948 }, { "epoch": 1.5670347003154574, "grad_norm": 0.5275550760728646, "learning_rate": 1.3351653524230366e-05, "loss": 0.3523, "step": 7949 }, { "epoch": 1.5672318611987381, "grad_norm": 0.4735343037228871, "learning_rate": 1.3350193112549202e-05, "loss": 0.3279, "step": 7950 }, { "epoch": 1.567429022082019, "grad_norm": 0.46954231120457435, "learning_rate": 1.334873262037695e-05, "loss": 0.3417, "step": 7951 }, { "epoch": 1.5676261829652995, "grad_norm": 0.9705396445603504, "learning_rate": 1.3347272047748696e-05, "loss": 0.3745, "step": 7952 }, { "epoch": 1.5678233438485805, "grad_norm": 0.513132943966501, "learning_rate": 1.3345811394699542e-05, "loss": 0.3437, "step": 7953 }, { "epoch": 1.5680205047318612, "grad_norm": 0.4859610432771261, "learning_rate": 1.3344350661264568e-05, "loss": 0.3525, "step": 7954 }, { "epoch": 1.5682176656151419, "grad_norm": 0.5006180525289092, "learning_rate": 1.3342889847478884e-05, "loss": 0.3054, "step": 7955 }, { "epoch": 1.5684148264984228, "grad_norm": 0.49462467277505867, "learning_rate": 1.3341428953377574e-05, "loss": 0.3457, "step": 7956 }, { "epoch": 1.5686119873817035, "grad_norm": 0.4960059003422695, "learning_rate": 1.3339967978995746e-05, "loss": 0.3262, "step": 7957 }, { "epoch": 1.5688091482649842, "grad_norm": 0.5065110926656363, "learning_rate": 1.3338506924368494e-05, "loss": 0.3368, "step": 7958 }, { "epoch": 1.569006309148265, "grad_norm": 0.49924087045412563, "learning_rate": 1.3337045789530927e-05, "loss": 0.3544, "step": 7959 }, { "epoch": 1.5692034700315456, "grad_norm": 0.5450061812449806, "learning_rate": 1.3335584574518148e-05, "loss": 0.3455, "step": 7960 }, { "epoch": 1.5694006309148265, "grad_norm": 0.5370940243152774, "learning_rate": 1.333412327936526e-05, "loss": 0.3212, "step": 7961 }, { "epoch": 1.5695977917981072, "grad_norm": 0.48962467971522494, "learning_rate": 1.333266190410738e-05, "loss": 0.327, "step": 7962 }, { "epoch": 1.569794952681388, "grad_norm": 0.4967444956543661, "learning_rate": 1.3331200448779611e-05, "loss": 0.3412, "step": 7963 }, { "epoch": 1.5699921135646688, "grad_norm": 1.5137118599387835, "learning_rate": 1.332973891341707e-05, "loss": 0.3315, "step": 7964 }, { "epoch": 1.5701892744479495, "grad_norm": 0.5024737388313264, "learning_rate": 1.332827729805487e-05, "loss": 0.3528, "step": 7965 }, { "epoch": 1.5703864353312302, "grad_norm": 0.5259290240369583, "learning_rate": 1.3326815602728127e-05, "loss": 0.3713, "step": 7966 }, { "epoch": 1.5705835962145112, "grad_norm": 0.5279028181427355, "learning_rate": 1.332535382747196e-05, "loss": 0.3557, "step": 7967 }, { "epoch": 1.5707807570977916, "grad_norm": 0.7224209367198574, "learning_rate": 1.332389197232149e-05, "loss": 0.3688, "step": 7968 }, { "epoch": 1.5709779179810726, "grad_norm": 0.48099183427409264, "learning_rate": 1.3322430037311837e-05, "loss": 0.3116, "step": 7969 }, { "epoch": 1.5711750788643533, "grad_norm": 0.508998063080973, "learning_rate": 1.332096802247813e-05, "loss": 0.3365, "step": 7970 }, { "epoch": 1.571372239747634, "grad_norm": 0.49220793045424055, "learning_rate": 1.331950592785549e-05, "loss": 0.3522, "step": 7971 }, { "epoch": 1.5715694006309149, "grad_norm": 0.5332391139433758, "learning_rate": 1.3318043753479047e-05, "loss": 0.3892, "step": 7972 }, { "epoch": 1.5717665615141956, "grad_norm": 0.4951350369532224, "learning_rate": 1.3316581499383929e-05, "loss": 0.3295, "step": 7973 }, { "epoch": 1.5719637223974763, "grad_norm": 0.5140523975794388, "learning_rate": 1.3315119165605273e-05, "loss": 0.3443, "step": 7974 }, { "epoch": 1.5721608832807572, "grad_norm": 0.5191099345240461, "learning_rate": 1.3313656752178205e-05, "loss": 0.3456, "step": 7975 }, { "epoch": 1.572358044164038, "grad_norm": 0.5150428182108653, "learning_rate": 1.331219425913787e-05, "loss": 0.3262, "step": 7976 }, { "epoch": 1.5725552050473186, "grad_norm": 0.4909428641563583, "learning_rate": 1.3310731686519397e-05, "loss": 0.3172, "step": 7977 }, { "epoch": 1.5727523659305995, "grad_norm": 0.4961393599175189, "learning_rate": 1.3309269034357931e-05, "loss": 0.3097, "step": 7978 }, { "epoch": 1.57294952681388, "grad_norm": 0.5318028453318919, "learning_rate": 1.330780630268861e-05, "loss": 0.3635, "step": 7979 }, { "epoch": 1.573146687697161, "grad_norm": 0.5996323717685671, "learning_rate": 1.3306343491546581e-05, "loss": 0.3703, "step": 7980 }, { "epoch": 1.5733438485804416, "grad_norm": 1.5419005107960566, "learning_rate": 1.3304880600966985e-05, "loss": 0.3559, "step": 7981 }, { "epoch": 1.5735410094637223, "grad_norm": 0.4648861254663534, "learning_rate": 1.3303417630984972e-05, "loss": 0.31, "step": 7982 }, { "epoch": 1.5737381703470033, "grad_norm": 0.5014405931273911, "learning_rate": 1.3301954581635692e-05, "loss": 0.3542, "step": 7983 }, { "epoch": 1.573935331230284, "grad_norm": 0.47324297334032567, "learning_rate": 1.3300491452954292e-05, "loss": 0.3439, "step": 7984 }, { "epoch": 1.5741324921135647, "grad_norm": 0.5098324583372497, "learning_rate": 1.3299028244975929e-05, "loss": 0.3528, "step": 7985 }, { "epoch": 1.5743296529968456, "grad_norm": 0.48462830498298404, "learning_rate": 1.3297564957735752e-05, "loss": 0.3318, "step": 7986 }, { "epoch": 1.574526813880126, "grad_norm": 0.5391632212698709, "learning_rate": 1.3296101591268924e-05, "loss": 0.3632, "step": 7987 }, { "epoch": 1.574723974763407, "grad_norm": 0.48926713072275946, "learning_rate": 1.3294638145610598e-05, "loss": 0.314, "step": 7988 }, { "epoch": 1.5749211356466877, "grad_norm": 0.5220045491815719, "learning_rate": 1.3293174620795942e-05, "loss": 0.3469, "step": 7989 }, { "epoch": 1.5751182965299684, "grad_norm": 0.5103365539849146, "learning_rate": 1.329171101686011e-05, "loss": 0.3431, "step": 7990 }, { "epoch": 1.5753154574132493, "grad_norm": 0.49515948764836015, "learning_rate": 1.329024733383827e-05, "loss": 0.312, "step": 7991 }, { "epoch": 1.57551261829653, "grad_norm": 0.47764308725452986, "learning_rate": 1.328878357176559e-05, "loss": 0.3156, "step": 7992 }, { "epoch": 1.5757097791798107, "grad_norm": 0.4913518129002428, "learning_rate": 1.3287319730677237e-05, "loss": 0.3449, "step": 7993 }, { "epoch": 1.5759069400630916, "grad_norm": 0.5199989452514197, "learning_rate": 1.3285855810608377e-05, "loss": 0.3572, "step": 7994 }, { "epoch": 1.576104100946372, "grad_norm": 0.46619302997703116, "learning_rate": 1.3284391811594191e-05, "loss": 0.3115, "step": 7995 }, { "epoch": 1.576301261829653, "grad_norm": 0.532011015620942, "learning_rate": 1.3282927733669842e-05, "loss": 0.3377, "step": 7996 }, { "epoch": 1.5764984227129337, "grad_norm": 1.440253937845809, "learning_rate": 1.328146357687051e-05, "loss": 0.3688, "step": 7997 }, { "epoch": 1.5766955835962144, "grad_norm": 1.0850935873776184, "learning_rate": 1.3279999341231375e-05, "loss": 0.338, "step": 7998 }, { "epoch": 1.5768927444794953, "grad_norm": 0.5168104366063799, "learning_rate": 1.3278535026787614e-05, "loss": 0.3501, "step": 7999 }, { "epoch": 1.577089905362776, "grad_norm": 0.5139066874038835, "learning_rate": 1.3277070633574409e-05, "loss": 0.3414, "step": 8000 }, { "epoch": 1.5772870662460567, "grad_norm": 0.48443540557771675, "learning_rate": 1.3275606161626941e-05, "loss": 0.3271, "step": 8001 }, { "epoch": 1.5774842271293377, "grad_norm": 0.5239644881242183, "learning_rate": 1.32741416109804e-05, "loss": 0.3445, "step": 8002 }, { "epoch": 1.5776813880126181, "grad_norm": 0.46561515482433635, "learning_rate": 1.3272676981669968e-05, "loss": 0.3232, "step": 8003 }, { "epoch": 1.577878548895899, "grad_norm": 0.5355952021642951, "learning_rate": 1.327121227373084e-05, "loss": 0.3552, "step": 8004 }, { "epoch": 1.5780757097791798, "grad_norm": 0.511194087803704, "learning_rate": 1.3269747487198197e-05, "loss": 0.3651, "step": 8005 }, { "epoch": 1.5782728706624605, "grad_norm": 0.463393152472201, "learning_rate": 1.326828262210724e-05, "loss": 0.3282, "step": 8006 }, { "epoch": 1.5784700315457414, "grad_norm": 0.49704658525209167, "learning_rate": 1.326681767849316e-05, "loss": 0.3174, "step": 8007 }, { "epoch": 1.578667192429022, "grad_norm": 0.5412028232341971, "learning_rate": 1.3265352656391158e-05, "loss": 0.349, "step": 8008 }, { "epoch": 1.5788643533123028, "grad_norm": 0.4698860322570366, "learning_rate": 1.3263887555836425e-05, "loss": 0.3106, "step": 8009 }, { "epoch": 1.5790615141955837, "grad_norm": 0.4982618521435573, "learning_rate": 1.3262422376864168e-05, "loss": 0.3262, "step": 8010 }, { "epoch": 1.5792586750788642, "grad_norm": 0.539325905160115, "learning_rate": 1.3260957119509586e-05, "loss": 0.3502, "step": 8011 }, { "epoch": 1.5794558359621451, "grad_norm": 0.5288545896046527, "learning_rate": 1.325949178380788e-05, "loss": 0.3692, "step": 8012 }, { "epoch": 1.5796529968454258, "grad_norm": 0.4758545656821426, "learning_rate": 1.3258026369794261e-05, "loss": 0.3283, "step": 8013 }, { "epoch": 1.5798501577287065, "grad_norm": 0.5218405213561202, "learning_rate": 1.3256560877503936e-05, "loss": 0.3415, "step": 8014 }, { "epoch": 1.5800473186119874, "grad_norm": 0.4894784063780892, "learning_rate": 1.3255095306972112e-05, "loss": 0.3269, "step": 8015 }, { "epoch": 1.5802444794952681, "grad_norm": 0.5110496055324127, "learning_rate": 1.3253629658234002e-05, "loss": 0.3542, "step": 8016 }, { "epoch": 1.5804416403785488, "grad_norm": 0.49174003178088704, "learning_rate": 1.325216393132482e-05, "loss": 0.3394, "step": 8017 }, { "epoch": 1.5806388012618298, "grad_norm": 0.47299800973515416, "learning_rate": 1.3250698126279781e-05, "loss": 0.3419, "step": 8018 }, { "epoch": 1.5808359621451105, "grad_norm": 0.4822158106803239, "learning_rate": 1.32492322431341e-05, "loss": 0.3539, "step": 8019 }, { "epoch": 1.5810331230283912, "grad_norm": 0.47952679544937005, "learning_rate": 1.3247766281922998e-05, "loss": 0.3468, "step": 8020 }, { "epoch": 1.581230283911672, "grad_norm": 0.46714001850575293, "learning_rate": 1.3246300242681698e-05, "loss": 0.3468, "step": 8021 }, { "epoch": 1.5814274447949526, "grad_norm": 0.5770765745101369, "learning_rate": 1.3244834125445415e-05, "loss": 0.3658, "step": 8022 }, { "epoch": 1.5816246056782335, "grad_norm": 0.466026202641457, "learning_rate": 1.3243367930249386e-05, "loss": 0.3342, "step": 8023 }, { "epoch": 1.5818217665615142, "grad_norm": 0.5082981481974369, "learning_rate": 1.3241901657128827e-05, "loss": 0.3635, "step": 8024 }, { "epoch": 1.5820189274447949, "grad_norm": 0.48542475906619664, "learning_rate": 1.3240435306118973e-05, "loss": 0.3553, "step": 8025 }, { "epoch": 1.5822160883280758, "grad_norm": 0.4787474049672265, "learning_rate": 1.3238968877255044e-05, "loss": 0.3357, "step": 8026 }, { "epoch": 1.5824132492113565, "grad_norm": 0.5075996681809781, "learning_rate": 1.3237502370572287e-05, "loss": 0.3479, "step": 8027 }, { "epoch": 1.5826104100946372, "grad_norm": 0.5129452355663832, "learning_rate": 1.3236035786105922e-05, "loss": 0.3598, "step": 8028 }, { "epoch": 1.5828075709779181, "grad_norm": 0.45167882333931725, "learning_rate": 1.3234569123891197e-05, "loss": 0.3199, "step": 8029 }, { "epoch": 1.5830047318611986, "grad_norm": 0.4891940893562189, "learning_rate": 1.3233102383963341e-05, "loss": 0.3588, "step": 8030 }, { "epoch": 1.5832018927444795, "grad_norm": 0.4924263041994256, "learning_rate": 1.3231635566357599e-05, "loss": 0.3558, "step": 8031 }, { "epoch": 1.5833990536277602, "grad_norm": 0.5691351727091879, "learning_rate": 1.3230168671109207e-05, "loss": 0.3215, "step": 8032 }, { "epoch": 1.583596214511041, "grad_norm": 0.5454873000853145, "learning_rate": 1.3228701698253415e-05, "loss": 0.3561, "step": 8033 }, { "epoch": 1.5837933753943219, "grad_norm": 4.541252989056811, "learning_rate": 1.3227234647825463e-05, "loss": 0.329, "step": 8034 }, { "epoch": 1.5839905362776026, "grad_norm": 0.5120743849675218, "learning_rate": 1.3225767519860597e-05, "loss": 0.342, "step": 8035 }, { "epoch": 1.5841876971608833, "grad_norm": 0.5241749900315733, "learning_rate": 1.3224300314394073e-05, "loss": 0.3847, "step": 8036 }, { "epoch": 1.5843848580441642, "grad_norm": 0.5165462915069103, "learning_rate": 1.3222833031461133e-05, "loss": 0.3414, "step": 8037 }, { "epoch": 1.5845820189274447, "grad_norm": 0.5189206607460046, "learning_rate": 1.3221365671097038e-05, "loss": 0.3291, "step": 8038 }, { "epoch": 1.5847791798107256, "grad_norm": 0.5059501509473748, "learning_rate": 1.3219898233337036e-05, "loss": 0.3249, "step": 8039 }, { "epoch": 1.5849763406940063, "grad_norm": 0.4904431912501636, "learning_rate": 1.321843071821639e-05, "loss": 0.3435, "step": 8040 }, { "epoch": 1.585173501577287, "grad_norm": 0.45852925927612104, "learning_rate": 1.3216963125770345e-05, "loss": 0.3055, "step": 8041 }, { "epoch": 1.585370662460568, "grad_norm": 0.5589366409498242, "learning_rate": 1.3215495456034179e-05, "loss": 0.3253, "step": 8042 }, { "epoch": 1.5855678233438486, "grad_norm": 0.49532146559960627, "learning_rate": 1.3214027709043142e-05, "loss": 0.3442, "step": 8043 }, { "epoch": 1.5857649842271293, "grad_norm": 0.49902148390151746, "learning_rate": 1.3212559884832503e-05, "loss": 0.3499, "step": 8044 }, { "epoch": 1.5859621451104102, "grad_norm": 0.5117292560455523, "learning_rate": 1.3211091983437524e-05, "loss": 0.3502, "step": 8045 }, { "epoch": 1.5861593059936907, "grad_norm": 0.4829964496583425, "learning_rate": 1.3209624004893476e-05, "loss": 0.321, "step": 8046 }, { "epoch": 1.5863564668769716, "grad_norm": 0.5776878181785877, "learning_rate": 1.3208155949235621e-05, "loss": 0.3194, "step": 8047 }, { "epoch": 1.5865536277602523, "grad_norm": 0.44848690865415186, "learning_rate": 1.3206687816499242e-05, "loss": 0.3077, "step": 8048 }, { "epoch": 1.586750788643533, "grad_norm": 0.4881456482953011, "learning_rate": 1.3205219606719606e-05, "loss": 0.3246, "step": 8049 }, { "epoch": 1.586947949526814, "grad_norm": 0.5011767838202971, "learning_rate": 1.3203751319931983e-05, "loss": 0.355, "step": 8050 }, { "epoch": 1.5871451104100947, "grad_norm": 0.5664107192411584, "learning_rate": 1.320228295617166e-05, "loss": 0.3579, "step": 8051 }, { "epoch": 1.5873422712933754, "grad_norm": 3.4551884902309777, "learning_rate": 1.3200814515473905e-05, "loss": 0.3417, "step": 8052 }, { "epoch": 1.5875394321766563, "grad_norm": 0.5286194871009565, "learning_rate": 1.3199345997874007e-05, "loss": 0.3351, "step": 8053 }, { "epoch": 1.5877365930599368, "grad_norm": 0.4625651878543429, "learning_rate": 1.3197877403407242e-05, "loss": 0.3198, "step": 8054 }, { "epoch": 1.5879337539432177, "grad_norm": 0.6352734118743283, "learning_rate": 1.31964087321089e-05, "loss": 0.3448, "step": 8055 }, { "epoch": 1.5881309148264984, "grad_norm": 0.503782340045354, "learning_rate": 1.3194939984014263e-05, "loss": 0.354, "step": 8056 }, { "epoch": 1.588328075709779, "grad_norm": 0.5794061421651925, "learning_rate": 1.3193471159158621e-05, "loss": 0.3621, "step": 8057 }, { "epoch": 1.58852523659306, "grad_norm": 0.49275969055339675, "learning_rate": 1.3192002257577263e-05, "loss": 0.3333, "step": 8058 }, { "epoch": 1.5887223974763407, "grad_norm": 0.4815640944601471, "learning_rate": 1.319053327930548e-05, "loss": 0.341, "step": 8059 }, { "epoch": 1.5889195583596214, "grad_norm": 0.4613014999066501, "learning_rate": 1.3189064224378562e-05, "loss": 0.3101, "step": 8060 }, { "epoch": 1.5891167192429023, "grad_norm": 0.48626447035569825, "learning_rate": 1.3187595092831813e-05, "loss": 0.3146, "step": 8061 }, { "epoch": 1.589313880126183, "grad_norm": 0.4577848208645168, "learning_rate": 1.3186125884700522e-05, "loss": 0.3175, "step": 8062 }, { "epoch": 1.5895110410094637, "grad_norm": 0.49457076312235915, "learning_rate": 1.3184656600019992e-05, "loss": 0.353, "step": 8063 }, { "epoch": 1.5897082018927446, "grad_norm": 0.47999535138775207, "learning_rate": 1.318318723882552e-05, "loss": 0.3392, "step": 8064 }, { "epoch": 1.5899053627760251, "grad_norm": 0.5418783986590079, "learning_rate": 1.3181717801152414e-05, "loss": 0.3415, "step": 8065 }, { "epoch": 1.590102523659306, "grad_norm": 0.5096260948506632, "learning_rate": 1.3180248287035977e-05, "loss": 0.3262, "step": 8066 }, { "epoch": 1.5902996845425867, "grad_norm": 0.4906247366527042, "learning_rate": 1.3178778696511511e-05, "loss": 0.3384, "step": 8067 }, { "epoch": 1.5904968454258674, "grad_norm": 0.470211162945986, "learning_rate": 1.317730902961433e-05, "loss": 0.3244, "step": 8068 }, { "epoch": 1.5906940063091484, "grad_norm": 0.49115378811432053, "learning_rate": 1.3175839286379734e-05, "loss": 0.3422, "step": 8069 }, { "epoch": 1.590891167192429, "grad_norm": 0.5013805949559059, "learning_rate": 1.3174369466843048e-05, "loss": 0.3441, "step": 8070 }, { "epoch": 1.5910883280757098, "grad_norm": 0.4876708210398381, "learning_rate": 1.3172899571039577e-05, "loss": 0.3444, "step": 8071 }, { "epoch": 1.5912854889589907, "grad_norm": 0.45242714845948756, "learning_rate": 1.3171429599004641e-05, "loss": 0.3162, "step": 8072 }, { "epoch": 1.5914826498422712, "grad_norm": 0.47227360673852276, "learning_rate": 1.316995955077355e-05, "loss": 0.3263, "step": 8073 }, { "epoch": 1.591679810725552, "grad_norm": 0.4875936546980924, "learning_rate": 1.3168489426381635e-05, "loss": 0.3398, "step": 8074 }, { "epoch": 1.5918769716088328, "grad_norm": 0.483433090727928, "learning_rate": 1.3167019225864203e-05, "loss": 0.3262, "step": 8075 }, { "epoch": 1.5920741324921135, "grad_norm": 0.4983273920225542, "learning_rate": 1.3165548949256586e-05, "loss": 0.3528, "step": 8076 }, { "epoch": 1.5922712933753944, "grad_norm": 0.48450953137363295, "learning_rate": 1.3164078596594107e-05, "loss": 0.3452, "step": 8077 }, { "epoch": 1.5924684542586751, "grad_norm": 0.4623023529103843, "learning_rate": 1.3162608167912091e-05, "loss": 0.3421, "step": 8078 }, { "epoch": 1.5926656151419558, "grad_norm": 1.1891945757617244, "learning_rate": 1.3161137663245869e-05, "loss": 0.335, "step": 8079 }, { "epoch": 1.5928627760252367, "grad_norm": 0.46625342878642473, "learning_rate": 1.3159667082630768e-05, "loss": 0.3017, "step": 8080 }, { "epoch": 1.5930599369085172, "grad_norm": 0.4885959593564677, "learning_rate": 1.3158196426102121e-05, "loss": 0.3207, "step": 8081 }, { "epoch": 1.5932570977917981, "grad_norm": 0.5374079371161415, "learning_rate": 1.315672569369526e-05, "loss": 0.3292, "step": 8082 }, { "epoch": 1.5934542586750788, "grad_norm": 0.47767332144300273, "learning_rate": 1.3155254885445526e-05, "loss": 0.3134, "step": 8083 }, { "epoch": 1.5936514195583595, "grad_norm": 0.4563667095812108, "learning_rate": 1.3153784001388249e-05, "loss": 0.3181, "step": 8084 }, { "epoch": 1.5938485804416405, "grad_norm": 0.4891329550595928, "learning_rate": 1.315231304155877e-05, "loss": 0.352, "step": 8085 }, { "epoch": 1.5940457413249212, "grad_norm": 0.4746370559092902, "learning_rate": 1.3150842005992434e-05, "loss": 0.3107, "step": 8086 }, { "epoch": 1.5942429022082019, "grad_norm": 0.5821961651457958, "learning_rate": 1.3149370894724583e-05, "loss": 0.3215, "step": 8087 }, { "epoch": 1.5944400630914828, "grad_norm": 0.4692821907783642, "learning_rate": 1.3147899707790557e-05, "loss": 0.3383, "step": 8088 }, { "epoch": 1.5946372239747633, "grad_norm": 0.6369717589355536, "learning_rate": 1.3146428445225708e-05, "loss": 0.3516, "step": 8089 }, { "epoch": 1.5948343848580442, "grad_norm": 0.5091082227275152, "learning_rate": 1.3144957107065379e-05, "loss": 0.3598, "step": 8090 }, { "epoch": 1.5950315457413249, "grad_norm": 0.4761782875190844, "learning_rate": 1.3143485693344925e-05, "loss": 0.3409, "step": 8091 }, { "epoch": 1.5952287066246056, "grad_norm": 0.46602014777166734, "learning_rate": 1.3142014204099696e-05, "loss": 0.3135, "step": 8092 }, { "epoch": 1.5954258675078865, "grad_norm": 0.4514528986309173, "learning_rate": 1.3140542639365047e-05, "loss": 0.3134, "step": 8093 }, { "epoch": 1.5956230283911672, "grad_norm": 0.5297263361088499, "learning_rate": 1.3139070999176326e-05, "loss": 0.3419, "step": 8094 }, { "epoch": 1.595820189274448, "grad_norm": 0.5469443697151118, "learning_rate": 1.3137599283568902e-05, "loss": 0.3563, "step": 8095 }, { "epoch": 1.5960173501577288, "grad_norm": 0.4957278314999418, "learning_rate": 1.3136127492578126e-05, "loss": 0.355, "step": 8096 }, { "epoch": 1.5962145110410093, "grad_norm": 0.5166791104046938, "learning_rate": 1.3134655626239363e-05, "loss": 0.3296, "step": 8097 }, { "epoch": 1.5964116719242902, "grad_norm": 0.5016830354531797, "learning_rate": 1.3133183684587974e-05, "loss": 0.3412, "step": 8098 }, { "epoch": 1.596608832807571, "grad_norm": 0.4893337636428677, "learning_rate": 1.3131711667659323e-05, "loss": 0.339, "step": 8099 }, { "epoch": 1.5968059936908516, "grad_norm": 0.467330301371642, "learning_rate": 1.3130239575488777e-05, "loss": 0.2984, "step": 8100 }, { "epoch": 1.5970031545741326, "grad_norm": 3.5270122225585343, "learning_rate": 1.3128767408111704e-05, "loss": 0.3816, "step": 8101 }, { "epoch": 1.5972003154574133, "grad_norm": 0.46859234609245476, "learning_rate": 1.3127295165563476e-05, "loss": 0.3137, "step": 8102 }, { "epoch": 1.597397476340694, "grad_norm": 0.5086912187536752, "learning_rate": 1.3125822847879464e-05, "loss": 0.364, "step": 8103 }, { "epoch": 1.5975946372239749, "grad_norm": 0.46400065943464436, "learning_rate": 1.312435045509504e-05, "loss": 0.3094, "step": 8104 }, { "epoch": 1.5977917981072554, "grad_norm": 0.5054767816862595, "learning_rate": 1.3122877987245579e-05, "loss": 0.3287, "step": 8105 }, { "epoch": 1.5979889589905363, "grad_norm": 0.49607387723822316, "learning_rate": 1.3121405444366459e-05, "loss": 0.3377, "step": 8106 }, { "epoch": 1.598186119873817, "grad_norm": 0.4921049056516874, "learning_rate": 1.3119932826493063e-05, "loss": 0.3322, "step": 8107 }, { "epoch": 1.5983832807570977, "grad_norm": 0.4850703791480183, "learning_rate": 1.3118460133660766e-05, "loss": 0.317, "step": 8108 }, { "epoch": 1.5985804416403786, "grad_norm": 0.4917675892451221, "learning_rate": 1.3116987365904951e-05, "loss": 0.3443, "step": 8109 }, { "epoch": 1.5987776025236593, "grad_norm": 0.5527143499695473, "learning_rate": 1.3115514523261008e-05, "loss": 0.317, "step": 8110 }, { "epoch": 1.59897476340694, "grad_norm": 0.4889167905883699, "learning_rate": 1.3114041605764319e-05, "loss": 0.3288, "step": 8111 }, { "epoch": 1.599171924290221, "grad_norm": 0.4604739649545089, "learning_rate": 1.3112568613450271e-05, "loss": 0.3237, "step": 8112 }, { "epoch": 1.5993690851735016, "grad_norm": 0.4826891223593189, "learning_rate": 1.3111095546354257e-05, "loss": 0.3372, "step": 8113 }, { "epoch": 1.5995662460567823, "grad_norm": 0.4627352710214227, "learning_rate": 1.3109622404511669e-05, "loss": 0.3274, "step": 8114 }, { "epoch": 1.5997634069400632, "grad_norm": 0.5183424926340384, "learning_rate": 1.3108149187957895e-05, "loss": 0.3668, "step": 8115 }, { "epoch": 1.5999605678233437, "grad_norm": 0.4890735359860389, "learning_rate": 1.3106675896728334e-05, "loss": 0.3281, "step": 8116 }, { "epoch": 1.6001577287066246, "grad_norm": 0.4663459951593309, "learning_rate": 1.3105202530858386e-05, "loss": 0.3157, "step": 8117 }, { "epoch": 1.6003548895899053, "grad_norm": 0.4642219805501338, "learning_rate": 1.310372909038344e-05, "loss": 0.322, "step": 8118 }, { "epoch": 1.600552050473186, "grad_norm": 0.4883535083604666, "learning_rate": 1.3102255575338912e-05, "loss": 0.3544, "step": 8119 }, { "epoch": 1.600749211356467, "grad_norm": 0.4938965248583283, "learning_rate": 1.3100781985760188e-05, "loss": 0.3415, "step": 8120 }, { "epoch": 1.6009463722397477, "grad_norm": 0.508009918172584, "learning_rate": 1.3099308321682685e-05, "loss": 0.3645, "step": 8121 }, { "epoch": 1.6011435331230284, "grad_norm": 0.4977394454270282, "learning_rate": 1.30978345831418e-05, "loss": 0.37, "step": 8122 }, { "epoch": 1.6013406940063093, "grad_norm": 0.4827810334398335, "learning_rate": 1.3096360770172947e-05, "loss": 0.3373, "step": 8123 }, { "epoch": 1.6015378548895898, "grad_norm": 0.5238029508010671, "learning_rate": 1.309488688281153e-05, "loss": 0.3907, "step": 8124 }, { "epoch": 1.6017350157728707, "grad_norm": 0.4897188747786206, "learning_rate": 1.3093412921092967e-05, "loss": 0.3295, "step": 8125 }, { "epoch": 1.6019321766561514, "grad_norm": 0.4957516780778494, "learning_rate": 1.3091938885052665e-05, "loss": 0.35, "step": 8126 }, { "epoch": 1.602129337539432, "grad_norm": 0.5216427832225966, "learning_rate": 1.3090464774726042e-05, "loss": 0.3486, "step": 8127 }, { "epoch": 1.602326498422713, "grad_norm": 0.5159704540850552, "learning_rate": 1.3088990590148516e-05, "loss": 0.355, "step": 8128 }, { "epoch": 1.6025236593059937, "grad_norm": 0.49842376144042355, "learning_rate": 1.3087516331355501e-05, "loss": 0.3439, "step": 8129 }, { "epoch": 1.6027208201892744, "grad_norm": 0.4666686177899867, "learning_rate": 1.3086041998382419e-05, "loss": 0.3322, "step": 8130 }, { "epoch": 1.6029179810725553, "grad_norm": 0.6474512146454534, "learning_rate": 1.3084567591264694e-05, "loss": 0.3618, "step": 8131 }, { "epoch": 1.6031151419558358, "grad_norm": 0.5026757411329433, "learning_rate": 1.308309311003775e-05, "loss": 0.3374, "step": 8132 }, { "epoch": 1.6033123028391167, "grad_norm": 0.4966462917789033, "learning_rate": 1.308161855473701e-05, "loss": 0.326, "step": 8133 }, { "epoch": 1.6035094637223974, "grad_norm": 0.48096431260184735, "learning_rate": 1.3080143925397904e-05, "loss": 0.3368, "step": 8134 }, { "epoch": 1.6037066246056781, "grad_norm": 0.48619845262827444, "learning_rate": 1.3078669222055858e-05, "loss": 0.3617, "step": 8135 }, { "epoch": 1.603903785488959, "grad_norm": 1.8367614071058735, "learning_rate": 1.3077194444746307e-05, "loss": 0.3497, "step": 8136 }, { "epoch": 1.6041009463722398, "grad_norm": 0.512344448724785, "learning_rate": 1.3075719593504674e-05, "loss": 0.3413, "step": 8137 }, { "epoch": 1.6042981072555205, "grad_norm": 0.5211149611982421, "learning_rate": 1.3074244668366412e-05, "loss": 0.3447, "step": 8138 }, { "epoch": 1.6044952681388014, "grad_norm": 0.48739207546058483, "learning_rate": 1.3072769669366938e-05, "loss": 0.3083, "step": 8139 }, { "epoch": 1.6046924290220819, "grad_norm": 0.49212067031603235, "learning_rate": 1.3071294596541701e-05, "loss": 0.3196, "step": 8140 }, { "epoch": 1.6048895899053628, "grad_norm": 3.7626457727782956, "learning_rate": 1.3069819449926136e-05, "loss": 0.4303, "step": 8141 }, { "epoch": 1.6050867507886435, "grad_norm": 0.5830689224709581, "learning_rate": 1.3068344229555692e-05, "loss": 0.3308, "step": 8142 }, { "epoch": 1.6052839116719242, "grad_norm": 0.49521045756115595, "learning_rate": 1.30668689354658e-05, "loss": 0.3391, "step": 8143 }, { "epoch": 1.6054810725552051, "grad_norm": 0.5145784616836976, "learning_rate": 1.3065393567691914e-05, "loss": 0.3544, "step": 8144 }, { "epoch": 1.6056782334384858, "grad_norm": 0.5028725718766839, "learning_rate": 1.3063918126269483e-05, "loss": 0.3444, "step": 8145 }, { "epoch": 1.6058753943217665, "grad_norm": 0.5769436395830195, "learning_rate": 1.3062442611233949e-05, "loss": 0.3543, "step": 8146 }, { "epoch": 1.6060725552050474, "grad_norm": 0.5022646683039049, "learning_rate": 1.3060967022620766e-05, "loss": 0.3165, "step": 8147 }, { "epoch": 1.606269716088328, "grad_norm": 0.4827891396259431, "learning_rate": 1.3059491360465384e-05, "loss": 0.3357, "step": 8148 }, { "epoch": 1.6064668769716088, "grad_norm": 0.49635418716653323, "learning_rate": 1.305801562480326e-05, "loss": 0.3121, "step": 8149 }, { "epoch": 1.6066640378548895, "grad_norm": 0.4857542068874944, "learning_rate": 1.3056539815669846e-05, "loss": 0.3357, "step": 8150 }, { "epoch": 1.6068611987381702, "grad_norm": 0.45147908050559366, "learning_rate": 1.3055063933100602e-05, "loss": 0.2883, "step": 8151 }, { "epoch": 1.6070583596214512, "grad_norm": 0.46546726459791526, "learning_rate": 1.3053587977130988e-05, "loss": 0.3118, "step": 8152 }, { "epoch": 1.6072555205047319, "grad_norm": 0.4993862309452187, "learning_rate": 1.3052111947796463e-05, "loss": 0.3594, "step": 8153 }, { "epoch": 1.6074526813880126, "grad_norm": 0.4879439935041709, "learning_rate": 1.305063584513249e-05, "loss": 0.3151, "step": 8154 }, { "epoch": 1.6076498422712935, "grad_norm": 0.48554820642690266, "learning_rate": 1.3049159669174534e-05, "loss": 0.3031, "step": 8155 }, { "epoch": 1.6078470031545742, "grad_norm": 0.500184176222356, "learning_rate": 1.3047683419958062e-05, "loss": 0.3406, "step": 8156 }, { "epoch": 1.6080441640378549, "grad_norm": 0.5063923589890583, "learning_rate": 1.3046207097518542e-05, "loss": 0.3499, "step": 8157 }, { "epoch": 1.6082413249211358, "grad_norm": 0.49816615378885987, "learning_rate": 1.3044730701891442e-05, "loss": 0.3237, "step": 8158 }, { "epoch": 1.6084384858044163, "grad_norm": 0.4925753536085062, "learning_rate": 1.3043254233112237e-05, "loss": 0.3368, "step": 8159 }, { "epoch": 1.6086356466876972, "grad_norm": 0.5310015500392122, "learning_rate": 1.3041777691216395e-05, "loss": 0.3725, "step": 8160 }, { "epoch": 1.608832807570978, "grad_norm": 0.49537678955838665, "learning_rate": 1.3040301076239398e-05, "loss": 0.3287, "step": 8161 }, { "epoch": 1.6090299684542586, "grad_norm": 0.5527761110782208, "learning_rate": 1.3038824388216718e-05, "loss": 0.3726, "step": 8162 }, { "epoch": 1.6092271293375395, "grad_norm": 0.447484220574138, "learning_rate": 1.3037347627183835e-05, "loss": 0.3021, "step": 8163 }, { "epoch": 1.6094242902208202, "grad_norm": 0.5179693625577074, "learning_rate": 1.3035870793176229e-05, "loss": 0.3551, "step": 8164 }, { "epoch": 1.609621451104101, "grad_norm": 0.46927906569820993, "learning_rate": 1.3034393886229381e-05, "loss": 0.3183, "step": 8165 }, { "epoch": 1.6098186119873819, "grad_norm": 0.498692120573005, "learning_rate": 1.3032916906378782e-05, "loss": 0.3039, "step": 8166 }, { "epoch": 1.6100157728706623, "grad_norm": 0.501698354263306, "learning_rate": 1.3031439853659906e-05, "loss": 0.3314, "step": 8167 }, { "epoch": 1.6102129337539433, "grad_norm": 0.8075569329846617, "learning_rate": 1.302996272810825e-05, "loss": 0.3544, "step": 8168 }, { "epoch": 1.610410094637224, "grad_norm": 0.4790951853547528, "learning_rate": 1.3028485529759296e-05, "loss": 0.3349, "step": 8169 }, { "epoch": 1.6106072555205047, "grad_norm": 0.48026262439420764, "learning_rate": 1.3027008258648538e-05, "loss": 0.3649, "step": 8170 }, { "epoch": 1.6108044164037856, "grad_norm": 0.4922180547792553, "learning_rate": 1.3025530914811473e-05, "loss": 0.3143, "step": 8171 }, { "epoch": 1.6110015772870663, "grad_norm": 0.5198306988127153, "learning_rate": 1.3024053498283588e-05, "loss": 0.3275, "step": 8172 }, { "epoch": 1.611198738170347, "grad_norm": 0.4742201615022806, "learning_rate": 1.3022576009100382e-05, "loss": 0.3558, "step": 8173 }, { "epoch": 1.611395899053628, "grad_norm": 0.5014496532783586, "learning_rate": 1.3021098447297358e-05, "loss": 0.3412, "step": 8174 }, { "epoch": 1.6115930599369084, "grad_norm": 0.49858321941500644, "learning_rate": 1.3019620812910008e-05, "loss": 0.3349, "step": 8175 }, { "epoch": 1.6117902208201893, "grad_norm": 0.4876528140518932, "learning_rate": 1.3018143105973835e-05, "loss": 0.3295, "step": 8176 }, { "epoch": 1.61198738170347, "grad_norm": 0.5014129411656993, "learning_rate": 1.3016665326524343e-05, "loss": 0.3169, "step": 8177 }, { "epoch": 1.6121845425867507, "grad_norm": 0.4709155858213729, "learning_rate": 1.301518747459704e-05, "loss": 0.3033, "step": 8178 }, { "epoch": 1.6123817034700316, "grad_norm": 0.569369280596523, "learning_rate": 1.3013709550227429e-05, "loss": 0.3567, "step": 8179 }, { "epoch": 1.6125788643533123, "grad_norm": 0.49624380854953204, "learning_rate": 1.3012231553451018e-05, "loss": 0.3198, "step": 8180 }, { "epoch": 1.612776025236593, "grad_norm": 0.4572480950811159, "learning_rate": 1.301075348430332e-05, "loss": 0.3102, "step": 8181 }, { "epoch": 1.612973186119874, "grad_norm": 0.5168349496318023, "learning_rate": 1.3009275342819842e-05, "loss": 0.3265, "step": 8182 }, { "epoch": 1.6131703470031544, "grad_norm": 0.6022754654599433, "learning_rate": 1.3007797129036104e-05, "loss": 0.3463, "step": 8183 }, { "epoch": 1.6133675078864353, "grad_norm": 0.49921945706957105, "learning_rate": 1.3006318842987615e-05, "loss": 0.3402, "step": 8184 }, { "epoch": 1.613564668769716, "grad_norm": 0.4786378011895483, "learning_rate": 1.3004840484709897e-05, "loss": 0.3151, "step": 8185 }, { "epoch": 1.6137618296529967, "grad_norm": 0.5091965495386273, "learning_rate": 1.3003362054238465e-05, "loss": 0.3439, "step": 8186 }, { "epoch": 1.6139589905362777, "grad_norm": 0.5075959256110124, "learning_rate": 1.3001883551608843e-05, "loss": 0.3539, "step": 8187 }, { "epoch": 1.6141561514195584, "grad_norm": 0.51624022913439, "learning_rate": 1.3000404976856546e-05, "loss": 0.362, "step": 8188 }, { "epoch": 1.614353312302839, "grad_norm": 0.5353531514337778, "learning_rate": 1.2998926330017109e-05, "loss": 0.3716, "step": 8189 }, { "epoch": 1.61455047318612, "grad_norm": 0.48775573515644133, "learning_rate": 1.2997447611126049e-05, "loss": 0.3223, "step": 8190 }, { "epoch": 1.6147476340694005, "grad_norm": 0.5058365439971066, "learning_rate": 1.2995968820218896e-05, "loss": 0.3351, "step": 8191 }, { "epoch": 1.6149447949526814, "grad_norm": 0.4873527967441035, "learning_rate": 1.2994489957331183e-05, "loss": 0.3134, "step": 8192 }, { "epoch": 1.615141955835962, "grad_norm": 0.5142383938117662, "learning_rate": 1.2993011022498434e-05, "loss": 0.365, "step": 8193 }, { "epoch": 1.6153391167192428, "grad_norm": 0.5405740913312057, "learning_rate": 1.2991532015756185e-05, "loss": 0.3483, "step": 8194 }, { "epoch": 1.6155362776025237, "grad_norm": 0.5168750664395806, "learning_rate": 1.2990052937139972e-05, "loss": 0.343, "step": 8195 }, { "epoch": 1.6157334384858044, "grad_norm": 0.49456698674100874, "learning_rate": 1.298857378668533e-05, "loss": 0.3316, "step": 8196 }, { "epoch": 1.6159305993690851, "grad_norm": 0.5197649456360487, "learning_rate": 1.2987094564427794e-05, "loss": 0.3375, "step": 8197 }, { "epoch": 1.616127760252366, "grad_norm": 0.49653557262255904, "learning_rate": 1.2985615270402904e-05, "loss": 0.3166, "step": 8198 }, { "epoch": 1.6163249211356467, "grad_norm": 0.5287246948423061, "learning_rate": 1.2984135904646206e-05, "loss": 0.3684, "step": 8199 }, { "epoch": 1.6165220820189274, "grad_norm": 0.5612379255016995, "learning_rate": 1.298265646719324e-05, "loss": 0.3378, "step": 8200 }, { "epoch": 1.6167192429022084, "grad_norm": 0.5045525899494582, "learning_rate": 1.2981176958079549e-05, "loss": 0.3534, "step": 8201 }, { "epoch": 1.6169164037854888, "grad_norm": 0.5021046772968188, "learning_rate": 1.2979697377340681e-05, "loss": 0.3276, "step": 8202 }, { "epoch": 1.6171135646687698, "grad_norm": 0.5041966851488991, "learning_rate": 1.2978217725012183e-05, "loss": 0.3378, "step": 8203 }, { "epoch": 1.6173107255520505, "grad_norm": 0.4830420145017819, "learning_rate": 1.2976738001129608e-05, "loss": 0.3352, "step": 8204 }, { "epoch": 1.6175078864353312, "grad_norm": 0.511796475168007, "learning_rate": 1.2975258205728503e-05, "loss": 0.3401, "step": 8205 }, { "epoch": 1.617705047318612, "grad_norm": 0.4940584230053775, "learning_rate": 1.2973778338844425e-05, "loss": 0.3433, "step": 8206 }, { "epoch": 1.6179022082018928, "grad_norm": 0.5293789678998656, "learning_rate": 1.2972298400512926e-05, "loss": 0.3809, "step": 8207 }, { "epoch": 1.6180993690851735, "grad_norm": 0.49843756683541707, "learning_rate": 1.2970818390769569e-05, "loss": 0.3397, "step": 8208 }, { "epoch": 1.6182965299684544, "grad_norm": 0.45831650251490347, "learning_rate": 1.2969338309649901e-05, "loss": 0.302, "step": 8209 }, { "epoch": 1.618493690851735, "grad_norm": 0.4793435986752458, "learning_rate": 1.2967858157189495e-05, "loss": 0.3479, "step": 8210 }, { "epoch": 1.6186908517350158, "grad_norm": 0.48039522018416736, "learning_rate": 1.2966377933423901e-05, "loss": 0.3395, "step": 8211 }, { "epoch": 1.6188880126182965, "grad_norm": 0.49848149873721165, "learning_rate": 1.2964897638388694e-05, "loss": 0.3391, "step": 8212 }, { "epoch": 1.6190851735015772, "grad_norm": 0.4735212283074649, "learning_rate": 1.296341727211943e-05, "loss": 0.3421, "step": 8213 }, { "epoch": 1.6192823343848581, "grad_norm": 0.5087062921882801, "learning_rate": 1.296193683465168e-05, "loss": 0.342, "step": 8214 }, { "epoch": 1.6194794952681388, "grad_norm": 0.5258996248721838, "learning_rate": 1.2960456326021013e-05, "loss": 0.354, "step": 8215 }, { "epoch": 1.6196766561514195, "grad_norm": 0.47955351066783924, "learning_rate": 1.2958975746263e-05, "loss": 0.3241, "step": 8216 }, { "epoch": 1.6198738170347005, "grad_norm": 0.49147931279397045, "learning_rate": 1.295749509541321e-05, "loss": 0.3606, "step": 8217 }, { "epoch": 1.620070977917981, "grad_norm": 0.5002125077620726, "learning_rate": 1.2956014373507219e-05, "loss": 0.3514, "step": 8218 }, { "epoch": 1.6202681388012619, "grad_norm": 0.5136920075059789, "learning_rate": 1.2954533580580603e-05, "loss": 0.3622, "step": 8219 }, { "epoch": 1.6204652996845426, "grad_norm": 0.5015775386249709, "learning_rate": 1.2953052716668939e-05, "loss": 0.3395, "step": 8220 }, { "epoch": 1.6206624605678233, "grad_norm": 0.48326937347111654, "learning_rate": 1.2951571781807804e-05, "loss": 0.3227, "step": 8221 }, { "epoch": 1.6208596214511042, "grad_norm": 0.5751545472906557, "learning_rate": 1.295009077603278e-05, "loss": 0.3488, "step": 8222 }, { "epoch": 1.6210567823343849, "grad_norm": 0.5320211920407559, "learning_rate": 1.2948609699379451e-05, "loss": 0.3413, "step": 8223 }, { "epoch": 1.6212539432176656, "grad_norm": 0.4687064635801877, "learning_rate": 1.2947128551883399e-05, "loss": 0.3454, "step": 8224 }, { "epoch": 1.6214511041009465, "grad_norm": 0.49744433210961403, "learning_rate": 1.294564733358021e-05, "loss": 0.3515, "step": 8225 }, { "epoch": 1.621648264984227, "grad_norm": 0.5069074005833163, "learning_rate": 1.2944166044505467e-05, "loss": 0.3457, "step": 8226 }, { "epoch": 1.621845425867508, "grad_norm": 0.47976757490607047, "learning_rate": 1.294268468469477e-05, "loss": 0.3453, "step": 8227 }, { "epoch": 1.6220425867507886, "grad_norm": 0.4952595442891473, "learning_rate": 1.29412032541837e-05, "loss": 0.32, "step": 8228 }, { "epoch": 1.6222397476340693, "grad_norm": 0.46297510160068384, "learning_rate": 1.2939721753007857e-05, "loss": 0.3111, "step": 8229 }, { "epoch": 1.6224369085173502, "grad_norm": 0.5043310082743702, "learning_rate": 1.2938240181202828e-05, "loss": 0.3748, "step": 8230 }, { "epoch": 1.622634069400631, "grad_norm": 0.5095253493670123, "learning_rate": 1.2936758538804215e-05, "loss": 0.3488, "step": 8231 }, { "epoch": 1.6228312302839116, "grad_norm": 0.4503216039107211, "learning_rate": 1.2935276825847614e-05, "loss": 0.3134, "step": 8232 }, { "epoch": 1.6230283911671926, "grad_norm": 0.5050115110615685, "learning_rate": 1.293379504236862e-05, "loss": 0.3343, "step": 8233 }, { "epoch": 1.623225552050473, "grad_norm": 0.5141044664399913, "learning_rate": 1.293231318840284e-05, "loss": 0.3302, "step": 8234 }, { "epoch": 1.623422712933754, "grad_norm": 0.4852074392315996, "learning_rate": 1.2930831263985873e-05, "loss": 0.3247, "step": 8235 }, { "epoch": 1.6236198738170347, "grad_norm": 0.5009665060672993, "learning_rate": 1.2929349269153326e-05, "loss": 0.3452, "step": 8236 }, { "epoch": 1.6238170347003154, "grad_norm": 0.48558116806406154, "learning_rate": 1.29278672039408e-05, "loss": 0.3345, "step": 8237 }, { "epoch": 1.6240141955835963, "grad_norm": 0.45949344463355146, "learning_rate": 1.292638506838391e-05, "loss": 0.3232, "step": 8238 }, { "epoch": 1.624211356466877, "grad_norm": 0.4606410064917042, "learning_rate": 1.2924902862518262e-05, "loss": 0.3166, "step": 8239 }, { "epoch": 1.6244085173501577, "grad_norm": 0.47453261506907685, "learning_rate": 1.2923420586379466e-05, "loss": 0.3172, "step": 8240 }, { "epoch": 1.6246056782334386, "grad_norm": 0.49860886417423067, "learning_rate": 1.2921938240003138e-05, "loss": 0.3468, "step": 8241 }, { "epoch": 1.624802839116719, "grad_norm": 0.4842849858710501, "learning_rate": 1.2920455823424892e-05, "loss": 0.3301, "step": 8242 }, { "epoch": 1.625, "grad_norm": 0.4715791270451386, "learning_rate": 1.2918973336680339e-05, "loss": 0.3352, "step": 8243 }, { "epoch": 1.625197160883281, "grad_norm": 0.4857708781749364, "learning_rate": 1.2917490779805105e-05, "loss": 0.3422, "step": 8244 }, { "epoch": 1.6253943217665614, "grad_norm": 1.2910001595707934, "learning_rate": 1.2916008152834803e-05, "loss": 0.3522, "step": 8245 }, { "epoch": 1.6255914826498423, "grad_norm": 0.46922968126191145, "learning_rate": 1.2914525455805056e-05, "loss": 0.3209, "step": 8246 }, { "epoch": 1.625788643533123, "grad_norm": 0.5124348833505142, "learning_rate": 1.291304268875149e-05, "loss": 0.3444, "step": 8247 }, { "epoch": 1.6259858044164037, "grad_norm": 0.4617262156966585, "learning_rate": 1.2911559851709728e-05, "loss": 0.3357, "step": 8248 }, { "epoch": 1.6261829652996846, "grad_norm": 0.4682239861424123, "learning_rate": 1.2910076944715394e-05, "loss": 0.3258, "step": 8249 }, { "epoch": 1.6263801261829653, "grad_norm": 0.4926977054765841, "learning_rate": 1.2908593967804117e-05, "loss": 0.3357, "step": 8250 }, { "epoch": 1.626577287066246, "grad_norm": 0.47277698075135877, "learning_rate": 1.290711092101153e-05, "loss": 0.3236, "step": 8251 }, { "epoch": 1.626774447949527, "grad_norm": 0.46612901619189784, "learning_rate": 1.2905627804373259e-05, "loss": 0.3218, "step": 8252 }, { "epoch": 1.6269716088328074, "grad_norm": 0.46710838609859795, "learning_rate": 1.2904144617924946e-05, "loss": 0.32, "step": 8253 }, { "epoch": 1.6271687697160884, "grad_norm": 0.46480402221238837, "learning_rate": 1.2902661361702214e-05, "loss": 0.322, "step": 8254 }, { "epoch": 1.627365930599369, "grad_norm": 0.4992203321218696, "learning_rate": 1.2901178035740709e-05, "loss": 0.317, "step": 8255 }, { "epoch": 1.6275630914826498, "grad_norm": 0.5101087513586251, "learning_rate": 1.2899694640076062e-05, "loss": 0.3448, "step": 8256 }, { "epoch": 1.6277602523659307, "grad_norm": 0.48008287555221985, "learning_rate": 1.289821117474392e-05, "loss": 0.3251, "step": 8257 }, { "epoch": 1.6279574132492114, "grad_norm": 0.4797668818388636, "learning_rate": 1.2896727639779916e-05, "loss": 0.3361, "step": 8258 }, { "epoch": 1.628154574132492, "grad_norm": 0.4713917003117169, "learning_rate": 1.2895244035219701e-05, "loss": 0.3121, "step": 8259 }, { "epoch": 1.628351735015773, "grad_norm": 0.8057901594512715, "learning_rate": 1.2893760361098915e-05, "loss": 0.3256, "step": 8260 }, { "epoch": 1.6285488958990535, "grad_norm": 0.4876468535149187, "learning_rate": 1.2892276617453208e-05, "loss": 0.3393, "step": 8261 }, { "epoch": 1.6287460567823344, "grad_norm": 0.5166514990575025, "learning_rate": 1.2890792804318224e-05, "loss": 0.3577, "step": 8262 }, { "epoch": 1.6289432176656151, "grad_norm": 0.4791140344047714, "learning_rate": 1.2889308921729616e-05, "loss": 0.3388, "step": 8263 }, { "epoch": 1.6291403785488958, "grad_norm": 0.4812157889038476, "learning_rate": 1.2887824969723035e-05, "loss": 0.3236, "step": 8264 }, { "epoch": 1.6293375394321767, "grad_norm": 0.45798962884288835, "learning_rate": 1.2886340948334132e-05, "loss": 0.328, "step": 8265 }, { "epoch": 1.6295347003154574, "grad_norm": 0.8222202615392995, "learning_rate": 1.2884856857598564e-05, "loss": 0.3577, "step": 8266 }, { "epoch": 1.6297318611987381, "grad_norm": 0.5528989353996063, "learning_rate": 1.2883372697551987e-05, "loss": 0.3358, "step": 8267 }, { "epoch": 1.629929022082019, "grad_norm": 0.49179851604347374, "learning_rate": 1.2881888468230059e-05, "loss": 0.3182, "step": 8268 }, { "epoch": 1.6301261829652995, "grad_norm": 0.6056266880805475, "learning_rate": 1.2880404169668438e-05, "loss": 0.3249, "step": 8269 }, { "epoch": 1.6303233438485805, "grad_norm": 0.7467253138778867, "learning_rate": 1.2878919801902791e-05, "loss": 0.3381, "step": 8270 }, { "epoch": 1.6305205047318612, "grad_norm": 0.48977301540090146, "learning_rate": 1.2877435364968776e-05, "loss": 0.3175, "step": 8271 }, { "epoch": 1.6307176656151419, "grad_norm": 0.507009351089707, "learning_rate": 1.2875950858902057e-05, "loss": 0.3186, "step": 8272 }, { "epoch": 1.6309148264984228, "grad_norm": 0.48004983350279057, "learning_rate": 1.2874466283738303e-05, "loss": 0.3183, "step": 8273 }, { "epoch": 1.6311119873817035, "grad_norm": 0.6427227001497235, "learning_rate": 1.2872981639513187e-05, "loss": 0.36, "step": 8274 }, { "epoch": 1.6313091482649842, "grad_norm": 0.5043727739166707, "learning_rate": 1.2871496926262365e-05, "loss": 0.3388, "step": 8275 }, { "epoch": 1.631506309148265, "grad_norm": 0.537296723687631, "learning_rate": 1.2870012144021524e-05, "loss": 0.3812, "step": 8276 }, { "epoch": 1.6317034700315456, "grad_norm": 0.4600951770649924, "learning_rate": 1.2868527292826325e-05, "loss": 0.3042, "step": 8277 }, { "epoch": 1.6319006309148265, "grad_norm": 0.4961373527723629, "learning_rate": 1.2867042372712453e-05, "loss": 0.3288, "step": 8278 }, { "epoch": 1.6320977917981072, "grad_norm": 0.49399953361553006, "learning_rate": 1.2865557383715574e-05, "loss": 0.3309, "step": 8279 }, { "epoch": 1.632294952681388, "grad_norm": 0.4951800992829749, "learning_rate": 1.2864072325871372e-05, "loss": 0.316, "step": 8280 }, { "epoch": 1.6324921135646688, "grad_norm": 0.5172023636303266, "learning_rate": 1.2862587199215528e-05, "loss": 0.3361, "step": 8281 }, { "epoch": 1.6326892744479495, "grad_norm": 0.44561120909851326, "learning_rate": 1.2861102003783722e-05, "loss": 0.3092, "step": 8282 }, { "epoch": 1.6328864353312302, "grad_norm": 0.4804875200058334, "learning_rate": 1.2859616739611636e-05, "loss": 0.3166, "step": 8283 }, { "epoch": 1.6330835962145112, "grad_norm": 1.2364757919941514, "learning_rate": 1.2858131406734953e-05, "loss": 0.3109, "step": 8284 }, { "epoch": 1.6332807570977916, "grad_norm": 0.4655120000897447, "learning_rate": 1.2856646005189367e-05, "loss": 0.3197, "step": 8285 }, { "epoch": 1.6334779179810726, "grad_norm": 0.49106292754923186, "learning_rate": 1.285516053501055e-05, "loss": 0.3198, "step": 8286 }, { "epoch": 1.6336750788643533, "grad_norm": 0.48718403958723305, "learning_rate": 1.2853674996234209e-05, "loss": 0.3309, "step": 8287 }, { "epoch": 1.633872239747634, "grad_norm": 0.5300102271215745, "learning_rate": 1.2852189388896027e-05, "loss": 0.3611, "step": 8288 }, { "epoch": 1.6340694006309149, "grad_norm": 0.5758537724813633, "learning_rate": 1.2850703713031698e-05, "loss": 0.321, "step": 8289 }, { "epoch": 1.6342665615141956, "grad_norm": 0.4930932386464715, "learning_rate": 1.2849217968676916e-05, "loss": 0.3411, "step": 8290 }, { "epoch": 1.6344637223974763, "grad_norm": 0.47643339025266535, "learning_rate": 1.284773215586738e-05, "loss": 0.3325, "step": 8291 }, { "epoch": 1.6346608832807572, "grad_norm": 0.5102517200602528, "learning_rate": 1.2846246274638783e-05, "loss": 0.3085, "step": 8292 }, { "epoch": 1.634858044164038, "grad_norm": 0.4964470884863604, "learning_rate": 1.2844760325026827e-05, "loss": 0.3336, "step": 8293 }, { "epoch": 1.6350552050473186, "grad_norm": 0.4830571944820674, "learning_rate": 1.2843274307067212e-05, "loss": 0.2993, "step": 8294 }, { "epoch": 1.6352523659305995, "grad_norm": 0.604904792281781, "learning_rate": 1.2841788220795648e-05, "loss": 0.382, "step": 8295 }, { "epoch": 1.63544952681388, "grad_norm": 0.5476960221303971, "learning_rate": 1.2840302066247828e-05, "loss": 0.3219, "step": 8296 }, { "epoch": 1.635646687697161, "grad_norm": 0.5578041142423232, "learning_rate": 1.2838815843459467e-05, "loss": 0.3686, "step": 8297 }, { "epoch": 1.6358438485804416, "grad_norm": 0.4786603253362986, "learning_rate": 1.2837329552466268e-05, "loss": 0.3073, "step": 8298 }, { "epoch": 1.6360410094637223, "grad_norm": 0.5103008039139185, "learning_rate": 1.2835843193303941e-05, "loss": 0.3315, "step": 8299 }, { "epoch": 1.6362381703470033, "grad_norm": 0.5547846433262801, "learning_rate": 1.2834356766008198e-05, "loss": 0.3745, "step": 8300 }, { "epoch": 1.636435331230284, "grad_norm": 0.47914878602175826, "learning_rate": 1.283287027061475e-05, "loss": 0.3504, "step": 8301 }, { "epoch": 1.6366324921135647, "grad_norm": 0.48804434383979656, "learning_rate": 1.2831383707159316e-05, "loss": 0.2993, "step": 8302 }, { "epoch": 1.6368296529968456, "grad_norm": 0.4802703000186615, "learning_rate": 1.2829897075677602e-05, "loss": 0.3225, "step": 8303 }, { "epoch": 1.637026813880126, "grad_norm": 0.4776966853109427, "learning_rate": 1.2828410376205338e-05, "loss": 0.3372, "step": 8304 }, { "epoch": 1.637223974763407, "grad_norm": 0.4575440033807344, "learning_rate": 1.2826923608778234e-05, "loss": 0.3184, "step": 8305 }, { "epoch": 1.6374211356466877, "grad_norm": 0.5436332250882948, "learning_rate": 1.2825436773432014e-05, "loss": 0.3861, "step": 8306 }, { "epoch": 1.6376182965299684, "grad_norm": 0.48668387601687046, "learning_rate": 1.2823949870202402e-05, "loss": 0.3331, "step": 8307 }, { "epoch": 1.6378154574132493, "grad_norm": 0.5285033833556001, "learning_rate": 1.2822462899125118e-05, "loss": 0.3535, "step": 8308 }, { "epoch": 1.63801261829653, "grad_norm": 0.47806217212282204, "learning_rate": 1.2820975860235892e-05, "loss": 0.3247, "step": 8309 }, { "epoch": 1.6382097791798107, "grad_norm": 0.47295818635975434, "learning_rate": 1.2819488753570448e-05, "loss": 0.3155, "step": 8310 }, { "epoch": 1.6384069400630916, "grad_norm": 0.49044071641561593, "learning_rate": 1.2818001579164516e-05, "loss": 0.3129, "step": 8311 }, { "epoch": 1.638604100946372, "grad_norm": 0.49641282989980273, "learning_rate": 1.2816514337053829e-05, "loss": 0.339, "step": 8312 }, { "epoch": 1.638801261829653, "grad_norm": 0.49283456075636484, "learning_rate": 1.2815027027274114e-05, "loss": 0.3328, "step": 8313 }, { "epoch": 1.6389984227129337, "grad_norm": 0.509363681934031, "learning_rate": 1.281353964986111e-05, "loss": 0.338, "step": 8314 }, { "epoch": 1.6391955835962144, "grad_norm": 0.4973258630677114, "learning_rate": 1.2812052204850547e-05, "loss": 0.3491, "step": 8315 }, { "epoch": 1.6393927444794953, "grad_norm": 0.5065183080300542, "learning_rate": 1.2810564692278167e-05, "loss": 0.3394, "step": 8316 }, { "epoch": 1.639589905362776, "grad_norm": 0.4506338288617181, "learning_rate": 1.2809077112179708e-05, "loss": 0.3088, "step": 8317 }, { "epoch": 1.6397870662460567, "grad_norm": 0.5475036275756043, "learning_rate": 1.2807589464590908e-05, "loss": 0.3665, "step": 8318 }, { "epoch": 1.6399842271293377, "grad_norm": 0.5165355996297737, "learning_rate": 1.280610174954751e-05, "loss": 0.3362, "step": 8319 }, { "epoch": 1.6401813880126181, "grad_norm": 0.4791278974491389, "learning_rate": 1.2804613967085258e-05, "loss": 0.3059, "step": 8320 }, { "epoch": 1.640378548895899, "grad_norm": 0.4683684775788071, "learning_rate": 1.28031261172399e-05, "loss": 0.3264, "step": 8321 }, { "epoch": 1.6405757097791798, "grad_norm": 0.4783064134174778, "learning_rate": 1.2801638200047173e-05, "loss": 0.3351, "step": 8322 }, { "epoch": 1.6407728706624605, "grad_norm": 0.4818861779472011, "learning_rate": 1.2800150215542839e-05, "loss": 0.3351, "step": 8323 }, { "epoch": 1.6409700315457414, "grad_norm": 0.46380844410707833, "learning_rate": 1.2798662163762635e-05, "loss": 0.3361, "step": 8324 }, { "epoch": 1.641167192429022, "grad_norm": 1.7966899299353296, "learning_rate": 1.2797174044742324e-05, "loss": 0.3874, "step": 8325 }, { "epoch": 1.6413643533123028, "grad_norm": 0.4735261266189423, "learning_rate": 1.2795685858517651e-05, "loss": 0.3191, "step": 8326 }, { "epoch": 1.6415615141955837, "grad_norm": 0.4845463952330695, "learning_rate": 1.2794197605124375e-05, "loss": 0.356, "step": 8327 }, { "epoch": 1.6417586750788642, "grad_norm": 0.503438921518367, "learning_rate": 1.279270928459825e-05, "loss": 0.3515, "step": 8328 }, { "epoch": 1.6419558359621451, "grad_norm": 0.4899624719397338, "learning_rate": 1.2791220896975037e-05, "loss": 0.3158, "step": 8329 }, { "epoch": 1.6421529968454258, "grad_norm": 0.4669534661764057, "learning_rate": 1.2789732442290493e-05, "loss": 0.3215, "step": 8330 }, { "epoch": 1.6423501577287065, "grad_norm": 0.5173134441620917, "learning_rate": 1.2788243920580381e-05, "loss": 0.3774, "step": 8331 }, { "epoch": 1.6425473186119874, "grad_norm": 0.4980287720201502, "learning_rate": 1.2786755331880464e-05, "loss": 0.3449, "step": 8332 }, { "epoch": 1.6427444794952681, "grad_norm": 0.5047789983811025, "learning_rate": 1.2785266676226507e-05, "loss": 0.3501, "step": 8333 }, { "epoch": 1.6429416403785488, "grad_norm": 0.468218461046473, "learning_rate": 1.2783777953654273e-05, "loss": 0.3191, "step": 8334 }, { "epoch": 1.6431388012618298, "grad_norm": 0.48844626736338487, "learning_rate": 1.2782289164199534e-05, "loss": 0.3438, "step": 8335 }, { "epoch": 1.6433359621451105, "grad_norm": 0.5111075206278436, "learning_rate": 1.2780800307898057e-05, "loss": 0.3595, "step": 8336 }, { "epoch": 1.6435331230283912, "grad_norm": 0.5097957258324088, "learning_rate": 1.2779311384785609e-05, "loss": 0.3501, "step": 8337 }, { "epoch": 1.643730283911672, "grad_norm": 0.4715524753760921, "learning_rate": 1.2777822394897971e-05, "loss": 0.3133, "step": 8338 }, { "epoch": 1.6439274447949526, "grad_norm": 0.5158213364397016, "learning_rate": 1.2776333338270912e-05, "loss": 0.3584, "step": 8339 }, { "epoch": 1.6441246056782335, "grad_norm": 0.5119422614957586, "learning_rate": 1.277484421494021e-05, "loss": 0.3578, "step": 8340 }, { "epoch": 1.6443217665615142, "grad_norm": 0.4651826749966539, "learning_rate": 1.2773355024941636e-05, "loss": 0.3213, "step": 8341 }, { "epoch": 1.6445189274447949, "grad_norm": 0.5550710370506761, "learning_rate": 1.277186576831098e-05, "loss": 0.3653, "step": 8342 }, { "epoch": 1.6447160883280758, "grad_norm": 0.4788442285113236, "learning_rate": 1.2770376445084014e-05, "loss": 0.3207, "step": 8343 }, { "epoch": 1.6449132492113565, "grad_norm": 0.5097909767918353, "learning_rate": 1.2768887055296527e-05, "loss": 0.3457, "step": 8344 }, { "epoch": 1.6451104100946372, "grad_norm": 0.5052842170050793, "learning_rate": 1.2767397598984293e-05, "loss": 0.3592, "step": 8345 }, { "epoch": 1.6453075709779181, "grad_norm": 0.525965947087887, "learning_rate": 1.2765908076183107e-05, "loss": 0.3717, "step": 8346 }, { "epoch": 1.6455047318611986, "grad_norm": 0.4520388814224274, "learning_rate": 1.2764418486928748e-05, "loss": 0.3079, "step": 8347 }, { "epoch": 1.6457018927444795, "grad_norm": 0.4948960396210387, "learning_rate": 1.276292883125701e-05, "loss": 0.3625, "step": 8348 }, { "epoch": 1.6458990536277602, "grad_norm": 0.48398295575215156, "learning_rate": 1.2761439109203683e-05, "loss": 0.3362, "step": 8349 }, { "epoch": 1.646096214511041, "grad_norm": 0.45308149007619875, "learning_rate": 1.2759949320804559e-05, "loss": 0.3072, "step": 8350 }, { "epoch": 1.6462933753943219, "grad_norm": 0.47227300867152255, "learning_rate": 1.2758459466095432e-05, "loss": 0.3099, "step": 8351 }, { "epoch": 1.6464905362776026, "grad_norm": 0.5261569725940197, "learning_rate": 1.275696954511209e-05, "loss": 0.3195, "step": 8352 }, { "epoch": 1.6466876971608833, "grad_norm": 0.5172694259251647, "learning_rate": 1.2755479557890337e-05, "loss": 0.3382, "step": 8353 }, { "epoch": 1.6468848580441642, "grad_norm": 0.4596032582706732, "learning_rate": 1.2753989504465967e-05, "loss": 0.3264, "step": 8354 }, { "epoch": 1.6470820189274447, "grad_norm": 0.4700775556886168, "learning_rate": 1.275249938487478e-05, "loss": 0.3145, "step": 8355 }, { "epoch": 1.6472791798107256, "grad_norm": 0.5182439165674259, "learning_rate": 1.2751009199152584e-05, "loss": 0.3204, "step": 8356 }, { "epoch": 1.6474763406940063, "grad_norm": 0.4892848082259555, "learning_rate": 1.2749518947335173e-05, "loss": 0.3589, "step": 8357 }, { "epoch": 1.647673501577287, "grad_norm": 0.5181167588333921, "learning_rate": 1.2748028629458356e-05, "loss": 0.3517, "step": 8358 }, { "epoch": 1.647870662460568, "grad_norm": 0.5273555196791303, "learning_rate": 1.2746538245557938e-05, "loss": 0.3611, "step": 8359 }, { "epoch": 1.6480678233438486, "grad_norm": 0.4769301710367526, "learning_rate": 1.2745047795669728e-05, "loss": 0.3385, "step": 8360 }, { "epoch": 1.6482649842271293, "grad_norm": 0.5168568047514019, "learning_rate": 1.274355727982953e-05, "loss": 0.3357, "step": 8361 }, { "epoch": 1.6484621451104102, "grad_norm": 0.47721045303944054, "learning_rate": 1.2742066698073164e-05, "loss": 0.3318, "step": 8362 }, { "epoch": 1.6486593059936907, "grad_norm": 0.5244966742590467, "learning_rate": 1.2740576050436433e-05, "loss": 0.3629, "step": 8363 }, { "epoch": 1.6488564668769716, "grad_norm": 0.4763494332427964, "learning_rate": 1.273908533695516e-05, "loss": 0.355, "step": 8364 }, { "epoch": 1.6490536277602523, "grad_norm": 0.4791256346523753, "learning_rate": 1.2737594557665152e-05, "loss": 0.3291, "step": 8365 }, { "epoch": 1.649250788643533, "grad_norm": 0.49789637045484475, "learning_rate": 1.2736103712602232e-05, "loss": 0.3177, "step": 8366 }, { "epoch": 1.649447949526814, "grad_norm": 0.4568801611649524, "learning_rate": 1.2734612801802217e-05, "loss": 0.3141, "step": 8367 }, { "epoch": 1.6496451104100947, "grad_norm": 0.49441685311553696, "learning_rate": 1.2733121825300927e-05, "loss": 0.3359, "step": 8368 }, { "epoch": 1.6498422712933754, "grad_norm": 0.5050247460468098, "learning_rate": 1.2731630783134182e-05, "loss": 0.3715, "step": 8369 }, { "epoch": 1.6500394321766563, "grad_norm": 0.5015943491120733, "learning_rate": 1.273013967533781e-05, "loss": 0.3383, "step": 8370 }, { "epoch": 1.6502365930599368, "grad_norm": 0.506827984233287, "learning_rate": 1.2728648501947633e-05, "loss": 0.3357, "step": 8371 }, { "epoch": 1.6504337539432177, "grad_norm": 0.44237770072174254, "learning_rate": 1.2727157262999481e-05, "loss": 0.2997, "step": 8372 }, { "epoch": 1.6506309148264984, "grad_norm": 0.48053734591353375, "learning_rate": 1.2725665958529177e-05, "loss": 0.3193, "step": 8373 }, { "epoch": 1.650828075709779, "grad_norm": 0.5146248090989906, "learning_rate": 1.2724174588572556e-05, "loss": 0.3481, "step": 8374 }, { "epoch": 1.65102523659306, "grad_norm": 0.47551521631075744, "learning_rate": 1.272268315316544e-05, "loss": 0.327, "step": 8375 }, { "epoch": 1.6512223974763407, "grad_norm": 0.4877727357397369, "learning_rate": 1.2721191652343674e-05, "loss": 0.3283, "step": 8376 }, { "epoch": 1.6514195583596214, "grad_norm": 1.4180824108129741, "learning_rate": 1.2719700086143088e-05, "loss": 0.3418, "step": 8377 }, { "epoch": 1.6516167192429023, "grad_norm": 0.5155193108625191, "learning_rate": 1.2718208454599515e-05, "loss": 0.3532, "step": 8378 }, { "epoch": 1.651813880126183, "grad_norm": 0.47241976046112605, "learning_rate": 1.2716716757748795e-05, "loss": 0.3526, "step": 8379 }, { "epoch": 1.6520110410094637, "grad_norm": 0.4883813868684643, "learning_rate": 1.2715224995626769e-05, "loss": 0.321, "step": 8380 }, { "epoch": 1.6522082018927446, "grad_norm": 0.468988201610082, "learning_rate": 1.2713733168269275e-05, "loss": 0.3357, "step": 8381 }, { "epoch": 1.6524053627760251, "grad_norm": 0.4474868601000789, "learning_rate": 1.2712241275712156e-05, "loss": 0.3386, "step": 8382 }, { "epoch": 1.652602523659306, "grad_norm": 1.0602180659250304, "learning_rate": 1.2710749317991255e-05, "loss": 0.3677, "step": 8383 }, { "epoch": 1.6527996845425867, "grad_norm": 0.4626736577510362, "learning_rate": 1.2709257295142421e-05, "loss": 0.3171, "step": 8384 }, { "epoch": 1.6529968454258674, "grad_norm": 0.7287485310553602, "learning_rate": 1.2707765207201497e-05, "loss": 0.3376, "step": 8385 }, { "epoch": 1.6531940063091484, "grad_norm": 0.49508196335788296, "learning_rate": 1.2706273054204334e-05, "loss": 0.3154, "step": 8386 }, { "epoch": 1.653391167192429, "grad_norm": 0.5335563258052243, "learning_rate": 1.2704780836186781e-05, "loss": 0.335, "step": 8387 }, { "epoch": 1.6535883280757098, "grad_norm": 0.49204084068864695, "learning_rate": 1.270328855318469e-05, "loss": 0.3457, "step": 8388 }, { "epoch": 1.6537854889589907, "grad_norm": 0.7179327613194991, "learning_rate": 1.2701796205233916e-05, "loss": 0.3262, "step": 8389 }, { "epoch": 1.6539826498422712, "grad_norm": 0.5875255445299163, "learning_rate": 1.270030379237031e-05, "loss": 0.3445, "step": 8390 }, { "epoch": 1.654179810725552, "grad_norm": 0.48571857002344315, "learning_rate": 1.2698811314629734e-05, "loss": 0.3384, "step": 8391 }, { "epoch": 1.6543769716088328, "grad_norm": 0.5420200873446145, "learning_rate": 1.269731877204804e-05, "loss": 0.3471, "step": 8392 }, { "epoch": 1.6545741324921135, "grad_norm": 0.4978969822403536, "learning_rate": 1.2695826164661093e-05, "loss": 0.3361, "step": 8393 }, { "epoch": 1.6547712933753944, "grad_norm": 0.5619686050490151, "learning_rate": 1.269433349250475e-05, "loss": 0.3448, "step": 8394 }, { "epoch": 1.6549684542586751, "grad_norm": 0.5254844107289572, "learning_rate": 1.2692840755614873e-05, "loss": 0.3367, "step": 8395 }, { "epoch": 1.6551656151419558, "grad_norm": 0.5011006680947409, "learning_rate": 1.269134795402733e-05, "loss": 0.3418, "step": 8396 }, { "epoch": 1.6553627760252367, "grad_norm": 0.569081232736909, "learning_rate": 1.2689855087777988e-05, "loss": 0.3265, "step": 8397 }, { "epoch": 1.6555599369085172, "grad_norm": 0.49741199855508267, "learning_rate": 1.2688362156902707e-05, "loss": 0.3367, "step": 8398 }, { "epoch": 1.6557570977917981, "grad_norm": 0.5473920091128143, "learning_rate": 1.2686869161437364e-05, "loss": 0.3747, "step": 8399 }, { "epoch": 1.6559542586750788, "grad_norm": 0.5076794965437509, "learning_rate": 1.2685376101417823e-05, "loss": 0.3506, "step": 8400 }, { "epoch": 1.6561514195583595, "grad_norm": 0.5162556295971144, "learning_rate": 1.268388297687996e-05, "loss": 0.3288, "step": 8401 }, { "epoch": 1.6563485804416405, "grad_norm": 0.5519110540607234, "learning_rate": 1.2682389787859646e-05, "loss": 0.3312, "step": 8402 }, { "epoch": 1.6565457413249212, "grad_norm": 0.5118402740090534, "learning_rate": 1.268089653439276e-05, "loss": 0.2988, "step": 8403 }, { "epoch": 1.6567429022082019, "grad_norm": 0.5055886550149764, "learning_rate": 1.2679403216515171e-05, "loss": 0.3293, "step": 8404 }, { "epoch": 1.6569400630914828, "grad_norm": 0.5299784103251353, "learning_rate": 1.2677909834262764e-05, "loss": 0.3818, "step": 8405 }, { "epoch": 1.6571372239747633, "grad_norm": 0.4721883334342438, "learning_rate": 1.267641638767142e-05, "loss": 0.3164, "step": 8406 }, { "epoch": 1.6573343848580442, "grad_norm": 0.46598843868067574, "learning_rate": 1.2674922876777014e-05, "loss": 0.3242, "step": 8407 }, { "epoch": 1.6575315457413249, "grad_norm": 0.4908947433651475, "learning_rate": 1.2673429301615431e-05, "loss": 0.361, "step": 8408 }, { "epoch": 1.6577287066246056, "grad_norm": 0.5044190970850814, "learning_rate": 1.2671935662222556e-05, "loss": 0.3492, "step": 8409 }, { "epoch": 1.6579258675078865, "grad_norm": 0.4885883648458966, "learning_rate": 1.2670441958634278e-05, "loss": 0.3405, "step": 8410 }, { "epoch": 1.6581230283911672, "grad_norm": 0.4992792901348933, "learning_rate": 1.2668948190886479e-05, "loss": 0.3495, "step": 8411 }, { "epoch": 1.658320189274448, "grad_norm": 0.4766119795887696, "learning_rate": 1.2667454359015053e-05, "loss": 0.324, "step": 8412 }, { "epoch": 1.6585173501577288, "grad_norm": 0.49210628144064317, "learning_rate": 1.2665960463055884e-05, "loss": 0.3262, "step": 8413 }, { "epoch": 1.6587145110410093, "grad_norm": 0.5123201240998976, "learning_rate": 1.2664466503044872e-05, "loss": 0.3375, "step": 8414 }, { "epoch": 1.6589116719242902, "grad_norm": 0.48206936397556427, "learning_rate": 1.2662972479017905e-05, "loss": 0.3495, "step": 8415 }, { "epoch": 1.659108832807571, "grad_norm": 0.46491900044693, "learning_rate": 1.2661478391010877e-05, "loss": 0.3371, "step": 8416 }, { "epoch": 1.6593059936908516, "grad_norm": 0.4754604595322926, "learning_rate": 1.2659984239059693e-05, "loss": 0.345, "step": 8417 }, { "epoch": 1.6595031545741326, "grad_norm": 0.485110553505231, "learning_rate": 1.2658490023200238e-05, "loss": 0.3457, "step": 8418 }, { "epoch": 1.6597003154574133, "grad_norm": 0.5020242501635195, "learning_rate": 1.2656995743468428e-05, "loss": 0.3449, "step": 8419 }, { "epoch": 1.659897476340694, "grad_norm": 0.47335087548835225, "learning_rate": 1.2655501399900147e-05, "loss": 0.3344, "step": 8420 }, { "epoch": 1.6600946372239749, "grad_norm": 0.7292549190195249, "learning_rate": 1.2654006992531314e-05, "loss": 0.3618, "step": 8421 }, { "epoch": 1.6602917981072554, "grad_norm": 0.46704737896112347, "learning_rate": 1.265251252139782e-05, "loss": 0.3379, "step": 8422 }, { "epoch": 1.6604889589905363, "grad_norm": 0.47828307265868714, "learning_rate": 1.2651017986535578e-05, "loss": 0.3546, "step": 8423 }, { "epoch": 1.660686119873817, "grad_norm": 0.5046362588351052, "learning_rate": 1.2649523387980497e-05, "loss": 0.3535, "step": 8424 }, { "epoch": 1.6608832807570977, "grad_norm": 0.529973462052246, "learning_rate": 1.264802872576848e-05, "loss": 0.3614, "step": 8425 }, { "epoch": 1.6610804416403786, "grad_norm": 0.47162369718261343, "learning_rate": 1.2646533999935442e-05, "loss": 0.3486, "step": 8426 }, { "epoch": 1.6612776025236593, "grad_norm": 0.5125003435688313, "learning_rate": 1.2645039210517291e-05, "loss": 0.35, "step": 8427 }, { "epoch": 1.66147476340694, "grad_norm": 0.49775247171182835, "learning_rate": 1.2643544357549946e-05, "loss": 0.3468, "step": 8428 }, { "epoch": 1.661671924290221, "grad_norm": 0.4979574735742323, "learning_rate": 1.2642049441069318e-05, "loss": 0.3479, "step": 8429 }, { "epoch": 1.6618690851735016, "grad_norm": 0.49155198733720284, "learning_rate": 1.2640554461111324e-05, "loss": 0.3534, "step": 8430 }, { "epoch": 1.6620662460567823, "grad_norm": 0.49261265160510403, "learning_rate": 1.2639059417711882e-05, "loss": 0.3377, "step": 8431 }, { "epoch": 1.6622634069400632, "grad_norm": 0.47430655424794055, "learning_rate": 1.2637564310906913e-05, "loss": 0.3409, "step": 8432 }, { "epoch": 1.6624605678233437, "grad_norm": 0.4847102680872918, "learning_rate": 1.2636069140732338e-05, "loss": 0.3385, "step": 8433 }, { "epoch": 1.6626577287066246, "grad_norm": 0.47723013743075726, "learning_rate": 1.263457390722408e-05, "loss": 0.3318, "step": 8434 }, { "epoch": 1.6628548895899053, "grad_norm": 0.4724871482409827, "learning_rate": 1.2633078610418062e-05, "loss": 0.3122, "step": 8435 }, { "epoch": 1.663052050473186, "grad_norm": 0.4828083436400746, "learning_rate": 1.2631583250350208e-05, "loss": 0.344, "step": 8436 }, { "epoch": 1.663249211356467, "grad_norm": 0.4461653149295655, "learning_rate": 1.2630087827056445e-05, "loss": 0.3247, "step": 8437 }, { "epoch": 1.6634463722397477, "grad_norm": 0.5069753338240224, "learning_rate": 1.262859234057271e-05, "loss": 0.3152, "step": 8438 }, { "epoch": 1.6636435331230284, "grad_norm": 0.460521307249503, "learning_rate": 1.2627096790934921e-05, "loss": 0.3223, "step": 8439 }, { "epoch": 1.6638406940063093, "grad_norm": 0.4604716994762646, "learning_rate": 1.2625601178179021e-05, "loss": 0.3283, "step": 8440 }, { "epoch": 1.6640378548895898, "grad_norm": 0.5234481706217016, "learning_rate": 1.2624105502340935e-05, "loss": 0.3534, "step": 8441 }, { "epoch": 1.6642350157728707, "grad_norm": 0.5252701025916288, "learning_rate": 1.2622609763456604e-05, "loss": 0.3829, "step": 8442 }, { "epoch": 1.6644321766561514, "grad_norm": 0.48070551451863064, "learning_rate": 1.262111396156196e-05, "loss": 0.3495, "step": 8443 }, { "epoch": 1.664629337539432, "grad_norm": 0.48384766288562747, "learning_rate": 1.2619618096692942e-05, "loss": 0.3316, "step": 8444 }, { "epoch": 1.664826498422713, "grad_norm": 0.5062594443871286, "learning_rate": 1.2618122168885489e-05, "loss": 0.3348, "step": 8445 }, { "epoch": 1.6650236593059937, "grad_norm": 0.4907130059730268, "learning_rate": 1.2616626178175544e-05, "loss": 0.3559, "step": 8446 }, { "epoch": 1.6652208201892744, "grad_norm": 0.5082080738735469, "learning_rate": 1.2615130124599047e-05, "loss": 0.3292, "step": 8447 }, { "epoch": 1.6654179810725553, "grad_norm": 0.5032607745506708, "learning_rate": 1.2613634008191944e-05, "loss": 0.3291, "step": 8448 }, { "epoch": 1.6656151419558358, "grad_norm": 0.47422112873571903, "learning_rate": 1.2612137828990178e-05, "loss": 0.341, "step": 8449 }, { "epoch": 1.6658123028391167, "grad_norm": 0.48408723981425067, "learning_rate": 1.2610641587029697e-05, "loss": 0.342, "step": 8450 }, { "epoch": 1.6660094637223974, "grad_norm": 0.47815519859652317, "learning_rate": 1.2609145282346452e-05, "loss": 0.3042, "step": 8451 }, { "epoch": 1.6662066246056781, "grad_norm": 0.45905147453962514, "learning_rate": 1.2607648914976386e-05, "loss": 0.3413, "step": 8452 }, { "epoch": 1.666403785488959, "grad_norm": 0.4686501349889585, "learning_rate": 1.2606152484955458e-05, "loss": 0.32, "step": 8453 }, { "epoch": 1.6666009463722398, "grad_norm": 0.5027813358909118, "learning_rate": 1.2604655992319618e-05, "loss": 0.3577, "step": 8454 }, { "epoch": 1.6667981072555205, "grad_norm": 0.48965015962544906, "learning_rate": 1.260315943710482e-05, "loss": 0.3383, "step": 8455 }, { "epoch": 1.6669952681388014, "grad_norm": 0.48574589911473337, "learning_rate": 1.2601662819347017e-05, "loss": 0.3396, "step": 8456 }, { "epoch": 1.6671924290220819, "grad_norm": 0.44679877307708993, "learning_rate": 1.2600166139082175e-05, "loss": 0.3054, "step": 8457 }, { "epoch": 1.6673895899053628, "grad_norm": 0.5131395778523016, "learning_rate": 1.2598669396346244e-05, "loss": 0.3606, "step": 8458 }, { "epoch": 1.6675867507886435, "grad_norm": 0.48971170022752913, "learning_rate": 1.259717259117519e-05, "loss": 0.3588, "step": 8459 }, { "epoch": 1.6677839116719242, "grad_norm": 0.49066716607132943, "learning_rate": 1.259567572360497e-05, "loss": 0.3364, "step": 8460 }, { "epoch": 1.6679810725552051, "grad_norm": 0.4509623703464088, "learning_rate": 1.2594178793671554e-05, "loss": 0.3219, "step": 8461 }, { "epoch": 1.6681782334384858, "grad_norm": 0.46371182929662663, "learning_rate": 1.25926818014109e-05, "loss": 0.3491, "step": 8462 }, { "epoch": 1.6683753943217665, "grad_norm": 0.49080219225409744, "learning_rate": 1.259118474685898e-05, "loss": 0.3511, "step": 8463 }, { "epoch": 1.6685725552050474, "grad_norm": 0.4985969802063767, "learning_rate": 1.258968763005176e-05, "loss": 0.3487, "step": 8464 }, { "epoch": 1.668769716088328, "grad_norm": 0.49622170543145033, "learning_rate": 1.2588190451025209e-05, "loss": 0.333, "step": 8465 }, { "epoch": 1.6689668769716088, "grad_norm": 0.4511106753090801, "learning_rate": 1.2586693209815298e-05, "loss": 0.3192, "step": 8466 }, { "epoch": 1.6691640378548895, "grad_norm": 0.479506900351286, "learning_rate": 1.2585195906457998e-05, "loss": 0.3485, "step": 8467 }, { "epoch": 1.6693611987381702, "grad_norm": 0.4715197277159651, "learning_rate": 1.2583698540989288e-05, "loss": 0.3276, "step": 8468 }, { "epoch": 1.6695583596214512, "grad_norm": 0.4798793765470226, "learning_rate": 1.2582201113445136e-05, "loss": 0.3414, "step": 8469 }, { "epoch": 1.6697555205047319, "grad_norm": 0.4505832073301154, "learning_rate": 1.2580703623861525e-05, "loss": 0.3292, "step": 8470 }, { "epoch": 1.6699526813880126, "grad_norm": 0.4695096328518373, "learning_rate": 1.257920607227443e-05, "loss": 0.3511, "step": 8471 }, { "epoch": 1.6701498422712935, "grad_norm": 0.475920897049246, "learning_rate": 1.2577708458719836e-05, "loss": 0.3333, "step": 8472 }, { "epoch": 1.6703470031545742, "grad_norm": 0.4629881113952917, "learning_rate": 1.2576210783233715e-05, "loss": 0.3427, "step": 8473 }, { "epoch": 1.6705441640378549, "grad_norm": 0.4579101805999366, "learning_rate": 1.2574713045852059e-05, "loss": 0.3249, "step": 8474 }, { "epoch": 1.6707413249211358, "grad_norm": 0.5062337208811096, "learning_rate": 1.2573215246610845e-05, "loss": 0.3161, "step": 8475 }, { "epoch": 1.6709384858044163, "grad_norm": 0.5716309498702405, "learning_rate": 1.2571717385546067e-05, "loss": 0.3362, "step": 8476 }, { "epoch": 1.6711356466876972, "grad_norm": 0.45248380776560626, "learning_rate": 1.2570219462693703e-05, "loss": 0.3345, "step": 8477 }, { "epoch": 1.671332807570978, "grad_norm": 0.4930136541420434, "learning_rate": 1.2568721478089752e-05, "loss": 0.3379, "step": 8478 }, { "epoch": 1.6715299684542586, "grad_norm": 0.5282094905423682, "learning_rate": 1.2567223431770193e-05, "loss": 0.3808, "step": 8479 }, { "epoch": 1.6717271293375395, "grad_norm": 0.49446422793627104, "learning_rate": 1.256572532377103e-05, "loss": 0.3307, "step": 8480 }, { "epoch": 1.6719242902208202, "grad_norm": 0.4665091200670554, "learning_rate": 1.2564227154128248e-05, "loss": 0.3129, "step": 8481 }, { "epoch": 1.672121451104101, "grad_norm": 0.5106508095556581, "learning_rate": 1.256272892287784e-05, "loss": 0.3647, "step": 8482 }, { "epoch": 1.6723186119873819, "grad_norm": 0.48462370149465533, "learning_rate": 1.256123063005581e-05, "loss": 0.345, "step": 8483 }, { "epoch": 1.6725157728706623, "grad_norm": 0.46959052479288327, "learning_rate": 1.2559732275698147e-05, "loss": 0.3305, "step": 8484 }, { "epoch": 1.6727129337539433, "grad_norm": 0.4863277709967929, "learning_rate": 1.2558233859840861e-05, "loss": 0.3402, "step": 8485 }, { "epoch": 1.672910094637224, "grad_norm": 0.48685079228972755, "learning_rate": 1.255673538251994e-05, "loss": 0.3153, "step": 8486 }, { "epoch": 1.6731072555205047, "grad_norm": 0.4893988869401554, "learning_rate": 1.2555236843771398e-05, "loss": 0.3546, "step": 8487 }, { "epoch": 1.6733044164037856, "grad_norm": 0.4806560567289344, "learning_rate": 1.2553738243631228e-05, "loss": 0.3762, "step": 8488 }, { "epoch": 1.6735015772870663, "grad_norm": 0.4744744228607181, "learning_rate": 1.2552239582135446e-05, "loss": 0.3557, "step": 8489 }, { "epoch": 1.673698738170347, "grad_norm": 0.5065072339151813, "learning_rate": 1.2550740859320047e-05, "loss": 0.3514, "step": 8490 }, { "epoch": 1.673895899053628, "grad_norm": 0.49720037597529226, "learning_rate": 1.2549242075221047e-05, "loss": 0.3621, "step": 8491 }, { "epoch": 1.6740930599369084, "grad_norm": 0.4862962373562821, "learning_rate": 1.2547743229874452e-05, "loss": 0.3278, "step": 8492 }, { "epoch": 1.6742902208201893, "grad_norm": 0.4862623198322913, "learning_rate": 1.2546244323316276e-05, "loss": 0.3649, "step": 8493 }, { "epoch": 1.67448738170347, "grad_norm": 0.46675920503742757, "learning_rate": 1.254474535558253e-05, "loss": 0.3142, "step": 8494 }, { "epoch": 1.6746845425867507, "grad_norm": 6.314767157106249, "learning_rate": 1.2543246326709227e-05, "loss": 0.3485, "step": 8495 }, { "epoch": 1.6748817034700316, "grad_norm": 0.5032826105490751, "learning_rate": 1.2541747236732382e-05, "loss": 0.334, "step": 8496 }, { "epoch": 1.6750788643533123, "grad_norm": 0.4565883254189662, "learning_rate": 1.2540248085688013e-05, "loss": 0.3298, "step": 8497 }, { "epoch": 1.675276025236593, "grad_norm": 0.47701955741607316, "learning_rate": 1.253874887361214e-05, "loss": 0.3419, "step": 8498 }, { "epoch": 1.675473186119874, "grad_norm": 0.5192006825879065, "learning_rate": 1.253724960054078e-05, "loss": 0.349, "step": 8499 }, { "epoch": 1.6756703470031544, "grad_norm": 0.498046580157121, "learning_rate": 1.2535750266509955e-05, "loss": 0.359, "step": 8500 }, { "epoch": 1.6758675078864353, "grad_norm": 0.45577657594308446, "learning_rate": 1.2534250871555687e-05, "loss": 0.3262, "step": 8501 }, { "epoch": 1.676064668769716, "grad_norm": 0.4735721377691079, "learning_rate": 1.2532751415714001e-05, "loss": 0.2878, "step": 8502 }, { "epoch": 1.6762618296529967, "grad_norm": 0.4771932029118233, "learning_rate": 1.2531251899020925e-05, "loss": 0.3162, "step": 8503 }, { "epoch": 1.6764589905362777, "grad_norm": 0.4757910741943044, "learning_rate": 1.252975232151248e-05, "loss": 0.3357, "step": 8504 }, { "epoch": 1.6766561514195584, "grad_norm": 0.5121567601038779, "learning_rate": 1.2528252683224697e-05, "loss": 0.3436, "step": 8505 }, { "epoch": 1.676853312302839, "grad_norm": 0.5520693211608968, "learning_rate": 1.2526752984193613e-05, "loss": 0.3188, "step": 8506 }, { "epoch": 1.67705047318612, "grad_norm": 0.47244929893727605, "learning_rate": 1.2525253224455249e-05, "loss": 0.3074, "step": 8507 }, { "epoch": 1.6772476340694005, "grad_norm": 0.4911902577750642, "learning_rate": 1.252375340404565e-05, "loss": 0.3437, "step": 8508 }, { "epoch": 1.6774447949526814, "grad_norm": 0.4702219725532421, "learning_rate": 1.2522253523000834e-05, "loss": 0.2994, "step": 8509 }, { "epoch": 1.677641955835962, "grad_norm": 0.4719943895332728, "learning_rate": 1.2520753581356852e-05, "loss": 0.3319, "step": 8510 }, { "epoch": 1.6778391167192428, "grad_norm": 0.46273321896963937, "learning_rate": 1.251925357914973e-05, "loss": 0.3156, "step": 8511 }, { "epoch": 1.6780362776025237, "grad_norm": 0.45943840807810565, "learning_rate": 1.2517753516415516e-05, "loss": 0.3108, "step": 8512 }, { "epoch": 1.6782334384858044, "grad_norm": 0.4958574879886677, "learning_rate": 1.2516253393190245e-05, "loss": 0.3415, "step": 8513 }, { "epoch": 1.6784305993690851, "grad_norm": 0.471413234518349, "learning_rate": 1.251475320950996e-05, "loss": 0.3336, "step": 8514 }, { "epoch": 1.678627760252366, "grad_norm": 3.3034750186313615, "learning_rate": 1.2513252965410706e-05, "loss": 0.3529, "step": 8515 }, { "epoch": 1.6788249211356467, "grad_norm": 0.5058575196517642, "learning_rate": 1.2511752660928523e-05, "loss": 0.3583, "step": 8516 }, { "epoch": 1.6790220820189274, "grad_norm": 0.5210370780798075, "learning_rate": 1.251025229609946e-05, "loss": 0.3627, "step": 8517 }, { "epoch": 1.6792192429022084, "grad_norm": 0.9679223222367713, "learning_rate": 1.2508751870959563e-05, "loss": 0.3305, "step": 8518 }, { "epoch": 1.6794164037854888, "grad_norm": 0.955205787142468, "learning_rate": 1.2507251385544885e-05, "loss": 0.3384, "step": 8519 }, { "epoch": 1.6796135646687698, "grad_norm": 0.5760713418529518, "learning_rate": 1.2505750839891473e-05, "loss": 0.3412, "step": 8520 }, { "epoch": 1.6798107255520505, "grad_norm": 0.4672872118729301, "learning_rate": 1.2504250234035378e-05, "loss": 0.3238, "step": 8521 }, { "epoch": 1.6800078864353312, "grad_norm": 0.46960680724931947, "learning_rate": 1.2502749568012655e-05, "loss": 0.3368, "step": 8522 }, { "epoch": 1.680205047318612, "grad_norm": 0.4892990248131517, "learning_rate": 1.2501248841859358e-05, "loss": 0.3222, "step": 8523 }, { "epoch": 1.6804022082018928, "grad_norm": 0.9521603633960574, "learning_rate": 1.2499748055611543e-05, "loss": 0.3473, "step": 8524 }, { "epoch": 1.6805993690851735, "grad_norm": 0.4961229036962787, "learning_rate": 1.2498247209305267e-05, "loss": 0.34, "step": 8525 }, { "epoch": 1.6807965299684544, "grad_norm": 0.4641387089623987, "learning_rate": 1.2496746302976588e-05, "loss": 0.3262, "step": 8526 }, { "epoch": 1.680993690851735, "grad_norm": 0.5208046380242989, "learning_rate": 1.2495245336661575e-05, "loss": 0.357, "step": 8527 }, { "epoch": 1.6811908517350158, "grad_norm": 0.6377815976714571, "learning_rate": 1.2493744310396276e-05, "loss": 0.3446, "step": 8528 }, { "epoch": 1.6813880126182965, "grad_norm": 0.48508566333163605, "learning_rate": 1.249224322421677e-05, "loss": 0.3265, "step": 8529 }, { "epoch": 1.6815851735015772, "grad_norm": 0.4640062435177377, "learning_rate": 1.2490742078159107e-05, "loss": 0.3453, "step": 8530 }, { "epoch": 1.6817823343848581, "grad_norm": 0.48082524719773534, "learning_rate": 1.248924087225936e-05, "loss": 0.3428, "step": 8531 }, { "epoch": 1.6819794952681388, "grad_norm": 0.46881859443066326, "learning_rate": 1.24877396065536e-05, "loss": 0.3073, "step": 8532 }, { "epoch": 1.6821766561514195, "grad_norm": 0.4883737738622232, "learning_rate": 1.248623828107789e-05, "loss": 0.3472, "step": 8533 }, { "epoch": 1.6823738170347005, "grad_norm": 0.45689282889724375, "learning_rate": 1.2484736895868306e-05, "loss": 0.3097, "step": 8534 }, { "epoch": 1.682570977917981, "grad_norm": 0.47112167138042593, "learning_rate": 1.2483235450960914e-05, "loss": 0.3444, "step": 8535 }, { "epoch": 1.6827681388012619, "grad_norm": 0.46214308971573514, "learning_rate": 1.2481733946391792e-05, "loss": 0.3189, "step": 8536 }, { "epoch": 1.6829652996845426, "grad_norm": 0.46005114791319535, "learning_rate": 1.2480232382197013e-05, "loss": 0.3178, "step": 8537 }, { "epoch": 1.6831624605678233, "grad_norm": 0.4767717118901013, "learning_rate": 1.2478730758412652e-05, "loss": 0.3362, "step": 8538 }, { "epoch": 1.6833596214511042, "grad_norm": 0.4599960998922992, "learning_rate": 1.247722907507479e-05, "loss": 0.3195, "step": 8539 }, { "epoch": 1.6835567823343849, "grad_norm": 0.4951705881711061, "learning_rate": 1.2475727332219505e-05, "loss": 0.346, "step": 8540 }, { "epoch": 1.6837539432176656, "grad_norm": 0.5046721506851792, "learning_rate": 1.2474225529882878e-05, "loss": 0.3255, "step": 8541 }, { "epoch": 1.6839511041009465, "grad_norm": 0.4829302359712838, "learning_rate": 1.247272366810099e-05, "loss": 0.3429, "step": 8542 }, { "epoch": 1.684148264984227, "grad_norm": 0.46391312466042783, "learning_rate": 1.2471221746909923e-05, "loss": 0.3325, "step": 8543 }, { "epoch": 1.684345425867508, "grad_norm": 0.47200457284310665, "learning_rate": 1.2469719766345763e-05, "loss": 0.3388, "step": 8544 }, { "epoch": 1.6845425867507886, "grad_norm": 0.4746525342051994, "learning_rate": 1.2468217726444595e-05, "loss": 0.3293, "step": 8545 }, { "epoch": 1.6847397476340693, "grad_norm": 0.523726717599289, "learning_rate": 1.2466715627242514e-05, "loss": 0.3564, "step": 8546 }, { "epoch": 1.6849369085173502, "grad_norm": 0.47113498305611295, "learning_rate": 1.2465213468775602e-05, "loss": 0.3459, "step": 8547 }, { "epoch": 1.685134069400631, "grad_norm": 0.4768683631487932, "learning_rate": 1.2463711251079951e-05, "loss": 0.3419, "step": 8548 }, { "epoch": 1.6853312302839116, "grad_norm": 0.4796081831360892, "learning_rate": 1.2462208974191652e-05, "loss": 0.337, "step": 8549 }, { "epoch": 1.6855283911671926, "grad_norm": 0.47022166990324243, "learning_rate": 1.24607066381468e-05, "loss": 0.3343, "step": 8550 }, { "epoch": 1.685725552050473, "grad_norm": 0.47554608946434046, "learning_rate": 1.245920424298149e-05, "loss": 0.3205, "step": 8551 }, { "epoch": 1.685922712933754, "grad_norm": 0.616505992571845, "learning_rate": 1.2457701788731812e-05, "loss": 0.3296, "step": 8552 }, { "epoch": 1.6861198738170347, "grad_norm": 0.4878539182332263, "learning_rate": 1.2456199275433878e-05, "loss": 0.3525, "step": 8553 }, { "epoch": 1.6863170347003154, "grad_norm": 0.4414899321505671, "learning_rate": 1.2454696703123773e-05, "loss": 0.3206, "step": 8554 }, { "epoch": 1.6865141955835963, "grad_norm": 0.48286881495321865, "learning_rate": 1.2453194071837606e-05, "loss": 0.3238, "step": 8555 }, { "epoch": 1.686711356466877, "grad_norm": 0.4742179590973439, "learning_rate": 1.2451691381611472e-05, "loss": 0.3197, "step": 8556 }, { "epoch": 1.6869085173501577, "grad_norm": 0.47834655573536045, "learning_rate": 1.2450188632481484e-05, "loss": 0.3265, "step": 8557 }, { "epoch": 1.6871056782334386, "grad_norm": 0.4343626858698186, "learning_rate": 1.2448685824483735e-05, "loss": 0.3035, "step": 8558 }, { "epoch": 1.687302839116719, "grad_norm": 0.48021663657417696, "learning_rate": 1.244718295765434e-05, "loss": 0.3275, "step": 8559 }, { "epoch": 1.6875, "grad_norm": 0.4680145546040287, "learning_rate": 1.2445680032029403e-05, "loss": 0.3164, "step": 8560 }, { "epoch": 1.687697160883281, "grad_norm": 0.46324619631494696, "learning_rate": 1.2444177047645036e-05, "loss": 0.3164, "step": 8561 }, { "epoch": 1.6878943217665614, "grad_norm": 0.4788496971926555, "learning_rate": 1.2442674004537345e-05, "loss": 0.3412, "step": 8562 }, { "epoch": 1.6880914826498423, "grad_norm": 0.47439352264543966, "learning_rate": 1.2441170902742445e-05, "loss": 0.3272, "step": 8563 }, { "epoch": 1.688288643533123, "grad_norm": 0.49561553090817284, "learning_rate": 1.2439667742296448e-05, "loss": 0.3302, "step": 8564 }, { "epoch": 1.6884858044164037, "grad_norm": 0.4800560302995904, "learning_rate": 1.2438164523235467e-05, "loss": 0.3406, "step": 8565 }, { "epoch": 1.6886829652996846, "grad_norm": 0.49067895256143157, "learning_rate": 1.2436661245595623e-05, "loss": 0.325, "step": 8566 }, { "epoch": 1.6888801261829653, "grad_norm": 0.47817903605159684, "learning_rate": 1.2435157909413029e-05, "loss": 0.3247, "step": 8567 }, { "epoch": 1.689077287066246, "grad_norm": 0.48136123183849766, "learning_rate": 1.2433654514723806e-05, "loss": 0.3454, "step": 8568 }, { "epoch": 1.689274447949527, "grad_norm": 0.5051580981062126, "learning_rate": 1.2432151061564071e-05, "loss": 0.3601, "step": 8569 }, { "epoch": 1.6894716088328074, "grad_norm": 0.461511250819562, "learning_rate": 1.2430647549969949e-05, "loss": 0.3233, "step": 8570 }, { "epoch": 1.6896687697160884, "grad_norm": 0.47678716555909567, "learning_rate": 1.2429143979977562e-05, "loss": 0.3427, "step": 8571 }, { "epoch": 1.689865930599369, "grad_norm": 0.4486388389800133, "learning_rate": 1.2427640351623037e-05, "loss": 0.3183, "step": 8572 }, { "epoch": 1.6900630914826498, "grad_norm": 0.5299380107151339, "learning_rate": 1.2426136664942495e-05, "loss": 0.3656, "step": 8573 }, { "epoch": 1.6902602523659307, "grad_norm": 0.46937533374559753, "learning_rate": 1.2424632919972068e-05, "loss": 0.33, "step": 8574 }, { "epoch": 1.6904574132492114, "grad_norm": 0.4979192584567516, "learning_rate": 1.2423129116747878e-05, "loss": 0.3224, "step": 8575 }, { "epoch": 1.690654574132492, "grad_norm": 1.0231877748006113, "learning_rate": 1.2421625255306067e-05, "loss": 0.3506, "step": 8576 }, { "epoch": 1.690851735015773, "grad_norm": 0.47898806414813694, "learning_rate": 1.242012133568275e-05, "loss": 0.3408, "step": 8577 }, { "epoch": 1.6910488958990535, "grad_norm": 0.47674507248699505, "learning_rate": 1.2418617357914078e-05, "loss": 0.3402, "step": 8578 }, { "epoch": 1.6912460567823344, "grad_norm": 0.45179630277653227, "learning_rate": 1.2417113322036172e-05, "loss": 0.3154, "step": 8579 }, { "epoch": 1.6914432176656151, "grad_norm": 0.46052841582017556, "learning_rate": 1.2415609228085171e-05, "loss": 0.3126, "step": 8580 }, { "epoch": 1.6916403785488958, "grad_norm": 0.44611469096894074, "learning_rate": 1.2414105076097214e-05, "loss": 0.3166, "step": 8581 }, { "epoch": 1.6918375394321767, "grad_norm": 0.4576223803492312, "learning_rate": 1.241260086610844e-05, "loss": 0.2939, "step": 8582 }, { "epoch": 1.6920347003154574, "grad_norm": 0.4557877387492244, "learning_rate": 1.2411096598154985e-05, "loss": 0.3268, "step": 8583 }, { "epoch": 1.6922318611987381, "grad_norm": 0.46572552067045964, "learning_rate": 1.2409592272272995e-05, "loss": 0.351, "step": 8584 }, { "epoch": 1.692429022082019, "grad_norm": 0.5122610064442623, "learning_rate": 1.2408087888498608e-05, "loss": 0.3605, "step": 8585 }, { "epoch": 1.6926261829652995, "grad_norm": 0.49403230477515936, "learning_rate": 1.2406583446867972e-05, "loss": 0.3342, "step": 8586 }, { "epoch": 1.6928233438485805, "grad_norm": 0.5017202216659983, "learning_rate": 1.240507894741723e-05, "loss": 0.3491, "step": 8587 }, { "epoch": 1.6930205047318612, "grad_norm": 0.4899331141459395, "learning_rate": 1.2403574390182529e-05, "loss": 0.3607, "step": 8588 }, { "epoch": 1.6932176656151419, "grad_norm": 0.4930643507100292, "learning_rate": 1.2402069775200018e-05, "loss": 0.3346, "step": 8589 }, { "epoch": 1.6934148264984228, "grad_norm": 0.47713702195641194, "learning_rate": 1.2400565102505846e-05, "loss": 0.3293, "step": 8590 }, { "epoch": 1.6936119873817035, "grad_norm": 0.4783126509399956, "learning_rate": 1.2399060372136165e-05, "loss": 0.343, "step": 8591 }, { "epoch": 1.6938091482649842, "grad_norm": 0.48348496039775457, "learning_rate": 1.2397555584127127e-05, "loss": 0.3281, "step": 8592 }, { "epoch": 1.694006309148265, "grad_norm": 0.5160059605204272, "learning_rate": 1.2396050738514884e-05, "loss": 0.3423, "step": 8593 }, { "epoch": 1.6942034700315456, "grad_norm": 0.45441215261046347, "learning_rate": 1.2394545835335591e-05, "loss": 0.314, "step": 8594 }, { "epoch": 1.6944006309148265, "grad_norm": 0.4836399537284616, "learning_rate": 1.239304087462541e-05, "loss": 0.327, "step": 8595 }, { "epoch": 1.6945977917981072, "grad_norm": 0.46510484102779165, "learning_rate": 1.2391535856420492e-05, "loss": 0.3528, "step": 8596 }, { "epoch": 1.694794952681388, "grad_norm": 0.4430475484441221, "learning_rate": 1.2390030780757e-05, "loss": 0.31, "step": 8597 }, { "epoch": 1.6949921135646688, "grad_norm": 0.4674652671591372, "learning_rate": 1.2388525647671092e-05, "loss": 0.3148, "step": 8598 }, { "epoch": 1.6951892744479495, "grad_norm": 0.5054421188784337, "learning_rate": 1.2387020457198937e-05, "loss": 0.3795, "step": 8599 }, { "epoch": 1.6953864353312302, "grad_norm": 0.4930491695671332, "learning_rate": 1.238551520937669e-05, "loss": 0.3515, "step": 8600 }, { "epoch": 1.6955835962145112, "grad_norm": 0.464472471942234, "learning_rate": 1.2384009904240517e-05, "loss": 0.3178, "step": 8601 }, { "epoch": 1.6957807570977916, "grad_norm": 0.46922888632843546, "learning_rate": 1.238250454182659e-05, "loss": 0.3421, "step": 8602 }, { "epoch": 1.6959779179810726, "grad_norm": 0.5117715606317115, "learning_rate": 1.238099912217107e-05, "loss": 0.3657, "step": 8603 }, { "epoch": 1.6961750788643533, "grad_norm": 0.45140287026735587, "learning_rate": 1.237949364531013e-05, "loss": 0.3224, "step": 8604 }, { "epoch": 1.696372239747634, "grad_norm": 0.49574769053532836, "learning_rate": 1.2377988111279937e-05, "loss": 0.3462, "step": 8605 }, { "epoch": 1.6965694006309149, "grad_norm": 0.46230478380221307, "learning_rate": 1.2376482520116666e-05, "loss": 0.3373, "step": 8606 }, { "epoch": 1.6967665615141956, "grad_norm": 0.5059151436193712, "learning_rate": 1.237497687185649e-05, "loss": 0.3359, "step": 8607 }, { "epoch": 1.6969637223974763, "grad_norm": 0.48234255694436523, "learning_rate": 1.237347116653558e-05, "loss": 0.3479, "step": 8608 }, { "epoch": 1.6971608832807572, "grad_norm": 0.4396592143423356, "learning_rate": 1.2371965404190116e-05, "loss": 0.3067, "step": 8609 }, { "epoch": 1.697358044164038, "grad_norm": 0.49065585389386196, "learning_rate": 1.2370459584856271e-05, "loss": 0.3445, "step": 8610 }, { "epoch": 1.6975552050473186, "grad_norm": 0.49051628604620695, "learning_rate": 1.2368953708570226e-05, "loss": 0.3402, "step": 8611 }, { "epoch": 1.6977523659305995, "grad_norm": 0.5254870988260015, "learning_rate": 1.2367447775368163e-05, "loss": 0.3454, "step": 8612 }, { "epoch": 1.69794952681388, "grad_norm": 0.4562980109810731, "learning_rate": 1.2365941785286258e-05, "loss": 0.3074, "step": 8613 }, { "epoch": 1.698146687697161, "grad_norm": 0.5678148667222066, "learning_rate": 1.2364435738360696e-05, "loss": 0.3396, "step": 8614 }, { "epoch": 1.6983438485804416, "grad_norm": 0.4911585049022961, "learning_rate": 1.2362929634627663e-05, "loss": 0.3346, "step": 8615 }, { "epoch": 1.6985410094637223, "grad_norm": 0.4681297072028885, "learning_rate": 1.2361423474123343e-05, "loss": 0.3082, "step": 8616 }, { "epoch": 1.6987381703470033, "grad_norm": 0.5174736460144449, "learning_rate": 1.235991725688392e-05, "loss": 0.3814, "step": 8617 }, { "epoch": 1.698935331230284, "grad_norm": 0.46929685353298345, "learning_rate": 1.2358410982945586e-05, "loss": 0.3355, "step": 8618 }, { "epoch": 1.6991324921135647, "grad_norm": 0.4922370451892625, "learning_rate": 1.2356904652344528e-05, "loss": 0.3268, "step": 8619 }, { "epoch": 1.6993296529968456, "grad_norm": 0.4596598424424867, "learning_rate": 1.2355398265116937e-05, "loss": 0.3293, "step": 8620 }, { "epoch": 1.699526813880126, "grad_norm": 0.5001218713531277, "learning_rate": 1.235389182129901e-05, "loss": 0.3669, "step": 8621 }, { "epoch": 1.699723974763407, "grad_norm": 0.4896954521807538, "learning_rate": 1.2352385320926929e-05, "loss": 0.3477, "step": 8622 }, { "epoch": 1.6999211356466877, "grad_norm": 0.49063376422553284, "learning_rate": 1.2350878764036904e-05, "loss": 0.372, "step": 8623 }, { "epoch": 1.7001182965299684, "grad_norm": 0.46066188918467005, "learning_rate": 1.2349372150665117e-05, "loss": 0.3251, "step": 8624 }, { "epoch": 1.7003154574132493, "grad_norm": 0.48615185578096526, "learning_rate": 1.2347865480847778e-05, "loss": 0.3356, "step": 8625 }, { "epoch": 1.70051261829653, "grad_norm": 0.46350483669286957, "learning_rate": 1.2346358754621078e-05, "loss": 0.3367, "step": 8626 }, { "epoch": 1.7007097791798107, "grad_norm": 0.4534163321107968, "learning_rate": 1.2344851972021219e-05, "loss": 0.3297, "step": 8627 }, { "epoch": 1.7009069400630916, "grad_norm": 0.46024896781400154, "learning_rate": 1.2343345133084403e-05, "loss": 0.3161, "step": 8628 }, { "epoch": 1.701104100946372, "grad_norm": 0.4597911141388873, "learning_rate": 1.2341838237846833e-05, "loss": 0.3251, "step": 8629 }, { "epoch": 1.701301261829653, "grad_norm": 0.47755657347184777, "learning_rate": 1.2340331286344713e-05, "loss": 0.3439, "step": 8630 }, { "epoch": 1.7014984227129337, "grad_norm": 0.4595385546420116, "learning_rate": 1.233882427861425e-05, "loss": 0.3236, "step": 8631 }, { "epoch": 1.7016955835962144, "grad_norm": 0.47878136809716293, "learning_rate": 1.233731721469165e-05, "loss": 0.3182, "step": 8632 }, { "epoch": 1.7018927444794953, "grad_norm": 0.46749188234424455, "learning_rate": 1.2335810094613123e-05, "loss": 0.3346, "step": 8633 }, { "epoch": 1.702089905362776, "grad_norm": 0.4620951279690006, "learning_rate": 1.2334302918414875e-05, "loss": 0.324, "step": 8634 }, { "epoch": 1.7022870662460567, "grad_norm": 0.489807733516085, "learning_rate": 1.2332795686133121e-05, "loss": 0.3252, "step": 8635 }, { "epoch": 1.7024842271293377, "grad_norm": 0.4804183436816162, "learning_rate": 1.2331288397804072e-05, "loss": 0.3598, "step": 8636 }, { "epoch": 1.7026813880126181, "grad_norm": 0.5026867518086552, "learning_rate": 1.2329781053463944e-05, "loss": 0.36, "step": 8637 }, { "epoch": 1.702878548895899, "grad_norm": 0.4759113828318395, "learning_rate": 1.2328273653148945e-05, "loss": 0.3411, "step": 8638 }, { "epoch": 1.7030757097791798, "grad_norm": 0.48031817244407715, "learning_rate": 1.2326766196895301e-05, "loss": 0.3496, "step": 8639 }, { "epoch": 1.7032728706624605, "grad_norm": 0.47282002859928735, "learning_rate": 1.2325258684739223e-05, "loss": 0.3215, "step": 8640 }, { "epoch": 1.7034700315457414, "grad_norm": 3.191557994037556, "learning_rate": 1.2323751116716932e-05, "loss": 0.3359, "step": 8641 }, { "epoch": 1.703667192429022, "grad_norm": 0.5967525800282695, "learning_rate": 1.2322243492864651e-05, "loss": 0.3469, "step": 8642 }, { "epoch": 1.7038643533123028, "grad_norm": 0.4551107597567062, "learning_rate": 1.2320735813218599e-05, "loss": 0.3291, "step": 8643 }, { "epoch": 1.7040615141955837, "grad_norm": 0.4620889896670438, "learning_rate": 1.2319228077815001e-05, "loss": 0.33, "step": 8644 }, { "epoch": 1.7042586750788642, "grad_norm": 0.4853258209863878, "learning_rate": 1.231772028669008e-05, "loss": 0.3261, "step": 8645 }, { "epoch": 1.7044558359621451, "grad_norm": 0.48368889919551544, "learning_rate": 1.2316212439880065e-05, "loss": 0.354, "step": 8646 }, { "epoch": 1.7046529968454258, "grad_norm": 0.5542920344424767, "learning_rate": 1.2314704537421177e-05, "loss": 0.3408, "step": 8647 }, { "epoch": 1.7048501577287065, "grad_norm": 0.47647351780117475, "learning_rate": 1.2313196579349648e-05, "loss": 0.3274, "step": 8648 }, { "epoch": 1.7050473186119874, "grad_norm": 0.46934633413559235, "learning_rate": 1.2311688565701711e-05, "loss": 0.3418, "step": 8649 }, { "epoch": 1.7052444794952681, "grad_norm": 0.48432919465079566, "learning_rate": 1.2310180496513595e-05, "loss": 0.3193, "step": 8650 }, { "epoch": 1.7054416403785488, "grad_norm": 0.4661153258537868, "learning_rate": 1.2308672371821532e-05, "loss": 0.3321, "step": 8651 }, { "epoch": 1.7056388012618298, "grad_norm": 0.4826086437714458, "learning_rate": 1.2307164191661756e-05, "loss": 0.338, "step": 8652 }, { "epoch": 1.7058359621451105, "grad_norm": 0.4637877696238651, "learning_rate": 1.2305655956070504e-05, "loss": 0.3245, "step": 8653 }, { "epoch": 1.7060331230283912, "grad_norm": 0.4640340301452536, "learning_rate": 1.2304147665084007e-05, "loss": 0.3176, "step": 8654 }, { "epoch": 1.706230283911672, "grad_norm": 0.49786805036849485, "learning_rate": 1.230263931873851e-05, "loss": 0.3619, "step": 8655 }, { "epoch": 1.7064274447949526, "grad_norm": 0.5747013253276394, "learning_rate": 1.2301130917070245e-05, "loss": 0.3151, "step": 8656 }, { "epoch": 1.7066246056782335, "grad_norm": 0.5511325174270215, "learning_rate": 1.2299622460115461e-05, "loss": 0.36, "step": 8657 }, { "epoch": 1.7068217665615142, "grad_norm": 0.5072499961226788, "learning_rate": 1.2298113947910393e-05, "loss": 0.3349, "step": 8658 }, { "epoch": 1.7070189274447949, "grad_norm": 0.48091952766514195, "learning_rate": 1.2296605380491288e-05, "loss": 0.3491, "step": 8659 }, { "epoch": 1.7072160883280758, "grad_norm": 0.4798754544496718, "learning_rate": 1.2295096757894389e-05, "loss": 0.3471, "step": 8660 }, { "epoch": 1.7074132492113565, "grad_norm": 0.6307603113857772, "learning_rate": 1.2293588080155943e-05, "loss": 0.3413, "step": 8661 }, { "epoch": 1.7076104100946372, "grad_norm": 0.4779048637501002, "learning_rate": 1.2292079347312194e-05, "loss": 0.3337, "step": 8662 }, { "epoch": 1.7078075709779181, "grad_norm": 0.5247659710582877, "learning_rate": 1.2290570559399395e-05, "loss": 0.37, "step": 8663 }, { "epoch": 1.7080047318611986, "grad_norm": 0.4617016132383226, "learning_rate": 1.2289061716453795e-05, "loss": 0.3223, "step": 8664 }, { "epoch": 1.7082018927444795, "grad_norm": 0.47698713563356776, "learning_rate": 1.2287552818511641e-05, "loss": 0.3247, "step": 8665 }, { "epoch": 1.7083990536277602, "grad_norm": 0.46484751275332564, "learning_rate": 1.2286043865609188e-05, "loss": 0.3319, "step": 8666 }, { "epoch": 1.708596214511041, "grad_norm": 0.473092049126572, "learning_rate": 1.2284534857782694e-05, "loss": 0.3515, "step": 8667 }, { "epoch": 1.7087933753943219, "grad_norm": 0.49275200785968454, "learning_rate": 1.2283025795068407e-05, "loss": 0.3342, "step": 8668 }, { "epoch": 1.7089905362776026, "grad_norm": 0.5026513536426332, "learning_rate": 1.2281516677502586e-05, "loss": 0.3411, "step": 8669 }, { "epoch": 1.7091876971608833, "grad_norm": 0.46869592654591175, "learning_rate": 1.2280007505121491e-05, "loss": 0.332, "step": 8670 }, { "epoch": 1.7093848580441642, "grad_norm": 0.5099270759799205, "learning_rate": 1.2278498277961377e-05, "loss": 0.3446, "step": 8671 }, { "epoch": 1.7095820189274447, "grad_norm": 2.050432417214253, "learning_rate": 1.2276988996058511e-05, "loss": 0.4219, "step": 8672 }, { "epoch": 1.7097791798107256, "grad_norm": 0.4725441540525176, "learning_rate": 1.227547965944915e-05, "loss": 0.3432, "step": 8673 }, { "epoch": 1.7099763406940063, "grad_norm": 0.4993708255622457, "learning_rate": 1.227397026816956e-05, "loss": 0.3513, "step": 8674 }, { "epoch": 1.710173501577287, "grad_norm": 0.4887912737158361, "learning_rate": 1.2272460822255996e-05, "loss": 0.322, "step": 8675 }, { "epoch": 1.710370662460568, "grad_norm": 0.46993623199235296, "learning_rate": 1.2270951321744736e-05, "loss": 0.3599, "step": 8676 }, { "epoch": 1.7105678233438486, "grad_norm": 0.4643011227098681, "learning_rate": 1.2269441766672042e-05, "loss": 0.3012, "step": 8677 }, { "epoch": 1.7107649842271293, "grad_norm": 0.5506263319435446, "learning_rate": 1.2267932157074178e-05, "loss": 0.3621, "step": 8678 }, { "epoch": 1.7109621451104102, "grad_norm": 0.4919225236599655, "learning_rate": 1.2266422492987423e-05, "loss": 0.338, "step": 8679 }, { "epoch": 1.7111593059936907, "grad_norm": 0.4896517148433233, "learning_rate": 1.2264912774448037e-05, "loss": 0.3527, "step": 8680 }, { "epoch": 1.7113564668769716, "grad_norm": 0.48839509275100296, "learning_rate": 1.22634030014923e-05, "loss": 0.3464, "step": 8681 }, { "epoch": 1.7115536277602523, "grad_norm": 0.4721548932185601, "learning_rate": 1.2261893174156485e-05, "loss": 0.3297, "step": 8682 }, { "epoch": 1.711750788643533, "grad_norm": 0.4911637333121339, "learning_rate": 1.2260383292476862e-05, "loss": 0.3285, "step": 8683 }, { "epoch": 1.711947949526814, "grad_norm": 0.459378104968867, "learning_rate": 1.2258873356489713e-05, "loss": 0.3292, "step": 8684 }, { "epoch": 1.7121451104100947, "grad_norm": 0.4643594445198744, "learning_rate": 1.2257363366231311e-05, "loss": 0.3354, "step": 8685 }, { "epoch": 1.7123422712933754, "grad_norm": 0.4875758149219096, "learning_rate": 1.2255853321737935e-05, "loss": 0.3461, "step": 8686 }, { "epoch": 1.7125394321766563, "grad_norm": 0.48234734526715584, "learning_rate": 1.225434322304587e-05, "loss": 0.3432, "step": 8687 }, { "epoch": 1.7127365930599368, "grad_norm": 0.4765321853764992, "learning_rate": 1.2252833070191388e-05, "loss": 0.335, "step": 8688 }, { "epoch": 1.7129337539432177, "grad_norm": 0.4922172475680278, "learning_rate": 1.2251322863210785e-05, "loss": 0.3532, "step": 8689 }, { "epoch": 1.7131309148264984, "grad_norm": 0.49132588894060586, "learning_rate": 1.224981260214033e-05, "loss": 0.3471, "step": 8690 }, { "epoch": 1.713328075709779, "grad_norm": 0.4726091865577661, "learning_rate": 1.2248302287016321e-05, "loss": 0.3148, "step": 8691 }, { "epoch": 1.71352523659306, "grad_norm": 0.44365990430314184, "learning_rate": 1.2246791917875034e-05, "loss": 0.2972, "step": 8692 }, { "epoch": 1.7137223974763407, "grad_norm": 0.5487336291279901, "learning_rate": 1.2245281494752765e-05, "loss": 0.3856, "step": 8693 }, { "epoch": 1.7139195583596214, "grad_norm": 0.4920272311780737, "learning_rate": 1.2243771017685797e-05, "loss": 0.3366, "step": 8694 }, { "epoch": 1.7141167192429023, "grad_norm": 0.5015165415124818, "learning_rate": 1.2242260486710427e-05, "loss": 0.3512, "step": 8695 }, { "epoch": 1.714313880126183, "grad_norm": 0.5239711415916627, "learning_rate": 1.224074990186294e-05, "loss": 0.381, "step": 8696 }, { "epoch": 1.7145110410094637, "grad_norm": 0.4568955886255038, "learning_rate": 1.2239239263179635e-05, "loss": 0.3209, "step": 8697 }, { "epoch": 1.7147082018927446, "grad_norm": 0.49060093715398756, "learning_rate": 1.2237728570696801e-05, "loss": 0.3439, "step": 8698 }, { "epoch": 1.7149053627760251, "grad_norm": 0.47975117071356893, "learning_rate": 1.2236217824450739e-05, "loss": 0.344, "step": 8699 }, { "epoch": 1.715102523659306, "grad_norm": 0.5258786604080068, "learning_rate": 1.2234707024477742e-05, "loss": 0.3348, "step": 8700 }, { "epoch": 1.7152996845425867, "grad_norm": 0.48137432039450917, "learning_rate": 1.2233196170814105e-05, "loss": 0.3525, "step": 8701 }, { "epoch": 1.7154968454258674, "grad_norm": 0.5014397647686436, "learning_rate": 1.2231685263496137e-05, "loss": 0.3438, "step": 8702 }, { "epoch": 1.7156940063091484, "grad_norm": 0.4708182769624843, "learning_rate": 1.2230174302560132e-05, "loss": 0.3342, "step": 8703 }, { "epoch": 1.715891167192429, "grad_norm": 0.512195415565872, "learning_rate": 1.2228663288042392e-05, "loss": 0.3553, "step": 8704 }, { "epoch": 1.7160883280757098, "grad_norm": 0.4788767330139541, "learning_rate": 1.2227152219979224e-05, "loss": 0.3536, "step": 8705 }, { "epoch": 1.7162854889589907, "grad_norm": 0.4625920435496106, "learning_rate": 1.2225641098406928e-05, "loss": 0.2878, "step": 8706 }, { "epoch": 1.7164826498422712, "grad_norm": 0.5070101532510752, "learning_rate": 1.2224129923361813e-05, "loss": 0.3453, "step": 8707 }, { "epoch": 1.716679810725552, "grad_norm": 0.4988378522423907, "learning_rate": 1.2222618694880187e-05, "loss": 0.3697, "step": 8708 }, { "epoch": 1.7168769716088328, "grad_norm": 0.5033574544758712, "learning_rate": 1.2221107412998352e-05, "loss": 0.3118, "step": 8709 }, { "epoch": 1.7170741324921135, "grad_norm": 0.4890282116096399, "learning_rate": 1.2219596077752629e-05, "loss": 0.325, "step": 8710 }, { "epoch": 1.7172712933753944, "grad_norm": 0.4760094568747338, "learning_rate": 1.221808468917932e-05, "loss": 0.3281, "step": 8711 }, { "epoch": 1.7174684542586751, "grad_norm": 0.48764166466975545, "learning_rate": 1.221657324731474e-05, "loss": 0.3331, "step": 8712 }, { "epoch": 1.7176656151419558, "grad_norm": 0.4517243985708021, "learning_rate": 1.22150617521952e-05, "loss": 0.34, "step": 8713 }, { "epoch": 1.7178627760252367, "grad_norm": 0.5139727951740557, "learning_rate": 1.2213550203857025e-05, "loss": 0.3511, "step": 8714 }, { "epoch": 1.7180599369085172, "grad_norm": 0.45072979492101384, "learning_rate": 1.2212038602336518e-05, "loss": 0.3228, "step": 8715 }, { "epoch": 1.7182570977917981, "grad_norm": 0.4844004632125416, "learning_rate": 1.2210526947670003e-05, "loss": 0.3361, "step": 8716 }, { "epoch": 1.7184542586750788, "grad_norm": 0.48156386430034853, "learning_rate": 1.22090152398938e-05, "loss": 0.3579, "step": 8717 }, { "epoch": 1.7186514195583595, "grad_norm": 0.4801222110814163, "learning_rate": 1.2207503479044224e-05, "loss": 0.3327, "step": 8718 }, { "epoch": 1.7188485804416405, "grad_norm": 0.4633028361510336, "learning_rate": 1.2205991665157604e-05, "loss": 0.3361, "step": 8719 }, { "epoch": 1.7190457413249212, "grad_norm": 0.4954639943287471, "learning_rate": 1.2204479798270252e-05, "loss": 0.3555, "step": 8720 }, { "epoch": 1.7192429022082019, "grad_norm": 0.5071392340252393, "learning_rate": 1.2202967878418504e-05, "loss": 0.3311, "step": 8721 }, { "epoch": 1.7194400630914828, "grad_norm": 0.4924112369717431, "learning_rate": 1.2201455905638673e-05, "loss": 0.3343, "step": 8722 }, { "epoch": 1.7196372239747633, "grad_norm": 0.5281013648224745, "learning_rate": 1.2199943879967092e-05, "loss": 0.3661, "step": 8723 }, { "epoch": 1.7198343848580442, "grad_norm": 0.4188521660470496, "learning_rate": 1.2198431801440087e-05, "loss": 0.2792, "step": 8724 }, { "epoch": 1.7200315457413249, "grad_norm": 0.514554223490542, "learning_rate": 1.2196919670093989e-05, "loss": 0.351, "step": 8725 }, { "epoch": 1.7202287066246056, "grad_norm": 0.4773669903370585, "learning_rate": 1.2195407485965129e-05, "loss": 0.3371, "step": 8726 }, { "epoch": 1.7204258675078865, "grad_norm": 0.4486097522414558, "learning_rate": 1.2193895249089833e-05, "loss": 0.3221, "step": 8727 }, { "epoch": 1.7206230283911672, "grad_norm": 0.4514574538208784, "learning_rate": 1.2192382959504438e-05, "loss": 0.3146, "step": 8728 }, { "epoch": 1.720820189274448, "grad_norm": 0.4906075691840977, "learning_rate": 1.2190870617245279e-05, "loss": 0.3452, "step": 8729 }, { "epoch": 1.7210173501577288, "grad_norm": 0.4728367195594932, "learning_rate": 1.2189358222348685e-05, "loss": 0.34, "step": 8730 }, { "epoch": 1.7212145110410093, "grad_norm": 0.47426809412562415, "learning_rate": 1.2187845774850999e-05, "loss": 0.2974, "step": 8731 }, { "epoch": 1.7214116719242902, "grad_norm": 0.4676329793425546, "learning_rate": 1.2186333274788558e-05, "loss": 0.3375, "step": 8732 }, { "epoch": 1.721608832807571, "grad_norm": 0.467140693302685, "learning_rate": 1.2184820722197696e-05, "loss": 0.3224, "step": 8733 }, { "epoch": 1.7218059936908516, "grad_norm": 0.492299563007178, "learning_rate": 1.2183308117114759e-05, "loss": 0.3401, "step": 8734 }, { "epoch": 1.7220031545741326, "grad_norm": 0.47469376128492763, "learning_rate": 1.2181795459576085e-05, "loss": 0.3676, "step": 8735 }, { "epoch": 1.7222003154574133, "grad_norm": 0.482130929800137, "learning_rate": 1.2180282749618017e-05, "loss": 0.332, "step": 8736 }, { "epoch": 1.722397476340694, "grad_norm": 0.5031498208900129, "learning_rate": 1.2178769987276902e-05, "loss": 0.3525, "step": 8737 }, { "epoch": 1.7225946372239749, "grad_norm": 0.48826854933895736, "learning_rate": 1.2177257172589086e-05, "loss": 0.3464, "step": 8738 }, { "epoch": 1.7227917981072554, "grad_norm": 0.475972420949733, "learning_rate": 1.2175744305590907e-05, "loss": 0.3193, "step": 8739 }, { "epoch": 1.7229889589905363, "grad_norm": 0.49949539660408465, "learning_rate": 1.2174231386318724e-05, "loss": 0.3567, "step": 8740 }, { "epoch": 1.723186119873817, "grad_norm": 0.49325281815052363, "learning_rate": 1.2172718414808877e-05, "loss": 0.3581, "step": 8741 }, { "epoch": 1.7233832807570977, "grad_norm": 0.4777530691912601, "learning_rate": 1.2171205391097724e-05, "loss": 0.3277, "step": 8742 }, { "epoch": 1.7235804416403786, "grad_norm": 0.47301218354355706, "learning_rate": 1.216969231522161e-05, "loss": 0.3346, "step": 8743 }, { "epoch": 1.7237776025236593, "grad_norm": 0.6201310559734784, "learning_rate": 1.2168179187216893e-05, "loss": 0.3247, "step": 8744 }, { "epoch": 1.72397476340694, "grad_norm": 0.4669955469618345, "learning_rate": 1.2166666007119925e-05, "loss": 0.3382, "step": 8745 }, { "epoch": 1.724171924290221, "grad_norm": 0.47156930793491, "learning_rate": 1.2165152774967061e-05, "loss": 0.348, "step": 8746 }, { "epoch": 1.7243690851735016, "grad_norm": 0.5146898131222495, "learning_rate": 1.2163639490794659e-05, "loss": 0.3423, "step": 8747 }, { "epoch": 1.7245662460567823, "grad_norm": 0.4748624471052609, "learning_rate": 1.2162126154639073e-05, "loss": 0.3329, "step": 8748 }, { "epoch": 1.7247634069400632, "grad_norm": 0.5041545803945943, "learning_rate": 1.2160612766536668e-05, "loss": 0.3429, "step": 8749 }, { "epoch": 1.7249605678233437, "grad_norm": 0.4560606048941914, "learning_rate": 1.21590993265238e-05, "loss": 0.3329, "step": 8750 }, { "epoch": 1.7251577287066246, "grad_norm": 0.4920455686195489, "learning_rate": 1.2157585834636834e-05, "loss": 0.3369, "step": 8751 }, { "epoch": 1.7253548895899053, "grad_norm": 0.4817930597858489, "learning_rate": 1.2156072290912126e-05, "loss": 0.3123, "step": 8752 }, { "epoch": 1.725552050473186, "grad_norm": 0.4933196910433103, "learning_rate": 1.2154558695386049e-05, "loss": 0.3625, "step": 8753 }, { "epoch": 1.725749211356467, "grad_norm": 0.43787576840008613, "learning_rate": 1.2153045048094963e-05, "loss": 0.3305, "step": 8754 }, { "epoch": 1.7259463722397477, "grad_norm": 0.4646297207054528, "learning_rate": 1.2151531349075236e-05, "loss": 0.3276, "step": 8755 }, { "epoch": 1.7261435331230284, "grad_norm": 0.459618597411763, "learning_rate": 1.2150017598363236e-05, "loss": 0.3039, "step": 8756 }, { "epoch": 1.7263406940063093, "grad_norm": 0.48044932107882354, "learning_rate": 1.2148503795995332e-05, "loss": 0.3209, "step": 8757 }, { "epoch": 1.7265378548895898, "grad_norm": 0.48647578029445776, "learning_rate": 1.2146989942007891e-05, "loss": 0.3308, "step": 8758 }, { "epoch": 1.7267350157728707, "grad_norm": 0.4518527350354567, "learning_rate": 1.2145476036437294e-05, "loss": 0.3233, "step": 8759 }, { "epoch": 1.7269321766561514, "grad_norm": 0.48074502143691444, "learning_rate": 1.21439620793199e-05, "loss": 0.3538, "step": 8760 }, { "epoch": 1.727129337539432, "grad_norm": 0.4662467836506099, "learning_rate": 1.2142448070692096e-05, "loss": 0.3411, "step": 8761 }, { "epoch": 1.727326498422713, "grad_norm": 0.5184102450636691, "learning_rate": 1.2140934010590249e-05, "loss": 0.3508, "step": 8762 }, { "epoch": 1.7275236593059937, "grad_norm": 0.54097531724488, "learning_rate": 1.213941989905074e-05, "loss": 0.3291, "step": 8763 }, { "epoch": 1.7277208201892744, "grad_norm": 0.4815838736915963, "learning_rate": 1.2137905736109946e-05, "loss": 0.3279, "step": 8764 }, { "epoch": 1.7279179810725553, "grad_norm": 0.47938770142115866, "learning_rate": 1.213639152180424e-05, "loss": 0.3222, "step": 8765 }, { "epoch": 1.7281151419558358, "grad_norm": 0.44462625843755743, "learning_rate": 1.2134877256170012e-05, "loss": 0.3056, "step": 8766 }, { "epoch": 1.7283123028391167, "grad_norm": 0.48782541427748793, "learning_rate": 1.2133362939243638e-05, "loss": 0.3382, "step": 8767 }, { "epoch": 1.7285094637223974, "grad_norm": 0.47686312944629256, "learning_rate": 1.2131848571061501e-05, "loss": 0.3336, "step": 8768 }, { "epoch": 1.7287066246056781, "grad_norm": 0.4972392941654052, "learning_rate": 1.2130334151659987e-05, "loss": 0.3457, "step": 8769 }, { "epoch": 1.728903785488959, "grad_norm": 0.44658596263964356, "learning_rate": 1.2128819681075476e-05, "loss": 0.3113, "step": 8770 }, { "epoch": 1.7291009463722398, "grad_norm": 0.6132762769001334, "learning_rate": 1.2127305159344358e-05, "loss": 0.2762, "step": 8771 }, { "epoch": 1.7292981072555205, "grad_norm": 0.5184653807356759, "learning_rate": 1.2125790586503024e-05, "loss": 0.3589, "step": 8772 }, { "epoch": 1.7294952681388014, "grad_norm": 0.48887264222267973, "learning_rate": 1.2124275962587857e-05, "loss": 0.3458, "step": 8773 }, { "epoch": 1.7296924290220819, "grad_norm": 0.48006157666774124, "learning_rate": 1.212276128763525e-05, "loss": 0.3261, "step": 8774 }, { "epoch": 1.7298895899053628, "grad_norm": 0.4705936894192923, "learning_rate": 1.2121246561681592e-05, "loss": 0.3318, "step": 8775 }, { "epoch": 1.7300867507886435, "grad_norm": 0.4680969494433612, "learning_rate": 1.2119731784763278e-05, "loss": 0.3278, "step": 8776 }, { "epoch": 1.7302839116719242, "grad_norm": 0.5204916184909986, "learning_rate": 1.21182169569167e-05, "loss": 0.3461, "step": 8777 }, { "epoch": 1.7304810725552051, "grad_norm": 0.4682144834045916, "learning_rate": 1.2116702078178255e-05, "loss": 0.3115, "step": 8778 }, { "epoch": 1.7306782334384858, "grad_norm": 0.4684797179301652, "learning_rate": 1.2115187148584338e-05, "loss": 0.3297, "step": 8779 }, { "epoch": 1.7308753943217665, "grad_norm": 0.5149570001266986, "learning_rate": 1.2113672168171347e-05, "loss": 0.3411, "step": 8780 }, { "epoch": 1.7310725552050474, "grad_norm": 0.4918293028719042, "learning_rate": 1.2112157136975678e-05, "loss": 0.3302, "step": 8781 }, { "epoch": 1.731269716088328, "grad_norm": 0.4680392185456968, "learning_rate": 1.2110642055033737e-05, "loss": 0.313, "step": 8782 }, { "epoch": 1.7314668769716088, "grad_norm": 0.4675694840904993, "learning_rate": 1.2109126922381917e-05, "loss": 0.3383, "step": 8783 }, { "epoch": 1.7316640378548895, "grad_norm": 0.5051346681197235, "learning_rate": 1.2107611739056624e-05, "loss": 0.3679, "step": 8784 }, { "epoch": 1.7318611987381702, "grad_norm": 0.48965432777518686, "learning_rate": 1.2106096505094264e-05, "loss": 0.3412, "step": 8785 }, { "epoch": 1.7320583596214512, "grad_norm": 0.5179854164790161, "learning_rate": 1.2104581220531237e-05, "loss": 0.3659, "step": 8786 }, { "epoch": 1.7322555205047319, "grad_norm": 0.4814569942652689, "learning_rate": 1.2103065885403955e-05, "loss": 0.3369, "step": 8787 }, { "epoch": 1.7324526813880126, "grad_norm": 0.4885561961779997, "learning_rate": 1.2101550499748818e-05, "loss": 0.3446, "step": 8788 }, { "epoch": 1.7326498422712935, "grad_norm": 0.49161198978038273, "learning_rate": 1.210003506360224e-05, "loss": 0.3177, "step": 8789 }, { "epoch": 1.7328470031545742, "grad_norm": 0.48634200842172337, "learning_rate": 1.2098519577000627e-05, "loss": 0.3553, "step": 8790 }, { "epoch": 1.7330441640378549, "grad_norm": 0.48325106568491305, "learning_rate": 1.2097004039980391e-05, "loss": 0.3519, "step": 8791 }, { "epoch": 1.7332413249211358, "grad_norm": 0.4641044710550502, "learning_rate": 1.2095488452577946e-05, "loss": 0.3123, "step": 8792 }, { "epoch": 1.7334384858044163, "grad_norm": 0.47358093899817655, "learning_rate": 1.2093972814829701e-05, "loss": 0.3162, "step": 8793 }, { "epoch": 1.7336356466876972, "grad_norm": 0.4913256828106219, "learning_rate": 1.2092457126772074e-05, "loss": 0.3531, "step": 8794 }, { "epoch": 1.733832807570978, "grad_norm": 0.4912311679309798, "learning_rate": 1.2090941388441482e-05, "loss": 0.3359, "step": 8795 }, { "epoch": 1.7340299684542586, "grad_norm": 0.47304417855315983, "learning_rate": 1.2089425599874335e-05, "loss": 0.3022, "step": 8796 }, { "epoch": 1.7342271293375395, "grad_norm": 0.4672250909616135, "learning_rate": 1.208790976110706e-05, "loss": 0.3027, "step": 8797 }, { "epoch": 1.7344242902208202, "grad_norm": 0.4647836250132962, "learning_rate": 1.2086393872176067e-05, "loss": 0.3144, "step": 8798 }, { "epoch": 1.734621451104101, "grad_norm": 0.5010956167290279, "learning_rate": 1.2084877933117784e-05, "loss": 0.3641, "step": 8799 }, { "epoch": 1.7348186119873819, "grad_norm": 0.46610623517280136, "learning_rate": 1.2083361943968628e-05, "loss": 0.3291, "step": 8800 }, { "epoch": 1.7350157728706623, "grad_norm": 0.5069304515479853, "learning_rate": 1.2081845904765026e-05, "loss": 0.3766, "step": 8801 }, { "epoch": 1.7352129337539433, "grad_norm": 0.4418811498806394, "learning_rate": 1.2080329815543398e-05, "loss": 0.3065, "step": 8802 }, { "epoch": 1.735410094637224, "grad_norm": 0.5417087924566732, "learning_rate": 1.2078813676340171e-05, "loss": 0.322, "step": 8803 }, { "epoch": 1.7356072555205047, "grad_norm": 0.4885878208354627, "learning_rate": 1.2077297487191771e-05, "loss": 0.3166, "step": 8804 }, { "epoch": 1.7358044164037856, "grad_norm": 0.46819488040628704, "learning_rate": 1.2075781248134624e-05, "loss": 0.3308, "step": 8805 }, { "epoch": 1.7360015772870663, "grad_norm": 0.4725168433278604, "learning_rate": 1.2074264959205167e-05, "loss": 0.3248, "step": 8806 }, { "epoch": 1.736198738170347, "grad_norm": 0.4685757443962958, "learning_rate": 1.2072748620439816e-05, "loss": 0.3162, "step": 8807 }, { "epoch": 1.736395899053628, "grad_norm": 0.4944655388806196, "learning_rate": 1.2071232231875017e-05, "loss": 0.3465, "step": 8808 }, { "epoch": 1.7365930599369084, "grad_norm": 0.48192011814447067, "learning_rate": 1.2069715793547192e-05, "loss": 0.3312, "step": 8809 }, { "epoch": 1.7367902208201893, "grad_norm": 0.5000553460824155, "learning_rate": 1.2068199305492781e-05, "loss": 0.3439, "step": 8810 }, { "epoch": 1.73698738170347, "grad_norm": 0.47422303429265467, "learning_rate": 1.2066682767748212e-05, "loss": 0.3176, "step": 8811 }, { "epoch": 1.7371845425867507, "grad_norm": 0.449422917568121, "learning_rate": 1.2065166180349928e-05, "loss": 0.3241, "step": 8812 }, { "epoch": 1.7373817034700316, "grad_norm": 0.49595270985884926, "learning_rate": 1.2063649543334364e-05, "loss": 0.3461, "step": 8813 }, { "epoch": 1.7375788643533123, "grad_norm": 0.44630470527387806, "learning_rate": 1.2062132856737958e-05, "loss": 0.3099, "step": 8814 }, { "epoch": 1.737776025236593, "grad_norm": 2.0412021740875836, "learning_rate": 1.2060616120597149e-05, "loss": 0.3194, "step": 8815 }, { "epoch": 1.737973186119874, "grad_norm": 0.47472434744895176, "learning_rate": 1.2059099334948376e-05, "loss": 0.3325, "step": 8816 }, { "epoch": 1.7381703470031544, "grad_norm": 0.481225491809552, "learning_rate": 1.2057582499828086e-05, "loss": 0.336, "step": 8817 }, { "epoch": 1.7383675078864353, "grad_norm": 0.44821446660142955, "learning_rate": 1.205606561527272e-05, "loss": 0.3134, "step": 8818 }, { "epoch": 1.738564668769716, "grad_norm": 0.4813063073960643, "learning_rate": 1.205454868131872e-05, "loss": 0.3246, "step": 8819 }, { "epoch": 1.7387618296529967, "grad_norm": 0.4753107258555097, "learning_rate": 1.2053031698002533e-05, "loss": 0.333, "step": 8820 }, { "epoch": 1.7389589905362777, "grad_norm": 0.5613865154278634, "learning_rate": 1.2051514665360606e-05, "loss": 0.3334, "step": 8821 }, { "epoch": 1.7391561514195584, "grad_norm": 0.5166325707314708, "learning_rate": 1.2049997583429389e-05, "loss": 0.3388, "step": 8822 }, { "epoch": 1.739353312302839, "grad_norm": 0.4689529203647351, "learning_rate": 1.2048480452245328e-05, "loss": 0.3409, "step": 8823 }, { "epoch": 1.73955047318612, "grad_norm": 0.5124805778270197, "learning_rate": 1.2046963271844876e-05, "loss": 0.3589, "step": 8824 }, { "epoch": 1.7397476340694005, "grad_norm": 0.4555862428808988, "learning_rate": 1.2045446042264482e-05, "loss": 0.3218, "step": 8825 }, { "epoch": 1.7399447949526814, "grad_norm": 0.46880230449353444, "learning_rate": 1.2043928763540598e-05, "loss": 0.3204, "step": 8826 }, { "epoch": 1.740141955835962, "grad_norm": 0.4661584032901464, "learning_rate": 1.2042411435709683e-05, "loss": 0.3008, "step": 8827 }, { "epoch": 1.7403391167192428, "grad_norm": 0.49560394022565896, "learning_rate": 1.2040894058808183e-05, "loss": 0.3304, "step": 8828 }, { "epoch": 1.7405362776025237, "grad_norm": 0.477044636888031, "learning_rate": 1.2039376632872565e-05, "loss": 0.3348, "step": 8829 }, { "epoch": 1.7407334384858044, "grad_norm": 0.4768628146082518, "learning_rate": 1.2037859157939278e-05, "loss": 0.3517, "step": 8830 }, { "epoch": 1.7409305993690851, "grad_norm": 0.46710532512756137, "learning_rate": 1.2036341634044785e-05, "loss": 0.3378, "step": 8831 }, { "epoch": 1.741127760252366, "grad_norm": 0.4838108367315246, "learning_rate": 1.2034824061225545e-05, "loss": 0.3403, "step": 8832 }, { "epoch": 1.7413249211356467, "grad_norm": 0.4814792464848545, "learning_rate": 1.2033306439518017e-05, "loss": 0.328, "step": 8833 }, { "epoch": 1.7415220820189274, "grad_norm": 0.4927086846869189, "learning_rate": 1.2031788768958666e-05, "loss": 0.3458, "step": 8834 }, { "epoch": 1.7417192429022084, "grad_norm": 0.4617442660897952, "learning_rate": 1.203027104958395e-05, "loss": 0.3377, "step": 8835 }, { "epoch": 1.7419164037854888, "grad_norm": 0.48638088204825836, "learning_rate": 1.2028753281430343e-05, "loss": 0.3269, "step": 8836 }, { "epoch": 1.7421135646687698, "grad_norm": 0.5049199749667796, "learning_rate": 1.20272354645343e-05, "loss": 0.3766, "step": 8837 }, { "epoch": 1.7423107255520505, "grad_norm": 0.48188659695790537, "learning_rate": 1.2025717598932293e-05, "loss": 0.3414, "step": 8838 }, { "epoch": 1.7425078864353312, "grad_norm": 0.4738832139863155, "learning_rate": 1.2024199684660792e-05, "loss": 0.3253, "step": 8839 }, { "epoch": 1.742705047318612, "grad_norm": 0.477503317970411, "learning_rate": 1.202268172175626e-05, "loss": 0.3344, "step": 8840 }, { "epoch": 1.7429022082018928, "grad_norm": 0.5304814544412956, "learning_rate": 1.2021163710255173e-05, "loss": 0.334, "step": 8841 }, { "epoch": 1.7430993690851735, "grad_norm": 0.508157775015286, "learning_rate": 1.2019645650193999e-05, "loss": 0.3512, "step": 8842 }, { "epoch": 1.7432965299684544, "grad_norm": 0.5017879226850557, "learning_rate": 1.2018127541609212e-05, "loss": 0.333, "step": 8843 }, { "epoch": 1.743493690851735, "grad_norm": 0.4867632085277509, "learning_rate": 1.2016609384537287e-05, "loss": 0.3366, "step": 8844 }, { "epoch": 1.7436908517350158, "grad_norm": 0.5105248897104497, "learning_rate": 1.2015091179014696e-05, "loss": 0.3556, "step": 8845 }, { "epoch": 1.7438880126182965, "grad_norm": 0.4608424473916468, "learning_rate": 1.2013572925077919e-05, "loss": 0.3371, "step": 8846 }, { "epoch": 1.7440851735015772, "grad_norm": 0.45038670620879856, "learning_rate": 1.2012054622763425e-05, "loss": 0.3052, "step": 8847 }, { "epoch": 1.7442823343848581, "grad_norm": 0.5081471358396383, "learning_rate": 1.2010536272107706e-05, "loss": 0.3455, "step": 8848 }, { "epoch": 1.7444794952681388, "grad_norm": 0.4644321538998626, "learning_rate": 1.200901787314723e-05, "loss": 0.3091, "step": 8849 }, { "epoch": 1.7446766561514195, "grad_norm": 0.48115698751245345, "learning_rate": 1.2007499425918483e-05, "loss": 0.3482, "step": 8850 }, { "epoch": 1.7448738170347005, "grad_norm": 0.5228016067341045, "learning_rate": 1.2005980930457946e-05, "loss": 0.3275, "step": 8851 }, { "epoch": 1.745070977917981, "grad_norm": 0.583985733994483, "learning_rate": 1.2004462386802098e-05, "loss": 0.3648, "step": 8852 }, { "epoch": 1.7452681388012619, "grad_norm": 0.45939749735947777, "learning_rate": 1.2002943794987432e-05, "loss": 0.3389, "step": 8853 }, { "epoch": 1.7454652996845426, "grad_norm": 0.4648960544103313, "learning_rate": 1.2001425155050423e-05, "loss": 0.3412, "step": 8854 }, { "epoch": 1.7456624605678233, "grad_norm": 0.4793461007710896, "learning_rate": 1.1999906467027568e-05, "loss": 0.3385, "step": 8855 }, { "epoch": 1.7458596214511042, "grad_norm": 0.4611557554227105, "learning_rate": 1.1998387730955345e-05, "loss": 0.2932, "step": 8856 }, { "epoch": 1.7460567823343849, "grad_norm": 0.4679923259603527, "learning_rate": 1.1996868946870252e-05, "loss": 0.3347, "step": 8857 }, { "epoch": 1.7462539432176656, "grad_norm": 0.47215734774680346, "learning_rate": 1.1995350114808772e-05, "loss": 0.3437, "step": 8858 }, { "epoch": 1.7464511041009465, "grad_norm": 0.4648472195023899, "learning_rate": 1.1993831234807401e-05, "loss": 0.3623, "step": 8859 }, { "epoch": 1.746648264984227, "grad_norm": 0.473482682420203, "learning_rate": 1.1992312306902625e-05, "loss": 0.3372, "step": 8860 }, { "epoch": 1.746845425867508, "grad_norm": 0.46094418222077305, "learning_rate": 1.1990793331130944e-05, "loss": 0.3221, "step": 8861 }, { "epoch": 1.7470425867507886, "grad_norm": 0.5016161341201105, "learning_rate": 1.1989274307528848e-05, "loss": 0.3539, "step": 8862 }, { "epoch": 1.7472397476340693, "grad_norm": 0.49099642673007926, "learning_rate": 1.1987755236132839e-05, "loss": 0.3462, "step": 8863 }, { "epoch": 1.7474369085173502, "grad_norm": 0.48487743130133104, "learning_rate": 1.1986236116979406e-05, "loss": 0.3585, "step": 8864 }, { "epoch": 1.747634069400631, "grad_norm": 0.46908185404595404, "learning_rate": 1.1984716950105054e-05, "loss": 0.3306, "step": 8865 }, { "epoch": 1.7478312302839116, "grad_norm": 0.4832688212778634, "learning_rate": 1.1983197735546275e-05, "loss": 0.3631, "step": 8866 }, { "epoch": 1.7480283911671926, "grad_norm": 0.4602457165492578, "learning_rate": 1.1981678473339576e-05, "loss": 0.3071, "step": 8867 }, { "epoch": 1.748225552050473, "grad_norm": 0.43528249927314083, "learning_rate": 1.1980159163521454e-05, "loss": 0.3135, "step": 8868 }, { "epoch": 1.748422712933754, "grad_norm": 0.4695790802517424, "learning_rate": 1.1978639806128416e-05, "loss": 0.331, "step": 8869 }, { "epoch": 1.7486198738170347, "grad_norm": 0.5280131284138565, "learning_rate": 1.1977120401196963e-05, "loss": 0.3353, "step": 8870 }, { "epoch": 1.7488170347003154, "grad_norm": 0.4681473144779894, "learning_rate": 1.1975600948763597e-05, "loss": 0.3286, "step": 8871 }, { "epoch": 1.7490141955835963, "grad_norm": 0.4604810167753171, "learning_rate": 1.197408144886483e-05, "loss": 0.3244, "step": 8872 }, { "epoch": 1.749211356466877, "grad_norm": 0.46975125282307023, "learning_rate": 1.1972561901537164e-05, "loss": 0.3175, "step": 8873 }, { "epoch": 1.7494085173501577, "grad_norm": 0.4932348572031116, "learning_rate": 1.1971042306817113e-05, "loss": 0.3538, "step": 8874 }, { "epoch": 1.7496056782334386, "grad_norm": 0.46969073205555756, "learning_rate": 1.196952266474118e-05, "loss": 0.3453, "step": 8875 }, { "epoch": 1.749802839116719, "grad_norm": 0.4808934210094641, "learning_rate": 1.1968002975345882e-05, "loss": 0.3415, "step": 8876 }, { "epoch": 1.749802839116719, "eval_loss": 0.42366865277290344, "eval_runtime": 344.3756, "eval_samples_per_second": 23.608, "eval_steps_per_second": 1.478, "step": 8876 }, { "epoch": 1.75, "grad_norm": 0.48060930083956127, "learning_rate": 1.1966483238667725e-05, "loss": 0.3601, "step": 8877 }, { "epoch": 1.750197160883281, "grad_norm": 0.4821573663894791, "learning_rate": 1.1964963454743228e-05, "loss": 0.3615, "step": 8878 }, { "epoch": 1.7503943217665614, "grad_norm": 0.47251836827858207, "learning_rate": 1.1963443623608897e-05, "loss": 0.3372, "step": 8879 }, { "epoch": 1.7505914826498423, "grad_norm": 0.47137428118391156, "learning_rate": 1.1961923745301256e-05, "loss": 0.3314, "step": 8880 }, { "epoch": 1.750788643533123, "grad_norm": 0.4957152153915973, "learning_rate": 1.1960403819856815e-05, "loss": 0.3537, "step": 8881 }, { "epoch": 1.7509858044164037, "grad_norm": 0.4470537419100556, "learning_rate": 1.1958883847312092e-05, "loss": 0.3231, "step": 8882 }, { "epoch": 1.7511829652996846, "grad_norm": 0.4608113402256386, "learning_rate": 1.1957363827703612e-05, "loss": 0.3211, "step": 8883 }, { "epoch": 1.7513801261829653, "grad_norm": 0.49969096886092734, "learning_rate": 1.1955843761067886e-05, "loss": 0.337, "step": 8884 }, { "epoch": 1.751577287066246, "grad_norm": 0.480041688896454, "learning_rate": 1.1954323647441439e-05, "loss": 0.3579, "step": 8885 }, { "epoch": 1.751774447949527, "grad_norm": 0.45142752291944904, "learning_rate": 1.1952803486860794e-05, "loss": 0.314, "step": 8886 }, { "epoch": 1.7519716088328074, "grad_norm": 0.48553207420875816, "learning_rate": 1.1951283279362471e-05, "loss": 0.3485, "step": 8887 }, { "epoch": 1.7521687697160884, "grad_norm": 0.4724742952583676, "learning_rate": 1.1949763024982997e-05, "loss": 0.3496, "step": 8888 }, { "epoch": 1.752365930599369, "grad_norm": 0.4854477143262649, "learning_rate": 1.1948242723758896e-05, "loss": 0.3421, "step": 8889 }, { "epoch": 1.7525630914826498, "grad_norm": 0.46735796360977727, "learning_rate": 1.1946722375726694e-05, "loss": 0.3375, "step": 8890 }, { "epoch": 1.7527602523659307, "grad_norm": 0.4637010683614967, "learning_rate": 1.194520198092292e-05, "loss": 0.3038, "step": 8891 }, { "epoch": 1.7529574132492114, "grad_norm": 0.4793650345380348, "learning_rate": 1.1943681539384103e-05, "loss": 0.3252, "step": 8892 }, { "epoch": 1.753154574132492, "grad_norm": 0.4636919118333324, "learning_rate": 1.194216105114677e-05, "loss": 0.3256, "step": 8893 }, { "epoch": 1.753351735015773, "grad_norm": 0.47895800206523675, "learning_rate": 1.194064051624745e-05, "loss": 0.325, "step": 8894 }, { "epoch": 1.7535488958990535, "grad_norm": 0.46796297495549594, "learning_rate": 1.1939119934722685e-05, "loss": 0.322, "step": 8895 }, { "epoch": 1.7537460567823344, "grad_norm": 0.46995032074065873, "learning_rate": 1.1937599306609e-05, "loss": 0.35, "step": 8896 }, { "epoch": 1.7539432176656151, "grad_norm": 55.908718330694654, "learning_rate": 1.193607863194293e-05, "loss": 0.5092, "step": 8897 }, { "epoch": 1.7541403785488958, "grad_norm": 0.5121025623411513, "learning_rate": 1.1934557910761013e-05, "loss": 0.3565, "step": 8898 }, { "epoch": 1.7543375394321767, "grad_norm": 0.45082554071248027, "learning_rate": 1.1933037143099786e-05, "loss": 0.3044, "step": 8899 }, { "epoch": 1.7545347003154574, "grad_norm": 0.47611001136680525, "learning_rate": 1.1931516328995782e-05, "loss": 0.3135, "step": 8900 }, { "epoch": 1.7547318611987381, "grad_norm": 0.4768767652243685, "learning_rate": 1.1929995468485545e-05, "loss": 0.3442, "step": 8901 }, { "epoch": 1.754929022082019, "grad_norm": 0.49669976262352766, "learning_rate": 1.1928474561605612e-05, "loss": 0.3118, "step": 8902 }, { "epoch": 1.7551261829652995, "grad_norm": 0.4900205596977599, "learning_rate": 1.1926953608392522e-05, "loss": 0.3165, "step": 8903 }, { "epoch": 1.7553233438485805, "grad_norm": 0.48731950619097536, "learning_rate": 1.1925432608882826e-05, "loss": 0.3524, "step": 8904 }, { "epoch": 1.7555205047318612, "grad_norm": 0.5192299995573778, "learning_rate": 1.1923911563113053e-05, "loss": 0.3761, "step": 8905 }, { "epoch": 1.7557176656151419, "grad_norm": 0.4698410536685136, "learning_rate": 1.1922390471119763e-05, "loss": 0.3271, "step": 8906 }, { "epoch": 1.7559148264984228, "grad_norm": 0.4891636860035787, "learning_rate": 1.1920869332939488e-05, "loss": 0.3466, "step": 8907 }, { "epoch": 1.7561119873817035, "grad_norm": 0.49905335282315777, "learning_rate": 1.1919348148608782e-05, "loss": 0.3497, "step": 8908 }, { "epoch": 1.7563091482649842, "grad_norm": 0.5085891428975882, "learning_rate": 1.1917826918164193e-05, "loss": 0.3384, "step": 8909 }, { "epoch": 1.756506309148265, "grad_norm": 0.49833646762619443, "learning_rate": 1.1916305641642265e-05, "loss": 0.3471, "step": 8910 }, { "epoch": 1.7567034700315456, "grad_norm": 0.49886395851113163, "learning_rate": 1.1914784319079554e-05, "loss": 0.3834, "step": 8911 }, { "epoch": 1.7569006309148265, "grad_norm": 0.5085736701155745, "learning_rate": 1.1913262950512605e-05, "loss": 0.348, "step": 8912 }, { "epoch": 1.7570977917981072, "grad_norm": 0.5032507524362276, "learning_rate": 1.1911741535977972e-05, "loss": 0.3574, "step": 8913 }, { "epoch": 1.757294952681388, "grad_norm": 0.48298477571501336, "learning_rate": 1.1910220075512213e-05, "loss": 0.318, "step": 8914 }, { "epoch": 1.7574921135646688, "grad_norm": 0.48667269740164176, "learning_rate": 1.1908698569151877e-05, "loss": 0.3329, "step": 8915 }, { "epoch": 1.7576892744479495, "grad_norm": 0.47116779180865714, "learning_rate": 1.190717701693352e-05, "loss": 0.3365, "step": 8916 }, { "epoch": 1.7578864353312302, "grad_norm": 0.4997514621039943, "learning_rate": 1.19056554188937e-05, "loss": 0.3235, "step": 8917 }, { "epoch": 1.7580835962145112, "grad_norm": 0.48309653739189073, "learning_rate": 1.1904133775068974e-05, "loss": 0.3161, "step": 8918 }, { "epoch": 1.7582807570977916, "grad_norm": 0.5210440531213502, "learning_rate": 1.1902612085495902e-05, "loss": 0.3631, "step": 8919 }, { "epoch": 1.7584779179810726, "grad_norm": 0.4921227572826234, "learning_rate": 1.1901090350211037e-05, "loss": 0.3294, "step": 8920 }, { "epoch": 1.7586750788643533, "grad_norm": 0.49727990636910413, "learning_rate": 1.1899568569250951e-05, "loss": 0.3416, "step": 8921 }, { "epoch": 1.758872239747634, "grad_norm": 0.5053613903983315, "learning_rate": 1.1898046742652196e-05, "loss": 0.3138, "step": 8922 }, { "epoch": 1.7590694006309149, "grad_norm": 0.45734876191997076, "learning_rate": 1.1896524870451344e-05, "loss": 0.3403, "step": 8923 }, { "epoch": 1.7592665615141956, "grad_norm": 0.43524997755300077, "learning_rate": 1.1895002952684952e-05, "loss": 0.3275, "step": 8924 }, { "epoch": 1.7594637223974763, "grad_norm": 0.46379750213806303, "learning_rate": 1.189348098938959e-05, "loss": 0.3241, "step": 8925 }, { "epoch": 1.7596608832807572, "grad_norm": 0.5193732674368089, "learning_rate": 1.1891958980601819e-05, "loss": 0.3229, "step": 8926 }, { "epoch": 1.759858044164038, "grad_norm": 0.5374560365743167, "learning_rate": 1.1890436926358214e-05, "loss": 0.3549, "step": 8927 }, { "epoch": 1.7600552050473186, "grad_norm": 0.5510070896034697, "learning_rate": 1.1888914826695336e-05, "loss": 0.3427, "step": 8928 }, { "epoch": 1.7602523659305995, "grad_norm": 0.500101497587192, "learning_rate": 1.1887392681649761e-05, "loss": 0.3519, "step": 8929 }, { "epoch": 1.76044952681388, "grad_norm": 0.5025929287914618, "learning_rate": 1.1885870491258054e-05, "loss": 0.3415, "step": 8930 }, { "epoch": 1.760646687697161, "grad_norm": 0.4493713242328926, "learning_rate": 1.1884348255556793e-05, "loss": 0.327, "step": 8931 }, { "epoch": 1.7608438485804416, "grad_norm": 0.49432274508122315, "learning_rate": 1.1882825974582546e-05, "loss": 0.3382, "step": 8932 }, { "epoch": 1.7610410094637223, "grad_norm": 0.4970567230452097, "learning_rate": 1.1881303648371889e-05, "loss": 0.3509, "step": 8933 }, { "epoch": 1.7612381703470033, "grad_norm": 0.47318188032190406, "learning_rate": 1.1879781276961396e-05, "loss": 0.3271, "step": 8934 }, { "epoch": 1.761435331230284, "grad_norm": 0.5242959226678022, "learning_rate": 1.1878258860387644e-05, "loss": 0.3749, "step": 8935 }, { "epoch": 1.7616324921135647, "grad_norm": 0.46123437786870813, "learning_rate": 1.1876736398687212e-05, "loss": 0.3346, "step": 8936 }, { "epoch": 1.7618296529968456, "grad_norm": 0.4597024041990773, "learning_rate": 1.1875213891896676e-05, "loss": 0.3226, "step": 8937 }, { "epoch": 1.762026813880126, "grad_norm": 0.4708652177872067, "learning_rate": 1.1873691340052615e-05, "loss": 0.3292, "step": 8938 }, { "epoch": 1.762223974763407, "grad_norm": 0.4520141786068151, "learning_rate": 1.1872168743191613e-05, "loss": 0.3026, "step": 8939 }, { "epoch": 1.7624211356466877, "grad_norm": 0.45200547022438, "learning_rate": 1.1870646101350247e-05, "loss": 0.3254, "step": 8940 }, { "epoch": 1.7626182965299684, "grad_norm": 0.4656284736142514, "learning_rate": 1.18691234145651e-05, "loss": 0.3146, "step": 8941 }, { "epoch": 1.7628154574132493, "grad_norm": 0.45349874844695887, "learning_rate": 1.1867600682872764e-05, "loss": 0.3145, "step": 8942 }, { "epoch": 1.76301261829653, "grad_norm": 0.4977307339476207, "learning_rate": 1.1866077906309812e-05, "loss": 0.3329, "step": 8943 }, { "epoch": 1.7632097791798107, "grad_norm": 0.47172564633086056, "learning_rate": 1.1864555084912839e-05, "loss": 0.32, "step": 8944 }, { "epoch": 1.7634069400630916, "grad_norm": 0.6274170785811407, "learning_rate": 1.1863032218718424e-05, "loss": 0.3686, "step": 8945 }, { "epoch": 1.763604100946372, "grad_norm": 0.4976279806243257, "learning_rate": 1.1861509307763166e-05, "loss": 0.3433, "step": 8946 }, { "epoch": 1.763801261829653, "grad_norm": 0.5010044169487304, "learning_rate": 1.1859986352083644e-05, "loss": 0.3308, "step": 8947 }, { "epoch": 1.7639984227129337, "grad_norm": 0.4709732593585705, "learning_rate": 1.185846335171645e-05, "loss": 0.3243, "step": 8948 }, { "epoch": 1.7641955835962144, "grad_norm": 0.46672840740361604, "learning_rate": 1.1856940306698182e-05, "loss": 0.3288, "step": 8949 }, { "epoch": 1.7643927444794953, "grad_norm": 0.4876404296213418, "learning_rate": 1.1855417217065427e-05, "loss": 0.3454, "step": 8950 }, { "epoch": 1.764589905362776, "grad_norm": 0.7344474694941918, "learning_rate": 1.1853894082854778e-05, "loss": 0.3343, "step": 8951 }, { "epoch": 1.7647870662460567, "grad_norm": 0.5281718261013864, "learning_rate": 1.185237090410283e-05, "loss": 0.3559, "step": 8952 }, { "epoch": 1.7649842271293377, "grad_norm": 0.45336944968705656, "learning_rate": 1.1850847680846181e-05, "loss": 0.3367, "step": 8953 }, { "epoch": 1.7651813880126181, "grad_norm": 0.48515961309525846, "learning_rate": 1.1849324413121424e-05, "loss": 0.35, "step": 8954 }, { "epoch": 1.765378548895899, "grad_norm": 0.4683396505847219, "learning_rate": 1.184780110096516e-05, "loss": 0.3291, "step": 8955 }, { "epoch": 1.7655757097791798, "grad_norm": 0.47251430009899725, "learning_rate": 1.1846277744413988e-05, "loss": 0.3143, "step": 8956 }, { "epoch": 1.7657728706624605, "grad_norm": 0.47184209011286815, "learning_rate": 1.1844754343504503e-05, "loss": 0.3256, "step": 8957 }, { "epoch": 1.7659700315457414, "grad_norm": 0.4783002948832379, "learning_rate": 1.1843230898273312e-05, "loss": 0.3487, "step": 8958 }, { "epoch": 1.766167192429022, "grad_norm": 0.5175752592893191, "learning_rate": 1.1841707408757012e-05, "loss": 0.348, "step": 8959 }, { "epoch": 1.7663643533123028, "grad_norm": 0.48774710368554364, "learning_rate": 1.184018387499221e-05, "loss": 0.3489, "step": 8960 }, { "epoch": 1.7665615141955837, "grad_norm": 0.5091512140833712, "learning_rate": 1.183866029701551e-05, "loss": 0.3437, "step": 8961 }, { "epoch": 1.7667586750788642, "grad_norm": 0.4579012245288681, "learning_rate": 1.1837136674863512e-05, "loss": 0.3116, "step": 8962 }, { "epoch": 1.7669558359621451, "grad_norm": 0.4607106881483517, "learning_rate": 1.1835613008572828e-05, "loss": 0.3182, "step": 8963 }, { "epoch": 1.7671529968454258, "grad_norm": 0.48460789200010285, "learning_rate": 1.1834089298180062e-05, "loss": 0.3515, "step": 8964 }, { "epoch": 1.7673501577287065, "grad_norm": 0.5066322063807643, "learning_rate": 1.1832565543721828e-05, "loss": 0.3611, "step": 8965 }, { "epoch": 1.7675473186119874, "grad_norm": 0.4619333716892967, "learning_rate": 1.1831041745234728e-05, "loss": 0.3183, "step": 8966 }, { "epoch": 1.7677444794952681, "grad_norm": 0.46313697004860765, "learning_rate": 1.1829517902755375e-05, "loss": 0.322, "step": 8967 }, { "epoch": 1.7679416403785488, "grad_norm": 0.6763284941264115, "learning_rate": 1.1827994016320381e-05, "loss": 0.3173, "step": 8968 }, { "epoch": 1.7681388012618298, "grad_norm": 0.47210950027001125, "learning_rate": 1.1826470085966357e-05, "loss": 0.3312, "step": 8969 }, { "epoch": 1.7683359621451105, "grad_norm": 0.47290245061079667, "learning_rate": 1.1824946111729922e-05, "loss": 0.3043, "step": 8970 }, { "epoch": 1.7685331230283912, "grad_norm": 0.479790151905134, "learning_rate": 1.1823422093647684e-05, "loss": 0.3469, "step": 8971 }, { "epoch": 1.768730283911672, "grad_norm": 0.4581566972384319, "learning_rate": 1.1821898031756265e-05, "loss": 0.33, "step": 8972 }, { "epoch": 1.7689274447949526, "grad_norm": 0.4721423210829706, "learning_rate": 1.1820373926092274e-05, "loss": 0.3159, "step": 8973 }, { "epoch": 1.7691246056782335, "grad_norm": 0.49894798175464766, "learning_rate": 1.181884977669234e-05, "loss": 0.3572, "step": 8974 }, { "epoch": 1.7693217665615142, "grad_norm": 0.49659191839512695, "learning_rate": 1.181732558359307e-05, "loss": 0.334, "step": 8975 }, { "epoch": 1.7695189274447949, "grad_norm": 0.5171496834529237, "learning_rate": 1.181580134683109e-05, "loss": 0.345, "step": 8976 }, { "epoch": 1.7697160883280758, "grad_norm": 0.448236524997749, "learning_rate": 1.1814277066443023e-05, "loss": 0.2887, "step": 8977 }, { "epoch": 1.7699132492113565, "grad_norm": 5.333849505830885, "learning_rate": 1.1812752742465488e-05, "loss": 0.3734, "step": 8978 }, { "epoch": 1.7701104100946372, "grad_norm": 0.49076446223605413, "learning_rate": 1.1811228374935107e-05, "loss": 0.3396, "step": 8979 }, { "epoch": 1.7703075709779181, "grad_norm": 0.5459828067722964, "learning_rate": 1.1809703963888506e-05, "loss": 0.3123, "step": 8980 }, { "epoch": 1.7705047318611986, "grad_norm": 0.48374652525635975, "learning_rate": 1.180817950936231e-05, "loss": 0.3285, "step": 8981 }, { "epoch": 1.7707018927444795, "grad_norm": 0.4755660557838733, "learning_rate": 1.1806655011393144e-05, "loss": 0.3507, "step": 8982 }, { "epoch": 1.7708990536277602, "grad_norm": 0.6006920450403271, "learning_rate": 1.1805130470017639e-05, "loss": 0.3379, "step": 8983 }, { "epoch": 1.771096214511041, "grad_norm": 0.5009897132212682, "learning_rate": 1.180360588527242e-05, "loss": 0.3573, "step": 8984 }, { "epoch": 1.7712933753943219, "grad_norm": 0.4620563729527647, "learning_rate": 1.1802081257194116e-05, "loss": 0.3262, "step": 8985 }, { "epoch": 1.7714905362776026, "grad_norm": 0.4769119308524991, "learning_rate": 1.180055658581936e-05, "loss": 0.3184, "step": 8986 }, { "epoch": 1.7716876971608833, "grad_norm": 0.4487850697936526, "learning_rate": 1.179903187118478e-05, "loss": 0.3179, "step": 8987 }, { "epoch": 1.7718848580441642, "grad_norm": 0.4677363529785754, "learning_rate": 1.179750711332701e-05, "loss": 0.3009, "step": 8988 }, { "epoch": 1.7720820189274447, "grad_norm": 0.48154564362378643, "learning_rate": 1.179598231228269e-05, "loss": 0.3517, "step": 8989 }, { "epoch": 1.7722791798107256, "grad_norm": 0.4608411338811139, "learning_rate": 1.1794457468088443e-05, "loss": 0.3284, "step": 8990 }, { "epoch": 1.7724763406940063, "grad_norm": 0.5027074279023396, "learning_rate": 1.1792932580780913e-05, "loss": 0.3442, "step": 8991 }, { "epoch": 1.772673501577287, "grad_norm": 0.4399957421863353, "learning_rate": 1.1791407650396731e-05, "loss": 0.3108, "step": 8992 }, { "epoch": 1.772870662460568, "grad_norm": 0.45380590331375986, "learning_rate": 1.1789882676972541e-05, "loss": 0.2994, "step": 8993 }, { "epoch": 1.7730678233438486, "grad_norm": 0.5204613664594859, "learning_rate": 1.1788357660544976e-05, "loss": 0.3543, "step": 8994 }, { "epoch": 1.7732649842271293, "grad_norm": 0.48732752300245125, "learning_rate": 1.1786832601150677e-05, "loss": 0.3421, "step": 8995 }, { "epoch": 1.7734621451104102, "grad_norm": 0.473467187951441, "learning_rate": 1.1785307498826288e-05, "loss": 0.3338, "step": 8996 }, { "epoch": 1.7736593059936907, "grad_norm": 0.48137614570470716, "learning_rate": 1.1783782353608449e-05, "loss": 0.3533, "step": 8997 }, { "epoch": 1.7738564668769716, "grad_norm": 0.4665674415229966, "learning_rate": 1.1782257165533802e-05, "loss": 0.3309, "step": 8998 }, { "epoch": 1.7740536277602523, "grad_norm": 0.47639470639484743, "learning_rate": 1.1780731934638992e-05, "loss": 0.3181, "step": 8999 }, { "epoch": 1.774250788643533, "grad_norm": 0.4888316448783417, "learning_rate": 1.177920666096066e-05, "loss": 0.3469, "step": 9000 }, { "epoch": 1.774447949526814, "grad_norm": 0.488778406351827, "learning_rate": 1.177768134453546e-05, "loss": 0.3744, "step": 9001 }, { "epoch": 1.7746451104100947, "grad_norm": 0.44883233582815296, "learning_rate": 1.177615598540003e-05, "loss": 0.3301, "step": 9002 }, { "epoch": 1.7748422712933754, "grad_norm": 0.4506843011240513, "learning_rate": 1.1774630583591024e-05, "loss": 0.3038, "step": 9003 }, { "epoch": 1.7750394321766563, "grad_norm": 0.49508062610720943, "learning_rate": 1.1773105139145088e-05, "loss": 0.3583, "step": 9004 }, { "epoch": 1.7752365930599368, "grad_norm": 0.5164716499986945, "learning_rate": 1.1771579652098874e-05, "loss": 0.382, "step": 9005 }, { "epoch": 1.7754337539432177, "grad_norm": 0.4882485785032844, "learning_rate": 1.1770054122489031e-05, "loss": 0.337, "step": 9006 }, { "epoch": 1.7756309148264984, "grad_norm": 0.4663980919945463, "learning_rate": 1.1768528550352216e-05, "loss": 0.3192, "step": 9007 }, { "epoch": 1.775828075709779, "grad_norm": 0.43682922880005404, "learning_rate": 1.1767002935725076e-05, "loss": 0.3017, "step": 9008 }, { "epoch": 1.77602523659306, "grad_norm": 0.4867060152007287, "learning_rate": 1.1765477278644264e-05, "loss": 0.3299, "step": 9009 }, { "epoch": 1.7762223974763407, "grad_norm": 0.5124563775251622, "learning_rate": 1.1763951579146444e-05, "loss": 0.3266, "step": 9010 }, { "epoch": 1.7764195583596214, "grad_norm": 0.47078691041664955, "learning_rate": 1.1762425837268263e-05, "loss": 0.3356, "step": 9011 }, { "epoch": 1.7766167192429023, "grad_norm": 0.4916978260239702, "learning_rate": 1.1760900053046386e-05, "loss": 0.3288, "step": 9012 }, { "epoch": 1.776813880126183, "grad_norm": 0.4956955563685568, "learning_rate": 1.1759374226517464e-05, "loss": 0.3384, "step": 9013 }, { "epoch": 1.7770110410094637, "grad_norm": 0.4728851012371811, "learning_rate": 1.1757848357718162e-05, "loss": 0.3307, "step": 9014 }, { "epoch": 1.7772082018927446, "grad_norm": 0.503512410418657, "learning_rate": 1.1756322446685134e-05, "loss": 0.3366, "step": 9015 }, { "epoch": 1.7774053627760251, "grad_norm": 0.5196421133302855, "learning_rate": 1.1754796493455048e-05, "loss": 0.3633, "step": 9016 }, { "epoch": 1.777602523659306, "grad_norm": 0.49369062439862366, "learning_rate": 1.1753270498064561e-05, "loss": 0.3453, "step": 9017 }, { "epoch": 1.7777996845425867, "grad_norm": 0.4653735508627901, "learning_rate": 1.1751744460550338e-05, "loss": 0.3296, "step": 9018 }, { "epoch": 1.7779968454258674, "grad_norm": 0.5068627718887688, "learning_rate": 1.1750218380949047e-05, "loss": 0.3514, "step": 9019 }, { "epoch": 1.7781940063091484, "grad_norm": 0.47122281155877815, "learning_rate": 1.1748692259297347e-05, "loss": 0.3461, "step": 9020 }, { "epoch": 1.778391167192429, "grad_norm": 0.4566301776497934, "learning_rate": 1.174716609563191e-05, "loss": 0.3396, "step": 9021 }, { "epoch": 1.7785883280757098, "grad_norm": 0.4861860829634978, "learning_rate": 1.1745639889989398e-05, "loss": 0.3213, "step": 9022 }, { "epoch": 1.7787854889589907, "grad_norm": 0.4783918537273916, "learning_rate": 1.1744113642406483e-05, "loss": 0.3139, "step": 9023 }, { "epoch": 1.7789826498422712, "grad_norm": 0.4813446133963429, "learning_rate": 1.1742587352919833e-05, "loss": 0.3439, "step": 9024 }, { "epoch": 1.779179810725552, "grad_norm": 0.5247316124213983, "learning_rate": 1.1741061021566118e-05, "loss": 0.3264, "step": 9025 }, { "epoch": 1.7793769716088328, "grad_norm": 0.4666782968847508, "learning_rate": 1.173953464838201e-05, "loss": 0.3286, "step": 9026 }, { "epoch": 1.7795741324921135, "grad_norm": 0.4530221240481721, "learning_rate": 1.1738008233404181e-05, "loss": 0.3428, "step": 9027 }, { "epoch": 1.7797712933753944, "grad_norm": 0.4647953682965125, "learning_rate": 1.1736481776669307e-05, "loss": 0.3247, "step": 9028 }, { "epoch": 1.7799684542586751, "grad_norm": 0.49727784750492343, "learning_rate": 1.1734955278214057e-05, "loss": 0.3619, "step": 9029 }, { "epoch": 1.7801656151419558, "grad_norm": 0.49005891060030987, "learning_rate": 1.1733428738075108e-05, "loss": 0.3562, "step": 9030 }, { "epoch": 1.7803627760252367, "grad_norm": 0.734206410674185, "learning_rate": 1.1731902156289142e-05, "loss": 0.3285, "step": 9031 }, { "epoch": 1.7805599369085172, "grad_norm": 0.4681761876719222, "learning_rate": 1.173037553289283e-05, "loss": 0.3409, "step": 9032 }, { "epoch": 1.7807570977917981, "grad_norm": 0.46203491841893324, "learning_rate": 1.1728848867922853e-05, "loss": 0.3186, "step": 9033 }, { "epoch": 1.7809542586750788, "grad_norm": 0.4835537269864709, "learning_rate": 1.1727322161415888e-05, "loss": 0.3312, "step": 9034 }, { "epoch": 1.7811514195583595, "grad_norm": 0.44861177894612153, "learning_rate": 1.1725795413408618e-05, "loss": 0.3016, "step": 9035 }, { "epoch": 1.7813485804416405, "grad_norm": 0.4566247359785835, "learning_rate": 1.1724268623937725e-05, "loss": 0.3094, "step": 9036 }, { "epoch": 1.7815457413249212, "grad_norm": 0.5013017232922788, "learning_rate": 1.1722741793039885e-05, "loss": 0.3556, "step": 9037 }, { "epoch": 1.7817429022082019, "grad_norm": 0.4828140840198569, "learning_rate": 1.172121492075179e-05, "loss": 0.3398, "step": 9038 }, { "epoch": 1.7819400630914828, "grad_norm": 0.4836084111183976, "learning_rate": 1.171968800711012e-05, "loss": 0.3516, "step": 9039 }, { "epoch": 1.7821372239747633, "grad_norm": 0.45421673308016836, "learning_rate": 1.1718161052151562e-05, "loss": 0.2943, "step": 9040 }, { "epoch": 1.7823343848580442, "grad_norm": 0.46771437738564686, "learning_rate": 1.1716634055912796e-05, "loss": 0.358, "step": 9041 }, { "epoch": 1.7825315457413249, "grad_norm": 0.47666003792216416, "learning_rate": 1.1715107018430522e-05, "loss": 0.3418, "step": 9042 }, { "epoch": 1.7827287066246056, "grad_norm": 0.4840693365790069, "learning_rate": 1.1713579939741415e-05, "loss": 0.3198, "step": 9043 }, { "epoch": 1.7829258675078865, "grad_norm": 0.46910784593485333, "learning_rate": 1.1712052819882171e-05, "loss": 0.3262, "step": 9044 }, { "epoch": 1.7831230283911672, "grad_norm": 0.4896822327451005, "learning_rate": 1.171052565888948e-05, "loss": 0.3433, "step": 9045 }, { "epoch": 1.783320189274448, "grad_norm": 0.476154410913374, "learning_rate": 1.1708998456800034e-05, "loss": 0.3148, "step": 9046 }, { "epoch": 1.7835173501577288, "grad_norm": 0.5217716610338343, "learning_rate": 1.170747121365052e-05, "loss": 0.3312, "step": 9047 }, { "epoch": 1.7837145110410093, "grad_norm": 0.49551436556952305, "learning_rate": 1.1705943929477639e-05, "loss": 0.348, "step": 9048 }, { "epoch": 1.7839116719242902, "grad_norm": 0.4920600225347391, "learning_rate": 1.170441660431808e-05, "loss": 0.349, "step": 9049 }, { "epoch": 1.784108832807571, "grad_norm": 0.4378125833985353, "learning_rate": 1.1702889238208539e-05, "loss": 0.312, "step": 9050 }, { "epoch": 1.7843059936908516, "grad_norm": 0.43674490442544767, "learning_rate": 1.1701361831185714e-05, "loss": 0.2936, "step": 9051 }, { "epoch": 1.7845031545741326, "grad_norm": 0.4842053898639523, "learning_rate": 1.1699834383286299e-05, "loss": 0.3474, "step": 9052 }, { "epoch": 1.7847003154574133, "grad_norm": 0.4529779374085598, "learning_rate": 1.1698306894546995e-05, "loss": 0.3128, "step": 9053 }, { "epoch": 1.784897476340694, "grad_norm": 0.4801304915326945, "learning_rate": 1.16967793650045e-05, "loss": 0.3324, "step": 9054 }, { "epoch": 1.7850946372239749, "grad_norm": 0.5173598334277061, "learning_rate": 1.1695251794695514e-05, "loss": 0.3415, "step": 9055 }, { "epoch": 1.7852917981072554, "grad_norm": 0.4654828527782663, "learning_rate": 1.169372418365674e-05, "loss": 0.3172, "step": 9056 }, { "epoch": 1.7854889589905363, "grad_norm": 0.4563252768971276, "learning_rate": 1.1692196531924877e-05, "loss": 0.3165, "step": 9057 }, { "epoch": 1.785686119873817, "grad_norm": 0.48839701961063525, "learning_rate": 1.169066883953663e-05, "loss": 0.3321, "step": 9058 }, { "epoch": 1.7858832807570977, "grad_norm": 0.4668548629020815, "learning_rate": 1.1689141106528703e-05, "loss": 0.3229, "step": 9059 }, { "epoch": 1.7860804416403786, "grad_norm": 0.48668760035869535, "learning_rate": 1.16876133329378e-05, "loss": 0.3411, "step": 9060 }, { "epoch": 1.7862776025236593, "grad_norm": 0.4648244886523526, "learning_rate": 1.168608551880063e-05, "loss": 0.337, "step": 9061 }, { "epoch": 1.78647476340694, "grad_norm": 0.48604760080798454, "learning_rate": 1.1684557664153893e-05, "loss": 0.3144, "step": 9062 }, { "epoch": 1.786671924290221, "grad_norm": 0.4783043678898303, "learning_rate": 1.1683029769034304e-05, "loss": 0.3225, "step": 9063 }, { "epoch": 1.7868690851735016, "grad_norm": 0.47691055878103017, "learning_rate": 1.168150183347857e-05, "loss": 0.3317, "step": 9064 }, { "epoch": 1.7870662460567823, "grad_norm": 0.4868775598571958, "learning_rate": 1.16799738575234e-05, "loss": 0.3326, "step": 9065 }, { "epoch": 1.7872634069400632, "grad_norm": 0.45774853271321925, "learning_rate": 1.1678445841205506e-05, "loss": 0.3183, "step": 9066 }, { "epoch": 1.7874605678233437, "grad_norm": 0.9497850183764717, "learning_rate": 1.1676917784561599e-05, "loss": 0.3714, "step": 9067 }, { "epoch": 1.7876577287066246, "grad_norm": 0.4768647293354841, "learning_rate": 1.1675389687628389e-05, "loss": 0.3462, "step": 9068 }, { "epoch": 1.7878548895899053, "grad_norm": 0.4783944603727662, "learning_rate": 1.1673861550442596e-05, "loss": 0.3372, "step": 9069 }, { "epoch": 1.788052050473186, "grad_norm": 0.46436887776659636, "learning_rate": 1.167233337304093e-05, "loss": 0.3318, "step": 9070 }, { "epoch": 1.788249211356467, "grad_norm": 4.345180913181195, "learning_rate": 1.1670805155460108e-05, "loss": 0.3453, "step": 9071 }, { "epoch": 1.7884463722397477, "grad_norm": 0.5159025017330271, "learning_rate": 1.1669276897736847e-05, "loss": 0.3449, "step": 9072 }, { "epoch": 1.7886435331230284, "grad_norm": 0.5595335747614689, "learning_rate": 1.1667748599907864e-05, "loss": 0.3788, "step": 9073 }, { "epoch": 1.7888406940063093, "grad_norm": 0.5325186373611229, "learning_rate": 1.1666220262009877e-05, "loss": 0.353, "step": 9074 }, { "epoch": 1.7890378548895898, "grad_norm": 1.2209348375435845, "learning_rate": 1.1664691884079606e-05, "loss": 0.3454, "step": 9075 }, { "epoch": 1.7892350157728707, "grad_norm": 0.4529621721978549, "learning_rate": 1.1663163466153775e-05, "loss": 0.3219, "step": 9076 }, { "epoch": 1.7894321766561514, "grad_norm": 0.5786960304033676, "learning_rate": 1.16616350082691e-05, "loss": 0.3458, "step": 9077 }, { "epoch": 1.789629337539432, "grad_norm": 0.46513659246814454, "learning_rate": 1.1660106510462305e-05, "loss": 0.3205, "step": 9078 }, { "epoch": 1.789826498422713, "grad_norm": 0.49129392716967757, "learning_rate": 1.1658577972770115e-05, "loss": 0.3643, "step": 9079 }, { "epoch": 1.7900236593059937, "grad_norm": 0.4946634044459266, "learning_rate": 1.1657049395229255e-05, "loss": 0.3585, "step": 9080 }, { "epoch": 1.7902208201892744, "grad_norm": 0.4625534608044079, "learning_rate": 1.1655520777876446e-05, "loss": 0.3295, "step": 9081 }, { "epoch": 1.7904179810725553, "grad_norm": 0.4770599185465994, "learning_rate": 1.1653992120748421e-05, "loss": 0.3232, "step": 9082 }, { "epoch": 1.7906151419558358, "grad_norm": 0.5732472260639709, "learning_rate": 1.1652463423881898e-05, "loss": 0.3939, "step": 9083 }, { "epoch": 1.7908123028391167, "grad_norm": 0.5032610006719888, "learning_rate": 1.1650934687313615e-05, "loss": 0.3256, "step": 9084 }, { "epoch": 1.7910094637223974, "grad_norm": 0.5315497537438166, "learning_rate": 1.1649405911080298e-05, "loss": 0.3533, "step": 9085 }, { "epoch": 1.7912066246056781, "grad_norm": 0.47972476193122204, "learning_rate": 1.1647877095218671e-05, "loss": 0.3087, "step": 9086 }, { "epoch": 1.791403785488959, "grad_norm": 0.49904975210777663, "learning_rate": 1.1646348239765475e-05, "loss": 0.333, "step": 9087 }, { "epoch": 1.7916009463722398, "grad_norm": 0.4982533128788867, "learning_rate": 1.164481934475743e-05, "loss": 0.3569, "step": 9088 }, { "epoch": 1.7917981072555205, "grad_norm": 0.9122269960563105, "learning_rate": 1.1643290410231282e-05, "loss": 0.3395, "step": 9089 }, { "epoch": 1.7919952681388014, "grad_norm": 0.4557932362062137, "learning_rate": 1.1641761436223753e-05, "loss": 0.3024, "step": 9090 }, { "epoch": 1.7921924290220819, "grad_norm": 0.4941263997632442, "learning_rate": 1.1640232422771586e-05, "loss": 0.3476, "step": 9091 }, { "epoch": 1.7923895899053628, "grad_norm": 0.5155585824775918, "learning_rate": 1.1638703369911517e-05, "loss": 0.3596, "step": 9092 }, { "epoch": 1.7925867507886435, "grad_norm": 1.4023286561635016, "learning_rate": 1.1637174277680277e-05, "loss": 0.3335, "step": 9093 }, { "epoch": 1.7927839116719242, "grad_norm": 0.5065005447760449, "learning_rate": 1.1635645146114607e-05, "loss": 0.355, "step": 9094 }, { "epoch": 1.7929810725552051, "grad_norm": 0.526370677958319, "learning_rate": 1.1634115975251245e-05, "loss": 0.3056, "step": 9095 }, { "epoch": 1.7931782334384858, "grad_norm": 0.486266087331147, "learning_rate": 1.1632586765126929e-05, "loss": 0.3269, "step": 9096 }, { "epoch": 1.7933753943217665, "grad_norm": 0.4580937992770503, "learning_rate": 1.1631057515778403e-05, "loss": 0.3137, "step": 9097 }, { "epoch": 1.7935725552050474, "grad_norm": 0.5054061865722272, "learning_rate": 1.1629528227242408e-05, "loss": 0.3398, "step": 9098 }, { "epoch": 1.793769716088328, "grad_norm": 0.47624328346608846, "learning_rate": 1.1627998899555684e-05, "loss": 0.3217, "step": 9099 }, { "epoch": 1.7939668769716088, "grad_norm": 0.47501826137837944, "learning_rate": 1.1626469532754975e-05, "loss": 0.3436, "step": 9100 }, { "epoch": 1.7941640378548895, "grad_norm": 0.5083570086033906, "learning_rate": 1.1624940126877027e-05, "loss": 0.3342, "step": 9101 }, { "epoch": 1.7943611987381702, "grad_norm": 0.43834857149607065, "learning_rate": 1.1623410681958583e-05, "loss": 0.3176, "step": 9102 }, { "epoch": 1.7945583596214512, "grad_norm": 0.46797550187905224, "learning_rate": 1.1621881198036389e-05, "loss": 0.3296, "step": 9103 }, { "epoch": 1.7947555205047319, "grad_norm": 0.747473707199481, "learning_rate": 1.1620351675147195e-05, "loss": 0.336, "step": 9104 }, { "epoch": 1.7949526813880126, "grad_norm": 0.4864213421889568, "learning_rate": 1.1618822113327743e-05, "loss": 0.346, "step": 9105 }, { "epoch": 1.7951498422712935, "grad_norm": 0.48393670272857564, "learning_rate": 1.1617292512614793e-05, "loss": 0.3323, "step": 9106 }, { "epoch": 1.7953470031545742, "grad_norm": 0.42975326675170367, "learning_rate": 1.161576287304508e-05, "loss": 0.2663, "step": 9107 }, { "epoch": 1.7955441640378549, "grad_norm": 0.4861731628110563, "learning_rate": 1.1614233194655371e-05, "loss": 0.3257, "step": 9108 }, { "epoch": 1.7957413249211358, "grad_norm": 0.8570529099252474, "learning_rate": 1.1612703477482403e-05, "loss": 0.32, "step": 9109 }, { "epoch": 1.7959384858044163, "grad_norm": 0.4868050444217125, "learning_rate": 1.161117372156294e-05, "loss": 0.3324, "step": 9110 }, { "epoch": 1.7961356466876972, "grad_norm": 0.5166058354822001, "learning_rate": 1.1609643926933727e-05, "loss": 0.3296, "step": 9111 }, { "epoch": 1.796332807570978, "grad_norm": 0.454694260080299, "learning_rate": 1.1608114093631523e-05, "loss": 0.3057, "step": 9112 }, { "epoch": 1.7965299684542586, "grad_norm": 0.4873463120948357, "learning_rate": 1.1606584221693084e-05, "loss": 0.3327, "step": 9113 }, { "epoch": 1.7967271293375395, "grad_norm": 0.46258370737085464, "learning_rate": 1.1605054311155165e-05, "loss": 0.2937, "step": 9114 }, { "epoch": 1.7969242902208202, "grad_norm": 0.5089951413840461, "learning_rate": 1.1603524362054525e-05, "loss": 0.347, "step": 9115 }, { "epoch": 1.797121451104101, "grad_norm": 0.49112572937967497, "learning_rate": 1.1601994374427921e-05, "loss": 0.3328, "step": 9116 }, { "epoch": 1.7973186119873819, "grad_norm": 0.4743944704648095, "learning_rate": 1.160046434831211e-05, "loss": 0.316, "step": 9117 }, { "epoch": 1.7975157728706623, "grad_norm": 0.5012809900872015, "learning_rate": 1.1598934283743855e-05, "loss": 0.3618, "step": 9118 }, { "epoch": 1.7977129337539433, "grad_norm": 0.4632607405847242, "learning_rate": 1.1597404180759917e-05, "loss": 0.3282, "step": 9119 }, { "epoch": 1.797910094637224, "grad_norm": 0.4893372010154669, "learning_rate": 1.1595874039397055e-05, "loss": 0.3256, "step": 9120 }, { "epoch": 1.7981072555205047, "grad_norm": 0.49627890794355706, "learning_rate": 1.1594343859692037e-05, "loss": 0.3487, "step": 9121 }, { "epoch": 1.7983044164037856, "grad_norm": 0.49100214178136625, "learning_rate": 1.1592813641681621e-05, "loss": 0.3388, "step": 9122 }, { "epoch": 1.7985015772870663, "grad_norm": 0.4542961165749031, "learning_rate": 1.1591283385402577e-05, "loss": 0.3299, "step": 9123 }, { "epoch": 1.798698738170347, "grad_norm": 0.4801501150330402, "learning_rate": 1.1589753090891667e-05, "loss": 0.3323, "step": 9124 }, { "epoch": 1.798895899053628, "grad_norm": 0.4851771812965912, "learning_rate": 1.158822275818566e-05, "loss": 0.3357, "step": 9125 }, { "epoch": 1.7990930599369084, "grad_norm": 0.45182412694780133, "learning_rate": 1.158669238732132e-05, "loss": 0.305, "step": 9126 }, { "epoch": 1.7992902208201893, "grad_norm": 0.47897297293682256, "learning_rate": 1.158516197833542e-05, "loss": 0.3231, "step": 9127 }, { "epoch": 1.79948738170347, "grad_norm": 0.43389581021319834, "learning_rate": 1.1583631531264723e-05, "loss": 0.3173, "step": 9128 }, { "epoch": 1.7996845425867507, "grad_norm": 0.47731478351982176, "learning_rate": 1.1582101046146008e-05, "loss": 0.3394, "step": 9129 }, { "epoch": 1.7998817034700316, "grad_norm": 0.435886477568427, "learning_rate": 1.1580570523016036e-05, "loss": 0.2997, "step": 9130 }, { "epoch": 1.8000788643533123, "grad_norm": 0.5238566166040188, "learning_rate": 1.1579039961911591e-05, "loss": 0.3432, "step": 9131 }, { "epoch": 1.800276025236593, "grad_norm": 0.5379605041705956, "learning_rate": 1.1577509362869433e-05, "loss": 0.357, "step": 9132 }, { "epoch": 1.800473186119874, "grad_norm": 0.449491662574799, "learning_rate": 1.1575978725926347e-05, "loss": 0.3106, "step": 9133 }, { "epoch": 1.8006703470031544, "grad_norm": 0.5080608887234477, "learning_rate": 1.1574448051119101e-05, "loss": 0.3672, "step": 9134 }, { "epoch": 1.8008675078864353, "grad_norm": 0.5323631360585213, "learning_rate": 1.1572917338484471e-05, "loss": 0.3742, "step": 9135 }, { "epoch": 1.801064668769716, "grad_norm": 0.5031109008850254, "learning_rate": 1.1571386588059236e-05, "loss": 0.3406, "step": 9136 }, { "epoch": 1.8012618296529967, "grad_norm": 0.49254801312177554, "learning_rate": 1.1569855799880174e-05, "loss": 0.3412, "step": 9137 }, { "epoch": 1.8014589905362777, "grad_norm": 0.4836870483836016, "learning_rate": 1.1568324973984065e-05, "loss": 0.3256, "step": 9138 }, { "epoch": 1.8016561514195584, "grad_norm": 0.5079333391604857, "learning_rate": 1.1566794110407681e-05, "loss": 0.3129, "step": 9139 }, { "epoch": 1.801853312302839, "grad_norm": 0.48306063016614514, "learning_rate": 1.156526320918781e-05, "loss": 0.3263, "step": 9140 }, { "epoch": 1.80205047318612, "grad_norm": 0.46236068282322035, "learning_rate": 1.1563732270361228e-05, "loss": 0.3218, "step": 9141 }, { "epoch": 1.8022476340694005, "grad_norm": 0.46822948046376184, "learning_rate": 1.1562201293964716e-05, "loss": 0.3159, "step": 9142 }, { "epoch": 1.8024447949526814, "grad_norm": 0.457275091142312, "learning_rate": 1.1560670280035065e-05, "loss": 0.3422, "step": 9143 }, { "epoch": 1.802641955835962, "grad_norm": 0.45010333968693195, "learning_rate": 1.155913922860905e-05, "loss": 0.3038, "step": 9144 }, { "epoch": 1.8028391167192428, "grad_norm": 0.460887214721349, "learning_rate": 1.155760813972346e-05, "loss": 0.3184, "step": 9145 }, { "epoch": 1.8030362776025237, "grad_norm": 0.47074306673977423, "learning_rate": 1.1556077013415084e-05, "loss": 0.327, "step": 9146 }, { "epoch": 1.8032334384858044, "grad_norm": 0.44541887094425797, "learning_rate": 1.15545458497207e-05, "loss": 0.314, "step": 9147 }, { "epoch": 1.8034305993690851, "grad_norm": 0.46761848114178406, "learning_rate": 1.1553014648677104e-05, "loss": 0.3236, "step": 9148 }, { "epoch": 1.803627760252366, "grad_norm": 0.47178941550054526, "learning_rate": 1.1551483410321075e-05, "loss": 0.3619, "step": 9149 }, { "epoch": 1.8038249211356467, "grad_norm": 0.5060596355050806, "learning_rate": 1.1549952134689414e-05, "loss": 0.3555, "step": 9150 }, { "epoch": 1.8040220820189274, "grad_norm": 0.48942752728337136, "learning_rate": 1.1548420821818902e-05, "loss": 0.3512, "step": 9151 }, { "epoch": 1.8042192429022084, "grad_norm": 0.4656609142013564, "learning_rate": 1.1546889471746333e-05, "loss": 0.3254, "step": 9152 }, { "epoch": 1.8044164037854888, "grad_norm": 0.5678872974738812, "learning_rate": 1.1545358084508497e-05, "loss": 0.3503, "step": 9153 }, { "epoch": 1.8046135646687698, "grad_norm": 0.4746836161670529, "learning_rate": 1.154382666014219e-05, "loss": 0.3026, "step": 9154 }, { "epoch": 1.8048107255520505, "grad_norm": 0.4919723933858667, "learning_rate": 1.1542295198684206e-05, "loss": 0.3567, "step": 9155 }, { "epoch": 1.8050078864353312, "grad_norm": 0.4658920446396069, "learning_rate": 1.1540763700171334e-05, "loss": 0.3139, "step": 9156 }, { "epoch": 1.805205047318612, "grad_norm": 0.4998610736336036, "learning_rate": 1.1539232164640378e-05, "loss": 0.3537, "step": 9157 }, { "epoch": 1.8054022082018928, "grad_norm": 0.4838154631555964, "learning_rate": 1.1537700592128126e-05, "loss": 0.3231, "step": 9158 }, { "epoch": 1.8055993690851735, "grad_norm": 0.481476456192672, "learning_rate": 1.1536168982671378e-05, "loss": 0.3587, "step": 9159 }, { "epoch": 1.8057965299684544, "grad_norm": 0.6593255142170363, "learning_rate": 1.1534637336306935e-05, "loss": 0.3702, "step": 9160 }, { "epoch": 1.805993690851735, "grad_norm": 0.47074519329216513, "learning_rate": 1.1533105653071594e-05, "loss": 0.3292, "step": 9161 }, { "epoch": 1.8061908517350158, "grad_norm": 0.47104591661023276, "learning_rate": 1.1531573933002156e-05, "loss": 0.3273, "step": 9162 }, { "epoch": 1.8063880126182965, "grad_norm": 0.4962259307049863, "learning_rate": 1.153004217613542e-05, "loss": 0.3381, "step": 9163 }, { "epoch": 1.8065851735015772, "grad_norm": 0.47352216159289334, "learning_rate": 1.152851038250819e-05, "loss": 0.3293, "step": 9164 }, { "epoch": 1.8067823343848581, "grad_norm": 0.4661605793322122, "learning_rate": 1.1526978552157266e-05, "loss": 0.3221, "step": 9165 }, { "epoch": 1.8069794952681388, "grad_norm": 0.45860831067733454, "learning_rate": 1.1525446685119452e-05, "loss": 0.3341, "step": 9166 }, { "epoch": 1.8071766561514195, "grad_norm": 0.4704413143236224, "learning_rate": 1.1523914781431555e-05, "loss": 0.3071, "step": 9167 }, { "epoch": 1.8073738170347005, "grad_norm": 0.47559115700741583, "learning_rate": 1.1522382841130377e-05, "loss": 0.3467, "step": 9168 }, { "epoch": 1.807570977917981, "grad_norm": 0.4852070321473458, "learning_rate": 1.1520850864252724e-05, "loss": 0.3132, "step": 9169 }, { "epoch": 1.8077681388012619, "grad_norm": 0.5150731573019411, "learning_rate": 1.1519318850835406e-05, "loss": 0.3426, "step": 9170 }, { "epoch": 1.8079652996845426, "grad_norm": 0.46442119276118826, "learning_rate": 1.1517786800915229e-05, "loss": 0.318, "step": 9171 }, { "epoch": 1.8081624605678233, "grad_norm": 0.47760132684470824, "learning_rate": 1.1516254714529001e-05, "loss": 0.3138, "step": 9172 }, { "epoch": 1.8083596214511042, "grad_norm": 0.5063732452560264, "learning_rate": 1.1514722591713529e-05, "loss": 0.3535, "step": 9173 }, { "epoch": 1.8085567823343849, "grad_norm": 0.46353551339902077, "learning_rate": 1.1513190432505634e-05, "loss": 0.3085, "step": 9174 }, { "epoch": 1.8087539432176656, "grad_norm": 0.45193354361169114, "learning_rate": 1.1511658236942114e-05, "loss": 0.2871, "step": 9175 }, { "epoch": 1.8089511041009465, "grad_norm": 0.5095923560172854, "learning_rate": 1.1510126005059793e-05, "loss": 0.3514, "step": 9176 }, { "epoch": 1.809148264984227, "grad_norm": 0.45401003471081053, "learning_rate": 1.1508593736895475e-05, "loss": 0.32, "step": 9177 }, { "epoch": 1.809345425867508, "grad_norm": 0.4670090425518402, "learning_rate": 1.150706143248598e-05, "loss": 0.3338, "step": 9178 }, { "epoch": 1.8095425867507886, "grad_norm": 0.49260267310440076, "learning_rate": 1.1505529091868117e-05, "loss": 0.3646, "step": 9179 }, { "epoch": 1.8097397476340693, "grad_norm": 0.4892695211612479, "learning_rate": 1.1503996715078707e-05, "loss": 0.3159, "step": 9180 }, { "epoch": 1.8099369085173502, "grad_norm": 0.5269511149927203, "learning_rate": 1.1502464302154566e-05, "loss": 0.3476, "step": 9181 }, { "epoch": 1.810134069400631, "grad_norm": 0.477149389147672, "learning_rate": 1.150093185313251e-05, "loss": 0.3394, "step": 9182 }, { "epoch": 1.8103312302839116, "grad_norm": 0.4911874732780398, "learning_rate": 1.1499399368049356e-05, "loss": 0.3367, "step": 9183 }, { "epoch": 1.8105283911671926, "grad_norm": 0.7822176448681528, "learning_rate": 1.1497866846941926e-05, "loss": 0.3192, "step": 9184 }, { "epoch": 1.810725552050473, "grad_norm": 0.4936248418394106, "learning_rate": 1.1496334289847038e-05, "loss": 0.3429, "step": 9185 }, { "epoch": 1.810922712933754, "grad_norm": 0.47779103622401925, "learning_rate": 1.1494801696801515e-05, "loss": 0.3391, "step": 9186 }, { "epoch": 1.8111198738170347, "grad_norm": 0.4911635374833865, "learning_rate": 1.1493269067842175e-05, "loss": 0.3392, "step": 9187 }, { "epoch": 1.8113170347003154, "grad_norm": 0.4604725274713814, "learning_rate": 1.1491736403005844e-05, "loss": 0.324, "step": 9188 }, { "epoch": 1.8115141955835963, "grad_norm": 0.5032560937410412, "learning_rate": 1.1490203702329346e-05, "loss": 0.3603, "step": 9189 }, { "epoch": 1.811711356466877, "grad_norm": 0.6385600610637551, "learning_rate": 1.1488670965849505e-05, "loss": 0.3369, "step": 9190 }, { "epoch": 1.8119085173501577, "grad_norm": 0.4885866435640685, "learning_rate": 1.1487138193603142e-05, "loss": 0.3309, "step": 9191 }, { "epoch": 1.8121056782334386, "grad_norm": 0.5183397825685779, "learning_rate": 1.1485605385627088e-05, "loss": 0.3343, "step": 9192 }, { "epoch": 1.812302839116719, "grad_norm": 0.4989032074820882, "learning_rate": 1.1484072541958167e-05, "loss": 0.3397, "step": 9193 }, { "epoch": 1.8125, "grad_norm": 0.5033661808175196, "learning_rate": 1.1482539662633208e-05, "loss": 0.3417, "step": 9194 }, { "epoch": 1.812697160883281, "grad_norm": 5.86204139038015, "learning_rate": 1.1481006747689043e-05, "loss": 0.328, "step": 9195 }, { "epoch": 1.8128943217665614, "grad_norm": 0.5051699316872136, "learning_rate": 1.1479473797162492e-05, "loss": 0.3173, "step": 9196 }, { "epoch": 1.8130914826498423, "grad_norm": 0.5070080136339032, "learning_rate": 1.1477940811090398e-05, "loss": 0.3271, "step": 9197 }, { "epoch": 1.813288643533123, "grad_norm": 0.4811441468791982, "learning_rate": 1.1476407789509583e-05, "loss": 0.3268, "step": 9198 }, { "epoch": 1.8134858044164037, "grad_norm": 0.4785568333077474, "learning_rate": 1.1474874732456884e-05, "loss": 0.3212, "step": 9199 }, { "epoch": 1.8136829652996846, "grad_norm": 0.4698877579251847, "learning_rate": 1.147334163996913e-05, "loss": 0.3072, "step": 9200 }, { "epoch": 1.8138801261829653, "grad_norm": 0.47991128581690556, "learning_rate": 1.1471808512083156e-05, "loss": 0.3416, "step": 9201 }, { "epoch": 1.814077287066246, "grad_norm": 0.46961744858878407, "learning_rate": 1.1470275348835797e-05, "loss": 0.3187, "step": 9202 }, { "epoch": 1.814274447949527, "grad_norm": 0.5056112272060148, "learning_rate": 1.146874215026389e-05, "loss": 0.3397, "step": 9203 }, { "epoch": 1.8144716088328074, "grad_norm": 0.4972247643473326, "learning_rate": 1.1467208916404271e-05, "loss": 0.3404, "step": 9204 }, { "epoch": 1.8146687697160884, "grad_norm": 0.4723317093599732, "learning_rate": 1.1465675647293772e-05, "loss": 0.304, "step": 9205 }, { "epoch": 1.814865930599369, "grad_norm": 0.5198317290369266, "learning_rate": 1.1464142342969242e-05, "loss": 0.3535, "step": 9206 }, { "epoch": 1.8150630914826498, "grad_norm": 2.3677318794574895, "learning_rate": 1.1462609003467508e-05, "loss": 0.4222, "step": 9207 }, { "epoch": 1.8152602523659307, "grad_norm": 0.4794343279462834, "learning_rate": 1.1461075628825416e-05, "loss": 0.3156, "step": 9208 }, { "epoch": 1.8154574132492114, "grad_norm": 0.48662660979986266, "learning_rate": 1.1459542219079808e-05, "loss": 0.3432, "step": 9209 }, { "epoch": 1.815654574132492, "grad_norm": 0.48035488934776704, "learning_rate": 1.1458008774267518e-05, "loss": 0.3383, "step": 9210 }, { "epoch": 1.815851735015773, "grad_norm": 0.48382873384961195, "learning_rate": 1.1456475294425396e-05, "loss": 0.327, "step": 9211 }, { "epoch": 1.8160488958990535, "grad_norm": 0.47241477963626444, "learning_rate": 1.1454941779590283e-05, "loss": 0.3271, "step": 9212 }, { "epoch": 1.8162460567823344, "grad_norm": 0.46721123265648445, "learning_rate": 1.1453408229799017e-05, "loss": 0.3125, "step": 9213 }, { "epoch": 1.8164432176656151, "grad_norm": 0.4739931333373127, "learning_rate": 1.1451874645088455e-05, "loss": 0.3265, "step": 9214 }, { "epoch": 1.8166403785488958, "grad_norm": 0.5189118810899088, "learning_rate": 1.145034102549543e-05, "loss": 0.3672, "step": 9215 }, { "epoch": 1.8168375394321767, "grad_norm": 0.49357938108124044, "learning_rate": 1.1448807371056798e-05, "loss": 0.3343, "step": 9216 }, { "epoch": 1.8170347003154574, "grad_norm": 0.476806812670295, "learning_rate": 1.14472736818094e-05, "loss": 0.3544, "step": 9217 }, { "epoch": 1.8172318611987381, "grad_norm": 0.5426318526785301, "learning_rate": 1.1445739957790087e-05, "loss": 0.3388, "step": 9218 }, { "epoch": 1.817429022082019, "grad_norm": 0.48940437942449916, "learning_rate": 1.1444206199035708e-05, "loss": 0.3436, "step": 9219 }, { "epoch": 1.8176261829652995, "grad_norm": 0.47285073767586155, "learning_rate": 1.1442672405583109e-05, "loss": 0.342, "step": 9220 }, { "epoch": 1.8178233438485805, "grad_norm": 0.4668025709102716, "learning_rate": 1.1441138577469147e-05, "loss": 0.327, "step": 9221 }, { "epoch": 1.8180205047318612, "grad_norm": 0.45875055734217174, "learning_rate": 1.1439604714730666e-05, "loss": 0.3192, "step": 9222 }, { "epoch": 1.8182176656151419, "grad_norm": 0.4409526335848642, "learning_rate": 1.1438070817404527e-05, "loss": 0.3101, "step": 9223 }, { "epoch": 1.8184148264984228, "grad_norm": 0.45114553583024647, "learning_rate": 1.1436536885527576e-05, "loss": 0.3342, "step": 9224 }, { "epoch": 1.8186119873817035, "grad_norm": 0.4806686518283818, "learning_rate": 1.1435002919136671e-05, "loss": 0.3391, "step": 9225 }, { "epoch": 1.8188091482649842, "grad_norm": 0.5044312994268182, "learning_rate": 1.1433468918268663e-05, "loss": 0.3555, "step": 9226 }, { "epoch": 1.819006309148265, "grad_norm": 0.49263755874516185, "learning_rate": 1.1431934882960412e-05, "loss": 0.3607, "step": 9227 }, { "epoch": 1.8192034700315456, "grad_norm": 0.4536998690340305, "learning_rate": 1.1430400813248772e-05, "loss": 0.2918, "step": 9228 }, { "epoch": 1.8194006309148265, "grad_norm": 0.532141725281099, "learning_rate": 1.1428866709170599e-05, "loss": 0.355, "step": 9229 }, { "epoch": 1.8195977917981072, "grad_norm": 0.4635187686939784, "learning_rate": 1.1427332570762754e-05, "loss": 0.3211, "step": 9230 }, { "epoch": 1.819794952681388, "grad_norm": 0.602694804528996, "learning_rate": 1.1425798398062093e-05, "loss": 0.3574, "step": 9231 }, { "epoch": 1.8199921135646688, "grad_norm": 0.49418051715015165, "learning_rate": 1.1424264191105481e-05, "loss": 0.3526, "step": 9232 }, { "epoch": 1.8201892744479495, "grad_norm": 0.47528793595220115, "learning_rate": 1.1422729949929772e-05, "loss": 0.3408, "step": 9233 }, { "epoch": 1.8203864353312302, "grad_norm": 0.5040867868946186, "learning_rate": 1.142119567457183e-05, "loss": 0.3401, "step": 9234 }, { "epoch": 1.8205835962145112, "grad_norm": 0.4709221541985399, "learning_rate": 1.141966136506852e-05, "loss": 0.322, "step": 9235 }, { "epoch": 1.8207807570977916, "grad_norm": 0.5048500824063943, "learning_rate": 1.14181270214567e-05, "loss": 0.3449, "step": 9236 }, { "epoch": 1.8209779179810726, "grad_norm": 0.5520156337084202, "learning_rate": 1.1416592643773236e-05, "loss": 0.3485, "step": 9237 }, { "epoch": 1.8211750788643533, "grad_norm": 0.4662159013287956, "learning_rate": 1.1415058232054995e-05, "loss": 0.3116, "step": 9238 }, { "epoch": 1.821372239747634, "grad_norm": 0.4583750639044251, "learning_rate": 1.1413523786338838e-05, "loss": 0.314, "step": 9239 }, { "epoch": 1.8215694006309149, "grad_norm": 0.47754415155408275, "learning_rate": 1.1411989306661635e-05, "loss": 0.3322, "step": 9240 }, { "epoch": 1.8217665615141956, "grad_norm": 0.4974506056160547, "learning_rate": 1.1410454793060251e-05, "loss": 0.3507, "step": 9241 }, { "epoch": 1.8219637223974763, "grad_norm": 0.4734526011420336, "learning_rate": 1.1408920245571558e-05, "loss": 0.3168, "step": 9242 }, { "epoch": 1.8221608832807572, "grad_norm": 0.46295318137196184, "learning_rate": 1.1407385664232415e-05, "loss": 0.3195, "step": 9243 }, { "epoch": 1.822358044164038, "grad_norm": 0.4997671967465472, "learning_rate": 1.1405851049079706e-05, "loss": 0.3458, "step": 9244 }, { "epoch": 1.8225552050473186, "grad_norm": 0.4727316629472926, "learning_rate": 1.1404316400150288e-05, "loss": 0.3358, "step": 9245 }, { "epoch": 1.8227523659305995, "grad_norm": 0.4600159341304322, "learning_rate": 1.1402781717481042e-05, "loss": 0.3019, "step": 9246 }, { "epoch": 1.82294952681388, "grad_norm": 0.4832490316233833, "learning_rate": 1.1401247001108828e-05, "loss": 0.3215, "step": 9247 }, { "epoch": 1.823146687697161, "grad_norm": 0.4552192341859621, "learning_rate": 1.1399712251070532e-05, "loss": 0.3036, "step": 9248 }, { "epoch": 1.8233438485804416, "grad_norm": 0.45254379685961227, "learning_rate": 1.1398177467403022e-05, "loss": 0.3207, "step": 9249 }, { "epoch": 1.8235410094637223, "grad_norm": 0.48097837033082497, "learning_rate": 1.1396642650143171e-05, "loss": 0.3383, "step": 9250 }, { "epoch": 1.8237381703470033, "grad_norm": 2.4890867749317755, "learning_rate": 1.1395107799327856e-05, "loss": 0.3532, "step": 9251 }, { "epoch": 1.823935331230284, "grad_norm": 0.5014918794110886, "learning_rate": 1.1393572914993954e-05, "loss": 0.3663, "step": 9252 }, { "epoch": 1.8241324921135647, "grad_norm": 0.5245523336186161, "learning_rate": 1.1392037997178338e-05, "loss": 0.3445, "step": 9253 }, { "epoch": 1.8243296529968456, "grad_norm": 0.5134602963738575, "learning_rate": 1.1390503045917892e-05, "loss": 0.3631, "step": 9254 }, { "epoch": 1.824526813880126, "grad_norm": 0.4701559391883266, "learning_rate": 1.1388968061249486e-05, "loss": 0.3283, "step": 9255 }, { "epoch": 1.824723974763407, "grad_norm": 0.47107381684930266, "learning_rate": 1.1387433043210006e-05, "loss": 0.3278, "step": 9256 }, { "epoch": 1.8249211356466877, "grad_norm": 0.47653058629900974, "learning_rate": 1.138589799183633e-05, "loss": 0.3238, "step": 9257 }, { "epoch": 1.8251182965299684, "grad_norm": 0.48085309963347217, "learning_rate": 1.138436290716534e-05, "loss": 0.3355, "step": 9258 }, { "epoch": 1.8253154574132493, "grad_norm": 0.46243620388093765, "learning_rate": 1.1382827789233912e-05, "loss": 0.3259, "step": 9259 }, { "epoch": 1.82551261829653, "grad_norm": 0.45802315988824477, "learning_rate": 1.1381292638078935e-05, "loss": 0.3149, "step": 9260 }, { "epoch": 1.8257097791798107, "grad_norm": 0.4794295563766952, "learning_rate": 1.1379757453737293e-05, "loss": 0.3337, "step": 9261 }, { "epoch": 1.8259069400630916, "grad_norm": 0.4758846030069048, "learning_rate": 1.1378222236245862e-05, "loss": 0.324, "step": 9262 }, { "epoch": 1.826104100946372, "grad_norm": 0.4709615758764996, "learning_rate": 1.1376686985641536e-05, "loss": 0.3223, "step": 9263 }, { "epoch": 1.826301261829653, "grad_norm": 0.49999866958616157, "learning_rate": 1.1375151701961191e-05, "loss": 0.3621, "step": 9264 }, { "epoch": 1.8264984227129337, "grad_norm": 0.5326305619030082, "learning_rate": 1.1373616385241726e-05, "loss": 0.3455, "step": 9265 }, { "epoch": 1.8266955835962144, "grad_norm": 0.45691910409746533, "learning_rate": 1.1372081035520015e-05, "loss": 0.3085, "step": 9266 }, { "epoch": 1.8268927444794953, "grad_norm": 0.4587323635393261, "learning_rate": 1.1370545652832958e-05, "loss": 0.3322, "step": 9267 }, { "epoch": 1.827089905362776, "grad_norm": 0.4826907976994329, "learning_rate": 1.1369010237217435e-05, "loss": 0.3391, "step": 9268 }, { "epoch": 1.8272870662460567, "grad_norm": 0.4816253057556002, "learning_rate": 1.1367474788710338e-05, "loss": 0.3408, "step": 9269 }, { "epoch": 1.8274842271293377, "grad_norm": 0.5249937140583107, "learning_rate": 1.1365939307348559e-05, "loss": 0.3464, "step": 9270 }, { "epoch": 1.8276813880126181, "grad_norm": 0.48583193536640634, "learning_rate": 1.1364403793168988e-05, "loss": 0.3469, "step": 9271 }, { "epoch": 1.827878548895899, "grad_norm": 0.5051024554952468, "learning_rate": 1.1362868246208519e-05, "loss": 0.3261, "step": 9272 }, { "epoch": 1.8280757097791798, "grad_norm": 0.4968850771719625, "learning_rate": 1.1361332666504038e-05, "loss": 0.3272, "step": 9273 }, { "epoch": 1.8282728706624605, "grad_norm": 0.47294764285242963, "learning_rate": 1.135979705409245e-05, "loss": 0.3247, "step": 9274 }, { "epoch": 1.8284700315457414, "grad_norm": 0.4606069743822721, "learning_rate": 1.1358261409010636e-05, "loss": 0.3035, "step": 9275 }, { "epoch": 1.828667192429022, "grad_norm": 0.4771239186952894, "learning_rate": 1.1356725731295501e-05, "loss": 0.332, "step": 9276 }, { "epoch": 1.8288643533123028, "grad_norm": 0.4478351388311611, "learning_rate": 1.1355190020983937e-05, "loss": 0.3076, "step": 9277 }, { "epoch": 1.8290615141955837, "grad_norm": 0.46550837164514514, "learning_rate": 1.1353654278112841e-05, "loss": 0.3273, "step": 9278 }, { "epoch": 1.8292586750788642, "grad_norm": 0.4811875611417182, "learning_rate": 1.1352118502719115e-05, "loss": 0.3424, "step": 9279 }, { "epoch": 1.8294558359621451, "grad_norm": 0.4600111130971501, "learning_rate": 1.135058269483965e-05, "loss": 0.3362, "step": 9280 }, { "epoch": 1.8296529968454258, "grad_norm": 0.4475616886590054, "learning_rate": 1.1349046854511347e-05, "loss": 0.3139, "step": 9281 }, { "epoch": 1.8298501577287065, "grad_norm": 0.4806604678847091, "learning_rate": 1.1347510981771108e-05, "loss": 0.3672, "step": 9282 }, { "epoch": 1.8300473186119874, "grad_norm": 0.5175341707370579, "learning_rate": 1.1345975076655832e-05, "loss": 0.339, "step": 9283 }, { "epoch": 1.8302444794952681, "grad_norm": 0.5158084398771197, "learning_rate": 1.134443913920242e-05, "loss": 0.3422, "step": 9284 }, { "epoch": 1.8304416403785488, "grad_norm": 0.44628621178473027, "learning_rate": 1.1342903169447778e-05, "loss": 0.3123, "step": 9285 }, { "epoch": 1.8306388012618298, "grad_norm": 0.491244561834142, "learning_rate": 1.1341367167428806e-05, "loss": 0.3356, "step": 9286 }, { "epoch": 1.8308359621451105, "grad_norm": 0.4618159349464898, "learning_rate": 1.1339831133182405e-05, "loss": 0.3195, "step": 9287 }, { "epoch": 1.8310331230283912, "grad_norm": 0.46357096939810233, "learning_rate": 1.1338295066745482e-05, "loss": 0.3277, "step": 9288 }, { "epoch": 1.831230283911672, "grad_norm": 0.44971140458495884, "learning_rate": 1.1336758968154943e-05, "loss": 0.3236, "step": 9289 }, { "epoch": 1.8314274447949526, "grad_norm": 0.4528759786598488, "learning_rate": 1.1335222837447692e-05, "loss": 0.3091, "step": 9290 }, { "epoch": 1.8316246056782335, "grad_norm": 0.4797326570038922, "learning_rate": 1.1333686674660643e-05, "loss": 0.3388, "step": 9291 }, { "epoch": 1.8318217665615142, "grad_norm": 0.9510910814957004, "learning_rate": 1.133215047983069e-05, "loss": 0.3446, "step": 9292 }, { "epoch": 1.8320189274447949, "grad_norm": 0.47801369290214624, "learning_rate": 1.1330614252994753e-05, "loss": 0.337, "step": 9293 }, { "epoch": 1.8322160883280758, "grad_norm": 0.4798818841814811, "learning_rate": 1.1329077994189736e-05, "loss": 0.3283, "step": 9294 }, { "epoch": 1.8324132492113565, "grad_norm": 0.4649250612908643, "learning_rate": 1.132754170345255e-05, "loss": 0.3143, "step": 9295 }, { "epoch": 1.8326104100946372, "grad_norm": 0.5111682031018355, "learning_rate": 1.1326005380820106e-05, "loss": 0.3489, "step": 9296 }, { "epoch": 1.8328075709779181, "grad_norm": 0.479500042771139, "learning_rate": 1.1324469026329314e-05, "loss": 0.3467, "step": 9297 }, { "epoch": 1.8330047318611986, "grad_norm": 0.48760322704258524, "learning_rate": 1.1322932640017087e-05, "loss": 0.338, "step": 9298 }, { "epoch": 1.8332018927444795, "grad_norm": 0.4615980037814281, "learning_rate": 1.132139622192034e-05, "loss": 0.3211, "step": 9299 }, { "epoch": 1.8333990536277602, "grad_norm": 0.478582392329709, "learning_rate": 1.1319859772075982e-05, "loss": 0.3437, "step": 9300 }, { "epoch": 1.833596214511041, "grad_norm": 0.45732711770150825, "learning_rate": 1.1318323290520935e-05, "loss": 0.307, "step": 9301 }, { "epoch": 1.8337933753943219, "grad_norm": 0.4785336546942506, "learning_rate": 1.1316786777292103e-05, "loss": 0.3467, "step": 9302 }, { "epoch": 1.8339905362776026, "grad_norm": 0.48770360639670446, "learning_rate": 1.1315250232426411e-05, "loss": 0.3178, "step": 9303 }, { "epoch": 1.8341876971608833, "grad_norm": 0.4909997945985066, "learning_rate": 1.1313713655960773e-05, "loss": 0.339, "step": 9304 }, { "epoch": 1.8343848580441642, "grad_norm": 0.46943047019703255, "learning_rate": 1.1312177047932107e-05, "loss": 0.3233, "step": 9305 }, { "epoch": 1.8345820189274447, "grad_norm": 0.4637968220251219, "learning_rate": 1.1310640408377331e-05, "loss": 0.3354, "step": 9306 }, { "epoch": 1.8347791798107256, "grad_norm": 0.4811381690374539, "learning_rate": 1.1309103737333363e-05, "loss": 0.3206, "step": 9307 }, { "epoch": 1.8349763406940063, "grad_norm": 0.4841461834363835, "learning_rate": 1.1307567034837123e-05, "loss": 0.3355, "step": 9308 }, { "epoch": 1.835173501577287, "grad_norm": 0.5075047264571099, "learning_rate": 1.1306030300925531e-05, "loss": 0.3364, "step": 9309 }, { "epoch": 1.835370662460568, "grad_norm": 0.48761090026693843, "learning_rate": 1.1304493535635512e-05, "loss": 0.334, "step": 9310 }, { "epoch": 1.8355678233438486, "grad_norm": 0.4707017965699666, "learning_rate": 1.1302956739003981e-05, "loss": 0.3206, "step": 9311 }, { "epoch": 1.8357649842271293, "grad_norm": 0.47390782968886325, "learning_rate": 1.1301419911067871e-05, "loss": 0.3581, "step": 9312 }, { "epoch": 1.8359621451104102, "grad_norm": 0.5179730801274194, "learning_rate": 1.1299883051864095e-05, "loss": 0.3345, "step": 9313 }, { "epoch": 1.8361593059936907, "grad_norm": 0.4565817457776229, "learning_rate": 1.1298346161429585e-05, "loss": 0.324, "step": 9314 }, { "epoch": 1.8363564668769716, "grad_norm": 0.4743170511252832, "learning_rate": 1.1296809239801258e-05, "loss": 0.3257, "step": 9315 }, { "epoch": 1.8365536277602523, "grad_norm": 0.5062603340279239, "learning_rate": 1.129527228701605e-05, "loss": 0.3397, "step": 9316 }, { "epoch": 1.836750788643533, "grad_norm": 0.4627524681098276, "learning_rate": 1.129373530311088e-05, "loss": 0.3008, "step": 9317 }, { "epoch": 1.836947949526814, "grad_norm": 0.45366614630787505, "learning_rate": 1.1292198288122678e-05, "loss": 0.3239, "step": 9318 }, { "epoch": 1.8371451104100947, "grad_norm": 0.49553583595035205, "learning_rate": 1.1290661242088373e-05, "loss": 0.3335, "step": 9319 }, { "epoch": 1.8373422712933754, "grad_norm": 0.4656239756890831, "learning_rate": 1.1289124165044889e-05, "loss": 0.3343, "step": 9320 }, { "epoch": 1.8375394321766563, "grad_norm": 0.4592207478297998, "learning_rate": 1.1287587057029164e-05, "loss": 0.31, "step": 9321 }, { "epoch": 1.8377365930599368, "grad_norm": 0.48719866177332166, "learning_rate": 1.1286049918078118e-05, "loss": 0.3601, "step": 9322 }, { "epoch": 1.8379337539432177, "grad_norm": 0.4609017660568222, "learning_rate": 1.1284512748228686e-05, "loss": 0.3532, "step": 9323 }, { "epoch": 1.8381309148264984, "grad_norm": 0.471845332576064, "learning_rate": 1.1282975547517805e-05, "loss": 0.3428, "step": 9324 }, { "epoch": 1.838328075709779, "grad_norm": 0.45141311669092293, "learning_rate": 1.1281438315982403e-05, "loss": 0.3313, "step": 9325 }, { "epoch": 1.83852523659306, "grad_norm": 0.4846124283156861, "learning_rate": 1.127990105365941e-05, "loss": 0.3437, "step": 9326 }, { "epoch": 1.8387223974763407, "grad_norm": 0.4706449746560727, "learning_rate": 1.1278363760585767e-05, "loss": 0.3239, "step": 9327 }, { "epoch": 1.8389195583596214, "grad_norm": 0.46592281748659253, "learning_rate": 1.1276826436798406e-05, "loss": 0.2927, "step": 9328 }, { "epoch": 1.8391167192429023, "grad_norm": 0.4506091488970719, "learning_rate": 1.1275289082334257e-05, "loss": 0.3299, "step": 9329 }, { "epoch": 1.839313880126183, "grad_norm": 0.49247797496055923, "learning_rate": 1.1273751697230262e-05, "loss": 0.3177, "step": 9330 }, { "epoch": 1.8395110410094637, "grad_norm": 0.4823856031434032, "learning_rate": 1.1272214281523359e-05, "loss": 0.3369, "step": 9331 }, { "epoch": 1.8397082018927446, "grad_norm": 0.44703802630892214, "learning_rate": 1.127067683525048e-05, "loss": 0.3009, "step": 9332 }, { "epoch": 1.8399053627760251, "grad_norm": 0.4352564519007158, "learning_rate": 1.1269139358448573e-05, "loss": 0.2883, "step": 9333 }, { "epoch": 1.840102523659306, "grad_norm": 0.4897196424734099, "learning_rate": 1.1267601851154569e-05, "loss": 0.3292, "step": 9334 }, { "epoch": 1.8402996845425867, "grad_norm": 0.47061110718447907, "learning_rate": 1.1266064313405404e-05, "loss": 0.3198, "step": 9335 }, { "epoch": 1.8404968454258674, "grad_norm": 0.4725374845188905, "learning_rate": 1.1264526745238032e-05, "loss": 0.3303, "step": 9336 }, { "epoch": 1.8406940063091484, "grad_norm": 0.46014350127590964, "learning_rate": 1.1262989146689378e-05, "loss": 0.3047, "step": 9337 }, { "epoch": 1.840891167192429, "grad_norm": 0.48716019997900245, "learning_rate": 1.12614515177964e-05, "loss": 0.3345, "step": 9338 }, { "epoch": 1.8410883280757098, "grad_norm": 0.462419364697719, "learning_rate": 1.125991385859603e-05, "loss": 0.3345, "step": 9339 }, { "epoch": 1.8412854889589907, "grad_norm": 0.4838141067825107, "learning_rate": 1.1258376169125218e-05, "loss": 0.3418, "step": 9340 }, { "epoch": 1.8414826498422712, "grad_norm": 0.4638010611429982, "learning_rate": 1.1256838449420902e-05, "loss": 0.3397, "step": 9341 }, { "epoch": 1.841679810725552, "grad_norm": 0.4742578303058376, "learning_rate": 1.125530069952003e-05, "loss": 0.3369, "step": 9342 }, { "epoch": 1.8418769716088328, "grad_norm": 0.6012207890435203, "learning_rate": 1.1253762919459548e-05, "loss": 0.3922, "step": 9343 }, { "epoch": 1.8420741324921135, "grad_norm": 0.4640385924151846, "learning_rate": 1.1252225109276404e-05, "loss": 0.3276, "step": 9344 }, { "epoch": 1.8422712933753944, "grad_norm": 0.46720201438034464, "learning_rate": 1.1250687269007544e-05, "loss": 0.3231, "step": 9345 }, { "epoch": 1.8424684542586751, "grad_norm": 0.4809439170465585, "learning_rate": 1.1249149398689912e-05, "loss": 0.3359, "step": 9346 }, { "epoch": 1.8426656151419558, "grad_norm": 0.46092655938380556, "learning_rate": 1.1247611498360463e-05, "loss": 0.3288, "step": 9347 }, { "epoch": 1.8428627760252367, "grad_norm": 0.46354010282488706, "learning_rate": 1.124607356805614e-05, "loss": 0.3085, "step": 9348 }, { "epoch": 1.8430599369085172, "grad_norm": 0.4580081566598378, "learning_rate": 1.1244535607813898e-05, "loss": 0.3361, "step": 9349 }, { "epoch": 1.8432570977917981, "grad_norm": 0.49486686537167013, "learning_rate": 1.1242997617670685e-05, "loss": 0.3191, "step": 9350 }, { "epoch": 1.8434542586750788, "grad_norm": 0.4738563965909639, "learning_rate": 1.1241459597663453e-05, "loss": 0.358, "step": 9351 }, { "epoch": 1.8436514195583595, "grad_norm": 0.46422380906100263, "learning_rate": 1.1239921547829156e-05, "loss": 0.3178, "step": 9352 }, { "epoch": 1.8438485804416405, "grad_norm": 0.4863236006835687, "learning_rate": 1.1238383468204744e-05, "loss": 0.3352, "step": 9353 }, { "epoch": 1.8440457413249212, "grad_norm": 0.5448224040290974, "learning_rate": 1.1236845358827174e-05, "loss": 0.3508, "step": 9354 }, { "epoch": 1.8442429022082019, "grad_norm": 0.5119358198476269, "learning_rate": 1.1235307219733396e-05, "loss": 0.3626, "step": 9355 }, { "epoch": 1.8444400630914828, "grad_norm": 0.48695813930072723, "learning_rate": 1.1233769050960366e-05, "loss": 0.3276, "step": 9356 }, { "epoch": 1.8446372239747633, "grad_norm": 0.45412864850612233, "learning_rate": 1.1232230852545042e-05, "loss": 0.3367, "step": 9357 }, { "epoch": 1.8448343848580442, "grad_norm": 0.41684857678310244, "learning_rate": 1.1230692624524379e-05, "loss": 0.2956, "step": 9358 }, { "epoch": 1.8450315457413249, "grad_norm": 0.4765022219720706, "learning_rate": 1.1229154366935337e-05, "loss": 0.3451, "step": 9359 }, { "epoch": 1.8452287066246056, "grad_norm": 0.4451789441382438, "learning_rate": 1.1227616079814869e-05, "loss": 0.3078, "step": 9360 }, { "epoch": 1.8454258675078865, "grad_norm": 0.4783197235034446, "learning_rate": 1.1226077763199941e-05, "loss": 0.3355, "step": 9361 }, { "epoch": 1.8456230283911672, "grad_norm": 0.45356257014627016, "learning_rate": 1.12245394171275e-05, "loss": 0.3263, "step": 9362 }, { "epoch": 1.845820189274448, "grad_norm": 0.6426674681188173, "learning_rate": 1.1223001041634517e-05, "loss": 0.3478, "step": 9363 }, { "epoch": 1.8460173501577288, "grad_norm": 0.5256663639141104, "learning_rate": 1.122146263675795e-05, "loss": 0.3536, "step": 9364 }, { "epoch": 1.8462145110410093, "grad_norm": 0.49322701169852573, "learning_rate": 1.121992420253476e-05, "loss": 0.3507, "step": 9365 }, { "epoch": 1.8464116719242902, "grad_norm": 0.49133913419542524, "learning_rate": 1.1218385739001908e-05, "loss": 0.3297, "step": 9366 }, { "epoch": 1.846608832807571, "grad_norm": 0.4936609617307454, "learning_rate": 1.1216847246196356e-05, "loss": 0.3569, "step": 9367 }, { "epoch": 1.8468059936908516, "grad_norm": 0.4662341419677541, "learning_rate": 1.121530872415507e-05, "loss": 0.2912, "step": 9368 }, { "epoch": 1.8470031545741326, "grad_norm": 0.4643026216620009, "learning_rate": 1.1213770172915012e-05, "loss": 0.3205, "step": 9369 }, { "epoch": 1.8472003154574133, "grad_norm": 0.4620634065880125, "learning_rate": 1.121223159251315e-05, "loss": 0.3324, "step": 9370 }, { "epoch": 1.847397476340694, "grad_norm": 0.45144218730120467, "learning_rate": 1.1210692982986447e-05, "loss": 0.3085, "step": 9371 }, { "epoch": 1.8475946372239749, "grad_norm": 0.481324283568681, "learning_rate": 1.120915434437187e-05, "loss": 0.3384, "step": 9372 }, { "epoch": 1.8477917981072554, "grad_norm": 0.48469219856799683, "learning_rate": 1.1207615676706387e-05, "loss": 0.3237, "step": 9373 }, { "epoch": 1.8479889589905363, "grad_norm": 0.43868531841936526, "learning_rate": 1.1206076980026963e-05, "loss": 0.331, "step": 9374 }, { "epoch": 1.848186119873817, "grad_norm": 0.49905037754094533, "learning_rate": 1.120453825437057e-05, "loss": 0.3504, "step": 9375 }, { "epoch": 1.8483832807570977, "grad_norm": 0.47057721365819305, "learning_rate": 1.1202999499774174e-05, "loss": 0.3341, "step": 9376 }, { "epoch": 1.8485804416403786, "grad_norm": 0.5115538854447896, "learning_rate": 1.1201460716274745e-05, "loss": 0.3557, "step": 9377 }, { "epoch": 1.8487776025236593, "grad_norm": 0.49106393680197397, "learning_rate": 1.1199921903909258e-05, "loss": 0.3348, "step": 9378 }, { "epoch": 1.84897476340694, "grad_norm": 0.4783550963001248, "learning_rate": 1.119838306271468e-05, "loss": 0.3447, "step": 9379 }, { "epoch": 1.849171924290221, "grad_norm": 15.982642688070285, "learning_rate": 1.1196844192727984e-05, "loss": 0.3471, "step": 9380 }, { "epoch": 1.8493690851735016, "grad_norm": 0.49460635668203545, "learning_rate": 1.119530529398614e-05, "loss": 0.3547, "step": 9381 }, { "epoch": 1.8495662460567823, "grad_norm": 0.4767109854012262, "learning_rate": 1.1193766366526128e-05, "loss": 0.3347, "step": 9382 }, { "epoch": 1.8497634069400632, "grad_norm": 0.4344678704925985, "learning_rate": 1.1192227410384915e-05, "loss": 0.2973, "step": 9383 }, { "epoch": 1.8499605678233437, "grad_norm": 0.4942642287727857, "learning_rate": 1.1190688425599478e-05, "loss": 0.3555, "step": 9384 }, { "epoch": 1.8501577287066246, "grad_norm": 0.45936308831216105, "learning_rate": 1.1189149412206795e-05, "loss": 0.3256, "step": 9385 }, { "epoch": 1.8503548895899053, "grad_norm": 0.48263042540453904, "learning_rate": 1.1187610370243837e-05, "loss": 0.337, "step": 9386 }, { "epoch": 1.850552050473186, "grad_norm": 0.48546115028194037, "learning_rate": 1.1186071299747588e-05, "loss": 0.3529, "step": 9387 }, { "epoch": 1.850749211356467, "grad_norm": 1.2555124942050995, "learning_rate": 1.1184532200755017e-05, "loss": 0.3508, "step": 9388 }, { "epoch": 1.8509463722397477, "grad_norm": 0.517094897963285, "learning_rate": 1.1182993073303107e-05, "loss": 0.3507, "step": 9389 }, { "epoch": 1.8511435331230284, "grad_norm": 0.4931959158526763, "learning_rate": 1.1181453917428835e-05, "loss": 0.3404, "step": 9390 }, { "epoch": 1.8513406940063093, "grad_norm": 0.4720258856465021, "learning_rate": 1.117991473316918e-05, "loss": 0.3271, "step": 9391 }, { "epoch": 1.8515378548895898, "grad_norm": 0.4663809318007513, "learning_rate": 1.1178375520561126e-05, "loss": 0.3231, "step": 9392 }, { "epoch": 1.8517350157728707, "grad_norm": 0.4791754199299042, "learning_rate": 1.1176836279641649e-05, "loss": 0.3495, "step": 9393 }, { "epoch": 1.8519321766561514, "grad_norm": 0.44613918375908324, "learning_rate": 1.1175297010447734e-05, "loss": 0.3234, "step": 9394 }, { "epoch": 1.852129337539432, "grad_norm": 0.5340353989096874, "learning_rate": 1.1173757713016362e-05, "loss": 0.3315, "step": 9395 }, { "epoch": 1.852326498422713, "grad_norm": 0.46860519180264953, "learning_rate": 1.1172218387384517e-05, "loss": 0.3234, "step": 9396 }, { "epoch": 1.8525236593059937, "grad_norm": 0.46923350765977884, "learning_rate": 1.117067903358918e-05, "loss": 0.3152, "step": 9397 }, { "epoch": 1.8527208201892744, "grad_norm": 0.4989908576360559, "learning_rate": 1.1169139651667334e-05, "loss": 0.3334, "step": 9398 }, { "epoch": 1.8529179810725553, "grad_norm": 0.48850368752631385, "learning_rate": 1.1167600241655969e-05, "loss": 0.3168, "step": 9399 }, { "epoch": 1.8531151419558358, "grad_norm": 0.506462305386158, "learning_rate": 1.116606080359207e-05, "loss": 0.3354, "step": 9400 }, { "epoch": 1.8533123028391167, "grad_norm": 0.4783276972037197, "learning_rate": 1.1164521337512618e-05, "loss": 0.3444, "step": 9401 }, { "epoch": 1.8535094637223974, "grad_norm": 0.49486017910579383, "learning_rate": 1.1162981843454603e-05, "loss": 0.322, "step": 9402 }, { "epoch": 1.8537066246056781, "grad_norm": 0.48872781997990056, "learning_rate": 1.1161442321455013e-05, "loss": 0.3376, "step": 9403 }, { "epoch": 1.853903785488959, "grad_norm": 0.49644332416860143, "learning_rate": 1.1159902771550836e-05, "loss": 0.3282, "step": 9404 }, { "epoch": 1.8541009463722398, "grad_norm": 0.483535918659441, "learning_rate": 1.115836319377906e-05, "loss": 0.3394, "step": 9405 }, { "epoch": 1.8542981072555205, "grad_norm": 0.45628473409890424, "learning_rate": 1.115682358817668e-05, "loss": 0.3345, "step": 9406 }, { "epoch": 1.8544952681388014, "grad_norm": 0.5002627802730854, "learning_rate": 1.1155283954780676e-05, "loss": 0.3342, "step": 9407 }, { "epoch": 1.8546924290220819, "grad_norm": 0.46363502477326907, "learning_rate": 1.1153744293628049e-05, "loss": 0.341, "step": 9408 }, { "epoch": 1.8548895899053628, "grad_norm": 0.45754159982180126, "learning_rate": 1.115220460475578e-05, "loss": 0.322, "step": 9409 }, { "epoch": 1.8550867507886435, "grad_norm": 0.4811557071074235, "learning_rate": 1.1150664888200874e-05, "loss": 0.3448, "step": 9410 }, { "epoch": 1.8552839116719242, "grad_norm": 0.45695254954143394, "learning_rate": 1.1149125144000315e-05, "loss": 0.3172, "step": 9411 }, { "epoch": 1.8554810725552051, "grad_norm": 0.4611849986321203, "learning_rate": 1.1147585372191099e-05, "loss": 0.3324, "step": 9412 }, { "epoch": 1.8556782334384858, "grad_norm": 0.4951261883208246, "learning_rate": 1.114604557281022e-05, "loss": 0.3467, "step": 9413 }, { "epoch": 1.8558753943217665, "grad_norm": 0.45894758304595423, "learning_rate": 1.1144505745894674e-05, "loss": 0.3354, "step": 9414 }, { "epoch": 1.8560725552050474, "grad_norm": 0.47361962578122113, "learning_rate": 1.1142965891481456e-05, "loss": 0.338, "step": 9415 }, { "epoch": 1.856269716088328, "grad_norm": 0.46717084409335563, "learning_rate": 1.1141426009607562e-05, "loss": 0.3181, "step": 9416 }, { "epoch": 1.8564668769716088, "grad_norm": 0.46910516420489423, "learning_rate": 1.1139886100309987e-05, "loss": 0.3114, "step": 9417 }, { "epoch": 1.8566640378548895, "grad_norm": 0.45627899747872586, "learning_rate": 1.1138346163625732e-05, "loss": 0.3362, "step": 9418 }, { "epoch": 1.8568611987381702, "grad_norm": 0.488764972259954, "learning_rate": 1.1136806199591794e-05, "loss": 0.318, "step": 9419 }, { "epoch": 1.8570583596214512, "grad_norm": 0.48207706868844363, "learning_rate": 1.1135266208245173e-05, "loss": 0.3252, "step": 9420 }, { "epoch": 1.8572555205047319, "grad_norm": 0.4533304622987771, "learning_rate": 1.1133726189622865e-05, "loss": 0.3134, "step": 9421 }, { "epoch": 1.8574526813880126, "grad_norm": 0.5209819124401526, "learning_rate": 1.1132186143761872e-05, "loss": 0.3225, "step": 9422 }, { "epoch": 1.8576498422712935, "grad_norm": 0.4764148070256464, "learning_rate": 1.1130646070699196e-05, "loss": 0.335, "step": 9423 }, { "epoch": 1.8578470031545742, "grad_norm": 0.46903416952858623, "learning_rate": 1.1129105970471836e-05, "loss": 0.3372, "step": 9424 }, { "epoch": 1.8580441640378549, "grad_norm": 0.4386714428559711, "learning_rate": 1.1127565843116798e-05, "loss": 0.2949, "step": 9425 }, { "epoch": 1.8582413249211358, "grad_norm": 0.4453659320932465, "learning_rate": 1.1126025688671081e-05, "loss": 0.3091, "step": 9426 }, { "epoch": 1.8584384858044163, "grad_norm": 0.6218845162516824, "learning_rate": 1.1124485507171691e-05, "loss": 0.367, "step": 9427 }, { "epoch": 1.8586356466876972, "grad_norm": 0.5032362384746675, "learning_rate": 1.112294529865563e-05, "loss": 0.3386, "step": 9428 }, { "epoch": 1.858832807570978, "grad_norm": 0.46700700293384795, "learning_rate": 1.1121405063159906e-05, "loss": 0.344, "step": 9429 }, { "epoch": 1.8590299684542586, "grad_norm": 0.4650433144492055, "learning_rate": 1.111986480072152e-05, "loss": 0.3257, "step": 9430 }, { "epoch": 1.8592271293375395, "grad_norm": 0.470873212417909, "learning_rate": 1.1118324511377482e-05, "loss": 0.3012, "step": 9431 }, { "epoch": 1.8594242902208202, "grad_norm": 0.4694211550117268, "learning_rate": 1.1116784195164797e-05, "loss": 0.3423, "step": 9432 }, { "epoch": 1.859621451104101, "grad_norm": 0.46797333442001665, "learning_rate": 1.1115243852120472e-05, "loss": 0.3196, "step": 9433 }, { "epoch": 1.8598186119873819, "grad_norm": 0.4847132759132295, "learning_rate": 1.1113703482281515e-05, "loss": 0.3374, "step": 9434 }, { "epoch": 1.8600157728706623, "grad_norm": 0.48758924974039297, "learning_rate": 1.1112163085684935e-05, "loss": 0.3437, "step": 9435 }, { "epoch": 1.8602129337539433, "grad_norm": 0.48133455329307273, "learning_rate": 1.111062266236774e-05, "loss": 0.3175, "step": 9436 }, { "epoch": 1.860410094637224, "grad_norm": 0.4906755288935251, "learning_rate": 1.1109082212366944e-05, "loss": 0.318, "step": 9437 }, { "epoch": 1.8606072555205047, "grad_norm": 0.4633813435313994, "learning_rate": 1.1107541735719554e-05, "loss": 0.3103, "step": 9438 }, { "epoch": 1.8608044164037856, "grad_norm": 0.5163900036338083, "learning_rate": 1.110600123246258e-05, "loss": 0.3263, "step": 9439 }, { "epoch": 1.8610015772870663, "grad_norm": 0.5956374108250188, "learning_rate": 1.1104460702633038e-05, "loss": 0.3441, "step": 9440 }, { "epoch": 1.861198738170347, "grad_norm": 0.4763548264026657, "learning_rate": 1.1102920146267938e-05, "loss": 0.3032, "step": 9441 }, { "epoch": 1.861395899053628, "grad_norm": 0.4958429310254217, "learning_rate": 1.1101379563404291e-05, "loss": 0.3171, "step": 9442 }, { "epoch": 1.8615930599369084, "grad_norm": 0.4894185337405908, "learning_rate": 1.1099838954079117e-05, "loss": 0.3349, "step": 9443 }, { "epoch": 1.8617902208201893, "grad_norm": 0.48658157622850023, "learning_rate": 1.1098298318329421e-05, "loss": 0.3516, "step": 9444 }, { "epoch": 1.86198738170347, "grad_norm": 0.477421882843568, "learning_rate": 1.1096757656192226e-05, "loss": 0.3442, "step": 9445 }, { "epoch": 1.8621845425867507, "grad_norm": 0.4587814387324469, "learning_rate": 1.1095216967704548e-05, "loss": 0.3155, "step": 9446 }, { "epoch": 1.8623817034700316, "grad_norm": 0.46345456592813855, "learning_rate": 1.1093676252903395e-05, "loss": 0.3245, "step": 9447 }, { "epoch": 1.8625788643533123, "grad_norm": 0.49258102378140683, "learning_rate": 1.1092135511825795e-05, "loss": 0.3385, "step": 9448 }, { "epoch": 1.862776025236593, "grad_norm": 0.4885376518923072, "learning_rate": 1.1090594744508754e-05, "loss": 0.3429, "step": 9449 }, { "epoch": 1.862973186119874, "grad_norm": 0.5003966905526008, "learning_rate": 1.1089053950989301e-05, "loss": 0.3572, "step": 9450 }, { "epoch": 1.8631703470031544, "grad_norm": 0.48522433245566304, "learning_rate": 1.1087513131304446e-05, "loss": 0.3174, "step": 9451 }, { "epoch": 1.8633675078864353, "grad_norm": 0.5072726660997204, "learning_rate": 1.1085972285491213e-05, "loss": 0.3142, "step": 9452 }, { "epoch": 1.863564668769716, "grad_norm": 0.4492624933232977, "learning_rate": 1.1084431413586625e-05, "loss": 0.3264, "step": 9453 }, { "epoch": 1.8637618296529967, "grad_norm": 0.47440212617966615, "learning_rate": 1.1082890515627696e-05, "loss": 0.3221, "step": 9454 }, { "epoch": 1.8639589905362777, "grad_norm": 0.4449072907644378, "learning_rate": 1.108134959165145e-05, "loss": 0.3093, "step": 9455 }, { "epoch": 1.8641561514195584, "grad_norm": 0.4454938083744686, "learning_rate": 1.1079808641694909e-05, "loss": 0.3215, "step": 9456 }, { "epoch": 1.864353312302839, "grad_norm": 0.472087185708321, "learning_rate": 1.10782676657951e-05, "loss": 0.3397, "step": 9457 }, { "epoch": 1.86455047318612, "grad_norm": 0.4678074204749738, "learning_rate": 1.1076726663989037e-05, "loss": 0.3284, "step": 9458 }, { "epoch": 1.8647476340694005, "grad_norm": 0.46656067923498784, "learning_rate": 1.107518563631375e-05, "loss": 0.3283, "step": 9459 }, { "epoch": 1.8649447949526814, "grad_norm": 0.4635770298520586, "learning_rate": 1.1073644582806263e-05, "loss": 0.3187, "step": 9460 }, { "epoch": 1.865141955835962, "grad_norm": 0.44301184098285906, "learning_rate": 1.1072103503503599e-05, "loss": 0.3063, "step": 9461 }, { "epoch": 1.8653391167192428, "grad_norm": 0.4628837166980266, "learning_rate": 1.1070562398442789e-05, "loss": 0.3335, "step": 9462 }, { "epoch": 1.8655362776025237, "grad_norm": 0.6943912518558549, "learning_rate": 1.106902126766085e-05, "loss": 0.3459, "step": 9463 }, { "epoch": 1.8657334384858044, "grad_norm": 0.5689959265430884, "learning_rate": 1.1067480111194817e-05, "loss": 0.3639, "step": 9464 }, { "epoch": 1.8659305993690851, "grad_norm": 0.478559291667436, "learning_rate": 1.1065938929081714e-05, "loss": 0.3357, "step": 9465 }, { "epoch": 1.866127760252366, "grad_norm": 0.4676687566999173, "learning_rate": 1.1064397721358571e-05, "loss": 0.3022, "step": 9466 }, { "epoch": 1.8663249211356467, "grad_norm": 0.46290577274262135, "learning_rate": 1.1062856488062414e-05, "loss": 0.3096, "step": 9467 }, { "epoch": 1.8665220820189274, "grad_norm": 0.5040641995883025, "learning_rate": 1.1061315229230276e-05, "loss": 0.3394, "step": 9468 }, { "epoch": 1.8667192429022084, "grad_norm": 0.45321672662297996, "learning_rate": 1.1059773944899183e-05, "loss": 0.3369, "step": 9469 }, { "epoch": 1.8669164037854888, "grad_norm": 0.4487604982696012, "learning_rate": 1.1058232635106167e-05, "loss": 0.3296, "step": 9470 }, { "epoch": 1.8671135646687698, "grad_norm": 0.5159266623498804, "learning_rate": 1.1056691299888262e-05, "loss": 0.3446, "step": 9471 }, { "epoch": 1.8673107255520505, "grad_norm": 0.48017706589184767, "learning_rate": 1.1055149939282497e-05, "loss": 0.3296, "step": 9472 }, { "epoch": 1.8675078864353312, "grad_norm": 0.47518034239431195, "learning_rate": 1.1053608553325901e-05, "loss": 0.3314, "step": 9473 }, { "epoch": 1.867705047318612, "grad_norm": 0.4352311407076608, "learning_rate": 1.1052067142055516e-05, "loss": 0.3109, "step": 9474 }, { "epoch": 1.8679022082018928, "grad_norm": 0.47370145361968125, "learning_rate": 1.1050525705508369e-05, "loss": 0.3371, "step": 9475 }, { "epoch": 1.8680993690851735, "grad_norm": 0.5177165653942762, "learning_rate": 1.1048984243721496e-05, "loss": 0.353, "step": 9476 }, { "epoch": 1.8682965299684544, "grad_norm": 0.4685075059578066, "learning_rate": 1.104744275673193e-05, "loss": 0.3392, "step": 9477 }, { "epoch": 1.868493690851735, "grad_norm": 0.4547513508412821, "learning_rate": 1.1045901244576713e-05, "loss": 0.3096, "step": 9478 }, { "epoch": 1.8686908517350158, "grad_norm": 0.4715660539717716, "learning_rate": 1.104435970729287e-05, "loss": 0.3337, "step": 9479 }, { "epoch": 1.8688880126182965, "grad_norm": 0.5173676222111224, "learning_rate": 1.1042818144917449e-05, "loss": 0.3429, "step": 9480 }, { "epoch": 1.8690851735015772, "grad_norm": 0.4748763088025496, "learning_rate": 1.1041276557487482e-05, "loss": 0.3285, "step": 9481 }, { "epoch": 1.8692823343848581, "grad_norm": 0.4570571040060861, "learning_rate": 1.1039734945040004e-05, "loss": 0.3291, "step": 9482 }, { "epoch": 1.8694794952681388, "grad_norm": 0.5076175558770766, "learning_rate": 1.103819330761206e-05, "loss": 0.3466, "step": 9483 }, { "epoch": 1.8696766561514195, "grad_norm": 0.43967840426291815, "learning_rate": 1.1036651645240683e-05, "loss": 0.3072, "step": 9484 }, { "epoch": 1.8698738170347005, "grad_norm": 0.488582403780876, "learning_rate": 1.1035109957962918e-05, "loss": 0.3156, "step": 9485 }, { "epoch": 1.870070977917981, "grad_norm": 0.47909357015758164, "learning_rate": 1.10335682458158e-05, "loss": 0.3425, "step": 9486 }, { "epoch": 1.8702681388012619, "grad_norm": 0.47912178648751735, "learning_rate": 1.1032026508836376e-05, "loss": 0.3053, "step": 9487 }, { "epoch": 1.8704652996845426, "grad_norm": 0.480061395371812, "learning_rate": 1.103048474706168e-05, "loss": 0.364, "step": 9488 }, { "epoch": 1.8706624605678233, "grad_norm": 0.46424946405738576, "learning_rate": 1.102894296052876e-05, "loss": 0.3152, "step": 9489 }, { "epoch": 1.8708596214511042, "grad_norm": 0.46382382583955356, "learning_rate": 1.1027401149274658e-05, "loss": 0.331, "step": 9490 }, { "epoch": 1.8710567823343849, "grad_norm": 0.4707474802471004, "learning_rate": 1.1025859313336415e-05, "loss": 0.3308, "step": 9491 }, { "epoch": 1.8712539432176656, "grad_norm": 0.46377381491584824, "learning_rate": 1.1024317452751076e-05, "loss": 0.327, "step": 9492 }, { "epoch": 1.8714511041009465, "grad_norm": 0.44780835687333587, "learning_rate": 1.1022775567555686e-05, "loss": 0.3347, "step": 9493 }, { "epoch": 1.871648264984227, "grad_norm": 0.45613465092557437, "learning_rate": 1.1021233657787285e-05, "loss": 0.3223, "step": 9494 }, { "epoch": 1.871845425867508, "grad_norm": 0.4667856099355164, "learning_rate": 1.1019691723482928e-05, "loss": 0.3262, "step": 9495 }, { "epoch": 1.8720425867507886, "grad_norm": 0.5087693131824845, "learning_rate": 1.1018149764679653e-05, "loss": 0.3565, "step": 9496 }, { "epoch": 1.8722397476340693, "grad_norm": 0.47261955625623386, "learning_rate": 1.1016607781414514e-05, "loss": 0.3387, "step": 9497 }, { "epoch": 1.8724369085173502, "grad_norm": 0.4507061778033932, "learning_rate": 1.101506577372455e-05, "loss": 0.3378, "step": 9498 }, { "epoch": 1.872634069400631, "grad_norm": 0.4914038805520105, "learning_rate": 1.1013523741646817e-05, "loss": 0.3635, "step": 9499 }, { "epoch": 1.8728312302839116, "grad_norm": 0.46523334735695693, "learning_rate": 1.1011981685218355e-05, "loss": 0.3163, "step": 9500 }, { "epoch": 1.8730283911671926, "grad_norm": 0.461046105271434, "learning_rate": 1.1010439604476222e-05, "loss": 0.3027, "step": 9501 }, { "epoch": 1.873225552050473, "grad_norm": 0.4917301931134718, "learning_rate": 1.1008897499457466e-05, "loss": 0.3536, "step": 9502 }, { "epoch": 1.873422712933754, "grad_norm": 0.4931865848981832, "learning_rate": 1.100735537019913e-05, "loss": 0.338, "step": 9503 }, { "epoch": 1.8736198738170347, "grad_norm": 0.465842711087265, "learning_rate": 1.1005813216738273e-05, "loss": 0.3327, "step": 9504 }, { "epoch": 1.8738170347003154, "grad_norm": 0.46721071233696176, "learning_rate": 1.1004271039111943e-05, "loss": 0.3349, "step": 9505 }, { "epoch": 1.8740141955835963, "grad_norm": 0.47405778733750703, "learning_rate": 1.1002728837357192e-05, "loss": 0.3388, "step": 9506 }, { "epoch": 1.874211356466877, "grad_norm": 0.4734967427166212, "learning_rate": 1.1001186611511071e-05, "loss": 0.3385, "step": 9507 }, { "epoch": 1.8744085173501577, "grad_norm": 0.5255895043873378, "learning_rate": 1.099964436161064e-05, "loss": 0.3372, "step": 9508 }, { "epoch": 1.8746056782334386, "grad_norm": 0.4787785522800639, "learning_rate": 1.0998102087692946e-05, "loss": 0.3355, "step": 9509 }, { "epoch": 1.874802839116719, "grad_norm": 0.5023739258469361, "learning_rate": 1.0996559789795045e-05, "loss": 0.3478, "step": 9510 }, { "epoch": 1.875, "grad_norm": 0.46464061305129783, "learning_rate": 1.0995017467953994e-05, "loss": 0.3331, "step": 9511 }, { "epoch": 1.875197160883281, "grad_norm": 0.4995781916354229, "learning_rate": 1.0993475122206846e-05, "loss": 0.3407, "step": 9512 }, { "epoch": 1.8753943217665614, "grad_norm": 0.4733291385390516, "learning_rate": 1.0991932752590657e-05, "loss": 0.3197, "step": 9513 }, { "epoch": 1.8755914826498423, "grad_norm": 0.5105493418105221, "learning_rate": 1.0990390359142488e-05, "loss": 0.3507, "step": 9514 }, { "epoch": 1.875788643533123, "grad_norm": 0.4942055666455541, "learning_rate": 1.098884794189939e-05, "loss": 0.3403, "step": 9515 }, { "epoch": 1.8759858044164037, "grad_norm": 0.4900283583301513, "learning_rate": 1.0987305500898427e-05, "loss": 0.3239, "step": 9516 }, { "epoch": 1.8761829652996846, "grad_norm": 0.4555388435909809, "learning_rate": 1.0985763036176648e-05, "loss": 0.3254, "step": 9517 }, { "epoch": 1.8763801261829653, "grad_norm": 0.521257593570195, "learning_rate": 1.0984220547771127e-05, "loss": 0.3489, "step": 9518 }, { "epoch": 1.876577287066246, "grad_norm": 0.45372277802554656, "learning_rate": 1.098267803571891e-05, "loss": 0.2968, "step": 9519 }, { "epoch": 1.876774447949527, "grad_norm": 0.4731885774415591, "learning_rate": 1.098113550005706e-05, "loss": 0.348, "step": 9520 }, { "epoch": 1.8769716088328074, "grad_norm": 0.4704727543410908, "learning_rate": 1.0979592940822643e-05, "loss": 0.3237, "step": 9521 }, { "epoch": 1.8771687697160884, "grad_norm": 0.4716211417273583, "learning_rate": 1.0978050358052715e-05, "loss": 0.3332, "step": 9522 }, { "epoch": 1.877365930599369, "grad_norm": 0.4656934460308149, "learning_rate": 1.0976507751784343e-05, "loss": 0.3253, "step": 9523 }, { "epoch": 1.8775630914826498, "grad_norm": 0.45429803312317435, "learning_rate": 1.097496512205458e-05, "loss": 0.3213, "step": 9524 }, { "epoch": 1.8777602523659307, "grad_norm": 0.4941603848877281, "learning_rate": 1.0973422468900498e-05, "loss": 0.3314, "step": 9525 }, { "epoch": 1.8779574132492114, "grad_norm": 0.4921863038345052, "learning_rate": 1.0971879792359154e-05, "loss": 0.3161, "step": 9526 }, { "epoch": 1.878154574132492, "grad_norm": 0.46662564704980725, "learning_rate": 1.097033709246762e-05, "loss": 0.3451, "step": 9527 }, { "epoch": 1.878351735015773, "grad_norm": 0.4434045432752366, "learning_rate": 1.0968794369262954e-05, "loss": 0.3125, "step": 9528 }, { "epoch": 1.8785488958990535, "grad_norm": 0.4809233182201705, "learning_rate": 1.0967251622782223e-05, "loss": 0.3559, "step": 9529 }, { "epoch": 1.8787460567823344, "grad_norm": 0.4744798298419412, "learning_rate": 1.0965708853062493e-05, "loss": 0.3364, "step": 9530 }, { "epoch": 1.8789432176656151, "grad_norm": 0.47368018073228213, "learning_rate": 1.0964166060140831e-05, "loss": 0.3171, "step": 9531 }, { "epoch": 1.8791403785488958, "grad_norm": 0.5127652861424508, "learning_rate": 1.0962623244054302e-05, "loss": 0.3484, "step": 9532 }, { "epoch": 1.8793375394321767, "grad_norm": 0.4858636735208188, "learning_rate": 1.0961080404839974e-05, "loss": 0.3506, "step": 9533 }, { "epoch": 1.8795347003154574, "grad_norm": 0.4957252696098338, "learning_rate": 1.0959537542534916e-05, "loss": 0.3296, "step": 9534 }, { "epoch": 1.8797318611987381, "grad_norm": 0.5147145847410192, "learning_rate": 1.0957994657176197e-05, "loss": 0.3467, "step": 9535 }, { "epoch": 1.879929022082019, "grad_norm": 0.4335412701687705, "learning_rate": 1.0956451748800883e-05, "loss": 0.3149, "step": 9536 }, { "epoch": 1.8801261829652995, "grad_norm": 0.46479740344238335, "learning_rate": 1.0954908817446047e-05, "loss": 0.3318, "step": 9537 }, { "epoch": 1.8803233438485805, "grad_norm": 0.4683628747889899, "learning_rate": 1.0953365863148757e-05, "loss": 0.3287, "step": 9538 }, { "epoch": 1.8805205047318612, "grad_norm": 0.472736232015758, "learning_rate": 1.0951822885946084e-05, "loss": 0.3136, "step": 9539 }, { "epoch": 1.8807176656151419, "grad_norm": 0.4788330670775191, "learning_rate": 1.0950279885875098e-05, "loss": 0.3368, "step": 9540 }, { "epoch": 1.8809148264984228, "grad_norm": 0.4760770871051563, "learning_rate": 1.0948736862972873e-05, "loss": 0.3262, "step": 9541 }, { "epoch": 1.8811119873817035, "grad_norm": 0.456235107469069, "learning_rate": 1.0947193817276485e-05, "loss": 0.3283, "step": 9542 }, { "epoch": 1.8813091482649842, "grad_norm": 0.4949947731416457, "learning_rate": 1.0945650748822998e-05, "loss": 0.3504, "step": 9543 }, { "epoch": 1.881506309148265, "grad_norm": 0.5538356410729359, "learning_rate": 1.0944107657649494e-05, "loss": 0.319, "step": 9544 }, { "epoch": 1.8817034700315456, "grad_norm": 0.5051031777149498, "learning_rate": 1.0942564543793039e-05, "loss": 0.3364, "step": 9545 }, { "epoch": 1.8819006309148265, "grad_norm": 0.4953979151686171, "learning_rate": 1.0941021407290717e-05, "loss": 0.3856, "step": 9546 }, { "epoch": 1.8820977917981072, "grad_norm": 0.4632524126131711, "learning_rate": 1.0939478248179594e-05, "loss": 0.3225, "step": 9547 }, { "epoch": 1.882294952681388, "grad_norm": 0.46099411952055647, "learning_rate": 1.093793506649675e-05, "loss": 0.3221, "step": 9548 }, { "epoch": 1.8824921135646688, "grad_norm": 0.48987986252693555, "learning_rate": 1.093639186227926e-05, "loss": 0.3414, "step": 9549 }, { "epoch": 1.8826892744479495, "grad_norm": 0.5892412407244659, "learning_rate": 1.0934848635564203e-05, "loss": 0.3317, "step": 9550 }, { "epoch": 1.8828864353312302, "grad_norm": 0.464294311971776, "learning_rate": 1.0933305386388656e-05, "loss": 0.3274, "step": 9551 }, { "epoch": 1.8830835962145112, "grad_norm": 0.48576088684391133, "learning_rate": 1.0931762114789695e-05, "loss": 0.3357, "step": 9552 }, { "epoch": 1.8832807570977916, "grad_norm": 0.44811752905575575, "learning_rate": 1.0930218820804398e-05, "loss": 0.3113, "step": 9553 }, { "epoch": 1.8834779179810726, "grad_norm": 0.4927473006999295, "learning_rate": 1.0928675504469843e-05, "loss": 0.3513, "step": 9554 }, { "epoch": 1.8836750788643533, "grad_norm": 0.45362869939341455, "learning_rate": 1.0927132165823113e-05, "loss": 0.3272, "step": 9555 }, { "epoch": 1.883872239747634, "grad_norm": 0.45381446754200727, "learning_rate": 1.0925588804901286e-05, "loss": 0.3233, "step": 9556 }, { "epoch": 1.8840694006309149, "grad_norm": 0.44466594957868893, "learning_rate": 1.0924045421741442e-05, "loss": 0.3331, "step": 9557 }, { "epoch": 1.8842665615141956, "grad_norm": 0.5061065779984183, "learning_rate": 1.0922502016380663e-05, "loss": 0.3372, "step": 9558 }, { "epoch": 1.8844637223974763, "grad_norm": 0.5421030190329682, "learning_rate": 1.092095858885603e-05, "loss": 0.3578, "step": 9559 }, { "epoch": 1.8846608832807572, "grad_norm": 0.5002948397548355, "learning_rate": 1.0919415139204625e-05, "loss": 0.3225, "step": 9560 }, { "epoch": 1.884858044164038, "grad_norm": 0.49365892934666894, "learning_rate": 1.0917871667463533e-05, "loss": 0.3349, "step": 9561 }, { "epoch": 1.8850552050473186, "grad_norm": 0.4803469375107623, "learning_rate": 1.091632817366983e-05, "loss": 0.3515, "step": 9562 }, { "epoch": 1.8852523659305995, "grad_norm": 0.49730327018911846, "learning_rate": 1.091478465786061e-05, "loss": 0.3453, "step": 9563 }, { "epoch": 1.88544952681388, "grad_norm": 0.5162410884212086, "learning_rate": 1.0913241120072947e-05, "loss": 0.34, "step": 9564 }, { "epoch": 1.885646687697161, "grad_norm": 0.4919385394499981, "learning_rate": 1.0911697560343937e-05, "loss": 0.347, "step": 9565 }, { "epoch": 1.8858438485804416, "grad_norm": 0.48694966659079764, "learning_rate": 1.0910153978710654e-05, "loss": 0.3522, "step": 9566 }, { "epoch": 1.8860410094637223, "grad_norm": 0.488293991846791, "learning_rate": 1.0908610375210193e-05, "loss": 0.3287, "step": 9567 }, { "epoch": 1.8862381703470033, "grad_norm": 0.48621217336734124, "learning_rate": 1.0907066749879632e-05, "loss": 0.3401, "step": 9568 }, { "epoch": 1.886435331230284, "grad_norm": 0.47778447085942694, "learning_rate": 1.0905523102756061e-05, "loss": 0.3199, "step": 9569 }, { "epoch": 1.8866324921135647, "grad_norm": 0.47090656319170815, "learning_rate": 1.0903979433876573e-05, "loss": 0.3117, "step": 9570 }, { "epoch": 1.8868296529968456, "grad_norm": 0.4595803156099776, "learning_rate": 1.0902435743278248e-05, "loss": 0.3295, "step": 9571 }, { "epoch": 1.887026813880126, "grad_norm": 0.441046356319071, "learning_rate": 1.0900892030998181e-05, "loss": 0.2994, "step": 9572 }, { "epoch": 1.887223974763407, "grad_norm": 0.46255381742264307, "learning_rate": 1.089934829707345e-05, "loss": 0.3106, "step": 9573 }, { "epoch": 1.8874211356466877, "grad_norm": 0.48070659599098814, "learning_rate": 1.0897804541541159e-05, "loss": 0.3268, "step": 9574 }, { "epoch": 1.8876182965299684, "grad_norm": 0.47064959203140777, "learning_rate": 1.0896260764438387e-05, "loss": 0.3443, "step": 9575 }, { "epoch": 1.8878154574132493, "grad_norm": 0.48437387551779304, "learning_rate": 1.089471696580223e-05, "loss": 0.3274, "step": 9576 }, { "epoch": 1.88801261829653, "grad_norm": 0.4936834752237707, "learning_rate": 1.0893173145669777e-05, "loss": 0.354, "step": 9577 }, { "epoch": 1.8882097791798107, "grad_norm": 0.47759344988792984, "learning_rate": 1.089162930407812e-05, "loss": 0.3548, "step": 9578 }, { "epoch": 1.8884069400630916, "grad_norm": 0.4673787356484052, "learning_rate": 1.089008544106435e-05, "loss": 0.3184, "step": 9579 }, { "epoch": 1.888604100946372, "grad_norm": 5.043034026594802, "learning_rate": 1.0888541556665562e-05, "loss": 0.3314, "step": 9580 }, { "epoch": 1.888801261829653, "grad_norm": 0.49514263389138563, "learning_rate": 1.0886997650918848e-05, "loss": 0.3212, "step": 9581 }, { "epoch": 1.8889984227129337, "grad_norm": 0.45933092936924486, "learning_rate": 1.08854537238613e-05, "loss": 0.3102, "step": 9582 }, { "epoch": 1.8891955835962144, "grad_norm": 0.4590562261492261, "learning_rate": 1.0883909775530013e-05, "loss": 0.3114, "step": 9583 }, { "epoch": 1.8893927444794953, "grad_norm": 0.4834863213201024, "learning_rate": 1.0882365805962083e-05, "loss": 0.3437, "step": 9584 }, { "epoch": 1.889589905362776, "grad_norm": 0.4659483454174267, "learning_rate": 1.0880821815194602e-05, "loss": 0.316, "step": 9585 }, { "epoch": 1.8897870662460567, "grad_norm": 0.4768864743117075, "learning_rate": 1.087927780326467e-05, "loss": 0.3356, "step": 9586 }, { "epoch": 1.8899842271293377, "grad_norm": 0.459781264277085, "learning_rate": 1.087773377020938e-05, "loss": 0.3245, "step": 9587 }, { "epoch": 1.8901813880126181, "grad_norm": 0.4583408374138778, "learning_rate": 1.0876189716065825e-05, "loss": 0.323, "step": 9588 }, { "epoch": 1.890378548895899, "grad_norm": 0.4504538132773227, "learning_rate": 1.0874645640871114e-05, "loss": 0.3059, "step": 9589 }, { "epoch": 1.8905757097791798, "grad_norm": 0.47695654088042766, "learning_rate": 1.087310154466233e-05, "loss": 0.3227, "step": 9590 }, { "epoch": 1.8907728706624605, "grad_norm": 0.4482267647409884, "learning_rate": 1.0871557427476585e-05, "loss": 0.3133, "step": 9591 }, { "epoch": 1.8909700315457414, "grad_norm": 0.46554460153463734, "learning_rate": 1.0870013289350964e-05, "loss": 0.3353, "step": 9592 }, { "epoch": 1.891167192429022, "grad_norm": 0.4672677533066259, "learning_rate": 1.0868469130322581e-05, "loss": 0.314, "step": 9593 }, { "epoch": 1.8913643533123028, "grad_norm": 0.6158071510944071, "learning_rate": 1.086692495042852e-05, "loss": 0.3526, "step": 9594 }, { "epoch": 1.8915615141955837, "grad_norm": 0.43646887186476924, "learning_rate": 1.0865380749705892e-05, "loss": 0.3137, "step": 9595 }, { "epoch": 1.8917586750788642, "grad_norm": 0.5134052948284018, "learning_rate": 1.0863836528191795e-05, "loss": 0.3521, "step": 9596 }, { "epoch": 1.8919558359621451, "grad_norm": 0.4882031174942994, "learning_rate": 1.0862292285923331e-05, "loss": 0.3404, "step": 9597 }, { "epoch": 1.8921529968454258, "grad_norm": 0.44241922335608497, "learning_rate": 1.08607480229376e-05, "loss": 0.3076, "step": 9598 }, { "epoch": 1.8923501577287065, "grad_norm": 0.48941048003891413, "learning_rate": 1.0859203739271702e-05, "loss": 0.3379, "step": 9599 }, { "epoch": 1.8925473186119874, "grad_norm": 0.5021082128605233, "learning_rate": 1.0857659434962744e-05, "loss": 0.3502, "step": 9600 }, { "epoch": 1.8927444794952681, "grad_norm": 0.49462878367308655, "learning_rate": 1.0856115110047829e-05, "loss": 0.344, "step": 9601 }, { "epoch": 1.8929416403785488, "grad_norm": 0.5710152300950999, "learning_rate": 1.0854570764564057e-05, "loss": 0.3439, "step": 9602 }, { "epoch": 1.8931388012618298, "grad_norm": 0.4611516368668063, "learning_rate": 1.0853026398548535e-05, "loss": 0.3331, "step": 9603 }, { "epoch": 1.8933359621451105, "grad_norm": 0.45617109895197894, "learning_rate": 1.0851482012038366e-05, "loss": 0.299, "step": 9604 }, { "epoch": 1.8935331230283912, "grad_norm": 0.4710318513048329, "learning_rate": 1.0849937605070658e-05, "loss": 0.3069, "step": 9605 }, { "epoch": 1.893730283911672, "grad_norm": 0.5515208008448891, "learning_rate": 1.0848393177682513e-05, "loss": 0.3783, "step": 9606 }, { "epoch": 1.8939274447949526, "grad_norm": 0.48486227843023466, "learning_rate": 1.0846848729911037e-05, "loss": 0.337, "step": 9607 }, { "epoch": 1.8941246056782335, "grad_norm": 0.4820429170759435, "learning_rate": 1.084530426179334e-05, "loss": 0.3312, "step": 9608 }, { "epoch": 1.8943217665615142, "grad_norm": 0.5600154595642701, "learning_rate": 1.0843759773366526e-05, "loss": 0.3339, "step": 9609 }, { "epoch": 1.8945189274447949, "grad_norm": 0.48414804671739764, "learning_rate": 1.0842215264667708e-05, "loss": 0.3515, "step": 9610 }, { "epoch": 1.8947160883280758, "grad_norm": 0.46300778683793087, "learning_rate": 1.0840670735733984e-05, "loss": 0.3089, "step": 9611 }, { "epoch": 1.8949132492113565, "grad_norm": 0.4933738100516299, "learning_rate": 1.0839126186602475e-05, "loss": 0.3337, "step": 9612 }, { "epoch": 1.8951104100946372, "grad_norm": 0.4854372334064325, "learning_rate": 1.0837581617310279e-05, "loss": 0.3506, "step": 9613 }, { "epoch": 1.8953075709779181, "grad_norm": 0.458378537894229, "learning_rate": 1.0836037027894515e-05, "loss": 0.3205, "step": 9614 }, { "epoch": 1.8955047318611986, "grad_norm": 0.4564763917102619, "learning_rate": 1.0834492418392281e-05, "loss": 0.3117, "step": 9615 }, { "epoch": 1.8957018927444795, "grad_norm": 0.508560600760082, "learning_rate": 1.0832947788840699e-05, "loss": 0.3454, "step": 9616 }, { "epoch": 1.8958990536277602, "grad_norm": 0.4692816776007284, "learning_rate": 1.0831403139276875e-05, "loss": 0.3364, "step": 9617 }, { "epoch": 1.896096214511041, "grad_norm": 0.4606352560271506, "learning_rate": 1.0829858469737921e-05, "loss": 0.3357, "step": 9618 }, { "epoch": 1.8962933753943219, "grad_norm": 0.4547120875924028, "learning_rate": 1.082831378026095e-05, "loss": 0.3286, "step": 9619 }, { "epoch": 1.8964905362776026, "grad_norm": 0.5162498136227341, "learning_rate": 1.0826769070883073e-05, "loss": 0.3436, "step": 9620 }, { "epoch": 1.8966876971608833, "grad_norm": 0.4804924944515462, "learning_rate": 1.0825224341641403e-05, "loss": 0.3225, "step": 9621 }, { "epoch": 1.8968848580441642, "grad_norm": 0.4755043731674942, "learning_rate": 1.0823679592573052e-05, "loss": 0.3484, "step": 9622 }, { "epoch": 1.8970820189274447, "grad_norm": 0.47229749756388667, "learning_rate": 1.0822134823715139e-05, "loss": 0.3348, "step": 9623 }, { "epoch": 1.8972791798107256, "grad_norm": 0.443728175982202, "learning_rate": 1.0820590035104773e-05, "loss": 0.3156, "step": 9624 }, { "epoch": 1.8974763406940063, "grad_norm": 0.45201673400276804, "learning_rate": 1.0819045226779071e-05, "loss": 0.3132, "step": 9625 }, { "epoch": 1.897673501577287, "grad_norm": 0.46420972816236206, "learning_rate": 1.0817500398775147e-05, "loss": 0.3285, "step": 9626 }, { "epoch": 1.897870662460568, "grad_norm": 0.4878624846038825, "learning_rate": 1.0815955551130117e-05, "loss": 0.3621, "step": 9627 }, { "epoch": 1.8980678233438486, "grad_norm": 0.46796009590591403, "learning_rate": 1.0814410683881098e-05, "loss": 0.3341, "step": 9628 }, { "epoch": 1.8982649842271293, "grad_norm": 0.46658974852311247, "learning_rate": 1.0812865797065209e-05, "loss": 0.3476, "step": 9629 }, { "epoch": 1.8984621451104102, "grad_norm": 0.48330294725808115, "learning_rate": 1.0811320890719558e-05, "loss": 0.3436, "step": 9630 }, { "epoch": 1.8986593059936907, "grad_norm": 0.47789134303226777, "learning_rate": 1.0809775964881278e-05, "loss": 0.3399, "step": 9631 }, { "epoch": 1.8988564668769716, "grad_norm": 0.462828839034776, "learning_rate": 1.0808231019587472e-05, "loss": 0.3406, "step": 9632 }, { "epoch": 1.8990536277602523, "grad_norm": 0.4498197895663266, "learning_rate": 1.0806686054875268e-05, "loss": 0.3139, "step": 9633 }, { "epoch": 1.899250788643533, "grad_norm": 0.5032026325434863, "learning_rate": 1.080514107078178e-05, "loss": 0.333, "step": 9634 }, { "epoch": 1.899447949526814, "grad_norm": 0.4669772787243432, "learning_rate": 1.0803596067344134e-05, "loss": 0.319, "step": 9635 }, { "epoch": 1.8996451104100947, "grad_norm": 0.4506661711957322, "learning_rate": 1.0802051044599441e-05, "loss": 0.3223, "step": 9636 }, { "epoch": 1.8998422712933754, "grad_norm": 0.47399636404681555, "learning_rate": 1.0800506002584825e-05, "loss": 0.3244, "step": 9637 }, { "epoch": 1.9000394321766563, "grad_norm": 0.4508527663354465, "learning_rate": 1.0798960941337411e-05, "loss": 0.3369, "step": 9638 }, { "epoch": 1.9002365930599368, "grad_norm": 0.5017850647875128, "learning_rate": 1.0797415860894313e-05, "loss": 0.3421, "step": 9639 }, { "epoch": 1.9004337539432177, "grad_norm": 0.4883584730902664, "learning_rate": 1.0795870761292661e-05, "loss": 0.3313, "step": 9640 }, { "epoch": 1.9006309148264984, "grad_norm": 0.4761447190397066, "learning_rate": 1.079432564256957e-05, "loss": 0.3124, "step": 9641 }, { "epoch": 1.900828075709779, "grad_norm": 0.44783562310151726, "learning_rate": 1.0792780504762168e-05, "loss": 0.2974, "step": 9642 }, { "epoch": 1.90102523659306, "grad_norm": 0.47770652935997016, "learning_rate": 1.0791235347907573e-05, "loss": 0.3256, "step": 9643 }, { "epoch": 1.9012223974763407, "grad_norm": 0.5877815822517798, "learning_rate": 1.0789690172042912e-05, "loss": 0.2918, "step": 9644 }, { "epoch": 1.9014195583596214, "grad_norm": 0.4726070261592881, "learning_rate": 1.078814497720531e-05, "loss": 0.321, "step": 9645 }, { "epoch": 1.9016167192429023, "grad_norm": 0.4907105853731056, "learning_rate": 1.0786599763431891e-05, "loss": 0.3557, "step": 9646 }, { "epoch": 1.901813880126183, "grad_norm": 0.4597358486182623, "learning_rate": 1.078505453075978e-05, "loss": 0.3013, "step": 9647 }, { "epoch": 1.9020110410094637, "grad_norm": 0.5048614261322799, "learning_rate": 1.0783509279226099e-05, "loss": 0.3458, "step": 9648 }, { "epoch": 1.9022082018927446, "grad_norm": 0.4938247930425652, "learning_rate": 1.0781964008867979e-05, "loss": 0.3517, "step": 9649 }, { "epoch": 1.9024053627760251, "grad_norm": 0.4798368494115209, "learning_rate": 1.0780418719722544e-05, "loss": 0.3387, "step": 9650 }, { "epoch": 1.902602523659306, "grad_norm": 0.4594536938455119, "learning_rate": 1.0778873411826918e-05, "loss": 0.3178, "step": 9651 }, { "epoch": 1.9027996845425867, "grad_norm": 0.4600549598776842, "learning_rate": 1.0777328085218232e-05, "loss": 0.3114, "step": 9652 }, { "epoch": 1.9029968454258674, "grad_norm": 0.6555278594200854, "learning_rate": 1.0775782739933614e-05, "loss": 0.3139, "step": 9653 }, { "epoch": 1.9031940063091484, "grad_norm": 0.47716562507209187, "learning_rate": 1.077423737601019e-05, "loss": 0.3301, "step": 9654 }, { "epoch": 1.903391167192429, "grad_norm": 0.47582609005457543, "learning_rate": 1.0772691993485091e-05, "loss": 0.3196, "step": 9655 }, { "epoch": 1.9035883280757098, "grad_norm": 0.4654364416610627, "learning_rate": 1.0771146592395443e-05, "loss": 0.3211, "step": 9656 }, { "epoch": 1.9037854889589907, "grad_norm": 0.46468189188527265, "learning_rate": 1.0769601172778379e-05, "loss": 0.3355, "step": 9657 }, { "epoch": 1.9039826498422712, "grad_norm": 0.45410022068436584, "learning_rate": 1.0768055734671023e-05, "loss": 0.3109, "step": 9658 }, { "epoch": 1.904179810725552, "grad_norm": 0.48221225731467116, "learning_rate": 1.0766510278110514e-05, "loss": 0.3421, "step": 9659 }, { "epoch": 1.9043769716088328, "grad_norm": 0.47483955562907587, "learning_rate": 1.0764964803133975e-05, "loss": 0.324, "step": 9660 }, { "epoch": 1.9045741324921135, "grad_norm": 0.5016946982181942, "learning_rate": 1.0763419309778544e-05, "loss": 0.3586, "step": 9661 }, { "epoch": 1.9047712933753944, "grad_norm": 0.4816207167555914, "learning_rate": 1.0761873798081343e-05, "loss": 0.3425, "step": 9662 }, { "epoch": 1.9049684542586751, "grad_norm": 0.46646055121198043, "learning_rate": 1.0760328268079517e-05, "loss": 0.3387, "step": 9663 }, { "epoch": 1.9051656151419558, "grad_norm": 0.4558246279347243, "learning_rate": 1.075878271981019e-05, "loss": 0.3035, "step": 9664 }, { "epoch": 1.9053627760252367, "grad_norm": 0.49067026661890945, "learning_rate": 1.0757237153310496e-05, "loss": 0.3373, "step": 9665 }, { "epoch": 1.9055599369085172, "grad_norm": 0.4654982629990961, "learning_rate": 1.0755691568617573e-05, "loss": 0.3214, "step": 9666 }, { "epoch": 1.9057570977917981, "grad_norm": 0.48044344209508094, "learning_rate": 1.0754145965768548e-05, "loss": 0.3276, "step": 9667 }, { "epoch": 1.9059542586750788, "grad_norm": 0.4639232573098766, "learning_rate": 1.075260034480056e-05, "loss": 0.3367, "step": 9668 }, { "epoch": 1.9061514195583595, "grad_norm": 0.4610967414683846, "learning_rate": 1.0751054705750744e-05, "loss": 0.3195, "step": 9669 }, { "epoch": 1.9063485804416405, "grad_norm": 0.4456558119416812, "learning_rate": 1.0749509048656231e-05, "loss": 0.3163, "step": 9670 }, { "epoch": 1.9065457413249212, "grad_norm": 0.4679939791043013, "learning_rate": 1.074796337355416e-05, "loss": 0.3354, "step": 9671 }, { "epoch": 1.9067429022082019, "grad_norm": 0.4886824586751215, "learning_rate": 1.074641768048167e-05, "loss": 0.3325, "step": 9672 }, { "epoch": 1.9069400630914828, "grad_norm": 0.6283610239404007, "learning_rate": 1.074487196947589e-05, "loss": 0.3256, "step": 9673 }, { "epoch": 1.9071372239747633, "grad_norm": 0.47479516642440406, "learning_rate": 1.0743326240573964e-05, "loss": 0.3345, "step": 9674 }, { "epoch": 1.9073343848580442, "grad_norm": 0.48102011928532773, "learning_rate": 1.0741780493813025e-05, "loss": 0.3687, "step": 9675 }, { "epoch": 1.9075315457413249, "grad_norm": 0.44787874825005314, "learning_rate": 1.0740234729230213e-05, "loss": 0.3302, "step": 9676 }, { "epoch": 1.9077287066246056, "grad_norm": 0.4786425290897722, "learning_rate": 1.0738688946862661e-05, "loss": 0.3335, "step": 9677 }, { "epoch": 1.9079258675078865, "grad_norm": 0.47312252556976, "learning_rate": 1.073714314674752e-05, "loss": 0.3166, "step": 9678 }, { "epoch": 1.9081230283911672, "grad_norm": 0.4382653645073557, "learning_rate": 1.0735597328921914e-05, "loss": 0.3034, "step": 9679 }, { "epoch": 1.908320189274448, "grad_norm": 0.44778499820386447, "learning_rate": 1.0734051493422996e-05, "loss": 0.3082, "step": 9680 }, { "epoch": 1.9085173501577288, "grad_norm": 0.46994341815466767, "learning_rate": 1.0732505640287895e-05, "loss": 0.3271, "step": 9681 }, { "epoch": 1.9087145110410093, "grad_norm": 0.5035019762587363, "learning_rate": 1.0730959769553762e-05, "loss": 0.3197, "step": 9682 }, { "epoch": 1.9089116719242902, "grad_norm": 0.4658601369964348, "learning_rate": 1.0729413881257725e-05, "loss": 0.3325, "step": 9683 }, { "epoch": 1.909108832807571, "grad_norm": 0.48330429263892694, "learning_rate": 1.0727867975436936e-05, "loss": 0.322, "step": 9684 }, { "epoch": 1.9093059936908516, "grad_norm": 0.489211526633787, "learning_rate": 1.072632205212853e-05, "loss": 0.3395, "step": 9685 }, { "epoch": 1.9095031545741326, "grad_norm": 0.49250668046451535, "learning_rate": 1.0724776111369654e-05, "loss": 0.3657, "step": 9686 }, { "epoch": 1.9097003154574133, "grad_norm": 0.4740052263296321, "learning_rate": 1.072323015319745e-05, "loss": 0.3284, "step": 9687 }, { "epoch": 1.909897476340694, "grad_norm": 0.46842386445090717, "learning_rate": 1.0721684177649056e-05, "loss": 0.3339, "step": 9688 }, { "epoch": 1.9100946372239749, "grad_norm": 0.46625362508947077, "learning_rate": 1.0720138184761621e-05, "loss": 0.342, "step": 9689 }, { "epoch": 1.9102917981072554, "grad_norm": 0.4848740749436997, "learning_rate": 1.0718592174572285e-05, "loss": 0.3375, "step": 9690 }, { "epoch": 1.9104889589905363, "grad_norm": 0.48490516898151786, "learning_rate": 1.0717046147118193e-05, "loss": 0.3189, "step": 9691 }, { "epoch": 1.910686119873817, "grad_norm": 0.46286777382849786, "learning_rate": 1.071550010243649e-05, "loss": 0.3207, "step": 9692 }, { "epoch": 1.9108832807570977, "grad_norm": 0.4509100884863312, "learning_rate": 1.071395404056432e-05, "loss": 0.3283, "step": 9693 }, { "epoch": 1.9110804416403786, "grad_norm": 0.4959185650773511, "learning_rate": 1.071240796153883e-05, "loss": 0.3451, "step": 9694 }, { "epoch": 1.9112776025236593, "grad_norm": 0.4708104478931626, "learning_rate": 1.0710861865397166e-05, "loss": 0.348, "step": 9695 }, { "epoch": 1.91147476340694, "grad_norm": 0.48697087265632166, "learning_rate": 1.0709315752176472e-05, "loss": 0.3357, "step": 9696 }, { "epoch": 1.911671924290221, "grad_norm": 0.49229875756280933, "learning_rate": 1.0707769621913897e-05, "loss": 0.3522, "step": 9697 }, { "epoch": 1.9118690851735016, "grad_norm": 0.4787942182967307, "learning_rate": 1.0706223474646581e-05, "loss": 0.3469, "step": 9698 }, { "epoch": 1.9120662460567823, "grad_norm": 0.46221031697674925, "learning_rate": 1.0704677310411686e-05, "loss": 0.3215, "step": 9699 }, { "epoch": 1.9122634069400632, "grad_norm": 0.49657974351122547, "learning_rate": 1.0703131129246347e-05, "loss": 0.3449, "step": 9700 }, { "epoch": 1.9124605678233437, "grad_norm": 0.4700103591052916, "learning_rate": 1.070158493118772e-05, "loss": 0.3143, "step": 9701 }, { "epoch": 1.9126577287066246, "grad_norm": 0.5045654021745932, "learning_rate": 1.0700038716272944e-05, "loss": 0.3246, "step": 9702 }, { "epoch": 1.9128548895899053, "grad_norm": 0.4708499061368683, "learning_rate": 1.0698492484539178e-05, "loss": 0.3436, "step": 9703 }, { "epoch": 1.913052050473186, "grad_norm": 0.4808287774080232, "learning_rate": 1.0696946236023566e-05, "loss": 0.3474, "step": 9704 }, { "epoch": 1.913249211356467, "grad_norm": 0.4919946738804181, "learning_rate": 1.0695399970763258e-05, "loss": 0.331, "step": 9705 }, { "epoch": 1.9134463722397477, "grad_norm": 0.48323751424237105, "learning_rate": 1.069385368879541e-05, "loss": 0.3436, "step": 9706 }, { "epoch": 1.9136435331230284, "grad_norm": 0.4694330210445866, "learning_rate": 1.0692307390157164e-05, "loss": 0.3144, "step": 9707 }, { "epoch": 1.9138406940063093, "grad_norm": 0.4689921139270199, "learning_rate": 1.069076107488568e-05, "loss": 0.3196, "step": 9708 }, { "epoch": 1.9140378548895898, "grad_norm": 0.46415715567906024, "learning_rate": 1.0689214743018102e-05, "loss": 0.3288, "step": 9709 }, { "epoch": 1.9142350157728707, "grad_norm": 0.515132913417703, "learning_rate": 1.0687668394591586e-05, "loss": 0.3284, "step": 9710 }, { "epoch": 1.9144321766561514, "grad_norm": 0.4513787042971108, "learning_rate": 1.068612202964328e-05, "loss": 0.311, "step": 9711 }, { "epoch": 1.914629337539432, "grad_norm": 0.502933214732867, "learning_rate": 1.0684575648210343e-05, "loss": 0.3317, "step": 9712 }, { "epoch": 1.914826498422713, "grad_norm": 0.45248819432377035, "learning_rate": 1.0683029250329924e-05, "loss": 0.3317, "step": 9713 }, { "epoch": 1.9150236593059937, "grad_norm": 0.46741383909252043, "learning_rate": 1.0681482836039176e-05, "loss": 0.3067, "step": 9714 }, { "epoch": 1.9152208201892744, "grad_norm": 0.47201782201403564, "learning_rate": 1.0679936405375255e-05, "loss": 0.3541, "step": 9715 }, { "epoch": 1.9154179810725553, "grad_norm": 0.4945029413774007, "learning_rate": 1.0678389958375316e-05, "loss": 0.3423, "step": 9716 }, { "epoch": 1.9156151419558358, "grad_norm": 0.5418610271930345, "learning_rate": 1.067684349507651e-05, "loss": 0.3696, "step": 9717 }, { "epoch": 1.9158123028391167, "grad_norm": 0.5060226118055992, "learning_rate": 1.0675297015515993e-05, "loss": 0.3504, "step": 9718 }, { "epoch": 1.9160094637223974, "grad_norm": 0.44874325788032515, "learning_rate": 1.0673750519730923e-05, "loss": 0.2836, "step": 9719 }, { "epoch": 1.9162066246056781, "grad_norm": 0.4762929005302375, "learning_rate": 1.0672204007758453e-05, "loss": 0.3362, "step": 9720 }, { "epoch": 1.916403785488959, "grad_norm": 0.4643277801877159, "learning_rate": 1.0670657479635742e-05, "loss": 0.3236, "step": 9721 }, { "epoch": 1.9166009463722398, "grad_norm": 0.49866980334934874, "learning_rate": 1.0669110935399944e-05, "loss": 0.3474, "step": 9722 }, { "epoch": 1.9167981072555205, "grad_norm": 0.46636862046761957, "learning_rate": 1.0667564375088218e-05, "loss": 0.3371, "step": 9723 }, { "epoch": 1.9169952681388014, "grad_norm": 0.4624733590198138, "learning_rate": 1.066601779873772e-05, "loss": 0.3149, "step": 9724 }, { "epoch": 1.9171924290220819, "grad_norm": 0.4477193303841462, "learning_rate": 1.0664471206385607e-05, "loss": 0.2978, "step": 9725 }, { "epoch": 1.9173895899053628, "grad_norm": 0.4436196530416114, "learning_rate": 1.0662924598069035e-05, "loss": 0.302, "step": 9726 }, { "epoch": 1.9175867507886435, "grad_norm": 0.4980769406665461, "learning_rate": 1.0661377973825173e-05, "loss": 0.355, "step": 9727 }, { "epoch": 1.9177839116719242, "grad_norm": 0.4692953470690748, "learning_rate": 1.0659831333691166e-05, "loss": 0.3222, "step": 9728 }, { "epoch": 1.9179810725552051, "grad_norm": 0.4855708808826588, "learning_rate": 1.0658284677704187e-05, "loss": 0.3387, "step": 9729 }, { "epoch": 1.9181782334384858, "grad_norm": 0.4926783442889073, "learning_rate": 1.0656738005901382e-05, "loss": 0.3263, "step": 9730 }, { "epoch": 1.9183753943217665, "grad_norm": 0.49008026881045197, "learning_rate": 1.0655191318319921e-05, "loss": 0.3788, "step": 9731 }, { "epoch": 1.9185725552050474, "grad_norm": 0.47996998660573686, "learning_rate": 1.0653644614996958e-05, "loss": 0.3424, "step": 9732 }, { "epoch": 1.918769716088328, "grad_norm": 0.6127908630549663, "learning_rate": 1.0652097895969657e-05, "loss": 0.3558, "step": 9733 }, { "epoch": 1.9189668769716088, "grad_norm": 0.4627932710115664, "learning_rate": 1.0650551161275182e-05, "loss": 0.3155, "step": 9734 }, { "epoch": 1.9191640378548895, "grad_norm": 0.45413739024853905, "learning_rate": 1.064900441095069e-05, "loss": 0.3031, "step": 9735 }, { "epoch": 1.9193611987381702, "grad_norm": 0.4495957982119002, "learning_rate": 1.0647457645033343e-05, "loss": 0.3198, "step": 9736 }, { "epoch": 1.9195583596214512, "grad_norm": 0.44672878440578423, "learning_rate": 1.0645910863560306e-05, "loss": 0.3065, "step": 9737 }, { "epoch": 1.9197555205047319, "grad_norm": 0.4540120569214132, "learning_rate": 1.0644364066568742e-05, "loss": 0.316, "step": 9738 }, { "epoch": 1.9199526813880126, "grad_norm": 1.8452643662313766, "learning_rate": 1.0642817254095809e-05, "loss": 0.3175, "step": 9739 }, { "epoch": 1.9201498422712935, "grad_norm": 0.5187412295956796, "learning_rate": 1.0641270426178677e-05, "loss": 0.3445, "step": 9740 }, { "epoch": 1.9203470031545742, "grad_norm": 0.4665160701342525, "learning_rate": 1.0639723582854505e-05, "loss": 0.3309, "step": 9741 }, { "epoch": 1.9205441640378549, "grad_norm": 0.4551439613909609, "learning_rate": 1.0638176724160458e-05, "loss": 0.3291, "step": 9742 }, { "epoch": 1.9207413249211358, "grad_norm": 0.5160208376705452, "learning_rate": 1.0636629850133705e-05, "loss": 0.3391, "step": 9743 }, { "epoch": 1.9209384858044163, "grad_norm": 0.49989041787707916, "learning_rate": 1.0635082960811403e-05, "loss": 0.3444, "step": 9744 }, { "epoch": 1.9211356466876972, "grad_norm": 0.4964849127683154, "learning_rate": 1.063353605623072e-05, "loss": 0.3492, "step": 9745 }, { "epoch": 1.921332807570978, "grad_norm": 0.6049573542199426, "learning_rate": 1.0631989136428828e-05, "loss": 0.3631, "step": 9746 }, { "epoch": 1.9215299684542586, "grad_norm": 0.4756893237472388, "learning_rate": 1.0630442201442884e-05, "loss": 0.3283, "step": 9747 }, { "epoch": 1.9217271293375395, "grad_norm": 0.6792395870038784, "learning_rate": 1.0628895251310063e-05, "loss": 0.3448, "step": 9748 }, { "epoch": 1.9219242902208202, "grad_norm": 0.4749099206929661, "learning_rate": 1.0627348286067521e-05, "loss": 0.3263, "step": 9749 }, { "epoch": 1.922121451104101, "grad_norm": 0.46009586758698623, "learning_rate": 1.0625801305752436e-05, "loss": 0.3141, "step": 9750 }, { "epoch": 1.9223186119873819, "grad_norm": 0.46687228371318157, "learning_rate": 1.062425431040197e-05, "loss": 0.3375, "step": 9751 }, { "epoch": 1.9225157728706623, "grad_norm": 0.5157651333274518, "learning_rate": 1.062270730005329e-05, "loss": 0.3194, "step": 9752 }, { "epoch": 1.9227129337539433, "grad_norm": 0.5008517739542978, "learning_rate": 1.0621160274743564e-05, "loss": 0.3596, "step": 9753 }, { "epoch": 1.922910094637224, "grad_norm": 0.44671709049249914, "learning_rate": 1.0619613234509967e-05, "loss": 0.3119, "step": 9754 }, { "epoch": 1.9231072555205047, "grad_norm": 0.4313404095262679, "learning_rate": 1.0618066179389663e-05, "loss": 0.2903, "step": 9755 }, { "epoch": 1.9233044164037856, "grad_norm": 0.44712186986789765, "learning_rate": 1.0616519109419815e-05, "loss": 0.31, "step": 9756 }, { "epoch": 1.9235015772870663, "grad_norm": 0.48107891788084883, "learning_rate": 1.0614972024637606e-05, "loss": 0.3371, "step": 9757 }, { "epoch": 1.923698738170347, "grad_norm": 0.4302137346468184, "learning_rate": 1.0613424925080194e-05, "loss": 0.2886, "step": 9758 }, { "epoch": 1.923895899053628, "grad_norm": 0.4463000909131128, "learning_rate": 1.0611877810784756e-05, "loss": 0.3051, "step": 9759 }, { "epoch": 1.9240930599369084, "grad_norm": 0.46877366951404054, "learning_rate": 1.061033068178846e-05, "loss": 0.3319, "step": 9760 }, { "epoch": 1.9242902208201893, "grad_norm": 0.45058760214666865, "learning_rate": 1.0608783538128479e-05, "loss": 0.3294, "step": 9761 }, { "epoch": 1.92448738170347, "grad_norm": 0.4673506711450003, "learning_rate": 1.0607236379841984e-05, "loss": 0.3131, "step": 9762 }, { "epoch": 1.9246845425867507, "grad_norm": 0.4448969256754839, "learning_rate": 1.0605689206966145e-05, "loss": 0.3046, "step": 9763 }, { "epoch": 1.9248817034700316, "grad_norm": 0.44398749817741634, "learning_rate": 1.0604142019538135e-05, "loss": 0.3036, "step": 9764 }, { "epoch": 1.9250788643533123, "grad_norm": 0.4662281069913097, "learning_rate": 1.0602594817595126e-05, "loss": 0.3327, "step": 9765 }, { "epoch": 1.925276025236593, "grad_norm": 0.6123896643925617, "learning_rate": 1.060104760117429e-05, "loss": 0.2994, "step": 9766 }, { "epoch": 1.925473186119874, "grad_norm": 0.4800059909800257, "learning_rate": 1.0599500370312805e-05, "loss": 0.332, "step": 9767 }, { "epoch": 1.9256703470031544, "grad_norm": 0.47073997673558676, "learning_rate": 1.0597953125047839e-05, "loss": 0.3173, "step": 9768 }, { "epoch": 1.9258675078864353, "grad_norm": 0.486613027526376, "learning_rate": 1.0596405865416569e-05, "loss": 0.3397, "step": 9769 }, { "epoch": 1.926064668769716, "grad_norm": 0.4412020035394467, "learning_rate": 1.0594858591456166e-05, "loss": 0.3088, "step": 9770 }, { "epoch": 1.9262618296529967, "grad_norm": 0.4645088549635914, "learning_rate": 1.0593311303203806e-05, "loss": 0.3399, "step": 9771 }, { "epoch": 1.9264589905362777, "grad_norm": 0.5045226039115558, "learning_rate": 1.0591764000696665e-05, "loss": 0.3695, "step": 9772 }, { "epoch": 1.9266561514195584, "grad_norm": 0.4783611486444141, "learning_rate": 1.0590216683971915e-05, "loss": 0.334, "step": 9773 }, { "epoch": 1.926853312302839, "grad_norm": 0.47330458094671435, "learning_rate": 1.0588669353066739e-05, "loss": 0.3499, "step": 9774 }, { "epoch": 1.92705047318612, "grad_norm": 0.4625886872385459, "learning_rate": 1.0587122008018303e-05, "loss": 0.3267, "step": 9775 }, { "epoch": 1.9272476340694005, "grad_norm": 0.4856730238683575, "learning_rate": 1.058557464886379e-05, "loss": 0.3312, "step": 9776 }, { "epoch": 1.9274447949526814, "grad_norm": 0.46470351748252176, "learning_rate": 1.0584027275640372e-05, "loss": 0.3281, "step": 9777 }, { "epoch": 1.927641955835962, "grad_norm": 0.4805088560192612, "learning_rate": 1.0582479888385233e-05, "loss": 0.3373, "step": 9778 }, { "epoch": 1.9278391167192428, "grad_norm": 0.4876012478543451, "learning_rate": 1.0580932487135541e-05, "loss": 0.3425, "step": 9779 }, { "epoch": 1.9280362776025237, "grad_norm": 0.5473660280771162, "learning_rate": 1.057938507192848e-05, "loss": 0.351, "step": 9780 }, { "epoch": 1.9282334384858044, "grad_norm": 0.4684135144874953, "learning_rate": 1.0577837642801227e-05, "loss": 0.338, "step": 9781 }, { "epoch": 1.9284305993690851, "grad_norm": 0.48445875938977345, "learning_rate": 1.0576290199790959e-05, "loss": 0.3315, "step": 9782 }, { "epoch": 1.928627760252366, "grad_norm": 0.5000034658687103, "learning_rate": 1.0574742742934853e-05, "loss": 0.3623, "step": 9783 }, { "epoch": 1.9288249211356467, "grad_norm": 0.4536757092944629, "learning_rate": 1.0573195272270091e-05, "loss": 0.3083, "step": 9784 }, { "epoch": 1.9290220820189274, "grad_norm": 0.47051487862294444, "learning_rate": 1.0571647787833853e-05, "loss": 0.3294, "step": 9785 }, { "epoch": 1.9292192429022084, "grad_norm": 0.48487850881869465, "learning_rate": 1.0570100289663314e-05, "loss": 0.3444, "step": 9786 }, { "epoch": 1.9294164037854888, "grad_norm": 0.4587817967805745, "learning_rate": 1.0568552777795657e-05, "loss": 0.3369, "step": 9787 }, { "epoch": 1.9296135646687698, "grad_norm": 0.4489966707521766, "learning_rate": 1.0567005252268063e-05, "loss": 0.3023, "step": 9788 }, { "epoch": 1.9298107255520505, "grad_norm": 0.5396452051256785, "learning_rate": 1.056545771311771e-05, "loss": 0.3587, "step": 9789 }, { "epoch": 1.9300078864353312, "grad_norm": 0.45895425892398584, "learning_rate": 1.056391016038178e-05, "loss": 0.3135, "step": 9790 }, { "epoch": 1.930205047318612, "grad_norm": 0.45902487475066067, "learning_rate": 1.0562362594097456e-05, "loss": 0.3256, "step": 9791 }, { "epoch": 1.9304022082018928, "grad_norm": 0.48459698795680317, "learning_rate": 1.0560815014301916e-05, "loss": 0.3315, "step": 9792 }, { "epoch": 1.9305993690851735, "grad_norm": 0.48790563977946905, "learning_rate": 1.0559267421032345e-05, "loss": 0.3464, "step": 9793 }, { "epoch": 1.9307965299684544, "grad_norm": 0.45850742759324065, "learning_rate": 1.055771981432592e-05, "loss": 0.3014, "step": 9794 }, { "epoch": 1.930993690851735, "grad_norm": 0.5039083825372765, "learning_rate": 1.0556172194219831e-05, "loss": 0.3541, "step": 9795 }, { "epoch": 1.9311908517350158, "grad_norm": 0.48911414228696054, "learning_rate": 1.0554624560751254e-05, "loss": 0.3467, "step": 9796 }, { "epoch": 1.9313880126182965, "grad_norm": 0.4929694442801919, "learning_rate": 1.0553076913957381e-05, "loss": 0.3387, "step": 9797 }, { "epoch": 1.9315851735015772, "grad_norm": 0.47700645459297586, "learning_rate": 1.0551529253875383e-05, "loss": 0.3645, "step": 9798 }, { "epoch": 1.9317823343848581, "grad_norm": 0.45061951666064487, "learning_rate": 1.0549981580542457e-05, "loss": 0.2962, "step": 9799 }, { "epoch": 1.9319794952681388, "grad_norm": 0.47286203795175763, "learning_rate": 1.0548433893995775e-05, "loss": 0.3213, "step": 9800 }, { "epoch": 1.9321766561514195, "grad_norm": 0.486492356786951, "learning_rate": 1.054688619427253e-05, "loss": 0.3414, "step": 9801 }, { "epoch": 1.9323738170347005, "grad_norm": 0.48260028863528787, "learning_rate": 1.0545338481409903e-05, "loss": 0.3489, "step": 9802 }, { "epoch": 1.932570977917981, "grad_norm": 0.6276466016632191, "learning_rate": 1.054379075544508e-05, "loss": 0.3597, "step": 9803 }, { "epoch": 1.9327681388012619, "grad_norm": 0.44166593615676375, "learning_rate": 1.0542243016415248e-05, "loss": 0.3116, "step": 9804 }, { "epoch": 1.9329652996845426, "grad_norm": 6.057606271885772, "learning_rate": 1.0540695264357587e-05, "loss": 0.3798, "step": 9805 }, { "epoch": 1.9331624605678233, "grad_norm": 0.4682081351995988, "learning_rate": 1.053914749930929e-05, "loss": 0.3357, "step": 9806 }, { "epoch": 1.9333596214511042, "grad_norm": 0.45711029196728215, "learning_rate": 1.0537599721307538e-05, "loss": 0.3182, "step": 9807 }, { "epoch": 1.9335567823343849, "grad_norm": 0.477630148881728, "learning_rate": 1.0536051930389522e-05, "loss": 0.3151, "step": 9808 }, { "epoch": 1.9337539432176656, "grad_norm": 0.45025864365539675, "learning_rate": 1.0534504126592426e-05, "loss": 0.321, "step": 9809 }, { "epoch": 1.9339511041009465, "grad_norm": 0.45003521165597976, "learning_rate": 1.0532956309953437e-05, "loss": 0.3104, "step": 9810 }, { "epoch": 1.934148264984227, "grad_norm": 0.5519090372934141, "learning_rate": 1.0531408480509744e-05, "loss": 0.3511, "step": 9811 }, { "epoch": 1.934345425867508, "grad_norm": 0.45552387144099366, "learning_rate": 1.0529860638298535e-05, "loss": 0.3169, "step": 9812 }, { "epoch": 1.9345425867507886, "grad_norm": 0.48604649269003025, "learning_rate": 1.0528312783356998e-05, "loss": 0.3348, "step": 9813 }, { "epoch": 1.9347397476340693, "grad_norm": 0.48778714572885906, "learning_rate": 1.0526764915722319e-05, "loss": 0.3564, "step": 9814 }, { "epoch": 1.9349369085173502, "grad_norm": 0.46707720374812933, "learning_rate": 1.0525217035431687e-05, "loss": 0.3261, "step": 9815 }, { "epoch": 1.935134069400631, "grad_norm": 0.5104817179313064, "learning_rate": 1.0523669142522296e-05, "loss": 0.3757, "step": 9816 }, { "epoch": 1.9353312302839116, "grad_norm": 0.4760419839422579, "learning_rate": 1.0522121237031331e-05, "loss": 0.3202, "step": 9817 }, { "epoch": 1.9355283911671926, "grad_norm": 0.44494383811455845, "learning_rate": 1.0520573318995986e-05, "loss": 0.32, "step": 9818 }, { "epoch": 1.935725552050473, "grad_norm": 0.46156901553051594, "learning_rate": 1.051902538845344e-05, "loss": 0.3212, "step": 9819 }, { "epoch": 1.935922712933754, "grad_norm": 0.4658238149819304, "learning_rate": 1.0517477445440898e-05, "loss": 0.321, "step": 9820 }, { "epoch": 1.9361198738170347, "grad_norm": 0.44452316579492956, "learning_rate": 1.0515929489995544e-05, "loss": 0.3043, "step": 9821 }, { "epoch": 1.9363170347003154, "grad_norm": 0.45081267801444497, "learning_rate": 1.0514381522154563e-05, "loss": 0.3149, "step": 9822 }, { "epoch": 1.9365141955835963, "grad_norm": 0.5245331794257556, "learning_rate": 1.0512833541955158e-05, "loss": 0.3809, "step": 9823 }, { "epoch": 1.936711356466877, "grad_norm": 0.4930837487343131, "learning_rate": 1.0511285549434509e-05, "loss": 0.3181, "step": 9824 }, { "epoch": 1.9369085173501577, "grad_norm": 0.4545151283658732, "learning_rate": 1.0509737544629817e-05, "loss": 0.3117, "step": 9825 }, { "epoch": 1.9371056782334386, "grad_norm": 0.47452766613882447, "learning_rate": 1.0508189527578268e-05, "loss": 0.3187, "step": 9826 }, { "epoch": 1.937302839116719, "grad_norm": 0.44914864022298373, "learning_rate": 1.0506641498317056e-05, "loss": 0.3108, "step": 9827 }, { "epoch": 1.9375, "grad_norm": 6.606550108751509, "learning_rate": 1.0505093456883373e-05, "loss": 0.3925, "step": 9828 }, { "epoch": 1.937697160883281, "grad_norm": 0.48300218271218115, "learning_rate": 1.0503545403314414e-05, "loss": 0.3362, "step": 9829 }, { "epoch": 1.9378943217665614, "grad_norm": 0.4709845468244718, "learning_rate": 1.0501997337647372e-05, "loss": 0.3292, "step": 9830 }, { "epoch": 1.9380914826498423, "grad_norm": 0.4845184212039963, "learning_rate": 1.050044925991944e-05, "loss": 0.3554, "step": 9831 }, { "epoch": 1.938288643533123, "grad_norm": 0.4879693795170068, "learning_rate": 1.049890117016781e-05, "loss": 0.3615, "step": 9832 }, { "epoch": 1.9384858044164037, "grad_norm": 0.46307402255271723, "learning_rate": 1.0497353068429678e-05, "loss": 0.3401, "step": 9833 }, { "epoch": 1.9386829652996846, "grad_norm": 0.5531223341102023, "learning_rate": 1.049580495474224e-05, "loss": 0.3652, "step": 9834 }, { "epoch": 1.9388801261829653, "grad_norm": 0.4687758624727122, "learning_rate": 1.0494256829142687e-05, "loss": 0.3233, "step": 9835 }, { "epoch": 1.939077287066246, "grad_norm": 0.4923244502736517, "learning_rate": 1.0492708691668216e-05, "loss": 0.3246, "step": 9836 }, { "epoch": 1.939274447949527, "grad_norm": 0.44535709840076704, "learning_rate": 1.049116054235602e-05, "loss": 0.3239, "step": 9837 }, { "epoch": 1.9394716088328074, "grad_norm": 0.473698052381602, "learning_rate": 1.0489612381243299e-05, "loss": 0.3464, "step": 9838 }, { "epoch": 1.9396687697160884, "grad_norm": 0.4778541195055896, "learning_rate": 1.0488064208367246e-05, "loss": 0.3427, "step": 9839 }, { "epoch": 1.939865930599369, "grad_norm": 0.46660859837931956, "learning_rate": 1.0486516023765057e-05, "loss": 0.3334, "step": 9840 }, { "epoch": 1.9400630914826498, "grad_norm": 0.4609030291063781, "learning_rate": 1.0484967827473927e-05, "loss": 0.3233, "step": 9841 }, { "epoch": 1.9402602523659307, "grad_norm": 0.46978565070370426, "learning_rate": 1.0483419619531057e-05, "loss": 0.3397, "step": 9842 }, { "epoch": 1.9404574132492114, "grad_norm": 0.446991858909842, "learning_rate": 1.0481871399973638e-05, "loss": 0.3048, "step": 9843 }, { "epoch": 1.940654574132492, "grad_norm": 0.4538700492840836, "learning_rate": 1.0480323168838876e-05, "loss": 0.3104, "step": 9844 }, { "epoch": 1.940851735015773, "grad_norm": 0.4799559035841734, "learning_rate": 1.0478774926163957e-05, "loss": 0.3292, "step": 9845 }, { "epoch": 1.9410488958990535, "grad_norm": 0.49610549652225705, "learning_rate": 1.0477226671986089e-05, "loss": 0.3407, "step": 9846 }, { "epoch": 1.9412460567823344, "grad_norm": 0.4691565602936203, "learning_rate": 1.0475678406342462e-05, "loss": 0.3249, "step": 9847 }, { "epoch": 1.9414432176656151, "grad_norm": 0.4548964647773779, "learning_rate": 1.0474130129270281e-05, "loss": 0.3158, "step": 9848 }, { "epoch": 1.9416403785488958, "grad_norm": 0.46442569328638555, "learning_rate": 1.0472581840806742e-05, "loss": 0.3275, "step": 9849 }, { "epoch": 1.9418375394321767, "grad_norm": 0.47409430097779603, "learning_rate": 1.0471033540989044e-05, "loss": 0.3487, "step": 9850 }, { "epoch": 1.9420347003154574, "grad_norm": 0.4714341339076567, "learning_rate": 1.0469485229854383e-05, "loss": 0.3247, "step": 9851 }, { "epoch": 1.9422318611987381, "grad_norm": 0.4471157743782902, "learning_rate": 1.0467936907439966e-05, "loss": 0.3133, "step": 9852 }, { "epoch": 1.942429022082019, "grad_norm": 0.5320875040440245, "learning_rate": 1.0466388573782984e-05, "loss": 0.3648, "step": 9853 }, { "epoch": 1.9426261829652995, "grad_norm": 0.4714349073262386, "learning_rate": 1.0464840228920643e-05, "loss": 0.3159, "step": 9854 }, { "epoch": 1.9428233438485805, "grad_norm": 0.4907557101702487, "learning_rate": 1.046329187289014e-05, "loss": 0.3466, "step": 9855 }, { "epoch": 1.9430205047318612, "grad_norm": 0.4838478550153892, "learning_rate": 1.046174350572868e-05, "loss": 0.3141, "step": 9856 }, { "epoch": 1.9432176656151419, "grad_norm": 0.49718799827485166, "learning_rate": 1.0460195127473456e-05, "loss": 0.3363, "step": 9857 }, { "epoch": 1.9434148264984228, "grad_norm": 0.4538672203134272, "learning_rate": 1.0458646738161676e-05, "loss": 0.318, "step": 9858 }, { "epoch": 1.9436119873817035, "grad_norm": 0.4718009026198612, "learning_rate": 1.0457098337830536e-05, "loss": 0.3121, "step": 9859 }, { "epoch": 1.9438091482649842, "grad_norm": 0.4863092292989189, "learning_rate": 1.0455549926517243e-05, "loss": 0.3575, "step": 9860 }, { "epoch": 1.944006309148265, "grad_norm": 0.45207391928372037, "learning_rate": 1.0454001504258994e-05, "loss": 0.3208, "step": 9861 }, { "epoch": 1.9442034700315456, "grad_norm": 0.49540411046222155, "learning_rate": 1.0452453071092993e-05, "loss": 0.3148, "step": 9862 }, { "epoch": 1.9444006309148265, "grad_norm": 0.4902146796712335, "learning_rate": 1.0450904627056446e-05, "loss": 0.354, "step": 9863 }, { "epoch": 1.9445977917981072, "grad_norm": 0.4302996025002257, "learning_rate": 1.0449356172186548e-05, "loss": 0.2963, "step": 9864 }, { "epoch": 1.944794952681388, "grad_norm": 0.47578474259947834, "learning_rate": 1.0447807706520513e-05, "loss": 0.3117, "step": 9865 }, { "epoch": 1.9449921135646688, "grad_norm": 0.4504023483184343, "learning_rate": 1.0446259230095531e-05, "loss": 0.3401, "step": 9866 }, { "epoch": 1.9451892744479495, "grad_norm": 0.4757062498349321, "learning_rate": 1.0444710742948814e-05, "loss": 0.3366, "step": 9867 }, { "epoch": 1.9453864353312302, "grad_norm": 0.4819233484036111, "learning_rate": 1.0443162245117562e-05, "loss": 0.3342, "step": 9868 }, { "epoch": 1.9455835962145112, "grad_norm": 0.5213874575567724, "learning_rate": 1.044161373663898e-05, "loss": 0.3172, "step": 9869 }, { "epoch": 1.9457807570977916, "grad_norm": 0.44488837497482253, "learning_rate": 1.0440065217550273e-05, "loss": 0.3125, "step": 9870 }, { "epoch": 1.9459779179810726, "grad_norm": 0.460970808844026, "learning_rate": 1.0438516687888645e-05, "loss": 0.3258, "step": 9871 }, { "epoch": 1.9461750788643533, "grad_norm": 0.4810006442638309, "learning_rate": 1.04369681476913e-05, "loss": 0.3219, "step": 9872 }, { "epoch": 1.946372239747634, "grad_norm": 0.4838450735864227, "learning_rate": 1.0435419596995444e-05, "loss": 0.3655, "step": 9873 }, { "epoch": 1.9465694006309149, "grad_norm": 0.47505078585700605, "learning_rate": 1.0433871035838283e-05, "loss": 0.3257, "step": 9874 }, { "epoch": 1.9467665615141956, "grad_norm": 0.47990929946296423, "learning_rate": 1.0432322464257019e-05, "loss": 0.3165, "step": 9875 }, { "epoch": 1.9469637223974763, "grad_norm": 0.4824430346339747, "learning_rate": 1.0430773882288859e-05, "loss": 0.3493, "step": 9876 }, { "epoch": 1.9471608832807572, "grad_norm": 0.47127696634444705, "learning_rate": 1.042922528997101e-05, "loss": 0.3265, "step": 9877 }, { "epoch": 1.947358044164038, "grad_norm": 0.45182006106255146, "learning_rate": 1.0427676687340678e-05, "loss": 0.3219, "step": 9878 }, { "epoch": 1.9475552050473186, "grad_norm": 0.46337044334398714, "learning_rate": 1.0426128074435068e-05, "loss": 0.3319, "step": 9879 }, { "epoch": 1.9477523659305995, "grad_norm": 0.4555312238661632, "learning_rate": 1.0424579451291393e-05, "loss": 0.3119, "step": 9880 }, { "epoch": 1.94794952681388, "grad_norm": 0.5027627443388722, "learning_rate": 1.042303081794685e-05, "loss": 0.3372, "step": 9881 }, { "epoch": 1.948146687697161, "grad_norm": 0.46830494008681434, "learning_rate": 1.042148217443865e-05, "loss": 0.3491, "step": 9882 }, { "epoch": 1.9483438485804416, "grad_norm": 0.4738873885085205, "learning_rate": 1.0419933520804002e-05, "loss": 0.3493, "step": 9883 }, { "epoch": 1.9485410094637223, "grad_norm": 0.4965510696033795, "learning_rate": 1.0418384857080118e-05, "loss": 0.336, "step": 9884 }, { "epoch": 1.9487381703470033, "grad_norm": 0.4779020902575847, "learning_rate": 1.0416836183304198e-05, "loss": 0.3348, "step": 9885 }, { "epoch": 1.948935331230284, "grad_norm": 0.4533557633530483, "learning_rate": 1.0415287499513452e-05, "loss": 0.3072, "step": 9886 }, { "epoch": 1.9491324921135647, "grad_norm": 0.46478354483577006, "learning_rate": 1.0413738805745089e-05, "loss": 0.3269, "step": 9887 }, { "epoch": 1.9493296529968456, "grad_norm": 0.4553262713575033, "learning_rate": 1.0412190102036317e-05, "loss": 0.329, "step": 9888 }, { "epoch": 1.949526813880126, "grad_norm": 0.5236120156557359, "learning_rate": 1.041064138842435e-05, "loss": 0.3589, "step": 9889 }, { "epoch": 1.949723974763407, "grad_norm": 0.46404257823966893, "learning_rate": 1.0409092664946388e-05, "loss": 0.3165, "step": 9890 }, { "epoch": 1.9499211356466877, "grad_norm": 0.46119544702732324, "learning_rate": 1.040754393163965e-05, "loss": 0.3198, "step": 9891 }, { "epoch": 1.9501182965299684, "grad_norm": 0.48159542350926215, "learning_rate": 1.0405995188541336e-05, "loss": 0.3373, "step": 9892 }, { "epoch": 1.9503154574132493, "grad_norm": 0.4397391980222927, "learning_rate": 1.0404446435688665e-05, "loss": 0.3109, "step": 9893 }, { "epoch": 1.95051261829653, "grad_norm": 0.4668066514467705, "learning_rate": 1.040289767311884e-05, "loss": 0.345, "step": 9894 }, { "epoch": 1.9507097791798107, "grad_norm": 0.4880251587331396, "learning_rate": 1.0401348900869073e-05, "loss": 0.3402, "step": 9895 }, { "epoch": 1.9509069400630916, "grad_norm": 0.4909340031989673, "learning_rate": 1.0399800118976577e-05, "loss": 0.3561, "step": 9896 }, { "epoch": 1.951104100946372, "grad_norm": 0.46507313249997145, "learning_rate": 1.0398251327478561e-05, "loss": 0.3226, "step": 9897 }, { "epoch": 1.951301261829653, "grad_norm": 0.45585618774916764, "learning_rate": 1.0396702526412237e-05, "loss": 0.2961, "step": 9898 }, { "epoch": 1.9514984227129337, "grad_norm": 0.5000463843480156, "learning_rate": 1.0395153715814816e-05, "loss": 0.3413, "step": 9899 }, { "epoch": 1.9516955835962144, "grad_norm": 0.45225941628046795, "learning_rate": 1.0393604895723509e-05, "loss": 0.3243, "step": 9900 }, { "epoch": 1.9518927444794953, "grad_norm": 0.4853464256835092, "learning_rate": 1.0392056066175524e-05, "loss": 0.3294, "step": 9901 }, { "epoch": 1.952089905362776, "grad_norm": 0.48505362928524964, "learning_rate": 1.039050722720808e-05, "loss": 0.3592, "step": 9902 }, { "epoch": 1.9522870662460567, "grad_norm": 0.48062154733831464, "learning_rate": 1.0388958378858383e-05, "loss": 0.3211, "step": 9903 }, { "epoch": 1.9524842271293377, "grad_norm": 0.5171535755363212, "learning_rate": 1.038740952116365e-05, "loss": 0.3546, "step": 9904 }, { "epoch": 1.9526813880126181, "grad_norm": 0.46818506771167984, "learning_rate": 1.0385860654161088e-05, "loss": 0.3284, "step": 9905 }, { "epoch": 1.952878548895899, "grad_norm": 0.4915580603272551, "learning_rate": 1.0384311777887916e-05, "loss": 0.3658, "step": 9906 }, { "epoch": 1.9530757097791798, "grad_norm": 0.46935304552685453, "learning_rate": 1.0382762892381342e-05, "loss": 0.3332, "step": 9907 }, { "epoch": 1.9532728706624605, "grad_norm": 0.45363492810935857, "learning_rate": 1.0381213997678582e-05, "loss": 0.3388, "step": 9908 }, { "epoch": 1.9534700315457414, "grad_norm": 0.44641220154213385, "learning_rate": 1.0379665093816848e-05, "loss": 0.3035, "step": 9909 }, { "epoch": 1.953667192429022, "grad_norm": 0.45206940818234764, "learning_rate": 1.0378116180833357e-05, "loss": 0.3222, "step": 9910 }, { "epoch": 1.9538643533123028, "grad_norm": 0.4575547617482853, "learning_rate": 1.0376567258765316e-05, "loss": 0.3193, "step": 9911 }, { "epoch": 1.9540615141955837, "grad_norm": 0.4590418732960005, "learning_rate": 1.0375018327649948e-05, "loss": 0.3023, "step": 9912 }, { "epoch": 1.9542586750788642, "grad_norm": 0.4543796513395991, "learning_rate": 1.037346938752446e-05, "loss": 0.2872, "step": 9913 }, { "epoch": 1.9544558359621451, "grad_norm": 0.4805196349855143, "learning_rate": 1.037192043842607e-05, "loss": 0.325, "step": 9914 }, { "epoch": 1.9546529968454258, "grad_norm": 0.46446662799687205, "learning_rate": 1.037037148039199e-05, "loss": 0.3449, "step": 9915 }, { "epoch": 1.9548501577287065, "grad_norm": 0.7024562863012095, "learning_rate": 1.036882251345944e-05, "loss": 0.3346, "step": 9916 }, { "epoch": 1.9550473186119874, "grad_norm": 0.4792762668817157, "learning_rate": 1.036727353766563e-05, "loss": 0.3535, "step": 9917 }, { "epoch": 1.9552444794952681, "grad_norm": 0.46573884017043454, "learning_rate": 1.0365724553047778e-05, "loss": 0.3114, "step": 9918 }, { "epoch": 1.9554416403785488, "grad_norm": 0.45556820668060394, "learning_rate": 1.03641755596431e-05, "loss": 0.3101, "step": 9919 }, { "epoch": 1.9556388012618298, "grad_norm": 0.5061371270375018, "learning_rate": 1.0362626557488811e-05, "loss": 0.3451, "step": 9920 }, { "epoch": 1.9558359621451105, "grad_norm": 0.45544222128631295, "learning_rate": 1.0361077546622125e-05, "loss": 0.3414, "step": 9921 }, { "epoch": 1.9560331230283912, "grad_norm": 0.49708322112827474, "learning_rate": 1.0359528527080263e-05, "loss": 0.3501, "step": 9922 }, { "epoch": 1.956230283911672, "grad_norm": 0.4777705719722766, "learning_rate": 1.0357979498900436e-05, "loss": 0.3216, "step": 9923 }, { "epoch": 1.9564274447949526, "grad_norm": 0.4774834671431984, "learning_rate": 1.0356430462119865e-05, "loss": 0.3045, "step": 9924 }, { "epoch": 1.9566246056782335, "grad_norm": 0.47057412024018935, "learning_rate": 1.0354881416775765e-05, "loss": 0.3218, "step": 9925 }, { "epoch": 1.9568217665615142, "grad_norm": 0.4991382437521349, "learning_rate": 1.0353332362905351e-05, "loss": 0.3485, "step": 9926 }, { "epoch": 1.9570189274447949, "grad_norm": 0.4561117034939663, "learning_rate": 1.0351783300545843e-05, "loss": 0.333, "step": 9927 }, { "epoch": 1.9572160883280758, "grad_norm": 0.47236910408393734, "learning_rate": 1.0350234229734459e-05, "loss": 0.341, "step": 9928 }, { "epoch": 1.9574132492113565, "grad_norm": 0.46739069562356134, "learning_rate": 1.0348685150508417e-05, "loss": 0.316, "step": 9929 }, { "epoch": 1.9576104100946372, "grad_norm": 0.4726019967619267, "learning_rate": 1.034713606290493e-05, "loss": 0.3164, "step": 9930 }, { "epoch": 1.9578075709779181, "grad_norm": 0.46230340086353033, "learning_rate": 1.0345586966961223e-05, "loss": 0.3286, "step": 9931 }, { "epoch": 1.9580047318611986, "grad_norm": 0.4729940247692541, "learning_rate": 1.0344037862714506e-05, "loss": 0.3364, "step": 9932 }, { "epoch": 1.9582018927444795, "grad_norm": 0.49856429925552365, "learning_rate": 1.034248875020201e-05, "loss": 0.3196, "step": 9933 }, { "epoch": 1.9583990536277602, "grad_norm": 0.4701007265414603, "learning_rate": 1.0340939629460938e-05, "loss": 0.3181, "step": 9934 }, { "epoch": 1.958596214511041, "grad_norm": 0.4658639064265051, "learning_rate": 1.0339390500528523e-05, "loss": 0.3145, "step": 9935 }, { "epoch": 1.9587933753943219, "grad_norm": 0.49004170327794266, "learning_rate": 1.0337841363441973e-05, "loss": 0.336, "step": 9936 }, { "epoch": 1.9589905362776026, "grad_norm": 0.44921738293967783, "learning_rate": 1.0336292218238514e-05, "loss": 0.3133, "step": 9937 }, { "epoch": 1.9591876971608833, "grad_norm": 0.4823452316355001, "learning_rate": 1.0334743064955367e-05, "loss": 0.3471, "step": 9938 }, { "epoch": 1.9593848580441642, "grad_norm": 0.4507059368884933, "learning_rate": 1.0333193903629743e-05, "loss": 0.295, "step": 9939 }, { "epoch": 1.9595820189274447, "grad_norm": 0.46683494943121584, "learning_rate": 1.0331644734298874e-05, "loss": 0.3325, "step": 9940 }, { "epoch": 1.9597791798107256, "grad_norm": 0.4744162384642641, "learning_rate": 1.0330095556999966e-05, "loss": 0.32, "step": 9941 }, { "epoch": 1.9599763406940063, "grad_norm": 0.4617197712445625, "learning_rate": 1.0328546371770249e-05, "loss": 0.3618, "step": 9942 }, { "epoch": 1.960173501577287, "grad_norm": 0.5244555576526214, "learning_rate": 1.0326997178646941e-05, "loss": 0.3248, "step": 9943 }, { "epoch": 1.960370662460568, "grad_norm": 0.4799968339049716, "learning_rate": 1.0325447977667262e-05, "loss": 0.3388, "step": 9944 }, { "epoch": 1.9605678233438486, "grad_norm": 0.44066352020859745, "learning_rate": 1.0323898768868434e-05, "loss": 0.3199, "step": 9945 }, { "epoch": 1.9607649842271293, "grad_norm": 0.5245090776857998, "learning_rate": 1.0322349552287676e-05, "loss": 0.3169, "step": 9946 }, { "epoch": 1.9609621451104102, "grad_norm": 0.4716152433206715, "learning_rate": 1.0320800327962212e-05, "loss": 0.3217, "step": 9947 }, { "epoch": 1.9611593059936907, "grad_norm": 0.4712223481043858, "learning_rate": 1.0319251095929262e-05, "loss": 0.3397, "step": 9948 }, { "epoch": 1.9613564668769716, "grad_norm": 0.43783005573851125, "learning_rate": 1.0317701856226045e-05, "loss": 0.3187, "step": 9949 }, { "epoch": 1.9615536277602523, "grad_norm": 0.4490127963028393, "learning_rate": 1.0316152608889787e-05, "loss": 0.3312, "step": 9950 }, { "epoch": 1.961750788643533, "grad_norm": 0.47018160199287945, "learning_rate": 1.0314603353957709e-05, "loss": 0.3378, "step": 9951 }, { "epoch": 1.961947949526814, "grad_norm": 0.45982647728871295, "learning_rate": 1.031305409146703e-05, "loss": 0.3053, "step": 9952 }, { "epoch": 1.9621451104100947, "grad_norm": 0.47793598964818645, "learning_rate": 1.0311504821454973e-05, "loss": 0.3528, "step": 9953 }, { "epoch": 1.9623422712933754, "grad_norm": 0.5116582725353143, "learning_rate": 1.0309955543958765e-05, "loss": 0.3343, "step": 9954 }, { "epoch": 1.9625394321766563, "grad_norm": 0.4623753000651675, "learning_rate": 1.0308406259015624e-05, "loss": 0.345, "step": 9955 }, { "epoch": 1.9627365930599368, "grad_norm": 0.45920787227072735, "learning_rate": 1.0306856966662776e-05, "loss": 0.3179, "step": 9956 }, { "epoch": 1.9629337539432177, "grad_norm": 0.48812471718237876, "learning_rate": 1.0305307666937441e-05, "loss": 0.3162, "step": 9957 }, { "epoch": 1.9631309148264984, "grad_norm": 0.4525568703068494, "learning_rate": 1.0303758359876841e-05, "loss": 0.3335, "step": 9958 }, { "epoch": 1.963328075709779, "grad_norm": 0.4797900640643521, "learning_rate": 1.0302209045518206e-05, "loss": 0.3557, "step": 9959 }, { "epoch": 1.96352523659306, "grad_norm": 0.46653885535240164, "learning_rate": 1.0300659723898752e-05, "loss": 0.3313, "step": 9960 }, { "epoch": 1.9637223974763407, "grad_norm": 0.5002009677410206, "learning_rate": 1.029911039505571e-05, "loss": 0.3303, "step": 9961 }, { "epoch": 1.9639195583596214, "grad_norm": 0.4853690102944126, "learning_rate": 1.0297561059026293e-05, "loss": 0.3165, "step": 9962 }, { "epoch": 1.9641167192429023, "grad_norm": 0.4855838797298784, "learning_rate": 1.0296011715847738e-05, "loss": 0.3383, "step": 9963 }, { "epoch": 1.964313880126183, "grad_norm": 0.48686860386638087, "learning_rate": 1.029446236555726e-05, "loss": 0.3265, "step": 9964 }, { "epoch": 1.9645110410094637, "grad_norm": 0.519576709109696, "learning_rate": 1.0292913008192088e-05, "loss": 0.3623, "step": 9965 }, { "epoch": 1.9647082018927446, "grad_norm": 0.5048904976816195, "learning_rate": 1.0291363643789445e-05, "loss": 0.3572, "step": 9966 }, { "epoch": 1.9649053627760251, "grad_norm": 0.42587004814801377, "learning_rate": 1.0289814272386556e-05, "loss": 0.292, "step": 9967 }, { "epoch": 1.965102523659306, "grad_norm": 0.500466873158178, "learning_rate": 1.0288264894020646e-05, "loss": 0.333, "step": 9968 }, { "epoch": 1.9652996845425867, "grad_norm": 0.465182567762671, "learning_rate": 1.0286715508728937e-05, "loss": 0.3157, "step": 9969 }, { "epoch": 1.9654968454258674, "grad_norm": 0.4617031591725659, "learning_rate": 1.0285166116548662e-05, "loss": 0.3202, "step": 9970 }, { "epoch": 1.9656940063091484, "grad_norm": 0.47956736857907367, "learning_rate": 1.0283616717517037e-05, "loss": 0.3549, "step": 9971 }, { "epoch": 1.965891167192429, "grad_norm": 0.45146021950685644, "learning_rate": 1.0282067311671293e-05, "loss": 0.3241, "step": 9972 }, { "epoch": 1.9660883280757098, "grad_norm": 0.4674887113548766, "learning_rate": 1.0280517899048657e-05, "loss": 0.3395, "step": 9973 }, { "epoch": 1.9662854889589907, "grad_norm": 0.47145112139932266, "learning_rate": 1.027896847968635e-05, "loss": 0.3287, "step": 9974 }, { "epoch": 1.9664826498422712, "grad_norm": 0.4565740445948791, "learning_rate": 1.0277419053621602e-05, "loss": 0.3196, "step": 9975 }, { "epoch": 1.966679810725552, "grad_norm": 0.48138321162232844, "learning_rate": 1.0275869620891637e-05, "loss": 0.3403, "step": 9976 }, { "epoch": 1.9668769716088328, "grad_norm": 0.484975445459746, "learning_rate": 1.0274320181533681e-05, "loss": 0.3283, "step": 9977 }, { "epoch": 1.9670741324921135, "grad_norm": 0.4387256747272943, "learning_rate": 1.0272770735584966e-05, "loss": 0.3262, "step": 9978 }, { "epoch": 1.9672712933753944, "grad_norm": 0.471596315685171, "learning_rate": 1.0271221283082709e-05, "loss": 0.3276, "step": 9979 }, { "epoch": 1.9674684542586751, "grad_norm": 0.4545253122593284, "learning_rate": 1.0269671824064146e-05, "loss": 0.3234, "step": 9980 }, { "epoch": 1.9676656151419558, "grad_norm": 0.45641111253411415, "learning_rate": 1.0268122358566496e-05, "loss": 0.3175, "step": 9981 }, { "epoch": 1.9678627760252367, "grad_norm": 0.44841650898946805, "learning_rate": 1.0266572886626997e-05, "loss": 0.2985, "step": 9982 }, { "epoch": 1.9680599369085172, "grad_norm": 0.43192291651712367, "learning_rate": 1.0265023408282866e-05, "loss": 0.2932, "step": 9983 }, { "epoch": 1.9682570977917981, "grad_norm": 0.5012210582692301, "learning_rate": 1.0263473923571334e-05, "loss": 0.329, "step": 9984 }, { "epoch": 1.9684542586750788, "grad_norm": 1.1443330318609166, "learning_rate": 1.0261924432529629e-05, "loss": 0.3461, "step": 9985 }, { "epoch": 1.9686514195583595, "grad_norm": 0.5004253208337783, "learning_rate": 1.0260374935194979e-05, "loss": 0.3069, "step": 9986 }, { "epoch": 1.9688485804416405, "grad_norm": 0.4696360012631041, "learning_rate": 1.025882543160461e-05, "loss": 0.3157, "step": 9987 }, { "epoch": 1.9690457413249212, "grad_norm": 0.4826746914030759, "learning_rate": 1.0257275921795756e-05, "loss": 0.326, "step": 9988 }, { "epoch": 1.9692429022082019, "grad_norm": 1.3597036785857965, "learning_rate": 1.0255726405805637e-05, "loss": 0.3169, "step": 9989 }, { "epoch": 1.9694400630914828, "grad_norm": 0.4967882690519761, "learning_rate": 1.0254176883671485e-05, "loss": 0.335, "step": 9990 }, { "epoch": 1.9696372239747633, "grad_norm": 0.502620609744957, "learning_rate": 1.0252627355430532e-05, "loss": 0.3477, "step": 9991 }, { "epoch": 1.9698343848580442, "grad_norm": 0.4484287142620489, "learning_rate": 1.0251077821119998e-05, "loss": 0.3114, "step": 9992 }, { "epoch": 1.9700315457413249, "grad_norm": 0.4747578081774963, "learning_rate": 1.0249528280777121e-05, "loss": 0.3072, "step": 9993 }, { "epoch": 1.9702287066246056, "grad_norm": 0.4973932701653613, "learning_rate": 1.0247978734439127e-05, "loss": 0.3364, "step": 9994 }, { "epoch": 1.9704258675078865, "grad_norm": 0.5356785346512383, "learning_rate": 1.0246429182143241e-05, "loss": 0.325, "step": 9995 }, { "epoch": 1.9706230283911672, "grad_norm": 0.46472104839260125, "learning_rate": 1.0244879623926698e-05, "loss": 0.3337, "step": 9996 }, { "epoch": 1.970820189274448, "grad_norm": 0.47337341572225305, "learning_rate": 1.0243330059826724e-05, "loss": 0.3333, "step": 9997 }, { "epoch": 1.9710173501577288, "grad_norm": 0.48176261552874655, "learning_rate": 1.0241780489880546e-05, "loss": 0.338, "step": 9998 }, { "epoch": 1.9712145110410093, "grad_norm": 0.4816284206757202, "learning_rate": 1.0240230914125401e-05, "loss": 0.3312, "step": 9999 }, { "epoch": 1.9714116719242902, "grad_norm": 0.4632673744732919, "learning_rate": 1.0238681332598512e-05, "loss": 0.3249, "step": 10000 }, { "epoch": 1.971608832807571, "grad_norm": 0.48018702662774365, "learning_rate": 1.0237131745337117e-05, "loss": 0.3549, "step": 10001 }, { "epoch": 1.9718059936908516, "grad_norm": 0.468079428043231, "learning_rate": 1.0235582152378435e-05, "loss": 0.3089, "step": 10002 }, { "epoch": 1.9720031545741326, "grad_norm": 0.4966800301788038, "learning_rate": 1.0234032553759707e-05, "loss": 0.3486, "step": 10003 }, { "epoch": 1.9722003154574133, "grad_norm": 0.44884608245777396, "learning_rate": 1.0232482949518157e-05, "loss": 0.2966, "step": 10004 }, { "epoch": 1.972397476340694, "grad_norm": 8.64090741552038, "learning_rate": 1.0230933339691014e-05, "loss": 0.3888, "step": 10005 }, { "epoch": 1.9725946372239749, "grad_norm": 0.48715703305183966, "learning_rate": 1.0229383724315516e-05, "loss": 0.3216, "step": 10006 }, { "epoch": 1.9727917981072554, "grad_norm": 0.44760735526085277, "learning_rate": 1.0227834103428884e-05, "loss": 0.3165, "step": 10007 }, { "epoch": 1.9729889589905363, "grad_norm": 0.46751672698359104, "learning_rate": 1.022628447706836e-05, "loss": 0.3247, "step": 10008 }, { "epoch": 1.973186119873817, "grad_norm": 0.5038928323160687, "learning_rate": 1.0224734845271163e-05, "loss": 0.3713, "step": 10009 }, { "epoch": 1.9733832807570977, "grad_norm": 0.7157608221471825, "learning_rate": 1.0223185208074538e-05, "loss": 0.3501, "step": 10010 }, { "epoch": 1.9735804416403786, "grad_norm": 0.46267508667900453, "learning_rate": 1.0221635565515699e-05, "loss": 0.3268, "step": 10011 }, { "epoch": 1.9737776025236593, "grad_norm": 0.47469784854807734, "learning_rate": 1.0220085917631894e-05, "loss": 0.3323, "step": 10012 }, { "epoch": 1.97397476340694, "grad_norm": 0.4807980595553942, "learning_rate": 1.0218536264460346e-05, "loss": 0.3215, "step": 10013 }, { "epoch": 1.974171924290221, "grad_norm": 0.4659523222268142, "learning_rate": 1.0216986606038288e-05, "loss": 0.3288, "step": 10014 }, { "epoch": 1.9743690851735016, "grad_norm": 0.46518600281925643, "learning_rate": 1.0215436942402952e-05, "loss": 0.3337, "step": 10015 }, { "epoch": 1.9745662460567823, "grad_norm": 0.49142919292843934, "learning_rate": 1.0213887273591573e-05, "loss": 0.3318, "step": 10016 }, { "epoch": 1.9747634069400632, "grad_norm": 0.46949418334164145, "learning_rate": 1.0212337599641376e-05, "loss": 0.3245, "step": 10017 }, { "epoch": 1.9749605678233437, "grad_norm": 0.4508227196884441, "learning_rate": 1.0210787920589598e-05, "loss": 0.3187, "step": 10018 }, { "epoch": 1.9751577287066246, "grad_norm": 0.48656630600989037, "learning_rate": 1.0209238236473472e-05, "loss": 0.3362, "step": 10019 }, { "epoch": 1.9753548895899053, "grad_norm": 0.4759349272246813, "learning_rate": 1.0207688547330225e-05, "loss": 0.3247, "step": 10020 }, { "epoch": 1.975552050473186, "grad_norm": 0.5742771214058191, "learning_rate": 1.0206138853197098e-05, "loss": 0.3278, "step": 10021 }, { "epoch": 1.975749211356467, "grad_norm": 0.5031639152587618, "learning_rate": 1.0204589154111318e-05, "loss": 0.3604, "step": 10022 }, { "epoch": 1.9759463722397477, "grad_norm": 0.48247181846875936, "learning_rate": 1.0203039450110117e-05, "loss": 0.3441, "step": 10023 }, { "epoch": 1.9761435331230284, "grad_norm": 0.43458711087785906, "learning_rate": 1.020148974123073e-05, "loss": 0.2844, "step": 10024 }, { "epoch": 1.9763406940063093, "grad_norm": 0.47496966219956804, "learning_rate": 1.0199940027510392e-05, "loss": 0.3473, "step": 10025 }, { "epoch": 1.9765378548895898, "grad_norm": 0.5215472264487935, "learning_rate": 1.0198390308986328e-05, "loss": 0.3459, "step": 10026 }, { "epoch": 1.9767350157728707, "grad_norm": 0.44281471612530954, "learning_rate": 1.0196840585695785e-05, "loss": 0.3057, "step": 10027 }, { "epoch": 1.9769321766561514, "grad_norm": 0.4818228906154598, "learning_rate": 1.0195290857675982e-05, "loss": 0.3293, "step": 10028 }, { "epoch": 1.977129337539432, "grad_norm": 0.460334518749255, "learning_rate": 1.0193741124964164e-05, "loss": 0.3286, "step": 10029 }, { "epoch": 1.977326498422713, "grad_norm": 0.46235057626123255, "learning_rate": 1.0192191387597554e-05, "loss": 0.3202, "step": 10030 }, { "epoch": 1.9775236593059937, "grad_norm": 0.44454174849526384, "learning_rate": 1.0190641645613397e-05, "loss": 0.3184, "step": 10031 }, { "epoch": 1.9777208201892744, "grad_norm": 0.46226996398858433, "learning_rate": 1.0189091899048914e-05, "loss": 0.3438, "step": 10032 }, { "epoch": 1.9779179810725553, "grad_norm": 0.5082875526155569, "learning_rate": 1.0187542147941352e-05, "loss": 0.3517, "step": 10033 }, { "epoch": 1.9781151419558358, "grad_norm": 0.43241743937186994, "learning_rate": 1.0185992392327936e-05, "loss": 0.3188, "step": 10034 }, { "epoch": 1.9783123028391167, "grad_norm": 0.4476815196740319, "learning_rate": 1.0184442632245905e-05, "loss": 0.3252, "step": 10035 }, { "epoch": 1.9785094637223974, "grad_norm": 0.45862822746483506, "learning_rate": 1.018289286773249e-05, "loss": 0.325, "step": 10036 }, { "epoch": 1.9787066246056781, "grad_norm": 0.4483776581846178, "learning_rate": 1.0181343098824928e-05, "loss": 0.3299, "step": 10037 }, { "epoch": 1.978903785488959, "grad_norm": 0.46878007068368455, "learning_rate": 1.017979332556045e-05, "loss": 0.3291, "step": 10038 }, { "epoch": 1.9791009463722398, "grad_norm": 0.4916705031341869, "learning_rate": 1.0178243547976293e-05, "loss": 0.3253, "step": 10039 }, { "epoch": 1.9792981072555205, "grad_norm": 0.5025149474174093, "learning_rate": 1.017669376610969e-05, "loss": 0.3469, "step": 10040 }, { "epoch": 1.9794952681388014, "grad_norm": 0.4592700151321055, "learning_rate": 1.0175143979997878e-05, "loss": 0.3341, "step": 10041 }, { "epoch": 1.9796924290220819, "grad_norm": 0.4918000568041209, "learning_rate": 1.0173594189678093e-05, "loss": 0.3598, "step": 10042 }, { "epoch": 1.9798895899053628, "grad_norm": 0.4479364044121577, "learning_rate": 1.0172044395187566e-05, "loss": 0.3027, "step": 10043 }, { "epoch": 1.9800867507886435, "grad_norm": 0.47799619307194813, "learning_rate": 1.0170494596563533e-05, "loss": 0.3433, "step": 10044 }, { "epoch": 1.9802839116719242, "grad_norm": 0.4523062427056764, "learning_rate": 1.016894479384323e-05, "loss": 0.305, "step": 10045 }, { "epoch": 1.9804810725552051, "grad_norm": 0.4759998481341186, "learning_rate": 1.0167394987063894e-05, "loss": 0.3446, "step": 10046 }, { "epoch": 1.9806782334384858, "grad_norm": 0.4869635858753714, "learning_rate": 1.0165845176262757e-05, "loss": 0.3478, "step": 10047 }, { "epoch": 1.9808753943217665, "grad_norm": 0.45645594907618603, "learning_rate": 1.016429536147706e-05, "loss": 0.3201, "step": 10048 }, { "epoch": 1.9810725552050474, "grad_norm": 0.46823112725914234, "learning_rate": 1.0162745542744028e-05, "loss": 0.305, "step": 10049 }, { "epoch": 1.981269716088328, "grad_norm": 0.46841106024358004, "learning_rate": 1.016119572010091e-05, "loss": 0.3323, "step": 10050 }, { "epoch": 1.9814668769716088, "grad_norm": 0.44805736427909854, "learning_rate": 1.015964589358493e-05, "loss": 0.3241, "step": 10051 }, { "epoch": 1.9816640378548895, "grad_norm": 0.46092527228178226, "learning_rate": 1.015809606323333e-05, "loss": 0.325, "step": 10052 }, { "epoch": 1.9818611987381702, "grad_norm": 0.4813889305251375, "learning_rate": 1.0156546229083346e-05, "loss": 0.3346, "step": 10053 }, { "epoch": 1.9820583596214512, "grad_norm": 0.48424701481984905, "learning_rate": 1.0154996391172211e-05, "loss": 0.3176, "step": 10054 }, { "epoch": 1.9822555205047319, "grad_norm": 0.5052195601955368, "learning_rate": 1.0153446549537164e-05, "loss": 0.3738, "step": 10055 }, { "epoch": 1.9824526813880126, "grad_norm": 0.5264958898669171, "learning_rate": 1.0151896704215441e-05, "loss": 0.3509, "step": 10056 }, { "epoch": 1.9826498422712935, "grad_norm": 0.4424729969934259, "learning_rate": 1.015034685524428e-05, "loss": 0.3016, "step": 10057 }, { "epoch": 1.9828470031545742, "grad_norm": 0.45230293654878495, "learning_rate": 1.0148797002660909e-05, "loss": 0.3415, "step": 10058 }, { "epoch": 1.9830441640378549, "grad_norm": 0.4296621647967024, "learning_rate": 1.0147247146502573e-05, "loss": 0.3229, "step": 10059 }, { "epoch": 1.9832413249211358, "grad_norm": 0.4736201795007952, "learning_rate": 1.0145697286806505e-05, "loss": 0.3193, "step": 10060 }, { "epoch": 1.9834384858044163, "grad_norm": 0.4483471743167527, "learning_rate": 1.0144147423609942e-05, "loss": 0.3078, "step": 10061 }, { "epoch": 1.9836356466876972, "grad_norm": 0.4795452273103116, "learning_rate": 1.0142597556950123e-05, "loss": 0.314, "step": 10062 }, { "epoch": 1.983832807570978, "grad_norm": 0.4651739245868516, "learning_rate": 1.014104768686428e-05, "loss": 0.3245, "step": 10063 }, { "epoch": 1.9840299684542586, "grad_norm": 0.4678656723303721, "learning_rate": 1.0139497813389654e-05, "loss": 0.3262, "step": 10064 }, { "epoch": 1.9842271293375395, "grad_norm": 0.4663884375595269, "learning_rate": 1.0137947936563481e-05, "loss": 0.3314, "step": 10065 }, { "epoch": 1.9844242902208202, "grad_norm": 0.4991990370561572, "learning_rate": 1.0136398056422995e-05, "loss": 0.3662, "step": 10066 }, { "epoch": 1.984621451104101, "grad_norm": 0.4630237349751292, "learning_rate": 1.013484817300544e-05, "loss": 0.3131, "step": 10067 }, { "epoch": 1.9848186119873819, "grad_norm": 0.4760005186639651, "learning_rate": 1.0133298286348046e-05, "loss": 0.3309, "step": 10068 }, { "epoch": 1.9850157728706623, "grad_norm": 0.47490621093107355, "learning_rate": 1.0131748396488057e-05, "loss": 0.3426, "step": 10069 }, { "epoch": 1.9852129337539433, "grad_norm": 0.4490036193130323, "learning_rate": 1.0130198503462705e-05, "loss": 0.3244, "step": 10070 }, { "epoch": 1.985410094637224, "grad_norm": 0.43921214974034223, "learning_rate": 1.0128648607309228e-05, "loss": 0.3108, "step": 10071 }, { "epoch": 1.9856072555205047, "grad_norm": 0.47746525475723234, "learning_rate": 1.0127098708064866e-05, "loss": 0.3631, "step": 10072 }, { "epoch": 1.9858044164037856, "grad_norm": 0.44863839819394496, "learning_rate": 1.0125548805766852e-05, "loss": 0.3203, "step": 10073 }, { "epoch": 1.9860015772870663, "grad_norm": 0.43643215846559164, "learning_rate": 1.0123998900452431e-05, "loss": 0.3055, "step": 10074 }, { "epoch": 1.986198738170347, "grad_norm": 0.4722620454489846, "learning_rate": 1.0122448992158834e-05, "loss": 0.3485, "step": 10075 }, { "epoch": 1.986395899053628, "grad_norm": 0.4686910381456083, "learning_rate": 1.0120899080923306e-05, "loss": 0.3494, "step": 10076 }, { "epoch": 1.9865930599369084, "grad_norm": 0.5235470374582382, "learning_rate": 1.0119349166783073e-05, "loss": 0.3158, "step": 10077 }, { "epoch": 1.9867902208201893, "grad_norm": 0.47775678927346366, "learning_rate": 1.0117799249775387e-05, "loss": 0.3274, "step": 10078 }, { "epoch": 1.98698738170347, "grad_norm": 0.4703367415865138, "learning_rate": 1.0116249329937474e-05, "loss": 0.3429, "step": 10079 }, { "epoch": 1.9871845425867507, "grad_norm": 0.4475956538792856, "learning_rate": 1.0114699407306576e-05, "loss": 0.3144, "step": 10080 }, { "epoch": 1.9873817034700316, "grad_norm": 0.4401013553494955, "learning_rate": 1.0113149481919938e-05, "loss": 0.3181, "step": 10081 }, { "epoch": 1.9875788643533123, "grad_norm": 0.4383130137426036, "learning_rate": 1.0111599553814788e-05, "loss": 0.3046, "step": 10082 }, { "epoch": 1.987776025236593, "grad_norm": 0.4776571024905826, "learning_rate": 1.0110049623028371e-05, "loss": 0.3395, "step": 10083 }, { "epoch": 1.987973186119874, "grad_norm": 0.4672666220627933, "learning_rate": 1.0108499689597924e-05, "loss": 0.3152, "step": 10084 }, { "epoch": 1.9881703470031544, "grad_norm": 0.45203351381336665, "learning_rate": 1.0106949753560682e-05, "loss": 0.312, "step": 10085 }, { "epoch": 1.9883675078864353, "grad_norm": 0.4774637834527684, "learning_rate": 1.0105399814953889e-05, "loss": 0.3212, "step": 10086 }, { "epoch": 1.988564668769716, "grad_norm": 0.4517907516822674, "learning_rate": 1.010384987381478e-05, "loss": 0.2959, "step": 10087 }, { "epoch": 1.9887618296529967, "grad_norm": 0.47886943448952884, "learning_rate": 1.0102299930180592e-05, "loss": 0.3485, "step": 10088 }, { "epoch": 1.9889589905362777, "grad_norm": 0.6789498721589519, "learning_rate": 1.0100749984088567e-05, "loss": 0.3312, "step": 10089 }, { "epoch": 1.9891561514195584, "grad_norm": 0.48039651499274616, "learning_rate": 1.0099200035575943e-05, "loss": 0.3376, "step": 10090 }, { "epoch": 1.989353312302839, "grad_norm": 0.49344206103676497, "learning_rate": 1.0097650084679957e-05, "loss": 0.3566, "step": 10091 }, { "epoch": 1.98955047318612, "grad_norm": 0.455626086749079, "learning_rate": 1.0096100131437851e-05, "loss": 0.3167, "step": 10092 }, { "epoch": 1.9897476340694005, "grad_norm": 0.471900481186439, "learning_rate": 1.009455017588686e-05, "loss": 0.3215, "step": 10093 }, { "epoch": 1.9899447949526814, "grad_norm": 0.43095008957712944, "learning_rate": 1.0093000218064224e-05, "loss": 0.3152, "step": 10094 }, { "epoch": 1.990141955835962, "grad_norm": 0.47850763704179666, "learning_rate": 1.0091450258007188e-05, "loss": 0.3285, "step": 10095 }, { "epoch": 1.9903391167192428, "grad_norm": 0.46902329428111766, "learning_rate": 1.008990029575298e-05, "loss": 0.327, "step": 10096 }, { "epoch": 1.9905362776025237, "grad_norm": 0.4635116114920501, "learning_rate": 1.008835033133885e-05, "loss": 0.3245, "step": 10097 }, { "epoch": 1.9907334384858044, "grad_norm": 0.4899631136805948, "learning_rate": 1.0086800364802028e-05, "loss": 0.3454, "step": 10098 }, { "epoch": 1.9909305993690851, "grad_norm": 0.4424518951849392, "learning_rate": 1.008525039617976e-05, "loss": 0.2982, "step": 10099 }, { "epoch": 1.991127760252366, "grad_norm": 0.4843796306436317, "learning_rate": 1.008370042550928e-05, "loss": 0.3226, "step": 10100 }, { "epoch": 1.9913249211356467, "grad_norm": 0.47515439701906687, "learning_rate": 1.0082150452827832e-05, "loss": 0.3647, "step": 10101 }, { "epoch": 1.9915220820189274, "grad_norm": 0.4984861179086406, "learning_rate": 1.0080600478172653e-05, "loss": 0.3689, "step": 10102 }, { "epoch": 1.9917192429022084, "grad_norm": 0.46711955198863625, "learning_rate": 1.0079050501580983e-05, "loss": 0.3347, "step": 10103 }, { "epoch": 1.9919164037854888, "grad_norm": 0.5227568346359791, "learning_rate": 1.0077500523090058e-05, "loss": 0.3125, "step": 10104 }, { "epoch": 1.9921135646687698, "grad_norm": 0.4491627976049431, "learning_rate": 1.0075950542737123e-05, "loss": 0.3253, "step": 10105 }, { "epoch": 1.9923107255520505, "grad_norm": 0.47884782845375357, "learning_rate": 1.0074400560559416e-05, "loss": 0.3402, "step": 10106 }, { "epoch": 1.9925078864353312, "grad_norm": 0.45076597168659827, "learning_rate": 1.0072850576594175e-05, "loss": 0.321, "step": 10107 }, { "epoch": 1.992705047318612, "grad_norm": 0.45577225933101717, "learning_rate": 1.0071300590878639e-05, "loss": 0.3216, "step": 10108 }, { "epoch": 1.9929022082018928, "grad_norm": 0.4654018460101875, "learning_rate": 1.006975060345005e-05, "loss": 0.3336, "step": 10109 }, { "epoch": 1.9930993690851735, "grad_norm": 0.4833827723808277, "learning_rate": 1.0068200614345647e-05, "loss": 0.3326, "step": 10110 }, { "epoch": 1.9932965299684544, "grad_norm": 0.45506045092199265, "learning_rate": 1.0066650623602667e-05, "loss": 0.2972, "step": 10111 }, { "epoch": 1.993493690851735, "grad_norm": 0.47117744890482555, "learning_rate": 1.0065100631258356e-05, "loss": 0.3359, "step": 10112 }, { "epoch": 1.9936908517350158, "grad_norm": 0.4155693551302444, "learning_rate": 1.0063550637349946e-05, "loss": 0.3016, "step": 10113 }, { "epoch": 1.9938880126182965, "grad_norm": 0.4796712431778576, "learning_rate": 1.0062000641914683e-05, "loss": 0.3038, "step": 10114 }, { "epoch": 1.9940851735015772, "grad_norm": 0.4460608776935147, "learning_rate": 1.00604506449898e-05, "loss": 0.3156, "step": 10115 }, { "epoch": 1.9942823343848581, "grad_norm": 0.4636295632848529, "learning_rate": 1.0058900646612548e-05, "loss": 0.34, "step": 10116 }, { "epoch": 1.9944794952681388, "grad_norm": 0.4419540655820689, "learning_rate": 1.0057350646820157e-05, "loss": 0.3153, "step": 10117 }, { "epoch": 1.9946766561514195, "grad_norm": 0.4595608276786376, "learning_rate": 1.0055800645649874e-05, "loss": 0.3308, "step": 10118 }, { "epoch": 1.9948738170347005, "grad_norm": 0.46822641410388405, "learning_rate": 1.0054250643138931e-05, "loss": 0.3351, "step": 10119 }, { "epoch": 1.995070977917981, "grad_norm": 0.4571949677373604, "learning_rate": 1.0052700639324574e-05, "loss": 0.3409, "step": 10120 }, { "epoch": 1.9952681388012619, "grad_norm": 0.4565357948126349, "learning_rate": 1.0051150634244042e-05, "loss": 0.3246, "step": 10121 }, { "epoch": 1.9954652996845426, "grad_norm": 0.4715824788374287, "learning_rate": 1.0049600627934576e-05, "loss": 0.3266, "step": 10122 }, { "epoch": 1.9956624605678233, "grad_norm": 36.0684477965843, "learning_rate": 1.0048050620433415e-05, "loss": 0.53, "step": 10123 }, { "epoch": 1.9958596214511042, "grad_norm": 0.4920636243774622, "learning_rate": 1.0046500611777799e-05, "loss": 0.3162, "step": 10124 }, { "epoch": 1.9960567823343849, "grad_norm": 0.4865650474889048, "learning_rate": 1.004495060200497e-05, "loss": 0.3395, "step": 10125 }, { "epoch": 1.9962539432176656, "grad_norm": 0.4730171891025844, "learning_rate": 1.0043400591152162e-05, "loss": 0.3264, "step": 10126 }, { "epoch": 1.9964511041009465, "grad_norm": 0.49951458846201335, "learning_rate": 1.0041850579256623e-05, "loss": 0.3605, "step": 10127 }, { "epoch": 1.996648264984227, "grad_norm": 0.4595162916836803, "learning_rate": 1.0040300566355588e-05, "loss": 0.3143, "step": 10128 }, { "epoch": 1.996845425867508, "grad_norm": 0.48676470289418966, "learning_rate": 1.00387505524863e-05, "loss": 0.3381, "step": 10129 }, { "epoch": 1.9970425867507886, "grad_norm": 0.4953287911408293, "learning_rate": 1.0037200537686001e-05, "loss": 0.3395, "step": 10130 }, { "epoch": 1.9972397476340693, "grad_norm": 0.4507924402166945, "learning_rate": 1.0035650521991927e-05, "loss": 0.3162, "step": 10131 }, { "epoch": 1.9974369085173502, "grad_norm": 0.4651561295109272, "learning_rate": 1.0034100505441322e-05, "loss": 0.3225, "step": 10132 }, { "epoch": 1.997634069400631, "grad_norm": 0.45225944154125214, "learning_rate": 1.0032550488071424e-05, "loss": 0.3214, "step": 10133 }, { "epoch": 1.9978312302839116, "grad_norm": 0.46650198815431315, "learning_rate": 1.0031000469919474e-05, "loss": 0.3524, "step": 10134 }, { "epoch": 1.9980283911671926, "grad_norm": 0.4685090100993953, "learning_rate": 1.0029450451022713e-05, "loss": 0.3366, "step": 10135 }, { "epoch": 1.998225552050473, "grad_norm": 0.46959485471009443, "learning_rate": 1.002790043141838e-05, "loss": 0.3227, "step": 10136 }, { "epoch": 1.998422712933754, "grad_norm": 0.4875933386597677, "learning_rate": 1.0026350411143719e-05, "loss": 0.337, "step": 10137 }, { "epoch": 1.9986198738170347, "grad_norm": 0.4640353105289818, "learning_rate": 1.0024800390235967e-05, "loss": 0.3296, "step": 10138 }, { "epoch": 1.9988170347003154, "grad_norm": 0.4365597644010865, "learning_rate": 1.0023250368732367e-05, "loss": 0.3246, "step": 10139 }, { "epoch": 1.9990141955835963, "grad_norm": 0.463634992887384, "learning_rate": 1.0021700346670156e-05, "loss": 0.3476, "step": 10140 }, { "epoch": 1.999211356466877, "grad_norm": 0.4488592770922772, "learning_rate": 1.0020150324086575e-05, "loss": 0.3288, "step": 10141 }, { "epoch": 1.9994085173501577, "grad_norm": 0.4474284368558747, "learning_rate": 1.0018600301018873e-05, "loss": 0.3236, "step": 10142 }, { "epoch": 1.9996056782334386, "grad_norm": 0.47845080896837267, "learning_rate": 1.0017050277504276e-05, "loss": 0.3214, "step": 10143 }, { "epoch": 1.999802839116719, "grad_norm": 0.47724597287669274, "learning_rate": 1.0015500253580039e-05, "loss": 0.3339, "step": 10144 }, { "epoch": 1.999802839116719, "eval_loss": 0.41522932052612305, "eval_runtime": 344.5021, "eval_samples_per_second": 23.599, "eval_steps_per_second": 1.477, "step": 10144 } ], "logging_steps": 1, "max_steps": 20288, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 5072, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.323382503112704e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }