{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8300418558038083, "eval_steps": 160, "global_step": 5280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015720489693253945, "grad_norm": 1.3751904964447021, "learning_rate": 0.0, "loss": 3.5741, "step": 1 }, { "epoch": 0.00015720489693253945, "eval_loss": 3.4173049926757812, "eval_runtime": 2315.7248, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.999, "step": 1 }, { "epoch": 0.0003144097938650789, "grad_norm": 1.231239676475525, "learning_rate": 5e-06, "loss": 3.3021, "step": 2 }, { "epoch": 0.00047161469079761836, "grad_norm": 1.3657807111740112, "learning_rate": 1e-05, "loss": 3.6333, "step": 3 }, { "epoch": 0.0006288195877301578, "grad_norm": 1.3117496967315674, "learning_rate": 1.5e-05, "loss": 3.3731, "step": 4 }, { "epoch": 0.0007860244846626972, "grad_norm": 1.4118576049804688, "learning_rate": 2e-05, "loss": 3.612, "step": 5 }, { "epoch": 0.0009432293815952367, "grad_norm": 1.3155895471572876, "learning_rate": 2.5e-05, "loss": 3.3296, "step": 6 }, { "epoch": 0.001100434278527776, "grad_norm": 1.2847192287445068, "learning_rate": 3e-05, "loss": 3.2168, "step": 7 }, { "epoch": 0.0012576391754603156, "grad_norm": 1.1421078443527222, "learning_rate": 3.5e-05, "loss": 3.085, "step": 8 }, { "epoch": 0.0014148440723928551, "grad_norm": 0.9923035502433777, "learning_rate": 4e-05, "loss": 3.0472, "step": 9 }, { "epoch": 0.0015720489693253944, "grad_norm": 0.795043408870697, "learning_rate": 4.5e-05, "loss": 2.6666, "step": 10 }, { "epoch": 0.001729253866257934, "grad_norm": 0.5987974405288696, "learning_rate": 5e-05, "loss": 2.473, "step": 11 }, { "epoch": 0.0018864587631904734, "grad_norm": 0.4488905668258667, "learning_rate": 4.9999999236547564e-05, "loss": 2.3731, "step": 12 }, { "epoch": 0.002043663660123013, "grad_norm": 0.3517301380634308, "learning_rate": 4.999999694619029e-05, "loss": 2.2158, "step": 13 }, { "epoch": 0.002200868557055552, "grad_norm": 0.3045121431350708, "learning_rate": 4.999999312892831e-05, "loss": 2.3351, "step": 14 }, { "epoch": 0.002358073453988092, "grad_norm": 0.24488244950771332, "learning_rate": 4.9999987784761884e-05, "loss": 2.2693, "step": 15 }, { "epoch": 0.0025152783509206312, "grad_norm": 0.22892728447914124, "learning_rate": 4.999998091369132e-05, "loss": 2.1006, "step": 16 }, { "epoch": 0.0026724832478531705, "grad_norm": 0.23219206929206848, "learning_rate": 4.999997251571704e-05, "loss": 2.215, "step": 17 }, { "epoch": 0.0028296881447857102, "grad_norm": 0.24427154660224915, "learning_rate": 4.999996259083956e-05, "loss": 2.1708, "step": 18 }, { "epoch": 0.0029868930417182495, "grad_norm": 0.2640205919742584, "learning_rate": 4.999995113905947e-05, "loss": 2.1709, "step": 19 }, { "epoch": 0.003144097938650789, "grad_norm": 0.26644033193588257, "learning_rate": 4.999993816037749e-05, "loss": 2.1733, "step": 20 }, { "epoch": 0.0033013028355833285, "grad_norm": 0.2621535062789917, "learning_rate": 4.9999923654794414e-05, "loss": 2.0059, "step": 21 }, { "epoch": 0.003458507732515868, "grad_norm": 0.2586187422275543, "learning_rate": 4.999990762231111e-05, "loss": 2.0336, "step": 22 }, { "epoch": 0.003615712629448407, "grad_norm": 0.26732271909713745, "learning_rate": 4.9999890062928566e-05, "loss": 2.0566, "step": 23 }, { "epoch": 0.003772917526380947, "grad_norm": 0.2357867807149887, "learning_rate": 4.999987097664787e-05, "loss": 1.9529, "step": 24 }, { "epoch": 0.003930122423313486, "grad_norm": 0.2297009825706482, "learning_rate": 4.999985036347016e-05, "loss": 2.0369, "step": 25 }, { "epoch": 0.004087327320246026, "grad_norm": 0.20529747009277344, "learning_rate": 4.9999828223396705e-05, "loss": 1.9781, "step": 26 }, { "epoch": 0.004244532217178565, "grad_norm": 0.18342873454093933, "learning_rate": 4.999980455642887e-05, "loss": 1.9986, "step": 27 }, { "epoch": 0.004401737114111104, "grad_norm": 0.16487397253513336, "learning_rate": 4.999977936256809e-05, "loss": 1.9063, "step": 28 }, { "epoch": 0.004558942011043644, "grad_norm": 0.1762266606092453, "learning_rate": 4.99997526418159e-05, "loss": 1.9517, "step": 29 }, { "epoch": 0.004716146907976184, "grad_norm": 0.16371938586235046, "learning_rate": 4.999972439417394e-05, "loss": 1.7734, "step": 30 }, { "epoch": 0.004873351804908723, "grad_norm": 0.17309769988059998, "learning_rate": 4.999969461964392e-05, "loss": 1.8732, "step": 31 }, { "epoch": 0.0050305567018412625, "grad_norm": 0.15772338211536407, "learning_rate": 4.9999663318227683e-05, "loss": 1.7537, "step": 32 }, { "epoch": 0.005187761598773802, "grad_norm": 0.17521986365318298, "learning_rate": 4.9999630489927126e-05, "loss": 2.0077, "step": 33 }, { "epoch": 0.005344966495706341, "grad_norm": 0.15462292730808258, "learning_rate": 4.999959613474425e-05, "loss": 1.8576, "step": 34 }, { "epoch": 0.005502171392638881, "grad_norm": 0.15280336141586304, "learning_rate": 4.999956025268117e-05, "loss": 1.862, "step": 35 }, { "epoch": 0.0056593762895714205, "grad_norm": 0.14518432319164276, "learning_rate": 4.999952284374006e-05, "loss": 1.8893, "step": 36 }, { "epoch": 0.005816581186503959, "grad_norm": 0.16087624430656433, "learning_rate": 4.999948390792321e-05, "loss": 1.8658, "step": 37 }, { "epoch": 0.005973786083436499, "grad_norm": 0.17504698038101196, "learning_rate": 4.999944344523301e-05, "loss": 1.7647, "step": 38 }, { "epoch": 0.006130990980369039, "grad_norm": 0.17786233127117157, "learning_rate": 4.999940145567191e-05, "loss": 1.8133, "step": 39 }, { "epoch": 0.006288195877301578, "grad_norm": 0.1628972887992859, "learning_rate": 4.999935793924249e-05, "loss": 1.7731, "step": 40 }, { "epoch": 0.006445400774234117, "grad_norm": 0.13461466133594513, "learning_rate": 4.9999312895947406e-05, "loss": 1.7558, "step": 41 }, { "epoch": 0.006602605671166657, "grad_norm": 0.12960125505924225, "learning_rate": 4.99992663257894e-05, "loss": 1.7639, "step": 42 }, { "epoch": 0.006759810568099196, "grad_norm": 0.10991287231445312, "learning_rate": 4.9999218228771324e-05, "loss": 1.7538, "step": 43 }, { "epoch": 0.006917015465031736, "grad_norm": 0.11583230644464493, "learning_rate": 4.999916860489612e-05, "loss": 1.715, "step": 44 }, { "epoch": 0.007074220361964275, "grad_norm": 0.10344280302524567, "learning_rate": 4.999911745416681e-05, "loss": 1.6907, "step": 45 }, { "epoch": 0.007231425258896814, "grad_norm": 0.10546118766069412, "learning_rate": 4.999906477658651e-05, "loss": 1.7294, "step": 46 }, { "epoch": 0.007388630155829354, "grad_norm": 0.11775675415992737, "learning_rate": 4.9999010572158465e-05, "loss": 1.7146, "step": 47 }, { "epoch": 0.007545835052761894, "grad_norm": 0.11109112203121185, "learning_rate": 4.999895484088596e-05, "loss": 1.6939, "step": 48 }, { "epoch": 0.007703039949694433, "grad_norm": 0.1116517186164856, "learning_rate": 4.999889758277242e-05, "loss": 1.7271, "step": 49 }, { "epoch": 0.007860244846626972, "grad_norm": 0.11245547980070114, "learning_rate": 4.999883879782132e-05, "loss": 1.7333, "step": 50 }, { "epoch": 0.008017449743559512, "grad_norm": 0.1150551363825798, "learning_rate": 4.999877848603626e-05, "loss": 1.7036, "step": 51 }, { "epoch": 0.008174654640492052, "grad_norm": 0.10856381803750992, "learning_rate": 4.999871664742093e-05, "loss": 1.7493, "step": 52 }, { "epoch": 0.008331859537424591, "grad_norm": 0.10760089010000229, "learning_rate": 4.9998653281979095e-05, "loss": 1.6292, "step": 53 }, { "epoch": 0.00848906443435713, "grad_norm": 0.0932115837931633, "learning_rate": 4.9998588389714634e-05, "loss": 1.6608, "step": 54 }, { "epoch": 0.00864626933128967, "grad_norm": 0.09837482124567032, "learning_rate": 4.9998521970631504e-05, "loss": 1.7834, "step": 55 }, { "epoch": 0.008803474228222209, "grad_norm": 0.08872833847999573, "learning_rate": 4.9998454024733775e-05, "loss": 1.6484, "step": 56 }, { "epoch": 0.008960679125154749, "grad_norm": 0.08829163759946823, "learning_rate": 4.9998384552025577e-05, "loss": 1.5913, "step": 57 }, { "epoch": 0.009117884022087288, "grad_norm": 0.09087682515382767, "learning_rate": 4.999831355251117e-05, "loss": 1.6809, "step": 58 }, { "epoch": 0.009275088919019828, "grad_norm": 0.08675853163003922, "learning_rate": 4.9998241026194884e-05, "loss": 1.6519, "step": 59 }, { "epoch": 0.009432293815952368, "grad_norm": 0.08463481813669205, "learning_rate": 4.999816697308114e-05, "loss": 1.6234, "step": 60 }, { "epoch": 0.009589498712884906, "grad_norm": 0.08403950184583664, "learning_rate": 4.999809139317448e-05, "loss": 1.6533, "step": 61 }, { "epoch": 0.009746703609817445, "grad_norm": 0.08155622333288193, "learning_rate": 4.99980142864795e-05, "loss": 1.6726, "step": 62 }, { "epoch": 0.009903908506749985, "grad_norm": 0.08056480437517166, "learning_rate": 4.999793565300093e-05, "loss": 1.5881, "step": 63 }, { "epoch": 0.010061113403682525, "grad_norm": 0.07879023998975754, "learning_rate": 4.999785549274355e-05, "loss": 1.5568, "step": 64 }, { "epoch": 0.010218318300615065, "grad_norm": 0.07828455418348312, "learning_rate": 4.9997773805712265e-05, "loss": 1.6464, "step": 65 }, { "epoch": 0.010375523197547604, "grad_norm": 0.08054805546998978, "learning_rate": 4.9997690591912075e-05, "loss": 1.6213, "step": 66 }, { "epoch": 0.010532728094480142, "grad_norm": 0.07610727101564407, "learning_rate": 4.999760585134805e-05, "loss": 1.5729, "step": 67 }, { "epoch": 0.010689932991412682, "grad_norm": 0.07693428546190262, "learning_rate": 4.999751958402537e-05, "loss": 1.5444, "step": 68 }, { "epoch": 0.010847137888345222, "grad_norm": 0.0810319185256958, "learning_rate": 4.99974317899493e-05, "loss": 1.7045, "step": 69 }, { "epoch": 0.011004342785277762, "grad_norm": 0.07729896157979965, "learning_rate": 4.9997342469125205e-05, "loss": 1.6268, "step": 70 }, { "epoch": 0.011161547682210301, "grad_norm": 0.07730107754468918, "learning_rate": 4.999725162155855e-05, "loss": 1.658, "step": 71 }, { "epoch": 0.011318752579142841, "grad_norm": 0.08072328567504883, "learning_rate": 4.9997159247254864e-05, "loss": 1.5045, "step": 72 }, { "epoch": 0.011475957476075379, "grad_norm": 0.08120577782392502, "learning_rate": 4.9997065346219805e-05, "loss": 1.568, "step": 73 }, { "epoch": 0.011633162373007919, "grad_norm": 0.08131498098373413, "learning_rate": 4.99969699184591e-05, "loss": 1.6035, "step": 74 }, { "epoch": 0.011790367269940458, "grad_norm": 0.08395873010158539, "learning_rate": 4.9996872963978584e-05, "loss": 1.5844, "step": 75 }, { "epoch": 0.011947572166872998, "grad_norm": 0.08502068370580673, "learning_rate": 4.999677448278417e-05, "loss": 1.6661, "step": 76 }, { "epoch": 0.012104777063805538, "grad_norm": 0.08467952907085419, "learning_rate": 4.999667447488188e-05, "loss": 1.5537, "step": 77 }, { "epoch": 0.012261981960738078, "grad_norm": 0.19682182371616364, "learning_rate": 4.999657294027782e-05, "loss": 1.5051, "step": 78 }, { "epoch": 0.012419186857670617, "grad_norm": 0.08586428314447403, "learning_rate": 4.999646987897818e-05, "loss": 1.565, "step": 79 }, { "epoch": 0.012576391754603155, "grad_norm": 0.08156823366880417, "learning_rate": 4.999636529098928e-05, "loss": 1.6627, "step": 80 }, { "epoch": 0.012733596651535695, "grad_norm": 0.08715341240167618, "learning_rate": 4.9996259176317486e-05, "loss": 1.5862, "step": 81 }, { "epoch": 0.012890801548468235, "grad_norm": 0.09664586186408997, "learning_rate": 4.999615153496928e-05, "loss": 1.5741, "step": 82 }, { "epoch": 0.013048006445400774, "grad_norm": 0.08438891172409058, "learning_rate": 4.999604236695125e-05, "loss": 1.5933, "step": 83 }, { "epoch": 0.013205211342333314, "grad_norm": 0.08333732932806015, "learning_rate": 4.999593167227006e-05, "loss": 1.5904, "step": 84 }, { "epoch": 0.013362416239265854, "grad_norm": 0.07945791631937027, "learning_rate": 4.9995819450932455e-05, "loss": 1.5763, "step": 85 }, { "epoch": 0.013519621136198392, "grad_norm": 0.07682961225509644, "learning_rate": 4.9995705702945304e-05, "loss": 1.5197, "step": 86 }, { "epoch": 0.013676826033130932, "grad_norm": 0.07547677308320999, "learning_rate": 4.999559042831555e-05, "loss": 1.6825, "step": 87 }, { "epoch": 0.013834030930063471, "grad_norm": 0.07293456047773361, "learning_rate": 4.999547362705025e-05, "loss": 1.5466, "step": 88 }, { "epoch": 0.013991235826996011, "grad_norm": 0.07730914652347565, "learning_rate": 4.999535529915651e-05, "loss": 1.5775, "step": 89 }, { "epoch": 0.01414844072392855, "grad_norm": 0.07689664512872696, "learning_rate": 4.9995235444641565e-05, "loss": 1.5881, "step": 90 }, { "epoch": 0.01430564562086109, "grad_norm": 0.07754997909069061, "learning_rate": 4.999511406351275e-05, "loss": 1.5037, "step": 91 }, { "epoch": 0.014462850517793628, "grad_norm": 0.07229866087436676, "learning_rate": 4.999499115577746e-05, "loss": 1.5077, "step": 92 }, { "epoch": 0.014620055414726168, "grad_norm": 0.07491567730903625, "learning_rate": 4.9994866721443215e-05, "loss": 1.5461, "step": 93 }, { "epoch": 0.014777260311658708, "grad_norm": 0.07258685678243637, "learning_rate": 4.9994740760517605e-05, "loss": 1.5516, "step": 94 }, { "epoch": 0.014934465208591248, "grad_norm": 0.07643327116966248, "learning_rate": 4.9994613273008334e-05, "loss": 1.6223, "step": 95 }, { "epoch": 0.015091670105523787, "grad_norm": 0.0740588903427124, "learning_rate": 4.999448425892318e-05, "loss": 1.5322, "step": 96 }, { "epoch": 0.015248875002456327, "grad_norm": 0.44172239303588867, "learning_rate": 4.999435371827003e-05, "loss": 1.5498, "step": 97 }, { "epoch": 0.015406079899388867, "grad_norm": 0.0756363570690155, "learning_rate": 4.999422165105684e-05, "loss": 1.559, "step": 98 }, { "epoch": 0.015563284796321405, "grad_norm": 0.07251248508691788, "learning_rate": 4.99940880572917e-05, "loss": 1.5903, "step": 99 }, { "epoch": 0.015720489693253945, "grad_norm": 0.06931837648153305, "learning_rate": 4.999395293698275e-05, "loss": 1.4849, "step": 100 }, { "epoch": 0.015877694590186484, "grad_norm": 0.07403590530157089, "learning_rate": 4.9993816290138254e-05, "loss": 1.5191, "step": 101 }, { "epoch": 0.016034899487119024, "grad_norm": 0.07027724385261536, "learning_rate": 4.999367811676655e-05, "loss": 1.5655, "step": 102 }, { "epoch": 0.016192104384051564, "grad_norm": 0.07320379465818405, "learning_rate": 4.9993538416876093e-05, "loss": 1.4869, "step": 103 }, { "epoch": 0.016349309280984103, "grad_norm": 0.0726180374622345, "learning_rate": 4.9993397190475396e-05, "loss": 1.4629, "step": 104 }, { "epoch": 0.016506514177916643, "grad_norm": 0.07542011886835098, "learning_rate": 4.999325443757309e-05, "loss": 1.5976, "step": 105 }, { "epoch": 0.016663719074849183, "grad_norm": 0.07440067082643509, "learning_rate": 4.9993110158177895e-05, "loss": 1.5469, "step": 106 }, { "epoch": 0.016820923971781723, "grad_norm": 0.07547372579574585, "learning_rate": 4.999296435229863e-05, "loss": 1.5328, "step": 107 }, { "epoch": 0.01697812886871426, "grad_norm": 0.07532137632369995, "learning_rate": 4.999281701994419e-05, "loss": 1.6742, "step": 108 }, { "epoch": 0.0171353337656468, "grad_norm": 0.07249438762664795, "learning_rate": 4.999266816112358e-05, "loss": 1.4799, "step": 109 }, { "epoch": 0.01729253866257934, "grad_norm": 0.07399806380271912, "learning_rate": 4.999251777584589e-05, "loss": 1.5438, "step": 110 }, { "epoch": 0.017449743559511878, "grad_norm": 0.08135057240724564, "learning_rate": 4.99923658641203e-05, "loss": 1.5608, "step": 111 }, { "epoch": 0.017606948456444418, "grad_norm": 0.07508935779333115, "learning_rate": 4.99922124259561e-05, "loss": 1.5894, "step": 112 }, { "epoch": 0.017764153353376957, "grad_norm": 0.07432372123003006, "learning_rate": 4.999205746136265e-05, "loss": 1.4818, "step": 113 }, { "epoch": 0.017921358250309497, "grad_norm": 0.07694194465875626, "learning_rate": 4.999190097034942e-05, "loss": 1.5629, "step": 114 }, { "epoch": 0.018078563147242037, "grad_norm": 0.07384433597326279, "learning_rate": 4.999174295292597e-05, "loss": 1.4829, "step": 115 }, { "epoch": 0.018235768044174577, "grad_norm": 0.07152919471263885, "learning_rate": 4.999158340910195e-05, "loss": 1.4748, "step": 116 }, { "epoch": 0.018392972941107116, "grad_norm": 0.07719701528549194, "learning_rate": 4.999142233888709e-05, "loss": 1.5524, "step": 117 }, { "epoch": 0.018550177838039656, "grad_norm": 0.07540587335824966, "learning_rate": 4.999125974229125e-05, "loss": 1.4661, "step": 118 }, { "epoch": 0.018707382734972196, "grad_norm": 0.0787581130862236, "learning_rate": 4.9991095619324344e-05, "loss": 1.6455, "step": 119 }, { "epoch": 0.018864587631904736, "grad_norm": 0.07454577833414078, "learning_rate": 4.999092996999641e-05, "loss": 1.5083, "step": 120 }, { "epoch": 0.019021792528837272, "grad_norm": 0.0751076266169548, "learning_rate": 4.9990762794317545e-05, "loss": 1.4874, "step": 121 }, { "epoch": 0.01917899742576981, "grad_norm": 0.07733119279146194, "learning_rate": 4.999059409229798e-05, "loss": 1.6308, "step": 122 }, { "epoch": 0.01933620232270235, "grad_norm": 0.07897089421749115, "learning_rate": 4.999042386394802e-05, "loss": 1.5906, "step": 123 }, { "epoch": 0.01949340721963489, "grad_norm": 0.07758141309022903, "learning_rate": 4.999025210927804e-05, "loss": 1.5604, "step": 124 }, { "epoch": 0.01965061211656743, "grad_norm": 0.07845707982778549, "learning_rate": 4.9990078828298544e-05, "loss": 1.5901, "step": 125 }, { "epoch": 0.01980781701349997, "grad_norm": 0.0772818773984909, "learning_rate": 4.998990402102012e-05, "loss": 1.4516, "step": 126 }, { "epoch": 0.01996502191043251, "grad_norm": 0.07795504480600357, "learning_rate": 4.998972768745344e-05, "loss": 1.4642, "step": 127 }, { "epoch": 0.02012222680736505, "grad_norm": 0.0784008800983429, "learning_rate": 4.998954982760926e-05, "loss": 1.5936, "step": 128 }, { "epoch": 0.02027943170429759, "grad_norm": 0.07791212201118469, "learning_rate": 4.9989370441498465e-05, "loss": 1.4705, "step": 129 }, { "epoch": 0.02043663660123013, "grad_norm": 0.07785367220640182, "learning_rate": 4.9989189529132004e-05, "loss": 1.5085, "step": 130 }, { "epoch": 0.02059384149816267, "grad_norm": 0.07916689664125443, "learning_rate": 4.9989007090520925e-05, "loss": 1.5365, "step": 131 }, { "epoch": 0.02075104639509521, "grad_norm": 0.0775083601474762, "learning_rate": 4.9988823125676367e-05, "loss": 1.5286, "step": 132 }, { "epoch": 0.020908251292027745, "grad_norm": 0.08110442757606506, "learning_rate": 4.998863763460956e-05, "loss": 1.5779, "step": 133 }, { "epoch": 0.021065456188960285, "grad_norm": 0.0814640000462532, "learning_rate": 4.998845061733185e-05, "loss": 1.4778, "step": 134 }, { "epoch": 0.021222661085892824, "grad_norm": 0.08069492131471634, "learning_rate": 4.998826207385465e-05, "loss": 1.5317, "step": 135 }, { "epoch": 0.021379865982825364, "grad_norm": 0.07377774268388748, "learning_rate": 4.998807200418948e-05, "loss": 1.5258, "step": 136 }, { "epoch": 0.021537070879757904, "grad_norm": 0.0787922590970993, "learning_rate": 4.9987880408347945e-05, "loss": 1.5185, "step": 137 }, { "epoch": 0.021694275776690444, "grad_norm": 0.07662995159626007, "learning_rate": 4.9987687286341745e-05, "loss": 1.4637, "step": 138 }, { "epoch": 0.021851480673622983, "grad_norm": 0.08528955280780792, "learning_rate": 4.9987492638182676e-05, "loss": 1.4776, "step": 139 }, { "epoch": 0.022008685570555523, "grad_norm": 0.08089053630828857, "learning_rate": 4.9987296463882626e-05, "loss": 1.5885, "step": 140 }, { "epoch": 0.022165890467488063, "grad_norm": 0.08029694855213165, "learning_rate": 4.998709876345358e-05, "loss": 1.4557, "step": 141 }, { "epoch": 0.022323095364420602, "grad_norm": 0.07918502390384674, "learning_rate": 4.9986899536907614e-05, "loss": 1.4285, "step": 142 }, { "epoch": 0.022480300261353142, "grad_norm": 0.0813126415014267, "learning_rate": 4.998669878425689e-05, "loss": 1.5958, "step": 143 }, { "epoch": 0.022637505158285682, "grad_norm": 0.07935188710689545, "learning_rate": 4.998649650551368e-05, "loss": 1.5249, "step": 144 }, { "epoch": 0.02279471005521822, "grad_norm": 0.08163304626941681, "learning_rate": 4.9986292700690324e-05, "loss": 1.483, "step": 145 }, { "epoch": 0.022951914952150758, "grad_norm": 0.08277447521686554, "learning_rate": 4.998608736979928e-05, "loss": 1.6212, "step": 146 }, { "epoch": 0.023109119849083298, "grad_norm": 0.08285827934741974, "learning_rate": 4.9985880512853076e-05, "loss": 1.4495, "step": 147 }, { "epoch": 0.023266324746015837, "grad_norm": 0.082750603556633, "learning_rate": 4.998567212986437e-05, "loss": 1.4335, "step": 148 }, { "epoch": 0.023423529642948377, "grad_norm": 0.07986058294773102, "learning_rate": 4.998546222084587e-05, "loss": 1.4704, "step": 149 }, { "epoch": 0.023580734539880917, "grad_norm": 0.08105576783418655, "learning_rate": 4.9985250785810396e-05, "loss": 1.5183, "step": 150 }, { "epoch": 0.023737939436813457, "grad_norm": 0.08202917128801346, "learning_rate": 4.9985037824770866e-05, "loss": 1.5423, "step": 151 }, { "epoch": 0.023895144333745996, "grad_norm": 0.08937894552946091, "learning_rate": 4.998482333774029e-05, "loss": 1.5731, "step": 152 }, { "epoch": 0.024052349230678536, "grad_norm": 0.08333728462457657, "learning_rate": 4.9984607324731766e-05, "loss": 1.5133, "step": 153 }, { "epoch": 0.024209554127611076, "grad_norm": 0.08529175072908401, "learning_rate": 4.998438978575849e-05, "loss": 1.516, "step": 154 }, { "epoch": 0.024366759024543615, "grad_norm": 0.08508963882923126, "learning_rate": 4.998417072083374e-05, "loss": 1.5646, "step": 155 }, { "epoch": 0.024523963921476155, "grad_norm": 0.08971578627824783, "learning_rate": 4.99839501299709e-05, "loss": 1.4714, "step": 156 }, { "epoch": 0.024681168818408695, "grad_norm": 0.08380109816789627, "learning_rate": 4.998372801318345e-05, "loss": 1.4476, "step": 157 }, { "epoch": 0.024838373715341235, "grad_norm": 0.08533143252134323, "learning_rate": 4.9983504370484945e-05, "loss": 1.4866, "step": 158 }, { "epoch": 0.02499557861227377, "grad_norm": 0.08318709582090378, "learning_rate": 4.998327920188905e-05, "loss": 1.5274, "step": 159 }, { "epoch": 0.02515278350920631, "grad_norm": 0.08486370742321014, "learning_rate": 4.9983052507409525e-05, "loss": 1.4713, "step": 160 }, { "epoch": 0.02515278350920631, "eval_loss": 1.5136528015136719, "eval_runtime": 2318.8971, "eval_samples_per_second": 3.992, "eval_steps_per_second": 1.996, "step": 160 }, { "epoch": 0.02530998840613885, "grad_norm": 0.08242359757423401, "learning_rate": 4.9982824287060195e-05, "loss": 1.5069, "step": 161 }, { "epoch": 0.02546719330307139, "grad_norm": 0.08547423779964447, "learning_rate": 4.9982594540855014e-05, "loss": 1.4973, "step": 162 }, { "epoch": 0.02562439820000393, "grad_norm": 0.08345580101013184, "learning_rate": 4.9982363268808016e-05, "loss": 1.5078, "step": 163 }, { "epoch": 0.02578160309693647, "grad_norm": 0.0830339640378952, "learning_rate": 4.9982130470933316e-05, "loss": 1.4098, "step": 164 }, { "epoch": 0.02593880799386901, "grad_norm": 0.08568515628576279, "learning_rate": 4.998189614724514e-05, "loss": 1.4628, "step": 165 }, { "epoch": 0.02609601289080155, "grad_norm": 0.08261829614639282, "learning_rate": 4.998166029775779e-05, "loss": 1.4492, "step": 166 }, { "epoch": 0.02625321778773409, "grad_norm": 0.08944887667894363, "learning_rate": 4.998142292248569e-05, "loss": 1.5633, "step": 167 }, { "epoch": 0.02641042268466663, "grad_norm": 0.08632911741733551, "learning_rate": 4.998118402144332e-05, "loss": 1.5106, "step": 168 }, { "epoch": 0.026567627581599168, "grad_norm": 0.08733859658241272, "learning_rate": 4.998094359464528e-05, "loss": 1.5607, "step": 169 }, { "epoch": 0.026724832478531708, "grad_norm": 0.08667927235364914, "learning_rate": 4.9980701642106245e-05, "loss": 1.4544, "step": 170 }, { "epoch": 0.026882037375464244, "grad_norm": 0.08655022084712982, "learning_rate": 4.9980458163841006e-05, "loss": 1.5264, "step": 171 }, { "epoch": 0.027039242272396784, "grad_norm": 0.08899988234043121, "learning_rate": 4.9980213159864426e-05, "loss": 1.4778, "step": 172 }, { "epoch": 0.027196447169329323, "grad_norm": 0.09411856532096863, "learning_rate": 4.997996663019147e-05, "loss": 1.5269, "step": 173 }, { "epoch": 0.027353652066261863, "grad_norm": 0.087191641330719, "learning_rate": 4.997971857483719e-05, "loss": 1.5166, "step": 174 }, { "epoch": 0.027510856963194403, "grad_norm": 0.08959636092185974, "learning_rate": 4.997946899381675e-05, "loss": 1.5503, "step": 175 }, { "epoch": 0.027668061860126943, "grad_norm": 0.0951187014579773, "learning_rate": 4.997921788714537e-05, "loss": 1.4879, "step": 176 }, { "epoch": 0.027825266757059482, "grad_norm": 0.09324768930673599, "learning_rate": 4.997896525483841e-05, "loss": 1.5714, "step": 177 }, { "epoch": 0.027982471653992022, "grad_norm": 0.08633986115455627, "learning_rate": 4.997871109691129e-05, "loss": 1.4198, "step": 178 }, { "epoch": 0.028139676550924562, "grad_norm": 0.08947525173425674, "learning_rate": 4.9978455413379535e-05, "loss": 1.4702, "step": 179 }, { "epoch": 0.0282968814478571, "grad_norm": 0.09275490790605545, "learning_rate": 4.9978198204258766e-05, "loss": 1.5252, "step": 180 }, { "epoch": 0.02845408634478964, "grad_norm": 0.08761609345674515, "learning_rate": 4.9977939469564676e-05, "loss": 1.505, "step": 181 }, { "epoch": 0.02861129124172218, "grad_norm": 0.08683087676763535, "learning_rate": 4.997767920931308e-05, "loss": 1.5059, "step": 182 }, { "epoch": 0.02876849613865472, "grad_norm": 0.08931361883878708, "learning_rate": 4.997741742351988e-05, "loss": 1.5003, "step": 183 }, { "epoch": 0.028925701035587257, "grad_norm": 0.08820109069347382, "learning_rate": 4.997715411220105e-05, "loss": 1.5132, "step": 184 }, { "epoch": 0.029082905932519797, "grad_norm": 0.09284964948892593, "learning_rate": 4.997688927537268e-05, "loss": 1.4561, "step": 185 }, { "epoch": 0.029240110829452336, "grad_norm": 0.09472864121198654, "learning_rate": 4.997662291305094e-05, "loss": 1.4729, "step": 186 }, { "epoch": 0.029397315726384876, "grad_norm": 0.08725330233573914, "learning_rate": 4.997635502525211e-05, "loss": 1.3994, "step": 187 }, { "epoch": 0.029554520623317416, "grad_norm": 0.09085626900196075, "learning_rate": 4.9976085611992536e-05, "loss": 1.4695, "step": 188 }, { "epoch": 0.029711725520249956, "grad_norm": 0.09322400391101837, "learning_rate": 4.9975814673288684e-05, "loss": 1.4753, "step": 189 }, { "epoch": 0.029868930417182495, "grad_norm": 0.08927160501480103, "learning_rate": 4.99755422091571e-05, "loss": 1.4465, "step": 190 }, { "epoch": 0.030026135314115035, "grad_norm": 0.09317070990800858, "learning_rate": 4.997526821961442e-05, "loss": 1.5124, "step": 191 }, { "epoch": 0.030183340211047575, "grad_norm": 0.08911167085170746, "learning_rate": 4.9974992704677385e-05, "loss": 1.4515, "step": 192 }, { "epoch": 0.030340545107980114, "grad_norm": 0.09432853013277054, "learning_rate": 4.997471566436282e-05, "loss": 1.4623, "step": 193 }, { "epoch": 0.030497750004912654, "grad_norm": 0.09417332708835602, "learning_rate": 4.997443709868764e-05, "loss": 1.5103, "step": 194 }, { "epoch": 0.030654954901845194, "grad_norm": 0.09564542025327682, "learning_rate": 4.997415700766887e-05, "loss": 1.4929, "step": 195 }, { "epoch": 0.030812159798777734, "grad_norm": 0.09101004898548126, "learning_rate": 4.997387539132361e-05, "loss": 1.4225, "step": 196 }, { "epoch": 0.03096936469571027, "grad_norm": 0.09196274727582932, "learning_rate": 4.997359224966906e-05, "loss": 1.4701, "step": 197 }, { "epoch": 0.03112656959264281, "grad_norm": 0.09573279321193695, "learning_rate": 4.997330758272251e-05, "loss": 1.4425, "step": 198 }, { "epoch": 0.03128377448957535, "grad_norm": 0.09180758893489838, "learning_rate": 4.9973021390501354e-05, "loss": 1.4426, "step": 199 }, { "epoch": 0.03144097938650789, "grad_norm": 0.09583238512277603, "learning_rate": 4.997273367302306e-05, "loss": 1.5158, "step": 200 }, { "epoch": 0.03159818428344043, "grad_norm": 0.09394747018814087, "learning_rate": 4.997244443030521e-05, "loss": 1.4306, "step": 201 }, { "epoch": 0.03175538918037297, "grad_norm": 0.09470199793577194, "learning_rate": 4.9972153662365474e-05, "loss": 1.5286, "step": 202 }, { "epoch": 0.031912594077305505, "grad_norm": 0.09274959564208984, "learning_rate": 4.997186136922161e-05, "loss": 1.4803, "step": 203 }, { "epoch": 0.03206979897423805, "grad_norm": 0.09344369918107986, "learning_rate": 4.997156755089145e-05, "loss": 1.5449, "step": 204 }, { "epoch": 0.032227003871170584, "grad_norm": 0.09794919937849045, "learning_rate": 4.997127220739296e-05, "loss": 1.4383, "step": 205 }, { "epoch": 0.03238420876810313, "grad_norm": 0.09698093682527542, "learning_rate": 4.997097533874418e-05, "loss": 1.4462, "step": 206 }, { "epoch": 0.032541413665035664, "grad_norm": 0.09690559655427933, "learning_rate": 4.997067694496323e-05, "loss": 1.4735, "step": 207 }, { "epoch": 0.03269861856196821, "grad_norm": 0.09657544642686844, "learning_rate": 4.9970377026068336e-05, "loss": 1.5672, "step": 208 }, { "epoch": 0.03285582345890074, "grad_norm": 0.09483659267425537, "learning_rate": 4.9970075582077825e-05, "loss": 1.4931, "step": 209 }, { "epoch": 0.033013028355833286, "grad_norm": 0.09744243323802948, "learning_rate": 4.9969772613010104e-05, "loss": 1.4638, "step": 210 }, { "epoch": 0.03317023325276582, "grad_norm": 0.09521006047725677, "learning_rate": 4.9969468118883665e-05, "loss": 1.4127, "step": 211 }, { "epoch": 0.033327438149698366, "grad_norm": 0.09646004438400269, "learning_rate": 4.996916209971713e-05, "loss": 1.5139, "step": 212 }, { "epoch": 0.0334846430466309, "grad_norm": 0.09292810410261154, "learning_rate": 4.996885455552916e-05, "loss": 1.4399, "step": 213 }, { "epoch": 0.033641847943563445, "grad_norm": 0.09986516088247299, "learning_rate": 4.996854548633857e-05, "loss": 1.4637, "step": 214 }, { "epoch": 0.03379905284049598, "grad_norm": 0.09723702073097229, "learning_rate": 4.996823489216421e-05, "loss": 1.5673, "step": 215 }, { "epoch": 0.03395625773742852, "grad_norm": 0.09608977288007736, "learning_rate": 4.996792277302507e-05, "loss": 1.4428, "step": 216 }, { "epoch": 0.03411346263436106, "grad_norm": 0.09329380095005035, "learning_rate": 4.99676091289402e-05, "loss": 1.3892, "step": 217 }, { "epoch": 0.0342706675312936, "grad_norm": 0.0959913358092308, "learning_rate": 4.996729395992875e-05, "loss": 1.5219, "step": 218 }, { "epoch": 0.03442787242822614, "grad_norm": 0.09832671284675598, "learning_rate": 4.996697726600999e-05, "loss": 1.5259, "step": 219 }, { "epoch": 0.03458507732515868, "grad_norm": 0.10061636567115784, "learning_rate": 4.996665904720325e-05, "loss": 1.5216, "step": 220 }, { "epoch": 0.03474228222209122, "grad_norm": 0.09742400050163269, "learning_rate": 4.9966339303527965e-05, "loss": 1.3819, "step": 221 }, { "epoch": 0.034899487119023756, "grad_norm": 0.09629969298839569, "learning_rate": 4.996601803500367e-05, "loss": 1.5341, "step": 222 }, { "epoch": 0.0350566920159563, "grad_norm": 0.09776200354099274, "learning_rate": 4.996569524164998e-05, "loss": 1.5054, "step": 223 }, { "epoch": 0.035213896912888835, "grad_norm": 0.1008530780673027, "learning_rate": 4.996537092348661e-05, "loss": 1.5333, "step": 224 }, { "epoch": 0.03537110180982138, "grad_norm": 0.09749735891819, "learning_rate": 4.996504508053338e-05, "loss": 1.3899, "step": 225 }, { "epoch": 0.035528306706753915, "grad_norm": 0.10522401332855225, "learning_rate": 4.9964717712810175e-05, "loss": 1.5413, "step": 226 }, { "epoch": 0.03568551160368646, "grad_norm": 0.09566272795200348, "learning_rate": 4.9964388820336996e-05, "loss": 1.435, "step": 227 }, { "epoch": 0.035842716500618994, "grad_norm": 0.10133984684944153, "learning_rate": 4.996405840313393e-05, "loss": 1.445, "step": 228 }, { "epoch": 0.03599992139755153, "grad_norm": 0.09702739119529724, "learning_rate": 4.996372646122116e-05, "loss": 1.4287, "step": 229 }, { "epoch": 0.036157126294484074, "grad_norm": 0.1012992411851883, "learning_rate": 4.996339299461896e-05, "loss": 1.382, "step": 230 }, { "epoch": 0.03631433119141661, "grad_norm": 0.09877166152000427, "learning_rate": 4.99630580033477e-05, "loss": 1.5729, "step": 231 }, { "epoch": 0.03647153608834915, "grad_norm": 0.1033129170536995, "learning_rate": 4.996272148742783e-05, "loss": 1.4754, "step": 232 }, { "epoch": 0.03662874098528169, "grad_norm": 0.09901215881109238, "learning_rate": 4.9962383446879914e-05, "loss": 1.5153, "step": 233 }, { "epoch": 0.03678594588221423, "grad_norm": 0.10241983830928802, "learning_rate": 4.996204388172458e-05, "loss": 1.5131, "step": 234 }, { "epoch": 0.03694315077914677, "grad_norm": 0.09574593603610992, "learning_rate": 4.9961702791982594e-05, "loss": 1.5285, "step": 235 }, { "epoch": 0.03710035567607931, "grad_norm": 0.10309838503599167, "learning_rate": 4.996136017767477e-05, "loss": 1.5751, "step": 236 }, { "epoch": 0.03725756057301185, "grad_norm": 0.09928470849990845, "learning_rate": 4.996101603882204e-05, "loss": 1.5108, "step": 237 }, { "epoch": 0.03741476546994439, "grad_norm": 0.10514767467975616, "learning_rate": 4.996067037544542e-05, "loss": 1.4206, "step": 238 }, { "epoch": 0.03757197036687693, "grad_norm": 0.10411518812179565, "learning_rate": 4.996032318756601e-05, "loss": 1.5628, "step": 239 }, { "epoch": 0.03772917526380947, "grad_norm": 0.0989808738231659, "learning_rate": 4.9959974475205045e-05, "loss": 1.4444, "step": 240 }, { "epoch": 0.03788638016074201, "grad_norm": 0.10069911926984787, "learning_rate": 4.9959624238383804e-05, "loss": 1.4805, "step": 241 }, { "epoch": 0.038043585057674544, "grad_norm": 0.10637518763542175, "learning_rate": 4.995927247712367e-05, "loss": 1.5289, "step": 242 }, { "epoch": 0.03820078995460709, "grad_norm": 0.10085684061050415, "learning_rate": 4.995891919144614e-05, "loss": 1.5288, "step": 243 }, { "epoch": 0.03835799485153962, "grad_norm": 0.09989017248153687, "learning_rate": 4.995856438137279e-05, "loss": 1.5444, "step": 244 }, { "epoch": 0.038515199748472166, "grad_norm": 0.10382463037967682, "learning_rate": 4.9958208046925294e-05, "loss": 1.4621, "step": 245 }, { "epoch": 0.0386724046454047, "grad_norm": 0.10208063572645187, "learning_rate": 4.99578501881254e-05, "loss": 1.5003, "step": 246 }, { "epoch": 0.038829609542337246, "grad_norm": 0.1028011366724968, "learning_rate": 4.9957490804994977e-05, "loss": 1.516, "step": 247 }, { "epoch": 0.03898681443926978, "grad_norm": 0.10475701838731766, "learning_rate": 4.995712989755598e-05, "loss": 1.5333, "step": 248 }, { "epoch": 0.039144019336202325, "grad_norm": 0.1038154736161232, "learning_rate": 4.995676746583044e-05, "loss": 1.4779, "step": 249 }, { "epoch": 0.03930122423313486, "grad_norm": 0.10413440316915512, "learning_rate": 4.99564035098405e-05, "loss": 1.5241, "step": 250 }, { "epoch": 0.039458429130067404, "grad_norm": 0.09869382530450821, "learning_rate": 4.995603802960838e-05, "loss": 1.442, "step": 251 }, { "epoch": 0.03961563402699994, "grad_norm": 0.10138234496116638, "learning_rate": 4.995567102515641e-05, "loss": 1.5393, "step": 252 }, { "epoch": 0.039772838923932484, "grad_norm": 0.10225867480039597, "learning_rate": 4.995530249650701e-05, "loss": 1.4516, "step": 253 }, { "epoch": 0.03993004382086502, "grad_norm": 0.09942895174026489, "learning_rate": 4.995493244368268e-05, "loss": 1.4543, "step": 254 }, { "epoch": 0.040087248717797556, "grad_norm": 0.11218860745429993, "learning_rate": 4.995456086670602e-05, "loss": 1.4985, "step": 255 }, { "epoch": 0.0402444536147301, "grad_norm": 0.10839337855577469, "learning_rate": 4.9954187765599736e-05, "loss": 1.4805, "step": 256 }, { "epoch": 0.040401658511662636, "grad_norm": 0.10317599028348923, "learning_rate": 4.9953813140386595e-05, "loss": 1.4412, "step": 257 }, { "epoch": 0.04055886340859518, "grad_norm": 0.10285656154155731, "learning_rate": 4.99534369910895e-05, "loss": 1.476, "step": 258 }, { "epoch": 0.040716068305527715, "grad_norm": 0.10330680012702942, "learning_rate": 4.995305931773141e-05, "loss": 1.5157, "step": 259 }, { "epoch": 0.04087327320246026, "grad_norm": 0.1086694598197937, "learning_rate": 4.99526801203354e-05, "loss": 1.4999, "step": 260 }, { "epoch": 0.041030478099392795, "grad_norm": 0.10800144821405411, "learning_rate": 4.995229939892464e-05, "loss": 1.4764, "step": 261 }, { "epoch": 0.04118768299632534, "grad_norm": 0.10645303875207901, "learning_rate": 4.9951917153522355e-05, "loss": 1.4404, "step": 262 }, { "epoch": 0.041344887893257874, "grad_norm": 0.10440964996814728, "learning_rate": 4.9951533384151906e-05, "loss": 1.3678, "step": 263 }, { "epoch": 0.04150209279019042, "grad_norm": 0.10993078351020813, "learning_rate": 4.995114809083673e-05, "loss": 1.5064, "step": 264 }, { "epoch": 0.041659297687122954, "grad_norm": 0.10710245370864868, "learning_rate": 4.9950761273600366e-05, "loss": 1.4134, "step": 265 }, { "epoch": 0.04181650258405549, "grad_norm": 0.11030582338571548, "learning_rate": 4.995037293246644e-05, "loss": 1.5299, "step": 266 }, { "epoch": 0.04197370748098803, "grad_norm": 0.1058267131447792, "learning_rate": 4.994998306745866e-05, "loss": 1.3654, "step": 267 }, { "epoch": 0.04213091237792057, "grad_norm": 0.10541702806949615, "learning_rate": 4.994959167860084e-05, "loss": 1.4297, "step": 268 }, { "epoch": 0.04228811727485311, "grad_norm": 0.11085420846939087, "learning_rate": 4.994919876591689e-05, "loss": 1.4876, "step": 269 }, { "epoch": 0.04244532217178565, "grad_norm": 0.11054470390081406, "learning_rate": 4.994880432943081e-05, "loss": 1.574, "step": 270 }, { "epoch": 0.04260252706871819, "grad_norm": 0.11234510689973831, "learning_rate": 4.994840836916668e-05, "loss": 1.5079, "step": 271 }, { "epoch": 0.04275973196565073, "grad_norm": 0.11040106415748596, "learning_rate": 4.994801088514869e-05, "loss": 1.5091, "step": 272 }, { "epoch": 0.04291693686258327, "grad_norm": 0.10639887303113937, "learning_rate": 4.994761187740111e-05, "loss": 1.4495, "step": 273 }, { "epoch": 0.04307414175951581, "grad_norm": 0.11268071085214615, "learning_rate": 4.994721134594833e-05, "loss": 1.5057, "step": 274 }, { "epoch": 0.04323134665644835, "grad_norm": 0.10079260170459747, "learning_rate": 4.994680929081479e-05, "loss": 1.4145, "step": 275 }, { "epoch": 0.04338855155338089, "grad_norm": 0.11474710702896118, "learning_rate": 4.994640571202506e-05, "loss": 1.5061, "step": 276 }, { "epoch": 0.04354575645031343, "grad_norm": 0.10946876555681229, "learning_rate": 4.994600060960377e-05, "loss": 1.5306, "step": 277 }, { "epoch": 0.04370296134724597, "grad_norm": 0.11192137002944946, "learning_rate": 4.994559398357569e-05, "loss": 1.5347, "step": 278 }, { "epoch": 0.0438601662441785, "grad_norm": 0.10744784027338028, "learning_rate": 4.994518583396564e-05, "loss": 1.4686, "step": 279 }, { "epoch": 0.044017371141111046, "grad_norm": 0.11113352328538895, "learning_rate": 4.9944776160798544e-05, "loss": 1.4101, "step": 280 }, { "epoch": 0.04417457603804358, "grad_norm": 0.11456230282783508, "learning_rate": 4.994436496409943e-05, "loss": 1.4036, "step": 281 }, { "epoch": 0.044331780934976125, "grad_norm": 0.11608672887086868, "learning_rate": 4.994395224389342e-05, "loss": 1.4949, "step": 282 }, { "epoch": 0.04448898583190866, "grad_norm": 0.1232326403260231, "learning_rate": 4.9943538000205705e-05, "loss": 1.5501, "step": 283 }, { "epoch": 0.044646190728841205, "grad_norm": 0.11791515350341797, "learning_rate": 4.994312223306159e-05, "loss": 1.4542, "step": 284 }, { "epoch": 0.04480339562577374, "grad_norm": 0.11657550930976868, "learning_rate": 4.9942704942486476e-05, "loss": 1.4724, "step": 285 }, { "epoch": 0.044960600522706284, "grad_norm": 0.11560262739658356, "learning_rate": 4.994228612850584e-05, "loss": 1.4036, "step": 286 }, { "epoch": 0.04511780541963882, "grad_norm": 0.10999175906181335, "learning_rate": 4.994186579114527e-05, "loss": 1.4489, "step": 287 }, { "epoch": 0.045275010316571364, "grad_norm": 0.11586826294660568, "learning_rate": 4.9941443930430436e-05, "loss": 1.5486, "step": 288 }, { "epoch": 0.0454322152135039, "grad_norm": 0.11349951475858688, "learning_rate": 4.994102054638711e-05, "loss": 1.5698, "step": 289 }, { "epoch": 0.04558942011043644, "grad_norm": 0.11978698521852493, "learning_rate": 4.9940595639041134e-05, "loss": 1.3933, "step": 290 }, { "epoch": 0.04574662500736898, "grad_norm": 0.11438622325658798, "learning_rate": 4.994016920841846e-05, "loss": 1.5005, "step": 291 }, { "epoch": 0.045903829904301516, "grad_norm": 0.11395915597677231, "learning_rate": 4.9939741254545155e-05, "loss": 1.4521, "step": 292 }, { "epoch": 0.04606103480123406, "grad_norm": 0.11659599095582962, "learning_rate": 4.993931177744734e-05, "loss": 1.5166, "step": 293 }, { "epoch": 0.046218239698166595, "grad_norm": 0.11053171753883362, "learning_rate": 4.9938880777151254e-05, "loss": 1.4459, "step": 294 }, { "epoch": 0.04637544459509914, "grad_norm": 0.11428084224462509, "learning_rate": 4.993844825368321e-05, "loss": 1.4448, "step": 295 }, { "epoch": 0.046532649492031675, "grad_norm": 0.10734150558710098, "learning_rate": 4.993801420706964e-05, "loss": 1.3388, "step": 296 }, { "epoch": 0.04668985438896422, "grad_norm": 0.11137369275093079, "learning_rate": 4.993757863733703e-05, "loss": 1.4155, "step": 297 }, { "epoch": 0.046847059285896754, "grad_norm": 0.1221408098936081, "learning_rate": 4.993714154451202e-05, "loss": 1.4884, "step": 298 }, { "epoch": 0.0470042641828293, "grad_norm": 0.11707969009876251, "learning_rate": 4.993670292862127e-05, "loss": 1.4605, "step": 299 }, { "epoch": 0.047161469079761834, "grad_norm": 0.11751751601696014, "learning_rate": 4.993626278969158e-05, "loss": 1.5538, "step": 300 }, { "epoch": 0.04731867397669438, "grad_norm": 0.11617731302976608, "learning_rate": 4.993582112774984e-05, "loss": 1.438, "step": 301 }, { "epoch": 0.04747587887362691, "grad_norm": 0.15164637565612793, "learning_rate": 4.993537794282302e-05, "loss": 1.4607, "step": 302 }, { "epoch": 0.047633083770559456, "grad_norm": 0.12434446811676025, "learning_rate": 4.9934933234938193e-05, "loss": 1.4167, "step": 303 }, { "epoch": 0.04779028866749199, "grad_norm": 0.12518739700317383, "learning_rate": 4.993448700412251e-05, "loss": 1.4003, "step": 304 }, { "epoch": 0.04794749356442453, "grad_norm": 0.11146944761276245, "learning_rate": 4.993403925040323e-05, "loss": 1.3913, "step": 305 }, { "epoch": 0.04810469846135707, "grad_norm": 0.11682326346635818, "learning_rate": 4.993358997380771e-05, "loss": 1.3415, "step": 306 }, { "epoch": 0.04826190335828961, "grad_norm": 0.1197504773736, "learning_rate": 4.993313917436336e-05, "loss": 1.515, "step": 307 }, { "epoch": 0.04841910825522215, "grad_norm": 0.14647473394870758, "learning_rate": 4.993268685209775e-05, "loss": 1.4529, "step": 308 }, { "epoch": 0.04857631315215469, "grad_norm": 0.12431525439023972, "learning_rate": 4.9932233007038484e-05, "loss": 1.5426, "step": 309 }, { "epoch": 0.04873351804908723, "grad_norm": 0.11715538799762726, "learning_rate": 4.9931777639213284e-05, "loss": 1.4615, "step": 310 }, { "epoch": 0.04889072294601977, "grad_norm": 0.12391876429319382, "learning_rate": 4.993132074864997e-05, "loss": 1.4138, "step": 311 }, { "epoch": 0.04904792784295231, "grad_norm": 0.11894181370735168, "learning_rate": 4.9930862335376444e-05, "loss": 1.4383, "step": 312 }, { "epoch": 0.049205132739884846, "grad_norm": 0.1225295439362526, "learning_rate": 4.9930402399420695e-05, "loss": 1.3847, "step": 313 }, { "epoch": 0.04936233763681739, "grad_norm": 0.11435995995998383, "learning_rate": 4.9929940940810825e-05, "loss": 1.4254, "step": 314 }, { "epoch": 0.049519542533749926, "grad_norm": 0.11988761276006699, "learning_rate": 4.9929477959575024e-05, "loss": 1.4787, "step": 315 }, { "epoch": 0.04967674743068247, "grad_norm": 0.11983373016119003, "learning_rate": 4.992901345574155e-05, "loss": 1.4341, "step": 316 }, { "epoch": 0.049833952327615005, "grad_norm": 0.13395054638385773, "learning_rate": 4.992854742933878e-05, "loss": 1.4315, "step": 317 }, { "epoch": 0.04999115722454754, "grad_norm": 0.12578143179416656, "learning_rate": 4.9928079880395186e-05, "loss": 1.4143, "step": 318 }, { "epoch": 0.050148362121480085, "grad_norm": 0.1401878446340561, "learning_rate": 4.992761080893932e-05, "loss": 1.4665, "step": 319 }, { "epoch": 0.05030556701841262, "grad_norm": 0.13048145174980164, "learning_rate": 4.9927140214999826e-05, "loss": 1.4266, "step": 320 }, { "epoch": 0.05030556701841262, "eval_loss": 1.450086236000061, "eval_runtime": 2316.1877, "eval_samples_per_second": 3.997, "eval_steps_per_second": 1.999, "step": 320 }, { "epoch": 0.050462771915345164, "grad_norm": 0.13121232390403748, "learning_rate": 4.992666809860545e-05, "loss": 1.4946, "step": 321 }, { "epoch": 0.0506199768122777, "grad_norm": 0.13547195494174957, "learning_rate": 4.9926194459785015e-05, "loss": 1.5532, "step": 322 }, { "epoch": 0.050777181709210244, "grad_norm": 0.11797169595956802, "learning_rate": 4.992571929856747e-05, "loss": 1.4118, "step": 323 }, { "epoch": 0.05093438660614278, "grad_norm": 0.12734922766685486, "learning_rate": 4.992524261498183e-05, "loss": 1.4427, "step": 324 }, { "epoch": 0.05109159150307532, "grad_norm": 0.12444902211427689, "learning_rate": 4.99247644090572e-05, "loss": 1.4369, "step": 325 }, { "epoch": 0.05124879640000786, "grad_norm": 0.12244518846273422, "learning_rate": 4.99242846808228e-05, "loss": 1.4587, "step": 326 }, { "epoch": 0.0514060012969404, "grad_norm": 0.12424397468566895, "learning_rate": 4.9923803430307916e-05, "loss": 1.3949, "step": 327 }, { "epoch": 0.05156320619387294, "grad_norm": 0.1352718621492386, "learning_rate": 4.9923320657541944e-05, "loss": 1.504, "step": 328 }, { "epoch": 0.05172041109080548, "grad_norm": 0.12855666875839233, "learning_rate": 4.992283636255438e-05, "loss": 1.4271, "step": 329 }, { "epoch": 0.05187761598773802, "grad_norm": 0.129829540848732, "learning_rate": 4.99223505453748e-05, "loss": 1.455, "step": 330 }, { "epoch": 0.052034820884670555, "grad_norm": 0.12780050933361053, "learning_rate": 4.992186320603286e-05, "loss": 1.4045, "step": 331 }, { "epoch": 0.0521920257816031, "grad_norm": 0.13515712320804596, "learning_rate": 4.992137434455834e-05, "loss": 1.4335, "step": 332 }, { "epoch": 0.052349230678535634, "grad_norm": 0.15026766061782837, "learning_rate": 4.99208839609811e-05, "loss": 1.5386, "step": 333 }, { "epoch": 0.05250643557546818, "grad_norm": 0.13422101736068726, "learning_rate": 4.992039205533108e-05, "loss": 1.454, "step": 334 }, { "epoch": 0.05266364047240071, "grad_norm": 0.13735777139663696, "learning_rate": 4.991989862763833e-05, "loss": 1.4415, "step": 335 }, { "epoch": 0.05282084536933326, "grad_norm": 0.12985137104988098, "learning_rate": 4.9919403677932994e-05, "loss": 1.385, "step": 336 }, { "epoch": 0.05297805026626579, "grad_norm": 0.1301167607307434, "learning_rate": 4.9918907206245285e-05, "loss": 1.4364, "step": 337 }, { "epoch": 0.053135255163198336, "grad_norm": 0.1407599002122879, "learning_rate": 4.991840921260553e-05, "loss": 1.4454, "step": 338 }, { "epoch": 0.05329246006013087, "grad_norm": 0.12763133645057678, "learning_rate": 4.9917909697044164e-05, "loss": 1.4008, "step": 339 }, { "epoch": 0.053449664957063416, "grad_norm": 0.1443052589893341, "learning_rate": 4.991740865959167e-05, "loss": 1.5184, "step": 340 }, { "epoch": 0.05360686985399595, "grad_norm": 0.13496418297290802, "learning_rate": 4.991690610027866e-05, "loss": 1.3888, "step": 341 }, { "epoch": 0.05376407475092849, "grad_norm": 0.12681293487548828, "learning_rate": 4.991640201913583e-05, "loss": 1.42, "step": 342 }, { "epoch": 0.05392127964786103, "grad_norm": 0.13178062438964844, "learning_rate": 4.9915896416193965e-05, "loss": 1.4178, "step": 343 }, { "epoch": 0.05407848454479357, "grad_norm": 0.14452503621578217, "learning_rate": 4.991538929148394e-05, "loss": 1.4248, "step": 344 }, { "epoch": 0.05423568944172611, "grad_norm": 0.1352955400943756, "learning_rate": 4.991488064503674e-05, "loss": 1.4304, "step": 345 }, { "epoch": 0.05439289433865865, "grad_norm": 0.14846469461917877, "learning_rate": 4.991437047688343e-05, "loss": 1.4784, "step": 346 }, { "epoch": 0.05455009923559119, "grad_norm": 0.12475849688053131, "learning_rate": 4.9913858787055156e-05, "loss": 1.4131, "step": 347 }, { "epoch": 0.054707304132523726, "grad_norm": 0.13835409283638, "learning_rate": 4.991334557558318e-05, "loss": 1.4913, "step": 348 }, { "epoch": 0.05486450902945627, "grad_norm": 0.13921529054641724, "learning_rate": 4.991283084249885e-05, "loss": 1.3713, "step": 349 }, { "epoch": 0.055021713926388806, "grad_norm": 0.13188250362873077, "learning_rate": 4.9912314587833586e-05, "loss": 1.3608, "step": 350 }, { "epoch": 0.05517891882332135, "grad_norm": 0.12457428872585297, "learning_rate": 4.991179681161895e-05, "loss": 1.4427, "step": 351 }, { "epoch": 0.055336123720253885, "grad_norm": 0.12452542781829834, "learning_rate": 4.9911277513886535e-05, "loss": 1.4179, "step": 352 }, { "epoch": 0.05549332861718643, "grad_norm": 0.14799195528030396, "learning_rate": 4.9910756694668074e-05, "loss": 1.4532, "step": 353 }, { "epoch": 0.055650533514118965, "grad_norm": 0.13485541939735413, "learning_rate": 4.991023435399538e-05, "loss": 1.4114, "step": 354 }, { "epoch": 0.0558077384110515, "grad_norm": 0.1422443389892578, "learning_rate": 4.990971049190034e-05, "loss": 1.377, "step": 355 }, { "epoch": 0.055964943307984044, "grad_norm": 0.12994804978370667, "learning_rate": 4.990918510841496e-05, "loss": 1.4474, "step": 356 }, { "epoch": 0.05612214820491658, "grad_norm": 0.1429785192012787, "learning_rate": 4.990865820357133e-05, "loss": 1.4435, "step": 357 }, { "epoch": 0.056279353101849124, "grad_norm": 0.12979790568351746, "learning_rate": 4.9908129777401625e-05, "loss": 1.4039, "step": 358 }, { "epoch": 0.05643655799878166, "grad_norm": 0.1332644671201706, "learning_rate": 4.990759982993812e-05, "loss": 1.4377, "step": 359 }, { "epoch": 0.0565937628957142, "grad_norm": 0.13796579837799072, "learning_rate": 4.99070683612132e-05, "loss": 1.3951, "step": 360 }, { "epoch": 0.05675096779264674, "grad_norm": 0.14315246045589447, "learning_rate": 4.9906535371259294e-05, "loss": 1.4042, "step": 361 }, { "epoch": 0.05690817268957928, "grad_norm": 0.1463768631219864, "learning_rate": 4.9906000860108974e-05, "loss": 1.461, "step": 362 }, { "epoch": 0.05706537758651182, "grad_norm": 0.14041170477867126, "learning_rate": 4.9905464827794884e-05, "loss": 1.4147, "step": 363 }, { "epoch": 0.05722258248344436, "grad_norm": 0.19242697954177856, "learning_rate": 4.990492727434976e-05, "loss": 1.3435, "step": 364 }, { "epoch": 0.0573797873803769, "grad_norm": 0.1556611955165863, "learning_rate": 4.990438819980644e-05, "loss": 1.4075, "step": 365 }, { "epoch": 0.05753699227730944, "grad_norm": 0.13157570362091064, "learning_rate": 4.990384760419784e-05, "loss": 1.3334, "step": 366 }, { "epoch": 0.05769419717424198, "grad_norm": 0.17953743040561676, "learning_rate": 4.990330548755698e-05, "loss": 1.4609, "step": 367 }, { "epoch": 0.057851402071174514, "grad_norm": 0.14179491996765137, "learning_rate": 4.990276184991697e-05, "loss": 1.4344, "step": 368 }, { "epoch": 0.05800860696810706, "grad_norm": 0.16522593796253204, "learning_rate": 4.9902216691311024e-05, "loss": 1.3794, "step": 369 }, { "epoch": 0.05816581186503959, "grad_norm": 0.12736016511917114, "learning_rate": 4.9901670011772425e-05, "loss": 1.4167, "step": 370 }, { "epoch": 0.058323016761972137, "grad_norm": 0.15869787335395813, "learning_rate": 4.990112181133456e-05, "loss": 1.4293, "step": 371 }, { "epoch": 0.05848022165890467, "grad_norm": 0.14410504698753357, "learning_rate": 4.990057209003093e-05, "loss": 1.4357, "step": 372 }, { "epoch": 0.058637426555837216, "grad_norm": 0.1567080020904541, "learning_rate": 4.9900020847895086e-05, "loss": 1.4146, "step": 373 }, { "epoch": 0.05879463145276975, "grad_norm": 0.1430107057094574, "learning_rate": 4.989946808496071e-05, "loss": 1.3415, "step": 374 }, { "epoch": 0.058951836349702295, "grad_norm": 0.146332785487175, "learning_rate": 4.989891380126156e-05, "loss": 1.496, "step": 375 }, { "epoch": 0.05910904124663483, "grad_norm": 0.13674487173557281, "learning_rate": 4.989835799683149e-05, "loss": 1.3611, "step": 376 }, { "epoch": 0.059266246143567375, "grad_norm": 0.1321984827518463, "learning_rate": 4.989780067170444e-05, "loss": 1.4695, "step": 377 }, { "epoch": 0.05942345104049991, "grad_norm": 0.1535942554473877, "learning_rate": 4.9897241825914464e-05, "loss": 1.3564, "step": 378 }, { "epoch": 0.059580655937432454, "grad_norm": 0.1538037806749344, "learning_rate": 4.989668145949568e-05, "loss": 1.3502, "step": 379 }, { "epoch": 0.05973786083436499, "grad_norm": 0.15744829177856445, "learning_rate": 4.989611957248232e-05, "loss": 1.4318, "step": 380 }, { "epoch": 0.05989506573129753, "grad_norm": 0.17178332805633545, "learning_rate": 4.98955561649087e-05, "loss": 1.4643, "step": 381 }, { "epoch": 0.06005227062823007, "grad_norm": 0.15913072228431702, "learning_rate": 4.989499123680923e-05, "loss": 1.487, "step": 382 }, { "epoch": 0.060209475525162606, "grad_norm": 0.15134060382843018, "learning_rate": 4.9894424788218415e-05, "loss": 1.4705, "step": 383 }, { "epoch": 0.06036668042209515, "grad_norm": 0.13704389333724976, "learning_rate": 4.989385681917085e-05, "loss": 1.4756, "step": 384 }, { "epoch": 0.060523885319027686, "grad_norm": 0.14025503396987915, "learning_rate": 4.989328732970122e-05, "loss": 1.443, "step": 385 }, { "epoch": 0.06068109021596023, "grad_norm": 0.1822325438261032, "learning_rate": 4.9892716319844325e-05, "loss": 1.3996, "step": 386 }, { "epoch": 0.060838295112892765, "grad_norm": 0.15639656782150269, "learning_rate": 4.989214378963502e-05, "loss": 1.3656, "step": 387 }, { "epoch": 0.06099550000982531, "grad_norm": 0.15097728371620178, "learning_rate": 4.989156973910828e-05, "loss": 1.4055, "step": 388 }, { "epoch": 0.061152704906757845, "grad_norm": 0.18977142870426178, "learning_rate": 4.989099416829917e-05, "loss": 1.4472, "step": 389 }, { "epoch": 0.06130990980369039, "grad_norm": 0.1596304178237915, "learning_rate": 4.989041707724284e-05, "loss": 1.4373, "step": 390 }, { "epoch": 0.061467114700622924, "grad_norm": 0.171820729970932, "learning_rate": 4.988983846597454e-05, "loss": 1.468, "step": 391 }, { "epoch": 0.06162431959755547, "grad_norm": 0.14266176521778107, "learning_rate": 4.98892583345296e-05, "loss": 1.4037, "step": 392 }, { "epoch": 0.061781524494488003, "grad_norm": 0.13375528156757355, "learning_rate": 4.988867668294346e-05, "loss": 1.437, "step": 393 }, { "epoch": 0.06193872939142054, "grad_norm": 0.13332228362560272, "learning_rate": 4.988809351125165e-05, "loss": 1.3892, "step": 394 }, { "epoch": 0.06209593428835308, "grad_norm": 0.17180980741977692, "learning_rate": 4.988750881948977e-05, "loss": 1.3494, "step": 395 }, { "epoch": 0.06225313918528562, "grad_norm": 0.1419111043214798, "learning_rate": 4.988692260769355e-05, "loss": 1.3748, "step": 396 }, { "epoch": 0.06241034408221816, "grad_norm": 0.17256620526313782, "learning_rate": 4.9886334875898776e-05, "loss": 1.3549, "step": 397 }, { "epoch": 0.0625675489791507, "grad_norm": 0.2243422418832779, "learning_rate": 4.988574562414137e-05, "loss": 1.4465, "step": 398 }, { "epoch": 0.06272475387608324, "grad_norm": 0.15700723230838776, "learning_rate": 4.9885154852457294e-05, "loss": 1.4477, "step": 399 }, { "epoch": 0.06288195877301578, "grad_norm": 0.14497259259223938, "learning_rate": 4.988456256088264e-05, "loss": 1.3861, "step": 400 }, { "epoch": 0.06303916366994831, "grad_norm": 0.14747034013271332, "learning_rate": 4.988396874945359e-05, "loss": 1.4206, "step": 401 }, { "epoch": 0.06319636856688086, "grad_norm": 0.17671054601669312, "learning_rate": 4.98833734182064e-05, "loss": 1.2475, "step": 402 }, { "epoch": 0.0633535734638134, "grad_norm": 0.16974316537380219, "learning_rate": 4.9882776567177446e-05, "loss": 1.4955, "step": 403 }, { "epoch": 0.06351077836074594, "grad_norm": 0.15419775247573853, "learning_rate": 4.988217819640317e-05, "loss": 1.4209, "step": 404 }, { "epoch": 0.06366798325767847, "grad_norm": 0.13987664878368378, "learning_rate": 4.988157830592012e-05, "loss": 1.456, "step": 405 }, { "epoch": 0.06382518815461101, "grad_norm": 0.24560455977916718, "learning_rate": 4.988097689576493e-05, "loss": 1.3567, "step": 406 }, { "epoch": 0.06398239305154356, "grad_norm": 0.13870076835155487, "learning_rate": 4.9880373965974334e-05, "loss": 1.3752, "step": 407 }, { "epoch": 0.0641395979484761, "grad_norm": 0.16167718172073364, "learning_rate": 4.987976951658517e-05, "loss": 1.4766, "step": 408 }, { "epoch": 0.06429680284540863, "grad_norm": 0.1700398474931717, "learning_rate": 4.9879163547634346e-05, "loss": 1.427, "step": 409 }, { "epoch": 0.06445400774234117, "grad_norm": 0.15502458810806274, "learning_rate": 4.987855605915887e-05, "loss": 1.3965, "step": 410 }, { "epoch": 0.06461121263927372, "grad_norm": 0.14834032952785492, "learning_rate": 4.987794705119584e-05, "loss": 1.4399, "step": 411 }, { "epoch": 0.06476841753620625, "grad_norm": 0.22443649172782898, "learning_rate": 4.987733652378246e-05, "loss": 1.3736, "step": 412 }, { "epoch": 0.06492562243313879, "grad_norm": 0.14396560192108154, "learning_rate": 4.9876724476956015e-05, "loss": 1.4648, "step": 413 }, { "epoch": 0.06508282733007133, "grad_norm": 0.15352006256580353, "learning_rate": 4.987611091075389e-05, "loss": 1.4988, "step": 414 }, { "epoch": 0.06524003222700388, "grad_norm": 0.13210074603557587, "learning_rate": 4.987549582521356e-05, "loss": 1.3705, "step": 415 }, { "epoch": 0.06539723712393641, "grad_norm": 0.16056782007217407, "learning_rate": 4.98748792203726e-05, "loss": 1.3388, "step": 416 }, { "epoch": 0.06555444202086895, "grad_norm": 0.18992343544960022, "learning_rate": 4.9874261096268647e-05, "loss": 1.3842, "step": 417 }, { "epoch": 0.06571164691780149, "grad_norm": 0.1789916455745697, "learning_rate": 4.9873641452939466e-05, "loss": 1.3622, "step": 418 }, { "epoch": 0.06586885181473402, "grad_norm": 0.21043789386749268, "learning_rate": 4.9873020290422915e-05, "loss": 1.3477, "step": 419 }, { "epoch": 0.06602605671166657, "grad_norm": 0.15355254709720612, "learning_rate": 4.987239760875691e-05, "loss": 1.3643, "step": 420 }, { "epoch": 0.06618326160859911, "grad_norm": 0.1433190107345581, "learning_rate": 4.9871773407979496e-05, "loss": 1.3753, "step": 421 }, { "epoch": 0.06634046650553165, "grad_norm": 0.17479249835014343, "learning_rate": 4.987114768812879e-05, "loss": 1.3809, "step": 422 }, { "epoch": 0.06649767140246418, "grad_norm": 0.186944842338562, "learning_rate": 4.987052044924302e-05, "loss": 1.3616, "step": 423 }, { "epoch": 0.06665487629939673, "grad_norm": 0.15202952921390533, "learning_rate": 4.986989169136048e-05, "loss": 1.4479, "step": 424 }, { "epoch": 0.06681208119632927, "grad_norm": 0.16295532882213593, "learning_rate": 4.9869261414519575e-05, "loss": 1.3713, "step": 425 }, { "epoch": 0.0669692860932618, "grad_norm": 0.19577625393867493, "learning_rate": 4.986862961875881e-05, "loss": 1.4199, "step": 426 }, { "epoch": 0.06712649099019434, "grad_norm": 0.22768542170524597, "learning_rate": 4.986799630411677e-05, "loss": 1.3529, "step": 427 }, { "epoch": 0.06728369588712689, "grad_norm": 0.25184011459350586, "learning_rate": 4.986736147063212e-05, "loss": 1.3944, "step": 428 }, { "epoch": 0.06744090078405943, "grad_norm": 0.15565118193626404, "learning_rate": 4.986672511834366e-05, "loss": 1.4505, "step": 429 }, { "epoch": 0.06759810568099196, "grad_norm": 0.16559922695159912, "learning_rate": 4.986608724729024e-05, "loss": 1.3742, "step": 430 }, { "epoch": 0.0677553105779245, "grad_norm": 0.14826242625713348, "learning_rate": 4.986544785751081e-05, "loss": 1.4008, "step": 431 }, { "epoch": 0.06791251547485704, "grad_norm": 0.16543184220790863, "learning_rate": 4.986480694904444e-05, "loss": 1.3433, "step": 432 }, { "epoch": 0.06806972037178959, "grad_norm": 0.15332931280136108, "learning_rate": 4.986416452193027e-05, "loss": 1.4459, "step": 433 }, { "epoch": 0.06822692526872212, "grad_norm": 0.18880733847618103, "learning_rate": 4.986352057620752e-05, "loss": 1.3902, "step": 434 }, { "epoch": 0.06838413016565466, "grad_norm": 0.1513829231262207, "learning_rate": 4.986287511191554e-05, "loss": 1.3485, "step": 435 }, { "epoch": 0.0685413350625872, "grad_norm": 0.15241704881191254, "learning_rate": 4.9862228129093745e-05, "loss": 1.3051, "step": 436 }, { "epoch": 0.06869853995951974, "grad_norm": 0.1956702321767807, "learning_rate": 4.986157962778165e-05, "loss": 1.4647, "step": 437 }, { "epoch": 0.06885574485645228, "grad_norm": 0.2027936428785324, "learning_rate": 4.9860929608018866e-05, "loss": 1.3602, "step": 438 }, { "epoch": 0.06901294975338482, "grad_norm": 0.1623186320066452, "learning_rate": 4.986027806984509e-05, "loss": 1.4154, "step": 439 }, { "epoch": 0.06917015465031735, "grad_norm": 0.16111283004283905, "learning_rate": 4.985962501330011e-05, "loss": 1.4311, "step": 440 }, { "epoch": 0.0693273595472499, "grad_norm": 0.16754299402236938, "learning_rate": 4.985897043842382e-05, "loss": 1.349, "step": 441 }, { "epoch": 0.06948456444418244, "grad_norm": 0.1766330897808075, "learning_rate": 4.985831434525621e-05, "loss": 1.3714, "step": 442 }, { "epoch": 0.06964176934111498, "grad_norm": 0.1742810308933258, "learning_rate": 4.985765673383733e-05, "loss": 1.4161, "step": 443 }, { "epoch": 0.06979897423804751, "grad_norm": 0.17025281488895416, "learning_rate": 4.985699760420736e-05, "loss": 1.3925, "step": 444 }, { "epoch": 0.06995617913498005, "grad_norm": 0.19201375544071198, "learning_rate": 4.985633695640655e-05, "loss": 1.4158, "step": 445 }, { "epoch": 0.0701133840319126, "grad_norm": 0.1636267751455307, "learning_rate": 4.985567479047524e-05, "loss": 1.4071, "step": 446 }, { "epoch": 0.07027058892884513, "grad_norm": 0.19676333665847778, "learning_rate": 4.9855011106453894e-05, "loss": 1.3449, "step": 447 }, { "epoch": 0.07042779382577767, "grad_norm": 0.17712907493114471, "learning_rate": 4.985434590438303e-05, "loss": 1.3421, "step": 448 }, { "epoch": 0.07058499872271021, "grad_norm": 0.18515101075172424, "learning_rate": 4.985367918430329e-05, "loss": 1.4051, "step": 449 }, { "epoch": 0.07074220361964276, "grad_norm": 0.17168915271759033, "learning_rate": 4.985301094625538e-05, "loss": 1.3093, "step": 450 }, { "epoch": 0.0708994085165753, "grad_norm": 0.1891397386789322, "learning_rate": 4.9852341190280127e-05, "loss": 1.3075, "step": 451 }, { "epoch": 0.07105661341350783, "grad_norm": 0.17731457948684692, "learning_rate": 4.985166991641843e-05, "loss": 1.3986, "step": 452 }, { "epoch": 0.07121381831044037, "grad_norm": 0.18817296624183655, "learning_rate": 4.985099712471129e-05, "loss": 1.3531, "step": 453 }, { "epoch": 0.07137102320737292, "grad_norm": 0.1782791018486023, "learning_rate": 4.9850322815199795e-05, "loss": 1.4064, "step": 454 }, { "epoch": 0.07152822810430545, "grad_norm": 0.18053874373435974, "learning_rate": 4.984964698792514e-05, "loss": 1.4607, "step": 455 }, { "epoch": 0.07168543300123799, "grad_norm": 0.286338746547699, "learning_rate": 4.984896964292858e-05, "loss": 1.3036, "step": 456 }, { "epoch": 0.07184263789817052, "grad_norm": 0.2560707926750183, "learning_rate": 4.98482907802515e-05, "loss": 1.3428, "step": 457 }, { "epoch": 0.07199984279510306, "grad_norm": 0.19296897947788239, "learning_rate": 4.984761039993537e-05, "loss": 1.3502, "step": 458 }, { "epoch": 0.07215704769203561, "grad_norm": 0.19685949385166168, "learning_rate": 4.9846928502021725e-05, "loss": 1.4015, "step": 459 }, { "epoch": 0.07231425258896815, "grad_norm": 0.1548481583595276, "learning_rate": 4.984624508655223e-05, "loss": 1.3698, "step": 460 }, { "epoch": 0.07247145748590068, "grad_norm": 0.16076034307479858, "learning_rate": 4.984556015356862e-05, "loss": 1.3627, "step": 461 }, { "epoch": 0.07262866238283322, "grad_norm": 0.18571603298187256, "learning_rate": 4.9844873703112726e-05, "loss": 1.3506, "step": 462 }, { "epoch": 0.07278586727976577, "grad_norm": 0.1540035605430603, "learning_rate": 4.984418573522648e-05, "loss": 1.4483, "step": 463 }, { "epoch": 0.0729430721766983, "grad_norm": 0.1730145364999771, "learning_rate": 4.984349624995188e-05, "loss": 1.3678, "step": 464 }, { "epoch": 0.07310027707363084, "grad_norm": 0.26254212856292725, "learning_rate": 4.984280524733107e-05, "loss": 1.401, "step": 465 }, { "epoch": 0.07325748197056338, "grad_norm": 0.2079063057899475, "learning_rate": 4.984211272740623e-05, "loss": 1.3655, "step": 466 }, { "epoch": 0.07341468686749593, "grad_norm": 0.21711499989032745, "learning_rate": 4.9841418690219653e-05, "loss": 1.4011, "step": 467 }, { "epoch": 0.07357189176442847, "grad_norm": 0.18226252496242523, "learning_rate": 4.984072313581375e-05, "loss": 1.4213, "step": 468 }, { "epoch": 0.073729096661361, "grad_norm": 0.1463780552148819, "learning_rate": 4.9840026064230984e-05, "loss": 1.4519, "step": 469 }, { "epoch": 0.07388630155829354, "grad_norm": 0.18232892453670502, "learning_rate": 4.983932747551394e-05, "loss": 1.3657, "step": 470 }, { "epoch": 0.07404350645522607, "grad_norm": 0.19644559919834137, "learning_rate": 4.9838627369705285e-05, "loss": 1.3988, "step": 471 }, { "epoch": 0.07420071135215862, "grad_norm": 0.16292576491832733, "learning_rate": 4.983792574684776e-05, "loss": 1.4369, "step": 472 }, { "epoch": 0.07435791624909116, "grad_norm": 0.2244543433189392, "learning_rate": 4.983722260698425e-05, "loss": 1.4269, "step": 473 }, { "epoch": 0.0745151211460237, "grad_norm": 0.2582489848136902, "learning_rate": 4.9836517950157666e-05, "loss": 1.3986, "step": 474 }, { "epoch": 0.07467232604295623, "grad_norm": 0.15564194321632385, "learning_rate": 4.983581177641108e-05, "loss": 1.3871, "step": 475 }, { "epoch": 0.07482953093988878, "grad_norm": 0.2301008552312851, "learning_rate": 4.9835104085787596e-05, "loss": 1.3572, "step": 476 }, { "epoch": 0.07498673583682132, "grad_norm": 0.21603424847126007, "learning_rate": 4.9834394878330444e-05, "loss": 1.3803, "step": 477 }, { "epoch": 0.07514394073375386, "grad_norm": 0.16744717955589294, "learning_rate": 4.9833684154082937e-05, "loss": 1.4233, "step": 478 }, { "epoch": 0.07530114563068639, "grad_norm": 0.23016415536403656, "learning_rate": 4.98329719130885e-05, "loss": 1.3962, "step": 479 }, { "epoch": 0.07545835052761894, "grad_norm": 0.19687114655971527, "learning_rate": 4.983225815539061e-05, "loss": 1.3667, "step": 480 }, { "epoch": 0.07545835052761894, "eval_loss": 1.3748993873596191, "eval_runtime": 2315.5952, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.999, "step": 480 }, { "epoch": 0.07561555542455148, "grad_norm": 0.1833205670118332, "learning_rate": 4.9831542881032884e-05, "loss": 1.4365, "step": 481 }, { "epoch": 0.07577276032148401, "grad_norm": 0.17124423384666443, "learning_rate": 4.983082609005899e-05, "loss": 1.3641, "step": 482 }, { "epoch": 0.07592996521841655, "grad_norm": 0.17352670431137085, "learning_rate": 4.9830107782512715e-05, "loss": 1.3415, "step": 483 }, { "epoch": 0.07608717011534909, "grad_norm": 0.20768220722675323, "learning_rate": 4.982938795843793e-05, "loss": 1.3261, "step": 484 }, { "epoch": 0.07624437501228164, "grad_norm": 0.21459853649139404, "learning_rate": 4.982866661787859e-05, "loss": 1.4185, "step": 485 }, { "epoch": 0.07640157990921417, "grad_norm": 0.26912233233451843, "learning_rate": 4.982794376087877e-05, "loss": 1.3941, "step": 486 }, { "epoch": 0.07655878480614671, "grad_norm": 0.28497114777565, "learning_rate": 4.982721938748261e-05, "loss": 1.3201, "step": 487 }, { "epoch": 0.07671598970307925, "grad_norm": 0.15378472208976746, "learning_rate": 4.982649349773435e-05, "loss": 1.3615, "step": 488 }, { "epoch": 0.0768731946000118, "grad_norm": 0.16169893741607666, "learning_rate": 4.982576609167831e-05, "loss": 1.3342, "step": 489 }, { "epoch": 0.07703039949694433, "grad_norm": 0.24693650007247925, "learning_rate": 4.982503716935896e-05, "loss": 1.3788, "step": 490 }, { "epoch": 0.07718760439387687, "grad_norm": 0.1769181787967682, "learning_rate": 4.982430673082077e-05, "loss": 1.3664, "step": 491 }, { "epoch": 0.0773448092908094, "grad_norm": 0.26325106620788574, "learning_rate": 4.982357477610839e-05, "loss": 1.3173, "step": 492 }, { "epoch": 0.07750201418774195, "grad_norm": 0.2063319832086563, "learning_rate": 4.9822841305266506e-05, "loss": 1.4125, "step": 493 }, { "epoch": 0.07765921908467449, "grad_norm": 0.29141879081726074, "learning_rate": 4.982210631833992e-05, "loss": 1.3596, "step": 494 }, { "epoch": 0.07781642398160703, "grad_norm": 0.18967591226100922, "learning_rate": 4.982136981537352e-05, "loss": 1.4128, "step": 495 }, { "epoch": 0.07797362887853956, "grad_norm": 0.2291795313358307, "learning_rate": 4.9820631796412287e-05, "loss": 1.3772, "step": 496 }, { "epoch": 0.0781308337754721, "grad_norm": 0.200834721326828, "learning_rate": 4.98198922615013e-05, "loss": 1.369, "step": 497 }, { "epoch": 0.07828803867240465, "grad_norm": 0.22960609197616577, "learning_rate": 4.9819151210685736e-05, "loss": 1.3979, "step": 498 }, { "epoch": 0.07844524356933719, "grad_norm": 0.17247427999973297, "learning_rate": 4.981840864401084e-05, "loss": 1.3927, "step": 499 }, { "epoch": 0.07860244846626972, "grad_norm": 0.2623608112335205, "learning_rate": 4.981766456152198e-05, "loss": 1.3919, "step": 500 }, { "epoch": 0.07875965336320226, "grad_norm": 0.19911788403987885, "learning_rate": 4.981691896326459e-05, "loss": 1.3925, "step": 501 }, { "epoch": 0.07891685826013481, "grad_norm": 0.24869734048843384, "learning_rate": 4.9816171849284205e-05, "loss": 1.3562, "step": 502 }, { "epoch": 0.07907406315706735, "grad_norm": 0.31372350454330444, "learning_rate": 4.981542321962647e-05, "loss": 1.3211, "step": 503 }, { "epoch": 0.07923126805399988, "grad_norm": 0.21760910749435425, "learning_rate": 4.981467307433709e-05, "loss": 1.3042, "step": 504 }, { "epoch": 0.07938847295093242, "grad_norm": 0.2469843477010727, "learning_rate": 4.9813921413461906e-05, "loss": 1.2831, "step": 505 }, { "epoch": 0.07954567784786497, "grad_norm": 0.24319148063659668, "learning_rate": 4.981316823704681e-05, "loss": 1.2703, "step": 506 }, { "epoch": 0.0797028827447975, "grad_norm": 0.19718031585216522, "learning_rate": 4.98124135451378e-05, "loss": 1.3258, "step": 507 }, { "epoch": 0.07986008764173004, "grad_norm": 0.17459236085414886, "learning_rate": 4.981165733778098e-05, "loss": 1.4248, "step": 508 }, { "epoch": 0.08001729253866258, "grad_norm": 0.17684616148471832, "learning_rate": 4.981089961502253e-05, "loss": 1.3939, "step": 509 }, { "epoch": 0.08017449743559511, "grad_norm": 0.17499729990959167, "learning_rate": 4.981014037690874e-05, "loss": 1.4156, "step": 510 }, { "epoch": 0.08033170233252766, "grad_norm": 0.1901170015335083, "learning_rate": 4.9809379623485964e-05, "loss": 1.4209, "step": 511 }, { "epoch": 0.0804889072294602, "grad_norm": 0.18230682611465454, "learning_rate": 4.980861735480067e-05, "loss": 1.4607, "step": 512 }, { "epoch": 0.08064611212639274, "grad_norm": 0.22843636572360992, "learning_rate": 4.9807853570899427e-05, "loss": 1.3671, "step": 513 }, { "epoch": 0.08080331702332527, "grad_norm": 0.2288489192724228, "learning_rate": 4.980708827182887e-05, "loss": 1.3657, "step": 514 }, { "epoch": 0.08096052192025782, "grad_norm": 0.19647593796253204, "learning_rate": 4.980632145763575e-05, "loss": 1.4079, "step": 515 }, { "epoch": 0.08111772681719036, "grad_norm": 0.20980435609817505, "learning_rate": 4.98055531283669e-05, "loss": 1.3746, "step": 516 }, { "epoch": 0.0812749317141229, "grad_norm": 0.19381123781204224, "learning_rate": 4.980478328406923e-05, "loss": 1.3986, "step": 517 }, { "epoch": 0.08143213661105543, "grad_norm": 0.2224361151456833, "learning_rate": 4.980401192478979e-05, "loss": 1.3082, "step": 518 }, { "epoch": 0.08158934150798797, "grad_norm": 0.20567384362220764, "learning_rate": 4.9803239050575664e-05, "loss": 1.4417, "step": 519 }, { "epoch": 0.08174654640492052, "grad_norm": 0.22890503704547882, "learning_rate": 4.9802464661474074e-05, "loss": 1.3034, "step": 520 }, { "epoch": 0.08190375130185305, "grad_norm": 0.23220910131931305, "learning_rate": 4.9801688757532304e-05, "loss": 1.3705, "step": 521 }, { "epoch": 0.08206095619878559, "grad_norm": 0.29084959626197815, "learning_rate": 4.980091133879775e-05, "loss": 1.3246, "step": 522 }, { "epoch": 0.08221816109571813, "grad_norm": 0.15776456892490387, "learning_rate": 4.9800132405317895e-05, "loss": 1.4311, "step": 523 }, { "epoch": 0.08237536599265068, "grad_norm": 0.2636071443557739, "learning_rate": 4.9799351957140314e-05, "loss": 1.3265, "step": 524 }, { "epoch": 0.08253257088958321, "grad_norm": 0.20042134821414948, "learning_rate": 4.979856999431266e-05, "loss": 1.3257, "step": 525 }, { "epoch": 0.08268977578651575, "grad_norm": 0.24039289355278015, "learning_rate": 4.9797786516882714e-05, "loss": 1.3999, "step": 526 }, { "epoch": 0.08284698068344828, "grad_norm": 0.16932524740695953, "learning_rate": 4.9797001524898315e-05, "loss": 1.4113, "step": 527 }, { "epoch": 0.08300418558038083, "grad_norm": 0.2101370096206665, "learning_rate": 4.97962150184074e-05, "loss": 1.3973, "step": 528 }, { "epoch": 0.08316139047731337, "grad_norm": 0.20983585715293884, "learning_rate": 4.979542699745803e-05, "loss": 1.3255, "step": 529 }, { "epoch": 0.08331859537424591, "grad_norm": 0.20477800071239471, "learning_rate": 4.97946374620983e-05, "loss": 1.4349, "step": 530 }, { "epoch": 0.08347580027117844, "grad_norm": 0.22637289762496948, "learning_rate": 4.979384641237647e-05, "loss": 1.3263, "step": 531 }, { "epoch": 0.08363300516811098, "grad_norm": 0.20332221686840057, "learning_rate": 4.9793053848340835e-05, "loss": 1.3411, "step": 532 }, { "epoch": 0.08379021006504353, "grad_norm": 0.22744616866111755, "learning_rate": 4.979225977003979e-05, "loss": 1.4042, "step": 533 }, { "epoch": 0.08394741496197607, "grad_norm": 0.20091576874256134, "learning_rate": 4.979146417752185e-05, "loss": 1.3218, "step": 534 }, { "epoch": 0.0841046198589086, "grad_norm": 0.2225920408964157, "learning_rate": 4.9790667070835604e-05, "loss": 1.4223, "step": 535 }, { "epoch": 0.08426182475584114, "grad_norm": 0.20447570085525513, "learning_rate": 4.9789868450029745e-05, "loss": 1.3884, "step": 536 }, { "epoch": 0.08441902965277369, "grad_norm": 0.22765719890594482, "learning_rate": 4.9789068315153035e-05, "loss": 1.3575, "step": 537 }, { "epoch": 0.08457623454970623, "grad_norm": 0.18886259198188782, "learning_rate": 4.9788266666254343e-05, "loss": 1.2737, "step": 538 }, { "epoch": 0.08473343944663876, "grad_norm": 0.26551586389541626, "learning_rate": 4.978746350338264e-05, "loss": 1.3867, "step": 539 }, { "epoch": 0.0848906443435713, "grad_norm": 0.29268744587898254, "learning_rate": 4.9786658826586975e-05, "loss": 1.4266, "step": 540 }, { "epoch": 0.08504784924050385, "grad_norm": 0.2537211775779724, "learning_rate": 4.97858526359165e-05, "loss": 1.3402, "step": 541 }, { "epoch": 0.08520505413743638, "grad_norm": 0.20287925004959106, "learning_rate": 4.978504493142045e-05, "loss": 1.3148, "step": 542 }, { "epoch": 0.08536225903436892, "grad_norm": 0.18584851920604706, "learning_rate": 4.978423571314814e-05, "loss": 1.3293, "step": 543 }, { "epoch": 0.08551946393130146, "grad_norm": 0.1944153755903244, "learning_rate": 4.978342498114903e-05, "loss": 1.4084, "step": 544 }, { "epoch": 0.08567666882823399, "grad_norm": 0.18139739334583282, "learning_rate": 4.978261273547261e-05, "loss": 1.2734, "step": 545 }, { "epoch": 0.08583387372516654, "grad_norm": 0.20824116468429565, "learning_rate": 4.97817989761685e-05, "loss": 1.3346, "step": 546 }, { "epoch": 0.08599107862209908, "grad_norm": 0.16180047392845154, "learning_rate": 4.978098370328639e-05, "loss": 1.4547, "step": 547 }, { "epoch": 0.08614828351903162, "grad_norm": 0.17156392335891724, "learning_rate": 4.978016691687609e-05, "loss": 1.366, "step": 548 }, { "epoch": 0.08630548841596415, "grad_norm": 0.17913401126861572, "learning_rate": 4.977934861698746e-05, "loss": 1.2771, "step": 549 }, { "epoch": 0.0864626933128967, "grad_norm": 0.17393502593040466, "learning_rate": 4.977852880367051e-05, "loss": 1.3061, "step": 550 }, { "epoch": 0.08661989820982924, "grad_norm": 0.21741637587547302, "learning_rate": 4.97777074769753e-05, "loss": 1.3232, "step": 551 }, { "epoch": 0.08677710310676177, "grad_norm": 0.26123344898223877, "learning_rate": 4.977688463695198e-05, "loss": 1.2678, "step": 552 }, { "epoch": 0.08693430800369431, "grad_norm": 0.2508600354194641, "learning_rate": 4.9776060283650826e-05, "loss": 1.4543, "step": 553 }, { "epoch": 0.08709151290062686, "grad_norm": 0.18527132272720337, "learning_rate": 4.977523441712217e-05, "loss": 1.3359, "step": 554 }, { "epoch": 0.0872487177975594, "grad_norm": 0.24495406448841095, "learning_rate": 4.977440703741646e-05, "loss": 1.2892, "step": 555 }, { "epoch": 0.08740592269449193, "grad_norm": 0.22759339213371277, "learning_rate": 4.9773578144584235e-05, "loss": 1.2212, "step": 556 }, { "epoch": 0.08756312759142447, "grad_norm": 0.1627693474292755, "learning_rate": 4.977274773867611e-05, "loss": 1.3461, "step": 557 }, { "epoch": 0.087720332488357, "grad_norm": 0.2068985551595688, "learning_rate": 4.9771915819742804e-05, "loss": 1.3348, "step": 558 }, { "epoch": 0.08787753738528956, "grad_norm": 0.19731195271015167, "learning_rate": 4.9771082387835135e-05, "loss": 1.3727, "step": 559 }, { "epoch": 0.08803474228222209, "grad_norm": 0.26571184396743774, "learning_rate": 4.977024744300399e-05, "loss": 1.3911, "step": 560 }, { "epoch": 0.08819194717915463, "grad_norm": 0.23141519725322723, "learning_rate": 4.976941098530039e-05, "loss": 1.3978, "step": 561 }, { "epoch": 0.08834915207608716, "grad_norm": 0.2507224380970001, "learning_rate": 4.97685730147754e-05, "loss": 1.3017, "step": 562 }, { "epoch": 0.08850635697301971, "grad_norm": 0.2453109323978424, "learning_rate": 4.976773353148022e-05, "loss": 1.2977, "step": 563 }, { "epoch": 0.08866356186995225, "grad_norm": 0.2600953280925751, "learning_rate": 4.9766892535466105e-05, "loss": 1.4015, "step": 564 }, { "epoch": 0.08882076676688479, "grad_norm": 0.19863371551036835, "learning_rate": 4.9766050026784416e-05, "loss": 1.3593, "step": 565 }, { "epoch": 0.08897797166381732, "grad_norm": 0.2115338146686554, "learning_rate": 4.976520600548663e-05, "loss": 1.2928, "step": 566 }, { "epoch": 0.08913517656074987, "grad_norm": 0.18994684517383575, "learning_rate": 4.976436047162429e-05, "loss": 1.3506, "step": 567 }, { "epoch": 0.08929238145768241, "grad_norm": 0.22891771793365479, "learning_rate": 4.976351342524903e-05, "loss": 1.4449, "step": 568 }, { "epoch": 0.08944958635461495, "grad_norm": 0.19313135743141174, "learning_rate": 4.976266486641259e-05, "loss": 1.2916, "step": 569 }, { "epoch": 0.08960679125154748, "grad_norm": 0.17697346210479736, "learning_rate": 4.976181479516679e-05, "loss": 1.3696, "step": 570 }, { "epoch": 0.08976399614848002, "grad_norm": 0.22902925312519073, "learning_rate": 4.976096321156356e-05, "loss": 1.3688, "step": 571 }, { "epoch": 0.08992120104541257, "grad_norm": 0.25305554270744324, "learning_rate": 4.97601101156549e-05, "loss": 1.3057, "step": 572 }, { "epoch": 0.0900784059423451, "grad_norm": 0.23255370557308197, "learning_rate": 4.975925550749293e-05, "loss": 1.3571, "step": 573 }, { "epoch": 0.09023561083927764, "grad_norm": 0.25259101390838623, "learning_rate": 4.9758399387129834e-05, "loss": 1.3152, "step": 574 }, { "epoch": 0.09039281573621018, "grad_norm": 0.26062390208244324, "learning_rate": 4.97575417546179e-05, "loss": 1.3042, "step": 575 }, { "epoch": 0.09055002063314273, "grad_norm": 0.16536732017993927, "learning_rate": 4.9756682610009515e-05, "loss": 1.2797, "step": 576 }, { "epoch": 0.09070722553007526, "grad_norm": 0.19088499248027802, "learning_rate": 4.9755821953357144e-05, "loss": 1.3774, "step": 577 }, { "epoch": 0.0908644304270078, "grad_norm": 0.2181147336959839, "learning_rate": 4.975495978471336e-05, "loss": 1.3364, "step": 578 }, { "epoch": 0.09102163532394034, "grad_norm": 0.18012750148773193, "learning_rate": 4.975409610413082e-05, "loss": 1.3852, "step": 579 }, { "epoch": 0.09117884022087289, "grad_norm": 0.18108834326267242, "learning_rate": 4.975323091166227e-05, "loss": 1.3214, "step": 580 }, { "epoch": 0.09133604511780542, "grad_norm": 0.25102898478507996, "learning_rate": 4.975236420736056e-05, "loss": 1.3199, "step": 581 }, { "epoch": 0.09149325001473796, "grad_norm": 0.20121383666992188, "learning_rate": 4.9751495991278626e-05, "loss": 1.3328, "step": 582 }, { "epoch": 0.0916504549116705, "grad_norm": 0.24183815717697144, "learning_rate": 4.975062626346948e-05, "loss": 1.3881, "step": 583 }, { "epoch": 0.09180765980860303, "grad_norm": 0.23274902999401093, "learning_rate": 4.974975502398626e-05, "loss": 1.3674, "step": 584 }, { "epoch": 0.09196486470553558, "grad_norm": 0.224375382065773, "learning_rate": 4.9748882272882165e-05, "loss": 1.362, "step": 585 }, { "epoch": 0.09212206960246812, "grad_norm": 0.2743482291698456, "learning_rate": 4.97480080102105e-05, "loss": 1.3028, "step": 586 }, { "epoch": 0.09227927449940065, "grad_norm": 0.30631452798843384, "learning_rate": 4.974713223602467e-05, "loss": 1.3541, "step": 587 }, { "epoch": 0.09243647939633319, "grad_norm": 0.1999395489692688, "learning_rate": 4.9746254950378166e-05, "loss": 1.3515, "step": 588 }, { "epoch": 0.09259368429326574, "grad_norm": 0.3005799353122711, "learning_rate": 4.974537615332455e-05, "loss": 1.3872, "step": 589 }, { "epoch": 0.09275088919019828, "grad_norm": 0.21795117855072021, "learning_rate": 4.9744495844917524e-05, "loss": 1.2804, "step": 590 }, { "epoch": 0.09290809408713081, "grad_norm": 0.2832283675670624, "learning_rate": 4.9743614025210825e-05, "loss": 1.3209, "step": 591 }, { "epoch": 0.09306529898406335, "grad_norm": 0.21391350030899048, "learning_rate": 4.9742730694258334e-05, "loss": 1.3041, "step": 592 }, { "epoch": 0.0932225038809959, "grad_norm": 0.21651242673397064, "learning_rate": 4.974184585211399e-05, "loss": 1.2529, "step": 593 }, { "epoch": 0.09337970877792844, "grad_norm": 0.22796374559402466, "learning_rate": 4.974095949883183e-05, "loss": 1.3999, "step": 594 }, { "epoch": 0.09353691367486097, "grad_norm": 0.21013247966766357, "learning_rate": 4.9740071634466e-05, "loss": 1.3626, "step": 595 }, { "epoch": 0.09369411857179351, "grad_norm": 0.31589969992637634, "learning_rate": 4.973918225907073e-05, "loss": 1.4096, "step": 596 }, { "epoch": 0.09385132346872604, "grad_norm": 0.2923184931278229, "learning_rate": 4.973829137270033e-05, "loss": 1.2116, "step": 597 }, { "epoch": 0.0940085283656586, "grad_norm": 0.2147187739610672, "learning_rate": 4.9737398975409224e-05, "loss": 1.3909, "step": 598 }, { "epoch": 0.09416573326259113, "grad_norm": 0.20287127792835236, "learning_rate": 4.9736505067251896e-05, "loss": 1.3621, "step": 599 }, { "epoch": 0.09432293815952367, "grad_norm": 0.24703876674175262, "learning_rate": 4.9735609648282965e-05, "loss": 1.3525, "step": 600 }, { "epoch": 0.0944801430564562, "grad_norm": 0.25060412287712097, "learning_rate": 4.97347127185571e-05, "loss": 1.36, "step": 601 }, { "epoch": 0.09463734795338875, "grad_norm": 0.214557945728302, "learning_rate": 4.9733814278129096e-05, "loss": 1.4372, "step": 602 }, { "epoch": 0.09479455285032129, "grad_norm": 0.1984785795211792, "learning_rate": 4.9732914327053825e-05, "loss": 1.3191, "step": 603 }, { "epoch": 0.09495175774725383, "grad_norm": 0.2099440097808838, "learning_rate": 4.9732012865386244e-05, "loss": 1.313, "step": 604 }, { "epoch": 0.09510896264418636, "grad_norm": 0.20393683016300201, "learning_rate": 4.9731109893181423e-05, "loss": 1.3465, "step": 605 }, { "epoch": 0.09526616754111891, "grad_norm": 0.25346165895462036, "learning_rate": 4.97302054104945e-05, "loss": 1.3379, "step": 606 }, { "epoch": 0.09542337243805145, "grad_norm": 0.21876423060894012, "learning_rate": 4.9729299417380725e-05, "loss": 1.2746, "step": 607 }, { "epoch": 0.09558057733498398, "grad_norm": 0.21032990515232086, "learning_rate": 4.9728391913895436e-05, "loss": 1.3215, "step": 608 }, { "epoch": 0.09573778223191652, "grad_norm": 0.2550762891769409, "learning_rate": 4.9727482900094044e-05, "loss": 1.3239, "step": 609 }, { "epoch": 0.09589498712884906, "grad_norm": 0.31706327199935913, "learning_rate": 4.972657237603208e-05, "loss": 1.3467, "step": 610 }, { "epoch": 0.09605219202578161, "grad_norm": 0.17176879942417145, "learning_rate": 4.972566034176516e-05, "loss": 1.3815, "step": 611 }, { "epoch": 0.09620939692271414, "grad_norm": 0.22620820999145508, "learning_rate": 4.972474679734898e-05, "loss": 1.2593, "step": 612 }, { "epoch": 0.09636660181964668, "grad_norm": 0.18735802173614502, "learning_rate": 4.9723831742839334e-05, "loss": 1.424, "step": 613 }, { "epoch": 0.09652380671657922, "grad_norm": 0.2582910656929016, "learning_rate": 4.972291517829211e-05, "loss": 1.2741, "step": 614 }, { "epoch": 0.09668101161351177, "grad_norm": 0.19907522201538086, "learning_rate": 4.97219971037633e-05, "loss": 1.2045, "step": 615 }, { "epoch": 0.0968382165104443, "grad_norm": 0.20451949536800385, "learning_rate": 4.972107751930896e-05, "loss": 1.3026, "step": 616 }, { "epoch": 0.09699542140737684, "grad_norm": 0.29682090878486633, "learning_rate": 4.972015642498527e-05, "loss": 1.3789, "step": 617 }, { "epoch": 0.09715262630430938, "grad_norm": 0.27210530638694763, "learning_rate": 4.9719233820848476e-05, "loss": 1.3968, "step": 618 }, { "epoch": 0.09730983120124193, "grad_norm": 0.24241842329502106, "learning_rate": 4.971830970695493e-05, "loss": 1.2763, "step": 619 }, { "epoch": 0.09746703609817446, "grad_norm": 0.2535828649997711, "learning_rate": 4.9717384083361075e-05, "loss": 1.3463, "step": 620 }, { "epoch": 0.097624240995107, "grad_norm": 0.22121217846870422, "learning_rate": 4.971645695012344e-05, "loss": 1.3384, "step": 621 }, { "epoch": 0.09778144589203953, "grad_norm": 0.28840744495391846, "learning_rate": 4.971552830729866e-05, "loss": 1.2418, "step": 622 }, { "epoch": 0.09793865078897207, "grad_norm": 0.1682664453983307, "learning_rate": 4.971459815494345e-05, "loss": 1.3658, "step": 623 }, { "epoch": 0.09809585568590462, "grad_norm": 0.24955761432647705, "learning_rate": 4.971366649311461e-05, "loss": 1.2372, "step": 624 }, { "epoch": 0.09825306058283716, "grad_norm": 0.2756117582321167, "learning_rate": 4.971273332186906e-05, "loss": 1.3212, "step": 625 }, { "epoch": 0.09841026547976969, "grad_norm": 0.2370867133140564, "learning_rate": 4.971179864126377e-05, "loss": 1.2879, "step": 626 }, { "epoch": 0.09856747037670223, "grad_norm": 0.20566895604133606, "learning_rate": 4.9710862451355846e-05, "loss": 1.4243, "step": 627 }, { "epoch": 0.09872467527363478, "grad_norm": 0.1923399120569229, "learning_rate": 4.970992475220246e-05, "loss": 1.2639, "step": 628 }, { "epoch": 0.09888188017056732, "grad_norm": 0.17972147464752197, "learning_rate": 4.9708985543860896e-05, "loss": 1.3366, "step": 629 }, { "epoch": 0.09903908506749985, "grad_norm": 0.1936875432729721, "learning_rate": 4.97080448263885e-05, "loss": 1.3496, "step": 630 }, { "epoch": 0.09919628996443239, "grad_norm": 0.24409984052181244, "learning_rate": 4.9707102599842735e-05, "loss": 1.3268, "step": 631 }, { "epoch": 0.09935349486136494, "grad_norm": 0.21084928512573242, "learning_rate": 4.970615886428115e-05, "loss": 1.3421, "step": 632 }, { "epoch": 0.09951069975829747, "grad_norm": 0.21201804280281067, "learning_rate": 4.970521361976138e-05, "loss": 1.3189, "step": 633 }, { "epoch": 0.09966790465523001, "grad_norm": 0.2698107063770294, "learning_rate": 4.9704266866341156e-05, "loss": 1.2193, "step": 634 }, { "epoch": 0.09982510955216255, "grad_norm": 0.27072674036026, "learning_rate": 4.970331860407831e-05, "loss": 1.2694, "step": 635 }, { "epoch": 0.09998231444909508, "grad_norm": 0.26514896750450134, "learning_rate": 4.9702368833030754e-05, "loss": 1.2175, "step": 636 }, { "epoch": 0.10013951934602763, "grad_norm": 0.21645940840244293, "learning_rate": 4.970141755325649e-05, "loss": 1.3099, "step": 637 }, { "epoch": 0.10029672424296017, "grad_norm": 0.27035385370254517, "learning_rate": 4.970046476481363e-05, "loss": 1.2723, "step": 638 }, { "epoch": 0.1004539291398927, "grad_norm": 0.20999298989772797, "learning_rate": 4.969951046776036e-05, "loss": 1.369, "step": 639 }, { "epoch": 0.10061113403682524, "grad_norm": 0.18554192781448364, "learning_rate": 4.969855466215497e-05, "loss": 1.3483, "step": 640 }, { "epoch": 0.10061113403682524, "eval_loss": 1.314468502998352, "eval_runtime": 2275.7115, "eval_samples_per_second": 4.068, "eval_steps_per_second": 2.034, "step": 640 }, { "epoch": 0.10076833893375779, "grad_norm": 0.19117292761802673, "learning_rate": 4.969759734805582e-05, "loss": 1.3538, "step": 641 }, { "epoch": 0.10092554383069033, "grad_norm": 0.21971918642520905, "learning_rate": 4.969663852552141e-05, "loss": 1.2827, "step": 642 }, { "epoch": 0.10108274872762286, "grad_norm": 0.2663845121860504, "learning_rate": 4.969567819461027e-05, "loss": 1.3332, "step": 643 }, { "epoch": 0.1012399536245554, "grad_norm": 0.23752686381340027, "learning_rate": 4.9694716355381076e-05, "loss": 1.2675, "step": 644 }, { "epoch": 0.10139715852148795, "grad_norm": 0.1558876782655716, "learning_rate": 4.9693753007892565e-05, "loss": 1.3356, "step": 645 }, { "epoch": 0.10155436341842049, "grad_norm": 0.2064114212989807, "learning_rate": 4.969278815220356e-05, "loss": 1.3261, "step": 646 }, { "epoch": 0.10171156831535302, "grad_norm": 0.2371819168329239, "learning_rate": 4.969182178837302e-05, "loss": 1.2706, "step": 647 }, { "epoch": 0.10186877321228556, "grad_norm": 0.22757107019424438, "learning_rate": 4.969085391645994e-05, "loss": 1.4035, "step": 648 }, { "epoch": 0.1020259781092181, "grad_norm": 0.16831007599830627, "learning_rate": 4.968988453652345e-05, "loss": 1.3006, "step": 649 }, { "epoch": 0.10218318300615065, "grad_norm": 0.1719575822353363, "learning_rate": 4.968891364862275e-05, "loss": 1.2439, "step": 650 }, { "epoch": 0.10234038790308318, "grad_norm": 0.27235090732574463, "learning_rate": 4.9687941252817144e-05, "loss": 1.3065, "step": 651 }, { "epoch": 0.10249759280001572, "grad_norm": 0.25622984766960144, "learning_rate": 4.968696734916601e-05, "loss": 1.2908, "step": 652 }, { "epoch": 0.10265479769694826, "grad_norm": 0.22526390850543976, "learning_rate": 4.968599193772885e-05, "loss": 1.3081, "step": 653 }, { "epoch": 0.1028120025938808, "grad_norm": 0.2552133798599243, "learning_rate": 4.968501501856522e-05, "loss": 1.3292, "step": 654 }, { "epoch": 0.10296920749081334, "grad_norm": 0.26533272862434387, "learning_rate": 4.96840365917348e-05, "loss": 1.3571, "step": 655 }, { "epoch": 0.10312641238774588, "grad_norm": 0.29065170884132385, "learning_rate": 4.968305665729732e-05, "loss": 1.2799, "step": 656 }, { "epoch": 0.10328361728467841, "grad_norm": 0.27552661299705505, "learning_rate": 4.968207521531267e-05, "loss": 1.2262, "step": 657 }, { "epoch": 0.10344082218161096, "grad_norm": 1.929308533668518, "learning_rate": 4.9681092265840775e-05, "loss": 1.2027, "step": 658 }, { "epoch": 0.1035980270785435, "grad_norm": 0.2610799968242645, "learning_rate": 4.968010780894167e-05, "loss": 1.3527, "step": 659 }, { "epoch": 0.10375523197547604, "grad_norm": 0.28388604521751404, "learning_rate": 4.967912184467547e-05, "loss": 1.2989, "step": 660 }, { "epoch": 0.10391243687240857, "grad_norm": 0.21056891977787018, "learning_rate": 4.9678134373102415e-05, "loss": 1.2748, "step": 661 }, { "epoch": 0.10406964176934111, "grad_norm": 0.268331378698349, "learning_rate": 4.967714539428281e-05, "loss": 1.3712, "step": 662 }, { "epoch": 0.10422684666627366, "grad_norm": 0.28430554270744324, "learning_rate": 4.967615490827705e-05, "loss": 1.3641, "step": 663 }, { "epoch": 0.1043840515632062, "grad_norm": 0.254165917634964, "learning_rate": 4.9675162915145636e-05, "loss": 1.3042, "step": 664 }, { "epoch": 0.10454125646013873, "grad_norm": 0.19123367965221405, "learning_rate": 4.967416941494914e-05, "loss": 1.3613, "step": 665 }, { "epoch": 0.10469846135707127, "grad_norm": 0.20710323750972748, "learning_rate": 4.967317440774828e-05, "loss": 1.2815, "step": 666 }, { "epoch": 0.10485566625400382, "grad_norm": 0.2143716812133789, "learning_rate": 4.967217789360379e-05, "loss": 1.3136, "step": 667 }, { "epoch": 0.10501287115093635, "grad_norm": 0.2556392550468445, "learning_rate": 4.967117987257654e-05, "loss": 1.384, "step": 668 }, { "epoch": 0.10517007604786889, "grad_norm": 0.28254854679107666, "learning_rate": 4.9670180344727505e-05, "loss": 1.3218, "step": 669 }, { "epoch": 0.10532728094480143, "grad_norm": 0.24643027782440186, "learning_rate": 4.9669179310117706e-05, "loss": 1.278, "step": 670 }, { "epoch": 0.10548448584173396, "grad_norm": 0.34323665499687195, "learning_rate": 4.9668176768808304e-05, "loss": 1.2511, "step": 671 }, { "epoch": 0.10564169073866651, "grad_norm": 0.2499508410692215, "learning_rate": 4.966717272086052e-05, "loss": 1.338, "step": 672 }, { "epoch": 0.10579889563559905, "grad_norm": 0.2145325094461441, "learning_rate": 4.966616716633567e-05, "loss": 1.3304, "step": 673 }, { "epoch": 0.10595610053253159, "grad_norm": 0.19230923056602478, "learning_rate": 4.9665160105295185e-05, "loss": 1.3535, "step": 674 }, { "epoch": 0.10611330542946412, "grad_norm": 0.20243465900421143, "learning_rate": 4.966415153780056e-05, "loss": 1.3118, "step": 675 }, { "epoch": 0.10627051032639667, "grad_norm": 0.24927914142608643, "learning_rate": 4.966314146391341e-05, "loss": 1.3136, "step": 676 }, { "epoch": 0.10642771522332921, "grad_norm": 0.21791934967041016, "learning_rate": 4.9662129883695406e-05, "loss": 1.3314, "step": 677 }, { "epoch": 0.10658492012026174, "grad_norm": 0.24318841099739075, "learning_rate": 4.966111679720835e-05, "loss": 1.3929, "step": 678 }, { "epoch": 0.10674212501719428, "grad_norm": 0.2829376757144928, "learning_rate": 4.966010220451411e-05, "loss": 1.3232, "step": 679 }, { "epoch": 0.10689932991412683, "grad_norm": 0.2353716641664505, "learning_rate": 4.965908610567465e-05, "loss": 1.2851, "step": 680 }, { "epoch": 0.10705653481105937, "grad_norm": 0.2615984380245209, "learning_rate": 4.965806850075203e-05, "loss": 1.2552, "step": 681 }, { "epoch": 0.1072137397079919, "grad_norm": 0.23773109912872314, "learning_rate": 4.965704938980841e-05, "loss": 1.2961, "step": 682 }, { "epoch": 0.10737094460492444, "grad_norm": 0.2622957229614258, "learning_rate": 4.9656028772906014e-05, "loss": 1.3073, "step": 683 }, { "epoch": 0.10752814950185698, "grad_norm": 0.24974018335342407, "learning_rate": 4.965500665010721e-05, "loss": 1.2774, "step": 684 }, { "epoch": 0.10768535439878953, "grad_norm": 0.17124338448047638, "learning_rate": 4.9653983021474395e-05, "loss": 1.4159, "step": 685 }, { "epoch": 0.10784255929572206, "grad_norm": 0.16673363745212555, "learning_rate": 4.96529578870701e-05, "loss": 1.3748, "step": 686 }, { "epoch": 0.1079997641926546, "grad_norm": 0.25368422269821167, "learning_rate": 4.965193124695693e-05, "loss": 1.3958, "step": 687 }, { "epoch": 0.10815696908958713, "grad_norm": 0.22910015285015106, "learning_rate": 4.96509031011976e-05, "loss": 1.3259, "step": 688 }, { "epoch": 0.10831417398651969, "grad_norm": 0.277851939201355, "learning_rate": 4.96498734498549e-05, "loss": 1.3254, "step": 689 }, { "epoch": 0.10847137888345222, "grad_norm": 0.32443082332611084, "learning_rate": 4.964884229299172e-05, "loss": 1.3007, "step": 690 }, { "epoch": 0.10862858378038476, "grad_norm": 0.20710885524749756, "learning_rate": 4.964780963067102e-05, "loss": 1.3297, "step": 691 }, { "epoch": 0.1087857886773173, "grad_norm": 0.25522252917289734, "learning_rate": 4.96467754629559e-05, "loss": 1.2487, "step": 692 }, { "epoch": 0.10894299357424984, "grad_norm": 0.3286147713661194, "learning_rate": 4.9645739789909504e-05, "loss": 1.2255, "step": 693 }, { "epoch": 0.10910019847118238, "grad_norm": 0.3795601725578308, "learning_rate": 4.964470261159509e-05, "loss": 1.2725, "step": 694 }, { "epoch": 0.10925740336811492, "grad_norm": 0.3112131655216217, "learning_rate": 4.964366392807602e-05, "loss": 1.252, "step": 695 }, { "epoch": 0.10941460826504745, "grad_norm": 0.2891729176044464, "learning_rate": 4.964262373941571e-05, "loss": 1.3377, "step": 696 }, { "epoch": 0.10957181316197999, "grad_norm": 0.26973745226860046, "learning_rate": 4.96415820456777e-05, "loss": 1.3186, "step": 697 }, { "epoch": 0.10972901805891254, "grad_norm": 0.2832094430923462, "learning_rate": 4.964053884692562e-05, "loss": 1.3248, "step": 698 }, { "epoch": 0.10988622295584508, "grad_norm": 0.2840999960899353, "learning_rate": 4.963949414322318e-05, "loss": 1.2677, "step": 699 }, { "epoch": 0.11004342785277761, "grad_norm": 0.2891542911529541, "learning_rate": 4.963844793463418e-05, "loss": 1.3274, "step": 700 }, { "epoch": 0.11020063274971015, "grad_norm": 0.23569005727767944, "learning_rate": 4.963740022122252e-05, "loss": 1.2259, "step": 701 }, { "epoch": 0.1103578376466427, "grad_norm": 0.2174285501241684, "learning_rate": 4.963635100305221e-05, "loss": 1.2785, "step": 702 }, { "epoch": 0.11051504254357523, "grad_norm": 0.2753438651561737, "learning_rate": 4.96353002801873e-05, "loss": 1.3263, "step": 703 }, { "epoch": 0.11067224744050777, "grad_norm": 0.21094419062137604, "learning_rate": 4.963424805269198e-05, "loss": 1.2439, "step": 704 }, { "epoch": 0.1108294523374403, "grad_norm": 0.20501388609409332, "learning_rate": 4.963319432063052e-05, "loss": 1.3091, "step": 705 }, { "epoch": 0.11098665723437286, "grad_norm": 0.2041424810886383, "learning_rate": 4.963213908406728e-05, "loss": 1.2951, "step": 706 }, { "epoch": 0.1111438621313054, "grad_norm": 0.24955442547798157, "learning_rate": 4.963108234306669e-05, "loss": 1.2208, "step": 707 }, { "epoch": 0.11130106702823793, "grad_norm": 0.39431118965148926, "learning_rate": 4.9630024097693314e-05, "loss": 1.306, "step": 708 }, { "epoch": 0.11145827192517047, "grad_norm": 0.24803434312343597, "learning_rate": 4.962896434801178e-05, "loss": 1.2951, "step": 709 }, { "epoch": 0.111615476822103, "grad_norm": 0.2736116349697113, "learning_rate": 4.962790309408681e-05, "loss": 1.3245, "step": 710 }, { "epoch": 0.11177268171903555, "grad_norm": 0.24502034485340118, "learning_rate": 4.9626840335983215e-05, "loss": 1.2961, "step": 711 }, { "epoch": 0.11192988661596809, "grad_norm": 0.24158692359924316, "learning_rate": 4.962577607376592e-05, "loss": 1.2387, "step": 712 }, { "epoch": 0.11208709151290062, "grad_norm": 0.24977251887321472, "learning_rate": 4.962471030749991e-05, "loss": 1.2976, "step": 713 }, { "epoch": 0.11224429640983316, "grad_norm": 0.15401019155979156, "learning_rate": 4.962364303725029e-05, "loss": 1.2684, "step": 714 }, { "epoch": 0.11240150130676571, "grad_norm": 0.2611544132232666, "learning_rate": 4.962257426308224e-05, "loss": 1.2928, "step": 715 }, { "epoch": 0.11255870620369825, "grad_norm": 0.434600830078125, "learning_rate": 4.962150398506103e-05, "loss": 1.3657, "step": 716 }, { "epoch": 0.11271591110063078, "grad_norm": 0.2896519601345062, "learning_rate": 4.9620432203252045e-05, "loss": 1.3055, "step": 717 }, { "epoch": 0.11287311599756332, "grad_norm": 0.1891547590494156, "learning_rate": 4.961935891772073e-05, "loss": 1.3355, "step": 718 }, { "epoch": 0.11303032089449587, "grad_norm": 0.2223133146762848, "learning_rate": 4.9618284128532644e-05, "loss": 1.2939, "step": 719 }, { "epoch": 0.1131875257914284, "grad_norm": 0.27313077449798584, "learning_rate": 4.961720783575343e-05, "loss": 1.2596, "step": 720 }, { "epoch": 0.11334473068836094, "grad_norm": 0.24807053804397583, "learning_rate": 4.961613003944883e-05, "loss": 1.2851, "step": 721 }, { "epoch": 0.11350193558529348, "grad_norm": 0.2343195378780365, "learning_rate": 4.9615050739684656e-05, "loss": 1.2899, "step": 722 }, { "epoch": 0.11365914048222601, "grad_norm": 0.229730024933815, "learning_rate": 4.961396993652684e-05, "loss": 1.3118, "step": 723 }, { "epoch": 0.11381634537915856, "grad_norm": 0.2397170215845108, "learning_rate": 4.9612887630041394e-05, "loss": 1.2148, "step": 724 }, { "epoch": 0.1139735502760911, "grad_norm": 0.2167958915233612, "learning_rate": 4.9611803820294414e-05, "loss": 1.2597, "step": 725 }, { "epoch": 0.11413075517302364, "grad_norm": 0.21318721771240234, "learning_rate": 4.961071850735209e-05, "loss": 1.3949, "step": 726 }, { "epoch": 0.11428796006995617, "grad_norm": 0.21988382935523987, "learning_rate": 4.960963169128073e-05, "loss": 1.3196, "step": 727 }, { "epoch": 0.11444516496688872, "grad_norm": 0.17555692791938782, "learning_rate": 4.96085433721467e-05, "loss": 1.3661, "step": 728 }, { "epoch": 0.11460236986382126, "grad_norm": 0.3545222282409668, "learning_rate": 4.960745355001647e-05, "loss": 1.2659, "step": 729 }, { "epoch": 0.1147595747607538, "grad_norm": 0.3196569085121155, "learning_rate": 4.960636222495659e-05, "loss": 1.2893, "step": 730 }, { "epoch": 0.11491677965768633, "grad_norm": 0.2241334766149521, "learning_rate": 4.960526939703374e-05, "loss": 1.2155, "step": 731 }, { "epoch": 0.11507398455461888, "grad_norm": 0.26543980836868286, "learning_rate": 4.960417506631465e-05, "loss": 1.3615, "step": 732 }, { "epoch": 0.11523118945155142, "grad_norm": 0.21146585047245026, "learning_rate": 4.960307923286616e-05, "loss": 1.3516, "step": 733 }, { "epoch": 0.11538839434848396, "grad_norm": 0.18095079064369202, "learning_rate": 4.960198189675519e-05, "loss": 1.3581, "step": 734 }, { "epoch": 0.11554559924541649, "grad_norm": 0.26687100529670715, "learning_rate": 4.9600883058048775e-05, "loss": 1.1971, "step": 735 }, { "epoch": 0.11570280414234903, "grad_norm": 0.2271047830581665, "learning_rate": 4.959978271681402e-05, "loss": 1.1867, "step": 736 }, { "epoch": 0.11586000903928158, "grad_norm": 0.2102867215871811, "learning_rate": 4.959868087311814e-05, "loss": 1.2749, "step": 737 }, { "epoch": 0.11601721393621411, "grad_norm": 0.2752761244773865, "learning_rate": 4.9597577527028424e-05, "loss": 1.1753, "step": 738 }, { "epoch": 0.11617441883314665, "grad_norm": 0.22385725378990173, "learning_rate": 4.959647267861226e-05, "loss": 1.343, "step": 739 }, { "epoch": 0.11633162373007919, "grad_norm": 0.2597412168979645, "learning_rate": 4.959536632793712e-05, "loss": 1.2539, "step": 740 }, { "epoch": 0.11648882862701174, "grad_norm": 0.27975237369537354, "learning_rate": 4.959425847507059e-05, "loss": 1.2883, "step": 741 }, { "epoch": 0.11664603352394427, "grad_norm": 0.29127049446105957, "learning_rate": 4.959314912008033e-05, "loss": 1.3139, "step": 742 }, { "epoch": 0.11680323842087681, "grad_norm": 0.19929318130016327, "learning_rate": 4.9592038263034094e-05, "loss": 1.271, "step": 743 }, { "epoch": 0.11696044331780935, "grad_norm": 0.23164550960063934, "learning_rate": 4.9590925903999716e-05, "loss": 1.3359, "step": 744 }, { "epoch": 0.1171176482147419, "grad_norm": 0.27876612544059753, "learning_rate": 4.958981204304516e-05, "loss": 1.2568, "step": 745 }, { "epoch": 0.11727485311167443, "grad_norm": 0.2459796965122223, "learning_rate": 4.9588696680238435e-05, "loss": 1.2426, "step": 746 }, { "epoch": 0.11743205800860697, "grad_norm": 0.2039456069469452, "learning_rate": 4.958757981564767e-05, "loss": 1.2681, "step": 747 }, { "epoch": 0.1175892629055395, "grad_norm": 0.24796408414840698, "learning_rate": 4.958646144934108e-05, "loss": 1.257, "step": 748 }, { "epoch": 0.11774646780247204, "grad_norm": 0.2779620289802551, "learning_rate": 4.958534158138697e-05, "loss": 1.2933, "step": 749 }, { "epoch": 0.11790367269940459, "grad_norm": 0.20878851413726807, "learning_rate": 4.9584220211853735e-05, "loss": 1.2902, "step": 750 }, { "epoch": 0.11806087759633713, "grad_norm": 0.24720412492752075, "learning_rate": 4.958309734080987e-05, "loss": 1.203, "step": 751 }, { "epoch": 0.11821808249326966, "grad_norm": 0.287654846906662, "learning_rate": 4.9581972968323956e-05, "loss": 1.3141, "step": 752 }, { "epoch": 0.1183752873902022, "grad_norm": 0.23071719706058502, "learning_rate": 4.958084709446466e-05, "loss": 1.3145, "step": 753 }, { "epoch": 0.11853249228713475, "grad_norm": 0.21027110517024994, "learning_rate": 4.9579719719300746e-05, "loss": 1.2893, "step": 754 }, { "epoch": 0.11868969718406729, "grad_norm": 0.17173202335834503, "learning_rate": 4.9578590842901066e-05, "loss": 1.2618, "step": 755 }, { "epoch": 0.11884690208099982, "grad_norm": 0.24606984853744507, "learning_rate": 4.957746046533457e-05, "loss": 1.1904, "step": 756 }, { "epoch": 0.11900410697793236, "grad_norm": 0.248653382062912, "learning_rate": 4.957632858667031e-05, "loss": 1.331, "step": 757 }, { "epoch": 0.11916131187486491, "grad_norm": 0.1904144436120987, "learning_rate": 4.9575195206977406e-05, "loss": 1.3303, "step": 758 }, { "epoch": 0.11931851677179744, "grad_norm": 0.39540621638298035, "learning_rate": 4.9574060326325075e-05, "loss": 1.3455, "step": 759 }, { "epoch": 0.11947572166872998, "grad_norm": 0.20992301404476166, "learning_rate": 4.957292394478265e-05, "loss": 1.2911, "step": 760 }, { "epoch": 0.11963292656566252, "grad_norm": 0.23418502509593964, "learning_rate": 4.957178606241951e-05, "loss": 1.35, "step": 761 }, { "epoch": 0.11979013146259505, "grad_norm": 0.24480225145816803, "learning_rate": 4.957064667930517e-05, "loss": 1.2138, "step": 762 }, { "epoch": 0.1199473363595276, "grad_norm": 0.22909322381019592, "learning_rate": 4.956950579550922e-05, "loss": 1.1915, "step": 763 }, { "epoch": 0.12010454125646014, "grad_norm": 0.16839763522148132, "learning_rate": 4.956836341110134e-05, "loss": 1.234, "step": 764 }, { "epoch": 0.12026174615339268, "grad_norm": 0.2291131466627121, "learning_rate": 4.956721952615129e-05, "loss": 1.2964, "step": 765 }, { "epoch": 0.12041895105032521, "grad_norm": 0.2606765329837799, "learning_rate": 4.956607414072895e-05, "loss": 1.2785, "step": 766 }, { "epoch": 0.12057615594725776, "grad_norm": 0.24100011587142944, "learning_rate": 4.956492725490426e-05, "loss": 1.2389, "step": 767 }, { "epoch": 0.1207333608441903, "grad_norm": 0.2868693172931671, "learning_rate": 4.956377886874729e-05, "loss": 1.3852, "step": 768 }, { "epoch": 0.12089056574112284, "grad_norm": 0.29049259424209595, "learning_rate": 4.956262898232816e-05, "loss": 1.1511, "step": 769 }, { "epoch": 0.12104777063805537, "grad_norm": 0.31396448612213135, "learning_rate": 4.9561477595717106e-05, "loss": 1.2687, "step": 770 }, { "epoch": 0.12120497553498792, "grad_norm": 0.3348733186721802, "learning_rate": 4.956032470898445e-05, "loss": 1.1933, "step": 771 }, { "epoch": 0.12136218043192046, "grad_norm": 0.2009342461824417, "learning_rate": 4.955917032220061e-05, "loss": 1.3299, "step": 772 }, { "epoch": 0.121519385328853, "grad_norm": 0.2037377655506134, "learning_rate": 4.9558014435436084e-05, "loss": 1.3208, "step": 773 }, { "epoch": 0.12167659022578553, "grad_norm": 0.3118877410888672, "learning_rate": 4.955685704876147e-05, "loss": 1.1927, "step": 774 }, { "epoch": 0.12183379512271807, "grad_norm": 0.21884632110595703, "learning_rate": 4.955569816224747e-05, "loss": 1.2661, "step": 775 }, { "epoch": 0.12199100001965062, "grad_norm": 0.25817862153053284, "learning_rate": 4.9554537775964846e-05, "loss": 1.3077, "step": 776 }, { "epoch": 0.12214820491658315, "grad_norm": 0.27827751636505127, "learning_rate": 4.955337588998449e-05, "loss": 1.2709, "step": 777 }, { "epoch": 0.12230540981351569, "grad_norm": 0.30520737171173096, "learning_rate": 4.955221250437735e-05, "loss": 1.2407, "step": 778 }, { "epoch": 0.12246261471044823, "grad_norm": 0.21729423105716705, "learning_rate": 4.9551047619214473e-05, "loss": 1.3392, "step": 779 }, { "epoch": 0.12261981960738078, "grad_norm": 0.2408866286277771, "learning_rate": 4.954988123456703e-05, "loss": 1.215, "step": 780 }, { "epoch": 0.12277702450431331, "grad_norm": 0.23833869397640228, "learning_rate": 4.954871335050625e-05, "loss": 1.3607, "step": 781 }, { "epoch": 0.12293422940124585, "grad_norm": 0.27017349004745483, "learning_rate": 4.954754396710345e-05, "loss": 1.2662, "step": 782 }, { "epoch": 0.12309143429817838, "grad_norm": 0.21869684755802155, "learning_rate": 4.954637308443007e-05, "loss": 1.2384, "step": 783 }, { "epoch": 0.12324863919511093, "grad_norm": 0.18912911415100098, "learning_rate": 4.9545200702557615e-05, "loss": 1.2958, "step": 784 }, { "epoch": 0.12340584409204347, "grad_norm": 0.27320876717567444, "learning_rate": 4.954402682155768e-05, "loss": 1.2546, "step": 785 }, { "epoch": 0.12356304898897601, "grad_norm": 0.2938046455383301, "learning_rate": 4.954285144150198e-05, "loss": 1.3451, "step": 786 }, { "epoch": 0.12372025388590854, "grad_norm": 0.18271508812904358, "learning_rate": 4.954167456246229e-05, "loss": 1.2239, "step": 787 }, { "epoch": 0.12387745878284108, "grad_norm": 0.21799346804618835, "learning_rate": 4.9540496184510495e-05, "loss": 1.2471, "step": 788 }, { "epoch": 0.12403466367977363, "grad_norm": 0.21574997901916504, "learning_rate": 4.9539316307718564e-05, "loss": 1.3137, "step": 789 }, { "epoch": 0.12419186857670617, "grad_norm": 0.21586358547210693, "learning_rate": 4.953813493215855e-05, "loss": 1.2763, "step": 790 }, { "epoch": 0.1243490734736387, "grad_norm": 0.2723408043384552, "learning_rate": 4.953695205790262e-05, "loss": 1.4148, "step": 791 }, { "epoch": 0.12450627837057124, "grad_norm": 0.29501527547836304, "learning_rate": 4.9535767685023026e-05, "loss": 1.3093, "step": 792 }, { "epoch": 0.12466348326750379, "grad_norm": 0.2884112000465393, "learning_rate": 4.9534581813592086e-05, "loss": 1.3276, "step": 793 }, { "epoch": 0.12482068816443632, "grad_norm": 0.24246759712696075, "learning_rate": 4.9533394443682234e-05, "loss": 1.3203, "step": 794 }, { "epoch": 0.12497789306136886, "grad_norm": 0.23493270576000214, "learning_rate": 4.9532205575365995e-05, "loss": 1.2567, "step": 795 }, { "epoch": 0.1251350979583014, "grad_norm": 0.26456305384635925, "learning_rate": 4.953101520871598e-05, "loss": 1.3194, "step": 796 }, { "epoch": 0.12529230285523393, "grad_norm": 0.18891221284866333, "learning_rate": 4.952982334380489e-05, "loss": 1.3041, "step": 797 }, { "epoch": 0.12544950775216648, "grad_norm": 0.21460258960723877, "learning_rate": 4.952862998070552e-05, "loss": 1.2274, "step": 798 }, { "epoch": 0.125606712649099, "grad_norm": 0.2832646667957306, "learning_rate": 4.9527435119490753e-05, "loss": 1.2009, "step": 799 }, { "epoch": 0.12576391754603156, "grad_norm": 0.22183702886104584, "learning_rate": 4.9526238760233576e-05, "loss": 1.31, "step": 800 }, { "epoch": 0.12576391754603156, "eval_loss": 1.2745521068572998, "eval_runtime": 2292.1003, "eval_samples_per_second": 4.039, "eval_steps_per_second": 2.02, "step": 800 }, { "epoch": 0.1259211224429641, "grad_norm": 0.21697020530700684, "learning_rate": 4.9525040903007046e-05, "loss": 1.3197, "step": 801 }, { "epoch": 0.12607832733989663, "grad_norm": 0.36354196071624756, "learning_rate": 4.952384154788433e-05, "loss": 1.1926, "step": 802 }, { "epoch": 0.12623553223682918, "grad_norm": 0.27054232358932495, "learning_rate": 4.952264069493868e-05, "loss": 1.3199, "step": 803 }, { "epoch": 0.12639273713376173, "grad_norm": 0.2425469011068344, "learning_rate": 4.952143834424344e-05, "loss": 1.2906, "step": 804 }, { "epoch": 0.12654994203069425, "grad_norm": 0.1988941729068756, "learning_rate": 4.952023449587205e-05, "loss": 1.3183, "step": 805 }, { "epoch": 0.1267071469276268, "grad_norm": 0.2429157942533493, "learning_rate": 4.951902914989802e-05, "loss": 1.2497, "step": 806 }, { "epoch": 0.12686435182455932, "grad_norm": 0.2704293727874756, "learning_rate": 4.951782230639499e-05, "loss": 1.3113, "step": 807 }, { "epoch": 0.12702155672149187, "grad_norm": 0.31801360845565796, "learning_rate": 4.951661396543664e-05, "loss": 1.2354, "step": 808 }, { "epoch": 0.12717876161842442, "grad_norm": 0.21358463168144226, "learning_rate": 4.951540412709681e-05, "loss": 1.3512, "step": 809 }, { "epoch": 0.12733596651535695, "grad_norm": 0.24300484359264374, "learning_rate": 4.951419279144936e-05, "loss": 1.213, "step": 810 }, { "epoch": 0.1274931714122895, "grad_norm": 0.39550015330314636, "learning_rate": 4.951297995856828e-05, "loss": 1.1872, "step": 811 }, { "epoch": 0.12765037630922202, "grad_norm": 0.20150414109230042, "learning_rate": 4.951176562852765e-05, "loss": 1.3469, "step": 812 }, { "epoch": 0.12780758120615457, "grad_norm": 0.2050725519657135, "learning_rate": 4.951054980140164e-05, "loss": 1.259, "step": 813 }, { "epoch": 0.12796478610308712, "grad_norm": 0.23815183341503143, "learning_rate": 4.950933247726451e-05, "loss": 1.2961, "step": 814 }, { "epoch": 0.12812199100001964, "grad_norm": 0.28224676847457886, "learning_rate": 4.95081136561906e-05, "loss": 1.2631, "step": 815 }, { "epoch": 0.1282791958969522, "grad_norm": 0.294791042804718, "learning_rate": 4.9506893338254353e-05, "loss": 1.1834, "step": 816 }, { "epoch": 0.12843640079388474, "grad_norm": 0.29148972034454346, "learning_rate": 4.9505671523530306e-05, "loss": 1.2573, "step": 817 }, { "epoch": 0.12859360569081726, "grad_norm": 0.29371243715286255, "learning_rate": 4.950444821209308e-05, "loss": 1.4532, "step": 818 }, { "epoch": 0.12875081058774981, "grad_norm": 0.2303713709115982, "learning_rate": 4.9503223404017396e-05, "loss": 1.2828, "step": 819 }, { "epoch": 0.12890801548468234, "grad_norm": 0.24906295537948608, "learning_rate": 4.9501997099378046e-05, "loss": 1.2759, "step": 820 }, { "epoch": 0.1290652203816149, "grad_norm": 0.1983998864889145, "learning_rate": 4.950076929824994e-05, "loss": 1.3111, "step": 821 }, { "epoch": 0.12922242527854744, "grad_norm": 0.2079075276851654, "learning_rate": 4.9499540000708064e-05, "loss": 1.3416, "step": 822 }, { "epoch": 0.12937963017547996, "grad_norm": 0.22548237442970276, "learning_rate": 4.94983092068275e-05, "loss": 1.3879, "step": 823 }, { "epoch": 0.1295368350724125, "grad_norm": 0.2052278220653534, "learning_rate": 4.949707691668343e-05, "loss": 1.3347, "step": 824 }, { "epoch": 0.12969403996934503, "grad_norm": 0.21978795528411865, "learning_rate": 4.949584313035109e-05, "loss": 1.1345, "step": 825 }, { "epoch": 0.12985124486627758, "grad_norm": 0.18930193781852722, "learning_rate": 4.9494607847905863e-05, "loss": 1.319, "step": 826 }, { "epoch": 0.13000844976321013, "grad_norm": 0.24538543820381165, "learning_rate": 4.9493371069423176e-05, "loss": 1.3103, "step": 827 }, { "epoch": 0.13016565466014265, "grad_norm": 0.2874930799007416, "learning_rate": 4.9492132794978586e-05, "loss": 1.3388, "step": 828 }, { "epoch": 0.1303228595570752, "grad_norm": 0.23338377475738525, "learning_rate": 4.949089302464771e-05, "loss": 1.2793, "step": 829 }, { "epoch": 0.13048006445400775, "grad_norm": 0.23670902848243713, "learning_rate": 4.948965175850626e-05, "loss": 1.2808, "step": 830 }, { "epoch": 0.13063726935094028, "grad_norm": 0.2617732584476471, "learning_rate": 4.9488408996630066e-05, "loss": 1.2641, "step": 831 }, { "epoch": 0.13079447424787283, "grad_norm": 0.24584044516086578, "learning_rate": 4.948716473909502e-05, "loss": 1.2462, "step": 832 }, { "epoch": 0.13095167914480535, "grad_norm": 0.2507297098636627, "learning_rate": 4.948591898597712e-05, "loss": 1.2211, "step": 833 }, { "epoch": 0.1311088840417379, "grad_norm": 0.25439611077308655, "learning_rate": 4.948467173735245e-05, "loss": 1.2762, "step": 834 }, { "epoch": 0.13126608893867045, "grad_norm": 0.19934779405593872, "learning_rate": 4.948342299329719e-05, "loss": 1.1798, "step": 835 }, { "epoch": 0.13142329383560297, "grad_norm": 0.24154123663902283, "learning_rate": 4.948217275388761e-05, "loss": 1.2608, "step": 836 }, { "epoch": 0.13158049873253552, "grad_norm": 0.2484877109527588, "learning_rate": 4.948092101920006e-05, "loss": 1.2466, "step": 837 }, { "epoch": 0.13173770362946804, "grad_norm": 0.28683343529701233, "learning_rate": 4.9479667789311e-05, "loss": 1.1915, "step": 838 }, { "epoch": 0.1318949085264006, "grad_norm": 0.21289369463920593, "learning_rate": 4.9478413064296976e-05, "loss": 1.2642, "step": 839 }, { "epoch": 0.13205211342333315, "grad_norm": 0.22933778166770935, "learning_rate": 4.947715684423461e-05, "loss": 1.2182, "step": 840 }, { "epoch": 0.13220931832026567, "grad_norm": 0.2507724463939667, "learning_rate": 4.9475899129200635e-05, "loss": 1.3089, "step": 841 }, { "epoch": 0.13236652321719822, "grad_norm": 0.251770943403244, "learning_rate": 4.947463991927187e-05, "loss": 1.3194, "step": 842 }, { "epoch": 0.13252372811413077, "grad_norm": 0.2533280849456787, "learning_rate": 4.947337921452521e-05, "loss": 1.2141, "step": 843 }, { "epoch": 0.1326809330110633, "grad_norm": 0.26309993863105774, "learning_rate": 4.9472117015037664e-05, "loss": 1.2265, "step": 844 }, { "epoch": 0.13283813790799584, "grad_norm": 0.29711806774139404, "learning_rate": 4.9470853320886335e-05, "loss": 1.2538, "step": 845 }, { "epoch": 0.13299534280492836, "grad_norm": 0.24551883339881897, "learning_rate": 4.9469588132148373e-05, "loss": 1.2927, "step": 846 }, { "epoch": 0.1331525477018609, "grad_norm": 0.28027257323265076, "learning_rate": 4.946832144890108e-05, "loss": 1.2712, "step": 847 }, { "epoch": 0.13330975259879346, "grad_norm": 0.22099149227142334, "learning_rate": 4.9467053271221804e-05, "loss": 1.2095, "step": 848 }, { "epoch": 0.13346695749572599, "grad_norm": 0.19661381840705872, "learning_rate": 4.946578359918801e-05, "loss": 1.2855, "step": 849 }, { "epoch": 0.13362416239265854, "grad_norm": 0.22767631709575653, "learning_rate": 4.946451243287723e-05, "loss": 1.2932, "step": 850 }, { "epoch": 0.13378136728959106, "grad_norm": 0.28008589148521423, "learning_rate": 4.946323977236712e-05, "loss": 1.2335, "step": 851 }, { "epoch": 0.1339385721865236, "grad_norm": 0.2091825157403946, "learning_rate": 4.94619656177354e-05, "loss": 1.3151, "step": 852 }, { "epoch": 0.13409577708345616, "grad_norm": 0.1978277713060379, "learning_rate": 4.946068996905989e-05, "loss": 1.3359, "step": 853 }, { "epoch": 0.13425298198038868, "grad_norm": 0.21397674083709717, "learning_rate": 4.9459412826418505e-05, "loss": 1.2998, "step": 854 }, { "epoch": 0.13441018687732123, "grad_norm": 0.30490776896476746, "learning_rate": 4.945813418988925e-05, "loss": 1.2607, "step": 855 }, { "epoch": 0.13456739177425378, "grad_norm": 0.2896914780139923, "learning_rate": 4.945685405955021e-05, "loss": 1.2329, "step": 856 }, { "epoch": 0.1347245966711863, "grad_norm": 0.1988048106431961, "learning_rate": 4.945557243547958e-05, "loss": 1.2877, "step": 857 }, { "epoch": 0.13488180156811885, "grad_norm": 0.17888212203979492, "learning_rate": 4.945428931775563e-05, "loss": 1.2543, "step": 858 }, { "epoch": 0.13503900646505138, "grad_norm": 0.2748056650161743, "learning_rate": 4.945300470645673e-05, "loss": 1.3461, "step": 859 }, { "epoch": 0.13519621136198393, "grad_norm": 0.23218591511249542, "learning_rate": 4.945171860166135e-05, "loss": 1.2878, "step": 860 }, { "epoch": 0.13535341625891648, "grad_norm": 0.33142325282096863, "learning_rate": 4.9450431003448015e-05, "loss": 1.294, "step": 861 }, { "epoch": 0.135510621155849, "grad_norm": 0.2330816686153412, "learning_rate": 4.944914191189539e-05, "loss": 1.3593, "step": 862 }, { "epoch": 0.13566782605278155, "grad_norm": 0.23989921808242798, "learning_rate": 4.9447851327082204e-05, "loss": 1.2879, "step": 863 }, { "epoch": 0.13582503094971407, "grad_norm": 0.21358944475650787, "learning_rate": 4.944655924908727e-05, "loss": 1.222, "step": 864 }, { "epoch": 0.13598223584664662, "grad_norm": 0.30434924364089966, "learning_rate": 4.9445265677989515e-05, "loss": 1.273, "step": 865 }, { "epoch": 0.13613944074357917, "grad_norm": 0.22028383612632751, "learning_rate": 4.944397061386794e-05, "loss": 1.2494, "step": 866 }, { "epoch": 0.1362966456405117, "grad_norm": 0.2354927659034729, "learning_rate": 4.944267405680164e-05, "loss": 1.1469, "step": 867 }, { "epoch": 0.13645385053744424, "grad_norm": 0.28941988945007324, "learning_rate": 4.944137600686981e-05, "loss": 1.1678, "step": 868 }, { "epoch": 0.1366110554343768, "grad_norm": 0.2538214325904846, "learning_rate": 4.944007646415172e-05, "loss": 1.2636, "step": 869 }, { "epoch": 0.13676826033130932, "grad_norm": 0.3719157576560974, "learning_rate": 4.943877542872676e-05, "loss": 1.2901, "step": 870 }, { "epoch": 0.13692546522824187, "grad_norm": 0.2994091212749481, "learning_rate": 4.943747290067438e-05, "loss": 1.2209, "step": 871 }, { "epoch": 0.1370826701251744, "grad_norm": 0.23586580157279968, "learning_rate": 4.9436168880074115e-05, "loss": 1.2989, "step": 872 }, { "epoch": 0.13723987502210694, "grad_norm": 0.193126380443573, "learning_rate": 4.943486336700564e-05, "loss": 1.204, "step": 873 }, { "epoch": 0.1373970799190395, "grad_norm": 0.18505080044269562, "learning_rate": 4.943355636154868e-05, "loss": 1.3247, "step": 874 }, { "epoch": 0.137554284815972, "grad_norm": 0.2586881220340729, "learning_rate": 4.9432247863783064e-05, "loss": 1.3315, "step": 875 }, { "epoch": 0.13771148971290456, "grad_norm": 0.2904506027698517, "learning_rate": 4.943093787378871e-05, "loss": 1.2593, "step": 876 }, { "epoch": 0.13786869460983708, "grad_norm": 0.2971174120903015, "learning_rate": 4.9429626391645615e-05, "loss": 1.2241, "step": 877 }, { "epoch": 0.13802589950676963, "grad_norm": 0.42521703243255615, "learning_rate": 4.9428313417433894e-05, "loss": 1.2638, "step": 878 }, { "epoch": 0.13818310440370218, "grad_norm": 0.2515777349472046, "learning_rate": 4.9426998951233735e-05, "loss": 1.3111, "step": 879 }, { "epoch": 0.1383403093006347, "grad_norm": 0.25959545373916626, "learning_rate": 4.942568299312541e-05, "loss": 1.2505, "step": 880 }, { "epoch": 0.13849751419756726, "grad_norm": 0.28090932965278625, "learning_rate": 4.942436554318931e-05, "loss": 1.1604, "step": 881 }, { "epoch": 0.1386547190944998, "grad_norm": 0.21833541989326477, "learning_rate": 4.942304660150588e-05, "loss": 1.2246, "step": 882 }, { "epoch": 0.13881192399143233, "grad_norm": 0.26167765259742737, "learning_rate": 4.9421726168155704e-05, "loss": 1.2399, "step": 883 }, { "epoch": 0.13896912888836488, "grad_norm": 0.23778817057609558, "learning_rate": 4.9420404243219395e-05, "loss": 1.2692, "step": 884 }, { "epoch": 0.1391263337852974, "grad_norm": 0.43253734707832336, "learning_rate": 4.941908082677773e-05, "loss": 1.2302, "step": 885 }, { "epoch": 0.13928353868222995, "grad_norm": 0.2448786050081253, "learning_rate": 4.94177559189115e-05, "loss": 1.3163, "step": 886 }, { "epoch": 0.1394407435791625, "grad_norm": 0.24711786210536957, "learning_rate": 4.941642951970165e-05, "loss": 1.2756, "step": 887 }, { "epoch": 0.13959794847609502, "grad_norm": 0.22932004928588867, "learning_rate": 4.941510162922917e-05, "loss": 1.3087, "step": 888 }, { "epoch": 0.13975515337302757, "grad_norm": 0.24999158084392548, "learning_rate": 4.941377224757518e-05, "loss": 1.3328, "step": 889 }, { "epoch": 0.1399123582699601, "grad_norm": 0.21222981810569763, "learning_rate": 4.941244137482088e-05, "loss": 1.3177, "step": 890 }, { "epoch": 0.14006956316689265, "grad_norm": 0.22691656649112701, "learning_rate": 4.941110901104754e-05, "loss": 1.2937, "step": 891 }, { "epoch": 0.1402267680638252, "grad_norm": 0.3120933771133423, "learning_rate": 4.940977515633653e-05, "loss": 1.1604, "step": 892 }, { "epoch": 0.14038397296075772, "grad_norm": 0.24279998242855072, "learning_rate": 4.940843981076934e-05, "loss": 1.3234, "step": 893 }, { "epoch": 0.14054117785769027, "grad_norm": 0.25406959652900696, "learning_rate": 4.940710297442751e-05, "loss": 1.3216, "step": 894 }, { "epoch": 0.14069838275462282, "grad_norm": 0.29678472876548767, "learning_rate": 4.940576464739269e-05, "loss": 1.2706, "step": 895 }, { "epoch": 0.14085558765155534, "grad_norm": 0.25185081362724304, "learning_rate": 4.9404424829746634e-05, "loss": 1.2456, "step": 896 }, { "epoch": 0.1410127925484879, "grad_norm": 0.2171952873468399, "learning_rate": 4.940308352157115e-05, "loss": 1.2943, "step": 897 }, { "epoch": 0.14116999744542041, "grad_norm": 0.21498677134513855, "learning_rate": 4.940174072294818e-05, "loss": 1.3466, "step": 898 }, { "epoch": 0.14132720234235296, "grad_norm": 0.2881999611854553, "learning_rate": 4.940039643395972e-05, "loss": 1.2322, "step": 899 }, { "epoch": 0.14148440723928551, "grad_norm": 0.2709384858608246, "learning_rate": 4.939905065468789e-05, "loss": 1.2228, "step": 900 }, { "epoch": 0.14164161213621804, "grad_norm": 0.2723088562488556, "learning_rate": 4.9397703385214875e-05, "loss": 1.1937, "step": 901 }, { "epoch": 0.1417988170331506, "grad_norm": 0.4296363294124603, "learning_rate": 4.939635462562297e-05, "loss": 1.2043, "step": 902 }, { "epoch": 0.1419560219300831, "grad_norm": 0.3255182206630707, "learning_rate": 4.939500437599454e-05, "loss": 1.1563, "step": 903 }, { "epoch": 0.14211322682701566, "grad_norm": 0.33772897720336914, "learning_rate": 4.939365263641206e-05, "loss": 1.3019, "step": 904 }, { "epoch": 0.1422704317239482, "grad_norm": 0.18991219997406006, "learning_rate": 4.93922994069581e-05, "loss": 1.3575, "step": 905 }, { "epoch": 0.14242763662088073, "grad_norm": 0.23950403928756714, "learning_rate": 4.939094468771529e-05, "loss": 1.2512, "step": 906 }, { "epoch": 0.14258484151781328, "grad_norm": 0.29783302545547485, "learning_rate": 4.938958847876637e-05, "loss": 1.3033, "step": 907 }, { "epoch": 0.14274204641474583, "grad_norm": 0.3168744742870331, "learning_rate": 4.93882307801942e-05, "loss": 1.1934, "step": 908 }, { "epoch": 0.14289925131167835, "grad_norm": 0.22578391432762146, "learning_rate": 4.9386871592081675e-05, "loss": 1.3307, "step": 909 }, { "epoch": 0.1430564562086109, "grad_norm": 0.32671108841896057, "learning_rate": 4.9385510914511824e-05, "loss": 1.2436, "step": 910 }, { "epoch": 0.14321366110554343, "grad_norm": 0.2524665296077728, "learning_rate": 4.938414874756774e-05, "loss": 1.2611, "step": 911 }, { "epoch": 0.14337086600247598, "grad_norm": 0.3576960563659668, "learning_rate": 4.9382785091332625e-05, "loss": 1.3721, "step": 912 }, { "epoch": 0.14352807089940853, "grad_norm": 0.2915900945663452, "learning_rate": 4.9381419945889776e-05, "loss": 1.3539, "step": 913 }, { "epoch": 0.14368527579634105, "grad_norm": 0.3168608844280243, "learning_rate": 4.938005331132256e-05, "loss": 1.224, "step": 914 }, { "epoch": 0.1438424806932736, "grad_norm": 0.24886426329612732, "learning_rate": 4.937868518771445e-05, "loss": 1.2299, "step": 915 }, { "epoch": 0.14399968559020612, "grad_norm": 0.26588642597198486, "learning_rate": 4.9377315575149e-05, "loss": 1.1947, "step": 916 }, { "epoch": 0.14415689048713867, "grad_norm": 0.28032201528549194, "learning_rate": 4.937594447370986e-05, "loss": 1.3756, "step": 917 }, { "epoch": 0.14431409538407122, "grad_norm": 0.3017072081565857, "learning_rate": 4.937457188348078e-05, "loss": 1.2723, "step": 918 }, { "epoch": 0.14447130028100374, "grad_norm": 0.2926197648048401, "learning_rate": 4.937319780454559e-05, "loss": 1.2716, "step": 919 }, { "epoch": 0.1446285051779363, "grad_norm": 0.24066713452339172, "learning_rate": 4.937182223698821e-05, "loss": 1.2828, "step": 920 }, { "epoch": 0.14478571007486885, "grad_norm": 0.30001577734947205, "learning_rate": 4.937044518089266e-05, "loss": 1.2407, "step": 921 }, { "epoch": 0.14494291497180137, "grad_norm": 0.25927406549453735, "learning_rate": 4.9369066636343044e-05, "loss": 1.3004, "step": 922 }, { "epoch": 0.14510011986873392, "grad_norm": 0.2542930543422699, "learning_rate": 4.936768660342355e-05, "loss": 1.3312, "step": 923 }, { "epoch": 0.14525732476566644, "grad_norm": 0.25233832001686096, "learning_rate": 4.936630508221847e-05, "loss": 1.1879, "step": 924 }, { "epoch": 0.145414529662599, "grad_norm": 0.22136953473091125, "learning_rate": 4.9364922072812185e-05, "loss": 1.2649, "step": 925 }, { "epoch": 0.14557173455953154, "grad_norm": 0.21759863197803497, "learning_rate": 4.936353757528916e-05, "loss": 1.2467, "step": 926 }, { "epoch": 0.14572893945646406, "grad_norm": 0.27614825963974, "learning_rate": 4.936215158973396e-05, "loss": 1.1901, "step": 927 }, { "epoch": 0.1458861443533966, "grad_norm": 0.2502923309803009, "learning_rate": 4.936076411623124e-05, "loss": 1.3358, "step": 928 }, { "epoch": 0.14604334925032914, "grad_norm": 0.2419285923242569, "learning_rate": 4.935937515486573e-05, "loss": 1.24, "step": 929 }, { "epoch": 0.14620055414726169, "grad_norm": 0.35315272212028503, "learning_rate": 4.935798470572226e-05, "loss": 1.2452, "step": 930 }, { "epoch": 0.14635775904419424, "grad_norm": 0.28915464878082275, "learning_rate": 4.935659276888577e-05, "loss": 1.3369, "step": 931 }, { "epoch": 0.14651496394112676, "grad_norm": 0.23898139595985413, "learning_rate": 4.9355199344441254e-05, "loss": 1.2328, "step": 932 }, { "epoch": 0.1466721688380593, "grad_norm": 0.25197896361351013, "learning_rate": 4.935380443247384e-05, "loss": 1.2826, "step": 933 }, { "epoch": 0.14682937373499186, "grad_norm": 0.26547369360923767, "learning_rate": 4.9352408033068695e-05, "loss": 1.2284, "step": 934 }, { "epoch": 0.14698657863192438, "grad_norm": 0.22031289339065552, "learning_rate": 4.935101014631114e-05, "loss": 1.2918, "step": 935 }, { "epoch": 0.14714378352885693, "grad_norm": 0.2603214979171753, "learning_rate": 4.9349610772286525e-05, "loss": 1.1767, "step": 936 }, { "epoch": 0.14730098842578945, "grad_norm": 0.29469192028045654, "learning_rate": 4.934820991108032e-05, "loss": 1.2845, "step": 937 }, { "epoch": 0.147458193322722, "grad_norm": 0.30825692415237427, "learning_rate": 4.934680756277811e-05, "loss": 1.1999, "step": 938 }, { "epoch": 0.14761539821965455, "grad_norm": 0.25342094898223877, "learning_rate": 4.934540372746552e-05, "loss": 1.2285, "step": 939 }, { "epoch": 0.14777260311658708, "grad_norm": 0.26036733388900757, "learning_rate": 4.9343998405228295e-05, "loss": 1.2367, "step": 940 }, { "epoch": 0.14792980801351963, "grad_norm": 0.27401411533355713, "learning_rate": 4.934259159615228e-05, "loss": 1.1985, "step": 941 }, { "epoch": 0.14808701291045215, "grad_norm": 0.23039095103740692, "learning_rate": 4.934118330032338e-05, "loss": 1.2649, "step": 942 }, { "epoch": 0.1482442178073847, "grad_norm": 0.29547953605651855, "learning_rate": 4.933977351782761e-05, "loss": 1.1345, "step": 943 }, { "epoch": 0.14840142270431725, "grad_norm": 0.22598884999752045, "learning_rate": 4.933836224875109e-05, "loss": 1.2965, "step": 944 }, { "epoch": 0.14855862760124977, "grad_norm": 0.31008240580558777, "learning_rate": 4.9336949493180006e-05, "loss": 1.1144, "step": 945 }, { "epoch": 0.14871583249818232, "grad_norm": 0.28397658467292786, "learning_rate": 4.9335535251200636e-05, "loss": 1.266, "step": 946 }, { "epoch": 0.14887303739511487, "grad_norm": 0.2284776121377945, "learning_rate": 4.933411952289937e-05, "loss": 1.2164, "step": 947 }, { "epoch": 0.1490302422920474, "grad_norm": 0.2107551246881485, "learning_rate": 4.9332702308362665e-05, "loss": 1.2719, "step": 948 }, { "epoch": 0.14918744718897994, "grad_norm": 0.26652616262435913, "learning_rate": 4.933128360767709e-05, "loss": 1.2304, "step": 949 }, { "epoch": 0.14934465208591247, "grad_norm": 0.22624680399894714, "learning_rate": 4.932986342092928e-05, "loss": 1.2999, "step": 950 }, { "epoch": 0.14950185698284502, "grad_norm": 0.20410288870334625, "learning_rate": 4.932844174820598e-05, "loss": 1.2269, "step": 951 }, { "epoch": 0.14965906187977757, "grad_norm": 0.24987919628620148, "learning_rate": 4.932701858959403e-05, "loss": 1.3042, "step": 952 }, { "epoch": 0.1498162667767101, "grad_norm": 0.191947340965271, "learning_rate": 4.932559394518033e-05, "loss": 1.2803, "step": 953 }, { "epoch": 0.14997347167364264, "grad_norm": 0.3396085798740387, "learning_rate": 4.932416781505191e-05, "loss": 1.2014, "step": 954 }, { "epoch": 0.15013067657057516, "grad_norm": 0.22375932335853577, "learning_rate": 4.932274019929587e-05, "loss": 1.242, "step": 955 }, { "epoch": 0.1502878814675077, "grad_norm": 0.281097412109375, "learning_rate": 4.93213110979994e-05, "loss": 1.2742, "step": 956 }, { "epoch": 0.15044508636444026, "grad_norm": 0.24049919843673706, "learning_rate": 4.931988051124979e-05, "loss": 1.3166, "step": 957 }, { "epoch": 0.15060229126137278, "grad_norm": 0.24433936178684235, "learning_rate": 4.93184484391344e-05, "loss": 1.2933, "step": 958 }, { "epoch": 0.15075949615830533, "grad_norm": 0.3671477138996124, "learning_rate": 4.9317014881740706e-05, "loss": 1.1731, "step": 959 }, { "epoch": 0.15091670105523788, "grad_norm": 0.22575189173221588, "learning_rate": 4.931557983915627e-05, "loss": 1.2509, "step": 960 }, { "epoch": 0.15091670105523788, "eval_loss": 1.2455451488494873, "eval_runtime": 2308.563, "eval_samples_per_second": 4.01, "eval_steps_per_second": 2.005, "step": 960 }, { "epoch": 0.1510739059521704, "grad_norm": 0.25838157534599304, "learning_rate": 4.931414331146873e-05, "loss": 1.3554, "step": 961 }, { "epoch": 0.15123111084910296, "grad_norm": 0.3163435459136963, "learning_rate": 4.931270529876583e-05, "loss": 1.3133, "step": 962 }, { "epoch": 0.15138831574603548, "grad_norm": 0.30024880170822144, "learning_rate": 4.9311265801135384e-05, "loss": 1.2303, "step": 963 }, { "epoch": 0.15154552064296803, "grad_norm": 0.3424816429615021, "learning_rate": 4.9309824818665325e-05, "loss": 1.1929, "step": 964 }, { "epoch": 0.15170272553990058, "grad_norm": 0.27401861548423767, "learning_rate": 4.930838235144366e-05, "loss": 1.2198, "step": 965 }, { "epoch": 0.1518599304368331, "grad_norm": 0.24110247194766998, "learning_rate": 4.930693839955848e-05, "loss": 1.2381, "step": 966 }, { "epoch": 0.15201713533376565, "grad_norm": 0.159100741147995, "learning_rate": 4.9305492963098e-05, "loss": 1.2666, "step": 967 }, { "epoch": 0.15217434023069817, "grad_norm": 0.23810729384422302, "learning_rate": 4.9304046042150474e-05, "loss": 1.2592, "step": 968 }, { "epoch": 0.15233154512763072, "grad_norm": 0.19887159764766693, "learning_rate": 4.930259763680429e-05, "loss": 1.3179, "step": 969 }, { "epoch": 0.15248875002456327, "grad_norm": 0.22060149908065796, "learning_rate": 4.930114774714791e-05, "loss": 1.2712, "step": 970 }, { "epoch": 0.1526459549214958, "grad_norm": 0.2443406730890274, "learning_rate": 4.929969637326989e-05, "loss": 1.2021, "step": 971 }, { "epoch": 0.15280315981842835, "grad_norm": 0.2185499370098114, "learning_rate": 4.9298243515258855e-05, "loss": 1.3017, "step": 972 }, { "epoch": 0.1529603647153609, "grad_norm": 0.23159849643707275, "learning_rate": 4.929678917320357e-05, "loss": 1.2122, "step": 973 }, { "epoch": 0.15311756961229342, "grad_norm": 0.3281627595424652, "learning_rate": 4.929533334719284e-05, "loss": 1.1151, "step": 974 }, { "epoch": 0.15327477450922597, "grad_norm": 0.24456332623958588, "learning_rate": 4.929387603731558e-05, "loss": 1.2107, "step": 975 }, { "epoch": 0.1534319794061585, "grad_norm": 0.3623602092266083, "learning_rate": 4.9292417243660814e-05, "loss": 1.3041, "step": 976 }, { "epoch": 0.15358918430309104, "grad_norm": 0.2319325953722, "learning_rate": 4.929095696631763e-05, "loss": 1.2331, "step": 977 }, { "epoch": 0.1537463892000236, "grad_norm": 0.324660986661911, "learning_rate": 4.92894952053752e-05, "loss": 1.2511, "step": 978 }, { "epoch": 0.15390359409695611, "grad_norm": 0.23866824805736542, "learning_rate": 4.9288031960922834e-05, "loss": 1.2709, "step": 979 }, { "epoch": 0.15406079899388866, "grad_norm": 0.21922123432159424, "learning_rate": 4.928656723304989e-05, "loss": 1.3108, "step": 980 }, { "epoch": 0.1542180038908212, "grad_norm": 0.2919687032699585, "learning_rate": 4.92851010218458e-05, "loss": 1.0465, "step": 981 }, { "epoch": 0.15437520878775374, "grad_norm": 0.26380711793899536, "learning_rate": 4.9283633327400156e-05, "loss": 1.2066, "step": 982 }, { "epoch": 0.1545324136846863, "grad_norm": 0.2607104182243347, "learning_rate": 4.9282164149802576e-05, "loss": 1.1227, "step": 983 }, { "epoch": 0.1546896185816188, "grad_norm": 0.32302606105804443, "learning_rate": 4.92806934891428e-05, "loss": 1.2066, "step": 984 }, { "epoch": 0.15484682347855136, "grad_norm": 0.28476232290267944, "learning_rate": 4.927922134551065e-05, "loss": 1.1447, "step": 985 }, { "epoch": 0.1550040283754839, "grad_norm": 0.253738671541214, "learning_rate": 4.9277747718996036e-05, "loss": 1.2438, "step": 986 }, { "epoch": 0.15516123327241643, "grad_norm": 0.29659610986709595, "learning_rate": 4.927627260968896e-05, "loss": 1.1946, "step": 987 }, { "epoch": 0.15531843816934898, "grad_norm": 0.27436473965644836, "learning_rate": 4.927479601767952e-05, "loss": 1.2783, "step": 988 }, { "epoch": 0.1554756430662815, "grad_norm": 0.21257497370243073, "learning_rate": 4.9273317943057896e-05, "loss": 1.204, "step": 989 }, { "epoch": 0.15563284796321405, "grad_norm": 0.2546120584011078, "learning_rate": 4.927183838591437e-05, "loss": 1.2224, "step": 990 }, { "epoch": 0.1557900528601466, "grad_norm": 0.2336018979549408, "learning_rate": 4.92703573463393e-05, "loss": 1.2903, "step": 991 }, { "epoch": 0.15594725775707913, "grad_norm": 0.2048567533493042, "learning_rate": 4.926887482442315e-05, "loss": 1.2529, "step": 992 }, { "epoch": 0.15610446265401168, "grad_norm": 0.20950450003147125, "learning_rate": 4.926739082025646e-05, "loss": 1.4247, "step": 993 }, { "epoch": 0.1562616675509442, "grad_norm": 0.22965438663959503, "learning_rate": 4.926590533392987e-05, "loss": 1.2573, "step": 994 }, { "epoch": 0.15641887244787675, "grad_norm": 0.25654879212379456, "learning_rate": 4.9264418365534105e-05, "loss": 1.2115, "step": 995 }, { "epoch": 0.1565760773448093, "grad_norm": 0.26419493556022644, "learning_rate": 4.9262929915159995e-05, "loss": 1.2994, "step": 996 }, { "epoch": 0.15673328224174182, "grad_norm": 0.3215327560901642, "learning_rate": 4.926143998289843e-05, "loss": 1.2268, "step": 997 }, { "epoch": 0.15689048713867437, "grad_norm": 0.3190024495124817, "learning_rate": 4.925994856884042e-05, "loss": 1.2747, "step": 998 }, { "epoch": 0.15704769203560692, "grad_norm": 0.26742681860923767, "learning_rate": 4.9258455673077065e-05, "loss": 1.2791, "step": 999 }, { "epoch": 0.15720489693253945, "grad_norm": 0.21026930212974548, "learning_rate": 4.925696129569953e-05, "loss": 1.3694, "step": 1000 }, { "epoch": 0.157362101829472, "grad_norm": 0.2838928699493408, "learning_rate": 4.925546543679909e-05, "loss": 1.2589, "step": 1001 }, { "epoch": 0.15751930672640452, "grad_norm": 0.3007844090461731, "learning_rate": 4.9253968096467104e-05, "loss": 1.2488, "step": 1002 }, { "epoch": 0.15767651162333707, "grad_norm": 0.2473154515028, "learning_rate": 4.925246927479503e-05, "loss": 1.2288, "step": 1003 }, { "epoch": 0.15783371652026962, "grad_norm": 0.2360457479953766, "learning_rate": 4.925096897187441e-05, "loss": 1.3148, "step": 1004 }, { "epoch": 0.15799092141720214, "grad_norm": 0.3671962320804596, "learning_rate": 4.924946718779687e-05, "loss": 1.1999, "step": 1005 }, { "epoch": 0.1581481263141347, "grad_norm": 0.28391456604003906, "learning_rate": 4.924796392265414e-05, "loss": 1.2296, "step": 1006 }, { "epoch": 0.1583053312110672, "grad_norm": 0.2550790011882782, "learning_rate": 4.924645917653802e-05, "loss": 1.1923, "step": 1007 }, { "epoch": 0.15846253610799976, "grad_norm": 0.301740825176239, "learning_rate": 4.924495294954044e-05, "loss": 1.2363, "step": 1008 }, { "epoch": 0.1586197410049323, "grad_norm": 0.2608097195625305, "learning_rate": 4.9243445241753374e-05, "loss": 1.298, "step": 1009 }, { "epoch": 0.15877694590186484, "grad_norm": 0.31052571535110474, "learning_rate": 4.924193605326891e-05, "loss": 1.3037, "step": 1010 }, { "epoch": 0.15893415079879739, "grad_norm": 0.2420274317264557, "learning_rate": 4.924042538417923e-05, "loss": 1.2541, "step": 1011 }, { "epoch": 0.15909135569572994, "grad_norm": 0.21056459844112396, "learning_rate": 4.92389132345766e-05, "loss": 1.3229, "step": 1012 }, { "epoch": 0.15924856059266246, "grad_norm": 0.3283076286315918, "learning_rate": 4.923739960455337e-05, "loss": 1.1036, "step": 1013 }, { "epoch": 0.159405765489595, "grad_norm": 0.2461448460817337, "learning_rate": 4.9235884494201987e-05, "loss": 1.2503, "step": 1014 }, { "epoch": 0.15956297038652753, "grad_norm": 0.2919829189777374, "learning_rate": 4.923436790361499e-05, "loss": 1.2108, "step": 1015 }, { "epoch": 0.15972017528346008, "grad_norm": 0.22029368579387665, "learning_rate": 4.923284983288501e-05, "loss": 1.2369, "step": 1016 }, { "epoch": 0.15987738018039263, "grad_norm": 0.27863630652427673, "learning_rate": 4.9231330282104756e-05, "loss": 1.2555, "step": 1017 }, { "epoch": 0.16003458507732515, "grad_norm": 0.28123825788497925, "learning_rate": 4.9229809251367055e-05, "loss": 1.1519, "step": 1018 }, { "epoch": 0.1601917899742577, "grad_norm": 0.231231227517128, "learning_rate": 4.922828674076478e-05, "loss": 1.3488, "step": 1019 }, { "epoch": 0.16034899487119023, "grad_norm": 0.2704828977584839, "learning_rate": 4.9226762750390944e-05, "loss": 1.1102, "step": 1020 }, { "epoch": 0.16050619976812278, "grad_norm": 0.17970627546310425, "learning_rate": 4.922523728033861e-05, "loss": 1.2237, "step": 1021 }, { "epoch": 0.16066340466505533, "grad_norm": 0.22386445105075836, "learning_rate": 4.9223710330700956e-05, "loss": 1.2564, "step": 1022 }, { "epoch": 0.16082060956198785, "grad_norm": 0.21347114443778992, "learning_rate": 4.922218190157124e-05, "loss": 1.2433, "step": 1023 }, { "epoch": 0.1609778144589204, "grad_norm": 0.23873676359653473, "learning_rate": 4.9220651993042813e-05, "loss": 1.2018, "step": 1024 }, { "epoch": 0.16113501935585295, "grad_norm": 0.26954975724220276, "learning_rate": 4.921912060520912e-05, "loss": 1.2118, "step": 1025 }, { "epoch": 0.16129222425278547, "grad_norm": 0.3023718595504761, "learning_rate": 4.9217587738163686e-05, "loss": 1.2717, "step": 1026 }, { "epoch": 0.16144942914971802, "grad_norm": 0.31107062101364136, "learning_rate": 4.921605339200013e-05, "loss": 1.2017, "step": 1027 }, { "epoch": 0.16160663404665054, "grad_norm": 0.2795855402946472, "learning_rate": 4.921451756681217e-05, "loss": 1.3492, "step": 1028 }, { "epoch": 0.1617638389435831, "grad_norm": 0.24515774846076965, "learning_rate": 4.921298026269361e-05, "loss": 1.2763, "step": 1029 }, { "epoch": 0.16192104384051564, "grad_norm": 0.2603877782821655, "learning_rate": 4.921144147973834e-05, "loss": 1.3196, "step": 1030 }, { "epoch": 0.16207824873744817, "grad_norm": 0.265697181224823, "learning_rate": 4.9209901218040335e-05, "loss": 1.3067, "step": 1031 }, { "epoch": 0.16223545363438072, "grad_norm": 0.2554314136505127, "learning_rate": 4.9208359477693686e-05, "loss": 1.2347, "step": 1032 }, { "epoch": 0.16239265853131324, "grad_norm": 0.3272973597049713, "learning_rate": 4.920681625879254e-05, "loss": 1.2104, "step": 1033 }, { "epoch": 0.1625498634282458, "grad_norm": 0.21259547770023346, "learning_rate": 4.9205271561431166e-05, "loss": 1.2857, "step": 1034 }, { "epoch": 0.16270706832517834, "grad_norm": 0.2505529820919037, "learning_rate": 4.92037253857039e-05, "loss": 1.1988, "step": 1035 }, { "epoch": 0.16286427322211086, "grad_norm": 0.2369750738143921, "learning_rate": 4.920217773170517e-05, "loss": 1.2384, "step": 1036 }, { "epoch": 0.1630214781190434, "grad_norm": 0.26577889919281006, "learning_rate": 4.920062859952951e-05, "loss": 1.1521, "step": 1037 }, { "epoch": 0.16317868301597593, "grad_norm": 0.2224215418100357, "learning_rate": 4.919907798927153e-05, "loss": 1.3065, "step": 1038 }, { "epoch": 0.16333588791290848, "grad_norm": 0.1882622092962265, "learning_rate": 4.9197525901025944e-05, "loss": 1.2472, "step": 1039 }, { "epoch": 0.16349309280984103, "grad_norm": 0.25392916798591614, "learning_rate": 4.919597233488754e-05, "loss": 1.3387, "step": 1040 }, { "epoch": 0.16365029770677356, "grad_norm": 0.3185995817184448, "learning_rate": 4.91944172909512e-05, "loss": 1.161, "step": 1041 }, { "epoch": 0.1638075026037061, "grad_norm": 0.26924118399620056, "learning_rate": 4.919286076931191e-05, "loss": 1.0659, "step": 1042 }, { "epoch": 0.16396470750063866, "grad_norm": 0.2224770486354828, "learning_rate": 4.919130277006473e-05, "loss": 1.2303, "step": 1043 }, { "epoch": 0.16412191239757118, "grad_norm": 0.24008037149906158, "learning_rate": 4.918974329330482e-05, "loss": 1.2762, "step": 1044 }, { "epoch": 0.16427911729450373, "grad_norm": 0.2728358209133148, "learning_rate": 4.918818233912742e-05, "loss": 1.1582, "step": 1045 }, { "epoch": 0.16443632219143625, "grad_norm": 0.24911800026893616, "learning_rate": 4.918661990762788e-05, "loss": 1.2155, "step": 1046 }, { "epoch": 0.1645935270883688, "grad_norm": 0.2444472759962082, "learning_rate": 4.918505599890162e-05, "loss": 1.2838, "step": 1047 }, { "epoch": 0.16475073198530135, "grad_norm": 0.2379113882780075, "learning_rate": 4.918349061304416e-05, "loss": 1.3043, "step": 1048 }, { "epoch": 0.16490793688223387, "grad_norm": 0.3085183799266815, "learning_rate": 4.9181923750151095e-05, "loss": 1.2568, "step": 1049 }, { "epoch": 0.16506514177916642, "grad_norm": 0.2629674971103668, "learning_rate": 4.918035541031814e-05, "loss": 1.2171, "step": 1050 }, { "epoch": 0.16522234667609895, "grad_norm": 0.2707282602787018, "learning_rate": 4.917878559364107e-05, "loss": 1.1597, "step": 1051 }, { "epoch": 0.1653795515730315, "grad_norm": 0.27305370569229126, "learning_rate": 4.9177214300215784e-05, "loss": 1.36, "step": 1052 }, { "epoch": 0.16553675646996405, "grad_norm": 0.20558474957942963, "learning_rate": 4.9175641530138226e-05, "loss": 1.2225, "step": 1053 }, { "epoch": 0.16569396136689657, "grad_norm": 0.23680521547794342, "learning_rate": 4.917406728350448e-05, "loss": 1.2149, "step": 1054 }, { "epoch": 0.16585116626382912, "grad_norm": 0.2101297229528427, "learning_rate": 4.917249156041066e-05, "loss": 1.2313, "step": 1055 }, { "epoch": 0.16600837116076167, "grad_norm": 0.2601447105407715, "learning_rate": 4.917091436095304e-05, "loss": 1.1907, "step": 1056 }, { "epoch": 0.1661655760576942, "grad_norm": 0.2113189995288849, "learning_rate": 4.916933568522793e-05, "loss": 1.2852, "step": 1057 }, { "epoch": 0.16632278095462674, "grad_norm": 0.21135227382183075, "learning_rate": 4.916775553333176e-05, "loss": 1.2852, "step": 1058 }, { "epoch": 0.16647998585155926, "grad_norm": 0.2743116617202759, "learning_rate": 4.916617390536102e-05, "loss": 1.2032, "step": 1059 }, { "epoch": 0.16663719074849181, "grad_norm": 0.2520056664943695, "learning_rate": 4.916459080141234e-05, "loss": 1.3038, "step": 1060 }, { "epoch": 0.16679439564542436, "grad_norm": 0.21614307165145874, "learning_rate": 4.916300622158239e-05, "loss": 1.2216, "step": 1061 }, { "epoch": 0.1669516005423569, "grad_norm": 0.28615444898605347, "learning_rate": 4.9161420165967956e-05, "loss": 1.2162, "step": 1062 }, { "epoch": 0.16710880543928944, "grad_norm": 0.33522137999534607, "learning_rate": 4.91598326346659e-05, "loss": 1.1458, "step": 1063 }, { "epoch": 0.16726601033622196, "grad_norm": 0.30597957968711853, "learning_rate": 4.9158243627773194e-05, "loss": 1.2623, "step": 1064 }, { "epoch": 0.1674232152331545, "grad_norm": 0.2643260061740875, "learning_rate": 4.915665314538688e-05, "loss": 1.2092, "step": 1065 }, { "epoch": 0.16758042013008706, "grad_norm": 0.3190208673477173, "learning_rate": 4.91550611876041e-05, "loss": 1.0543, "step": 1066 }, { "epoch": 0.16773762502701958, "grad_norm": 0.3194860816001892, "learning_rate": 4.9153467754522095e-05, "loss": 1.1393, "step": 1067 }, { "epoch": 0.16789482992395213, "grad_norm": 0.2031661570072174, "learning_rate": 4.915187284623817e-05, "loss": 1.2136, "step": 1068 }, { "epoch": 0.16805203482088468, "grad_norm": 0.2047189325094223, "learning_rate": 4.915027646284974e-05, "loss": 1.2962, "step": 1069 }, { "epoch": 0.1682092397178172, "grad_norm": 0.31565096974372864, "learning_rate": 4.9148678604454325e-05, "loss": 1.1979, "step": 1070 }, { "epoch": 0.16836644461474976, "grad_norm": 0.23875312507152557, "learning_rate": 4.914707927114949e-05, "loss": 1.3002, "step": 1071 }, { "epoch": 0.16852364951168228, "grad_norm": 0.24424909055233002, "learning_rate": 4.9145478463032924e-05, "loss": 1.1491, "step": 1072 }, { "epoch": 0.16868085440861483, "grad_norm": 0.28339752554893494, "learning_rate": 4.91438761802024e-05, "loss": 1.2398, "step": 1073 }, { "epoch": 0.16883805930554738, "grad_norm": 0.2435888648033142, "learning_rate": 4.9142272422755786e-05, "loss": 1.3292, "step": 1074 }, { "epoch": 0.1689952642024799, "grad_norm": 0.21540984511375427, "learning_rate": 4.9140667190791026e-05, "loss": 1.3665, "step": 1075 }, { "epoch": 0.16915246909941245, "grad_norm": 0.2556820809841156, "learning_rate": 4.913906048440617e-05, "loss": 1.2557, "step": 1076 }, { "epoch": 0.16930967399634497, "grad_norm": 0.23769475519657135, "learning_rate": 4.913745230369934e-05, "loss": 1.2163, "step": 1077 }, { "epoch": 0.16946687889327752, "grad_norm": 0.31578120589256287, "learning_rate": 4.913584264876875e-05, "loss": 1.3176, "step": 1078 }, { "epoch": 0.16962408379021007, "grad_norm": 0.22278232872486115, "learning_rate": 4.913423151971273e-05, "loss": 1.2206, "step": 1079 }, { "epoch": 0.1697812886871426, "grad_norm": 0.31810736656188965, "learning_rate": 4.913261891662967e-05, "loss": 1.2254, "step": 1080 }, { "epoch": 0.16993849358407515, "grad_norm": 0.22623823583126068, "learning_rate": 4.913100483961807e-05, "loss": 1.208, "step": 1081 }, { "epoch": 0.1700956984810077, "grad_norm": 0.27108776569366455, "learning_rate": 4.9129389288776504e-05, "loss": 1.2989, "step": 1082 }, { "epoch": 0.17025290337794022, "grad_norm": 0.194550558924675, "learning_rate": 4.912777226420365e-05, "loss": 1.3849, "step": 1083 }, { "epoch": 0.17041010827487277, "grad_norm": 0.20856213569641113, "learning_rate": 4.912615376599826e-05, "loss": 1.2736, "step": 1084 }, { "epoch": 0.1705673131718053, "grad_norm": 0.19355203211307526, "learning_rate": 4.91245337942592e-05, "loss": 1.256, "step": 1085 }, { "epoch": 0.17072451806873784, "grad_norm": 0.21832303702831268, "learning_rate": 4.9122912349085395e-05, "loss": 1.1987, "step": 1086 }, { "epoch": 0.1708817229656704, "grad_norm": 0.22642913460731506, "learning_rate": 4.912128943057589e-05, "loss": 1.3043, "step": 1087 }, { "epoch": 0.1710389278626029, "grad_norm": 0.22713351249694824, "learning_rate": 4.911966503882981e-05, "loss": 1.1951, "step": 1088 }, { "epoch": 0.17119613275953546, "grad_norm": 0.29707837104797363, "learning_rate": 4.911803917394634e-05, "loss": 1.2674, "step": 1089 }, { "epoch": 0.17135333765646799, "grad_norm": 0.27017104625701904, "learning_rate": 4.911641183602481e-05, "loss": 1.1727, "step": 1090 }, { "epoch": 0.17151054255340054, "grad_norm": 0.23867738246917725, "learning_rate": 4.911478302516461e-05, "loss": 1.2061, "step": 1091 }, { "epoch": 0.17166774745033309, "grad_norm": 0.26270055770874023, "learning_rate": 4.911315274146521e-05, "loss": 1.2735, "step": 1092 }, { "epoch": 0.1718249523472656, "grad_norm": 0.25167661905288696, "learning_rate": 4.911152098502617e-05, "loss": 1.2643, "step": 1093 }, { "epoch": 0.17198215724419816, "grad_norm": 0.46500882506370544, "learning_rate": 4.9109887755947185e-05, "loss": 1.1743, "step": 1094 }, { "epoch": 0.1721393621411307, "grad_norm": 0.2189512848854065, "learning_rate": 4.910825305432798e-05, "loss": 1.1232, "step": 1095 }, { "epoch": 0.17229656703806323, "grad_norm": 0.21188926696777344, "learning_rate": 4.9106616880268405e-05, "loss": 1.2031, "step": 1096 }, { "epoch": 0.17245377193499578, "grad_norm": 0.2135314792394638, "learning_rate": 4.910497923386839e-05, "loss": 1.2547, "step": 1097 }, { "epoch": 0.1726109768319283, "grad_norm": 0.27527931332588196, "learning_rate": 4.910334011522796e-05, "loss": 1.119, "step": 1098 }, { "epoch": 0.17276818172886085, "grad_norm": 0.23516559600830078, "learning_rate": 4.910169952444722e-05, "loss": 1.3006, "step": 1099 }, { "epoch": 0.1729253866257934, "grad_norm": 0.23166057467460632, "learning_rate": 4.910005746162637e-05, "loss": 1.281, "step": 1100 }, { "epoch": 0.17308259152272593, "grad_norm": 0.22281822562217712, "learning_rate": 4.9098413926865714e-05, "loss": 1.1526, "step": 1101 }, { "epoch": 0.17323979641965848, "grad_norm": 0.32062655687332153, "learning_rate": 4.909676892026563e-05, "loss": 1.2388, "step": 1102 }, { "epoch": 0.173397001316591, "grad_norm": 0.29868969321250916, "learning_rate": 4.909512244192657e-05, "loss": 1.2303, "step": 1103 }, { "epoch": 0.17355420621352355, "grad_norm": 0.23143270611763, "learning_rate": 4.90934744919491e-05, "loss": 1.2137, "step": 1104 }, { "epoch": 0.1737114111104561, "grad_norm": 0.2830474376678467, "learning_rate": 4.909182507043389e-05, "loss": 1.2178, "step": 1105 }, { "epoch": 0.17386861600738862, "grad_norm": 0.22427986562252045, "learning_rate": 4.909017417748166e-05, "loss": 1.3153, "step": 1106 }, { "epoch": 0.17402582090432117, "grad_norm": 0.2587423622608185, "learning_rate": 4.908852181319326e-05, "loss": 1.2669, "step": 1107 }, { "epoch": 0.17418302580125372, "grad_norm": 0.24905993044376373, "learning_rate": 4.9086867977669594e-05, "loss": 1.2549, "step": 1108 }, { "epoch": 0.17434023069818624, "grad_norm": 0.26877379417419434, "learning_rate": 4.908521267101167e-05, "loss": 1.2694, "step": 1109 }, { "epoch": 0.1744974355951188, "grad_norm": 0.2501152753829956, "learning_rate": 4.9083555893320596e-05, "loss": 1.2241, "step": 1110 }, { "epoch": 0.17465464049205132, "grad_norm": 0.27815014123916626, "learning_rate": 4.908189764469757e-05, "loss": 1.2152, "step": 1111 }, { "epoch": 0.17481184538898387, "grad_norm": 0.32891881465911865, "learning_rate": 4.9080237925243856e-05, "loss": 1.2638, "step": 1112 }, { "epoch": 0.17496905028591642, "grad_norm": 0.2137015461921692, "learning_rate": 4.9078576735060825e-05, "loss": 1.2041, "step": 1113 }, { "epoch": 0.17512625518284894, "grad_norm": 0.17862486839294434, "learning_rate": 4.907691407424995e-05, "loss": 1.3349, "step": 1114 }, { "epoch": 0.1752834600797815, "grad_norm": 0.25791284441947937, "learning_rate": 4.907524994291276e-05, "loss": 1.2337, "step": 1115 }, { "epoch": 0.175440664976714, "grad_norm": 0.24266491830348969, "learning_rate": 4.90735843411509e-05, "loss": 1.0939, "step": 1116 }, { "epoch": 0.17559786987364656, "grad_norm": 0.2618250250816345, "learning_rate": 4.9071917269066114e-05, "loss": 1.2855, "step": 1117 }, { "epoch": 0.1757550747705791, "grad_norm": 0.2477390021085739, "learning_rate": 4.9070248726760206e-05, "loss": 1.1675, "step": 1118 }, { "epoch": 0.17591227966751163, "grad_norm": 0.29105502367019653, "learning_rate": 4.906857871433508e-05, "loss": 1.183, "step": 1119 }, { "epoch": 0.17606948456444418, "grad_norm": 0.2923283874988556, "learning_rate": 4.906690723189275e-05, "loss": 1.1386, "step": 1120 }, { "epoch": 0.17606948456444418, "eval_loss": 1.219694972038269, "eval_runtime": 2300.2931, "eval_samples_per_second": 4.025, "eval_steps_per_second": 2.012, "step": 1120 }, { "epoch": 0.17622668946137673, "grad_norm": 0.3278633952140808, "learning_rate": 4.906523427953529e-05, "loss": 1.1738, "step": 1121 }, { "epoch": 0.17638389435830926, "grad_norm": 0.31546783447265625, "learning_rate": 4.906355985736488e-05, "loss": 1.0894, "step": 1122 }, { "epoch": 0.1765410992552418, "grad_norm": 0.28350481390953064, "learning_rate": 4.906188396548379e-05, "loss": 1.2774, "step": 1123 }, { "epoch": 0.17669830415217433, "grad_norm": 0.21374982595443726, "learning_rate": 4.9060206603994385e-05, "loss": 1.37, "step": 1124 }, { "epoch": 0.17685550904910688, "grad_norm": 0.2343566119670868, "learning_rate": 4.9058527772999095e-05, "loss": 1.2065, "step": 1125 }, { "epoch": 0.17701271394603943, "grad_norm": 0.29571887850761414, "learning_rate": 4.905684747260047e-05, "loss": 1.1967, "step": 1126 }, { "epoch": 0.17716991884297195, "grad_norm": 0.2689303457736969, "learning_rate": 4.905516570290113e-05, "loss": 1.2337, "step": 1127 }, { "epoch": 0.1773271237399045, "grad_norm": 0.22743673622608185, "learning_rate": 4.90534824640038e-05, "loss": 1.1673, "step": 1128 }, { "epoch": 0.17748432863683702, "grad_norm": 0.36731019616127014, "learning_rate": 4.905179775601126e-05, "loss": 1.1397, "step": 1129 }, { "epoch": 0.17764153353376957, "grad_norm": 0.2571149468421936, "learning_rate": 4.905011157902645e-05, "loss": 1.1166, "step": 1130 }, { "epoch": 0.17779873843070212, "grad_norm": 0.2615256905555725, "learning_rate": 4.904842393315231e-05, "loss": 1.2095, "step": 1131 }, { "epoch": 0.17795594332763465, "grad_norm": 0.28919360041618347, "learning_rate": 4.904673481849194e-05, "loss": 1.0976, "step": 1132 }, { "epoch": 0.1781131482245672, "grad_norm": 0.3858489990234375, "learning_rate": 4.90450442351485e-05, "loss": 1.1934, "step": 1133 }, { "epoch": 0.17827035312149975, "grad_norm": 0.2448245733976364, "learning_rate": 4.904335218322524e-05, "loss": 1.1604, "step": 1134 }, { "epoch": 0.17842755801843227, "grad_norm": 0.2626294195652008, "learning_rate": 4.9041658662825514e-05, "loss": 1.1301, "step": 1135 }, { "epoch": 0.17858476291536482, "grad_norm": 0.3016091287136078, "learning_rate": 4.903996367405275e-05, "loss": 1.2579, "step": 1136 }, { "epoch": 0.17874196781229734, "grad_norm": 0.28168612718582153, "learning_rate": 4.9038267217010455e-05, "loss": 1.1471, "step": 1137 }, { "epoch": 0.1788991727092299, "grad_norm": 0.29256439208984375, "learning_rate": 4.903656929180228e-05, "loss": 1.1598, "step": 1138 }, { "epoch": 0.17905637760616244, "grad_norm": 0.19786624610424042, "learning_rate": 4.9034869898531895e-05, "loss": 1.2115, "step": 1139 }, { "epoch": 0.17921358250309496, "grad_norm": 0.17216260731220245, "learning_rate": 4.9033169037303106e-05, "loss": 1.2471, "step": 1140 }, { "epoch": 0.17937078740002751, "grad_norm": 0.22571730613708496, "learning_rate": 4.9031466708219785e-05, "loss": 1.2226, "step": 1141 }, { "epoch": 0.17952799229696004, "grad_norm": 0.25510528683662415, "learning_rate": 4.9029762911385915e-05, "loss": 1.1428, "step": 1142 }, { "epoch": 0.1796851971938926, "grad_norm": 0.19014020264148712, "learning_rate": 4.902805764690556e-05, "loss": 1.2268, "step": 1143 }, { "epoch": 0.17984240209082514, "grad_norm": 0.25155729055404663, "learning_rate": 4.902635091488286e-05, "loss": 1.1943, "step": 1144 }, { "epoch": 0.17999960698775766, "grad_norm": 0.3109387159347534, "learning_rate": 4.902464271542206e-05, "loss": 1.176, "step": 1145 }, { "epoch": 0.1801568118846902, "grad_norm": 0.2269504815340042, "learning_rate": 4.9022933048627496e-05, "loss": 1.2166, "step": 1146 }, { "epoch": 0.18031401678162276, "grad_norm": 0.20270425081253052, "learning_rate": 4.902122191460358e-05, "loss": 1.235, "step": 1147 }, { "epoch": 0.18047122167855528, "grad_norm": 0.2519841194152832, "learning_rate": 4.901950931345481e-05, "loss": 1.2418, "step": 1148 }, { "epoch": 0.18062842657548783, "grad_norm": 0.1967516988515854, "learning_rate": 4.901779524528582e-05, "loss": 1.2979, "step": 1149 }, { "epoch": 0.18078563147242036, "grad_norm": 0.21120384335517883, "learning_rate": 4.901607971020127e-05, "loss": 1.1557, "step": 1150 }, { "epoch": 0.1809428363693529, "grad_norm": 0.31649792194366455, "learning_rate": 4.9014362708305944e-05, "loss": 1.3237, "step": 1151 }, { "epoch": 0.18110004126628546, "grad_norm": 0.24945318698883057, "learning_rate": 4.901264423970471e-05, "loss": 1.2099, "step": 1152 }, { "epoch": 0.18125724616321798, "grad_norm": 0.30652904510498047, "learning_rate": 4.901092430450254e-05, "loss": 1.1918, "step": 1153 }, { "epoch": 0.18141445106015053, "grad_norm": 0.2480253279209137, "learning_rate": 4.900920290280446e-05, "loss": 1.2675, "step": 1154 }, { "epoch": 0.18157165595708305, "grad_norm": 0.3034304976463318, "learning_rate": 4.900748003471561e-05, "loss": 1.2012, "step": 1155 }, { "epoch": 0.1817288608540156, "grad_norm": 0.2113679200410843, "learning_rate": 4.900575570034124e-05, "loss": 1.2824, "step": 1156 }, { "epoch": 0.18188606575094815, "grad_norm": 0.34726831316947937, "learning_rate": 4.9004029899786627e-05, "loss": 1.1426, "step": 1157 }, { "epoch": 0.18204327064788067, "grad_norm": 0.20344194769859314, "learning_rate": 4.900230263315722e-05, "loss": 1.2096, "step": 1158 }, { "epoch": 0.18220047554481322, "grad_norm": 0.28635072708129883, "learning_rate": 4.900057390055847e-05, "loss": 1.166, "step": 1159 }, { "epoch": 0.18235768044174577, "grad_norm": 0.21670344471931458, "learning_rate": 4.8998843702095995e-05, "loss": 1.2103, "step": 1160 }, { "epoch": 0.1825148853386783, "grad_norm": 0.31661516427993774, "learning_rate": 4.899711203787545e-05, "loss": 1.2345, "step": 1161 }, { "epoch": 0.18267209023561085, "grad_norm": 0.30255556106567383, "learning_rate": 4.899537890800261e-05, "loss": 1.2342, "step": 1162 }, { "epoch": 0.18282929513254337, "grad_norm": 0.23636944591999054, "learning_rate": 4.899364431258332e-05, "loss": 1.1685, "step": 1163 }, { "epoch": 0.18298650002947592, "grad_norm": 0.27452319860458374, "learning_rate": 4.8991908251723524e-05, "loss": 1.1263, "step": 1164 }, { "epoch": 0.18314370492640847, "grad_norm": 0.28636041283607483, "learning_rate": 4.899017072552926e-05, "loss": 1.1961, "step": 1165 }, { "epoch": 0.183300909823341, "grad_norm": 0.29220953583717346, "learning_rate": 4.8988431734106635e-05, "loss": 1.2414, "step": 1166 }, { "epoch": 0.18345811472027354, "grad_norm": 0.20738068222999573, "learning_rate": 4.898669127756188e-05, "loss": 1.1499, "step": 1167 }, { "epoch": 0.18361531961720606, "grad_norm": 0.19913551211357117, "learning_rate": 4.898494935600127e-05, "loss": 1.3538, "step": 1168 }, { "epoch": 0.1837725245141386, "grad_norm": 0.256979763507843, "learning_rate": 4.8983205969531234e-05, "loss": 1.1979, "step": 1169 }, { "epoch": 0.18392972941107116, "grad_norm": 0.26307129859924316, "learning_rate": 4.898146111825821e-05, "loss": 1.2054, "step": 1170 }, { "epoch": 0.18408693430800369, "grad_norm": 0.2451772540807724, "learning_rate": 4.897971480228879e-05, "loss": 1.1901, "step": 1171 }, { "epoch": 0.18424413920493624, "grad_norm": 0.3223975896835327, "learning_rate": 4.897796702172962e-05, "loss": 1.1825, "step": 1172 }, { "epoch": 0.18440134410186879, "grad_norm": 0.34991317987442017, "learning_rate": 4.897621777668746e-05, "loss": 1.1371, "step": 1173 }, { "epoch": 0.1845585489988013, "grad_norm": 0.2680002748966217, "learning_rate": 4.897446706726915e-05, "loss": 1.2179, "step": 1174 }, { "epoch": 0.18471575389573386, "grad_norm": 0.21509090065956116, "learning_rate": 4.897271489358159e-05, "loss": 1.1284, "step": 1175 }, { "epoch": 0.18487295879266638, "grad_norm": 0.20545831322669983, "learning_rate": 4.8970961255731826e-05, "loss": 1.2188, "step": 1176 }, { "epoch": 0.18503016368959893, "grad_norm": 0.23479585349559784, "learning_rate": 4.896920615382695e-05, "loss": 1.2947, "step": 1177 }, { "epoch": 0.18518736858653148, "grad_norm": 0.2880757749080658, "learning_rate": 4.896744958797417e-05, "loss": 1.1443, "step": 1178 }, { "epoch": 0.185344573483464, "grad_norm": 0.2431318610906601, "learning_rate": 4.8965691558280744e-05, "loss": 1.1123, "step": 1179 }, { "epoch": 0.18550177838039655, "grad_norm": 0.21252453327178955, "learning_rate": 4.896393206485407e-05, "loss": 1.326, "step": 1180 }, { "epoch": 0.18565898327732908, "grad_norm": 0.28821709752082825, "learning_rate": 4.8962171107801596e-05, "loss": 1.1508, "step": 1181 }, { "epoch": 0.18581618817426163, "grad_norm": 0.2636358141899109, "learning_rate": 4.8960408687230886e-05, "loss": 1.1061, "step": 1182 }, { "epoch": 0.18597339307119418, "grad_norm": 0.23121225833892822, "learning_rate": 4.895864480324957e-05, "loss": 1.2486, "step": 1183 }, { "epoch": 0.1861305979681267, "grad_norm": 0.29034245014190674, "learning_rate": 4.895687945596539e-05, "loss": 1.186, "step": 1184 }, { "epoch": 0.18628780286505925, "grad_norm": 0.3220363259315491, "learning_rate": 4.895511264548617e-05, "loss": 1.1727, "step": 1185 }, { "epoch": 0.1864450077619918, "grad_norm": 0.2863159477710724, "learning_rate": 4.89533443719198e-05, "loss": 1.1946, "step": 1186 }, { "epoch": 0.18660221265892432, "grad_norm": 0.27671483159065247, "learning_rate": 4.89515746353743e-05, "loss": 1.2271, "step": 1187 }, { "epoch": 0.18675941755585687, "grad_norm": 0.2535041570663452, "learning_rate": 4.894980343595775e-05, "loss": 1.2437, "step": 1188 }, { "epoch": 0.1869166224527894, "grad_norm": 0.34405645728111267, "learning_rate": 4.894803077377833e-05, "loss": 1.1397, "step": 1189 }, { "epoch": 0.18707382734972194, "grad_norm": 0.28299692273139954, "learning_rate": 4.8946256648944307e-05, "loss": 1.1215, "step": 1190 }, { "epoch": 0.1872310322466545, "grad_norm": 0.1962118297815323, "learning_rate": 4.8944481061564035e-05, "loss": 1.1908, "step": 1191 }, { "epoch": 0.18738823714358702, "grad_norm": 0.24563154578208923, "learning_rate": 4.894270401174597e-05, "loss": 1.2265, "step": 1192 }, { "epoch": 0.18754544204051957, "grad_norm": 0.22452424466609955, "learning_rate": 4.894092549959862e-05, "loss": 1.1673, "step": 1193 }, { "epoch": 0.1877026469374521, "grad_norm": 0.1847248673439026, "learning_rate": 4.8939145525230646e-05, "loss": 1.2706, "step": 1194 }, { "epoch": 0.18785985183438464, "grad_norm": 0.2578265964984894, "learning_rate": 4.893736408875075e-05, "loss": 1.2011, "step": 1195 }, { "epoch": 0.1880170567313172, "grad_norm": 0.2686786353588104, "learning_rate": 4.893558119026772e-05, "loss": 1.3191, "step": 1196 }, { "epoch": 0.1881742616282497, "grad_norm": 0.27492383122444153, "learning_rate": 4.893379682989047e-05, "loss": 1.1755, "step": 1197 }, { "epoch": 0.18833146652518226, "grad_norm": 0.2544412612915039, "learning_rate": 4.8932011007727965e-05, "loss": 1.1842, "step": 1198 }, { "epoch": 0.1884886714221148, "grad_norm": 0.24790935218334198, "learning_rate": 4.893022372388928e-05, "loss": 1.2408, "step": 1199 }, { "epoch": 0.18864587631904733, "grad_norm": 0.2788006067276001, "learning_rate": 4.892843497848358e-05, "loss": 1.2671, "step": 1200 }, { "epoch": 0.18880308121597988, "grad_norm": 0.2571476101875305, "learning_rate": 4.892664477162012e-05, "loss": 1.1894, "step": 1201 }, { "epoch": 0.1889602861129124, "grad_norm": 0.22788426280021667, "learning_rate": 4.892485310340822e-05, "loss": 1.2261, "step": 1202 }, { "epoch": 0.18911749100984496, "grad_norm": 0.2010507732629776, "learning_rate": 4.892305997395733e-05, "loss": 1.2399, "step": 1203 }, { "epoch": 0.1892746959067775, "grad_norm": 0.23946425318717957, "learning_rate": 4.892126538337696e-05, "loss": 1.2727, "step": 1204 }, { "epoch": 0.18943190080371003, "grad_norm": 0.2885929346084595, "learning_rate": 4.8919469331776714e-05, "loss": 1.2376, "step": 1205 }, { "epoch": 0.18958910570064258, "grad_norm": 0.31879860162734985, "learning_rate": 4.891767181926629e-05, "loss": 1.22, "step": 1206 }, { "epoch": 0.1897463105975751, "grad_norm": 0.2895459532737732, "learning_rate": 4.891587284595546e-05, "loss": 1.2387, "step": 1207 }, { "epoch": 0.18990351549450765, "grad_norm": 0.27507272362709045, "learning_rate": 4.891407241195412e-05, "loss": 1.1723, "step": 1208 }, { "epoch": 0.1900607203914402, "grad_norm": 0.26780039072036743, "learning_rate": 4.8912270517372224e-05, "loss": 1.1549, "step": 1209 }, { "epoch": 0.19021792528837272, "grad_norm": 0.1915176510810852, "learning_rate": 4.8910467162319826e-05, "loss": 1.109, "step": 1210 }, { "epoch": 0.19037513018530527, "grad_norm": 0.25054261088371277, "learning_rate": 4.8908662346907064e-05, "loss": 1.1197, "step": 1211 }, { "epoch": 0.19053233508223782, "grad_norm": 0.24239963293075562, "learning_rate": 4.8906856071244176e-05, "loss": 1.2614, "step": 1212 }, { "epoch": 0.19068953997917035, "grad_norm": 0.21543578803539276, "learning_rate": 4.890504833544147e-05, "loss": 1.3804, "step": 1213 }, { "epoch": 0.1908467448761029, "grad_norm": 0.2045610100030899, "learning_rate": 4.8903239139609376e-05, "loss": 1.2108, "step": 1214 }, { "epoch": 0.19100394977303542, "grad_norm": 0.2209930568933487, "learning_rate": 4.890142848385838e-05, "loss": 1.2329, "step": 1215 }, { "epoch": 0.19116115466996797, "grad_norm": 0.24921675026416779, "learning_rate": 4.889961636829906e-05, "loss": 1.2009, "step": 1216 }, { "epoch": 0.19131835956690052, "grad_norm": 0.2356979250907898, "learning_rate": 4.8897802793042115e-05, "loss": 1.211, "step": 1217 }, { "epoch": 0.19147556446383304, "grad_norm": 0.20199252665042877, "learning_rate": 4.88959877581983e-05, "loss": 1.18, "step": 1218 }, { "epoch": 0.1916327693607656, "grad_norm": 0.24907195568084717, "learning_rate": 4.889417126387846e-05, "loss": 1.2438, "step": 1219 }, { "epoch": 0.19178997425769811, "grad_norm": 0.2976427674293518, "learning_rate": 4.889235331019356e-05, "loss": 1.1526, "step": 1220 }, { "epoch": 0.19194717915463066, "grad_norm": 0.25074872374534607, "learning_rate": 4.889053389725463e-05, "loss": 1.1805, "step": 1221 }, { "epoch": 0.19210438405156322, "grad_norm": 0.2157672792673111, "learning_rate": 4.8888713025172776e-05, "loss": 1.2103, "step": 1222 }, { "epoch": 0.19226158894849574, "grad_norm": 0.24573171138763428, "learning_rate": 4.888689069405923e-05, "loss": 1.1981, "step": 1223 }, { "epoch": 0.1924187938454283, "grad_norm": 0.294160932302475, "learning_rate": 4.888506690402528e-05, "loss": 1.2667, "step": 1224 }, { "epoch": 0.19257599874236084, "grad_norm": 0.8444136381149292, "learning_rate": 4.8883241655182314e-05, "loss": 1.1977, "step": 1225 }, { "epoch": 0.19273320363929336, "grad_norm": 0.4191160798072815, "learning_rate": 4.888141494764182e-05, "loss": 1.1981, "step": 1226 }, { "epoch": 0.1928904085362259, "grad_norm": 0.31621554493904114, "learning_rate": 4.8879586781515376e-05, "loss": 1.2224, "step": 1227 }, { "epoch": 0.19304761343315843, "grad_norm": 0.2715776860713959, "learning_rate": 4.887775715691462e-05, "loss": 1.1029, "step": 1228 }, { "epoch": 0.19320481833009098, "grad_norm": 0.2641848623752594, "learning_rate": 4.88759260739513e-05, "loss": 1.1738, "step": 1229 }, { "epoch": 0.19336202322702353, "grad_norm": 0.2537270188331604, "learning_rate": 4.887409353273727e-05, "loss": 1.2847, "step": 1230 }, { "epoch": 0.19351922812395606, "grad_norm": 0.2998782694339752, "learning_rate": 4.8872259533384423e-05, "loss": 1.1814, "step": 1231 }, { "epoch": 0.1936764330208886, "grad_norm": 0.2254815697669983, "learning_rate": 4.8870424076004806e-05, "loss": 1.2004, "step": 1232 }, { "epoch": 0.19383363791782113, "grad_norm": 0.3711993396282196, "learning_rate": 4.88685871607105e-05, "loss": 1.1502, "step": 1233 }, { "epoch": 0.19399084281475368, "grad_norm": 0.24783778190612793, "learning_rate": 4.886674878761371e-05, "loss": 1.1185, "step": 1234 }, { "epoch": 0.19414804771168623, "grad_norm": 0.1896362453699112, "learning_rate": 4.88649089568267e-05, "loss": 1.1856, "step": 1235 }, { "epoch": 0.19430525260861875, "grad_norm": 0.28106558322906494, "learning_rate": 4.886306766846187e-05, "loss": 1.2196, "step": 1236 }, { "epoch": 0.1944624575055513, "grad_norm": 0.3023208975791931, "learning_rate": 4.8861224922631645e-05, "loss": 1.1836, "step": 1237 }, { "epoch": 0.19461966240248385, "grad_norm": 0.36752450466156006, "learning_rate": 4.8859380719448596e-05, "loss": 1.1831, "step": 1238 }, { "epoch": 0.19477686729941637, "grad_norm": 0.2593975365161896, "learning_rate": 4.885753505902535e-05, "loss": 1.1955, "step": 1239 }, { "epoch": 0.19493407219634892, "grad_norm": 0.2952882647514343, "learning_rate": 4.885568794147463e-05, "loss": 1.108, "step": 1240 }, { "epoch": 0.19509127709328145, "grad_norm": 0.2335767149925232, "learning_rate": 4.885383936690926e-05, "loss": 1.2389, "step": 1241 }, { "epoch": 0.195248481990214, "grad_norm": 0.3618619441986084, "learning_rate": 4.885198933544214e-05, "loss": 1.0247, "step": 1242 }, { "epoch": 0.19540568688714655, "grad_norm": 0.26691627502441406, "learning_rate": 4.885013784718626e-05, "loss": 1.1516, "step": 1243 }, { "epoch": 0.19556289178407907, "grad_norm": 0.2977723777294159, "learning_rate": 4.8848284902254705e-05, "loss": 1.1617, "step": 1244 }, { "epoch": 0.19572009668101162, "grad_norm": 0.33515632152557373, "learning_rate": 4.884643050076064e-05, "loss": 1.1789, "step": 1245 }, { "epoch": 0.19587730157794414, "grad_norm": 0.275840163230896, "learning_rate": 4.8844574642817334e-05, "loss": 1.1103, "step": 1246 }, { "epoch": 0.1960345064748767, "grad_norm": 0.26756566762924194, "learning_rate": 4.884271732853813e-05, "loss": 1.2101, "step": 1247 }, { "epoch": 0.19619171137180924, "grad_norm": 0.20770548284053802, "learning_rate": 4.884085855803647e-05, "loss": 1.2506, "step": 1248 }, { "epoch": 0.19634891626874176, "grad_norm": 0.2700664699077606, "learning_rate": 4.883899833142588e-05, "loss": 1.2034, "step": 1249 }, { "epoch": 0.1965061211656743, "grad_norm": 0.2403496950864792, "learning_rate": 4.883713664881997e-05, "loss": 1.1622, "step": 1250 }, { "epoch": 0.19666332606260686, "grad_norm": 0.2710270881652832, "learning_rate": 4.883527351033245e-05, "loss": 1.0679, "step": 1251 }, { "epoch": 0.19682053095953939, "grad_norm": 0.2600773870944977, "learning_rate": 4.8833408916077104e-05, "loss": 1.3343, "step": 1252 }, { "epoch": 0.19697773585647194, "grad_norm": 0.25740665197372437, "learning_rate": 4.883154286616783e-05, "loss": 1.2206, "step": 1253 }, { "epoch": 0.19713494075340446, "grad_norm": 0.3393601179122925, "learning_rate": 4.8829675360718585e-05, "loss": 1.1518, "step": 1254 }, { "epoch": 0.197292145650337, "grad_norm": 0.2968616783618927, "learning_rate": 4.8827806399843444e-05, "loss": 1.2547, "step": 1255 }, { "epoch": 0.19744935054726956, "grad_norm": 0.24990178644657135, "learning_rate": 4.8825935983656535e-05, "loss": 1.2733, "step": 1256 }, { "epoch": 0.19760655544420208, "grad_norm": 0.31955957412719727, "learning_rate": 4.882406411227212e-05, "loss": 1.2138, "step": 1257 }, { "epoch": 0.19776376034113463, "grad_norm": 0.22445374727249146, "learning_rate": 4.88221907858045e-05, "loss": 1.1845, "step": 1258 }, { "epoch": 0.19792096523806715, "grad_norm": 0.32888510823249817, "learning_rate": 4.8820316004368116e-05, "loss": 1.2339, "step": 1259 }, { "epoch": 0.1980781701349997, "grad_norm": 0.29760921001434326, "learning_rate": 4.8818439768077456e-05, "loss": 1.2216, "step": 1260 }, { "epoch": 0.19823537503193225, "grad_norm": 0.19965974986553192, "learning_rate": 4.881656207704712e-05, "loss": 1.2608, "step": 1261 }, { "epoch": 0.19839257992886478, "grad_norm": 0.2538587749004364, "learning_rate": 4.881468293139179e-05, "loss": 1.1989, "step": 1262 }, { "epoch": 0.19854978482579733, "grad_norm": 0.35299167037010193, "learning_rate": 4.8812802331226224e-05, "loss": 1.1426, "step": 1263 }, { "epoch": 0.19870698972272988, "grad_norm": 0.3230816423892975, "learning_rate": 4.8810920276665306e-05, "loss": 1.2546, "step": 1264 }, { "epoch": 0.1988641946196624, "grad_norm": 0.3077559769153595, "learning_rate": 4.880903676782397e-05, "loss": 1.1661, "step": 1265 }, { "epoch": 0.19902139951659495, "grad_norm": 0.32157936692237854, "learning_rate": 4.8807151804817254e-05, "loss": 1.2141, "step": 1266 }, { "epoch": 0.19917860441352747, "grad_norm": 0.32653504610061646, "learning_rate": 4.880526538776029e-05, "loss": 1.0623, "step": 1267 }, { "epoch": 0.19933580931046002, "grad_norm": 0.2675210237503052, "learning_rate": 4.880337751676828e-05, "loss": 1.1408, "step": 1268 }, { "epoch": 0.19949301420739257, "grad_norm": 0.28380653262138367, "learning_rate": 4.880148819195654e-05, "loss": 1.223, "step": 1269 }, { "epoch": 0.1996502191043251, "grad_norm": 0.2532847821712494, "learning_rate": 4.8799597413440466e-05, "loss": 1.2133, "step": 1270 }, { "epoch": 0.19980742400125764, "grad_norm": 0.2972438633441925, "learning_rate": 4.8797705181335526e-05, "loss": 1.2806, "step": 1271 }, { "epoch": 0.19996462889819017, "grad_norm": 0.2725450098514557, "learning_rate": 4.8795811495757306e-05, "loss": 1.1627, "step": 1272 }, { "epoch": 0.20012183379512272, "grad_norm": 0.2451506108045578, "learning_rate": 4.879391635682145e-05, "loss": 1.3242, "step": 1273 }, { "epoch": 0.20027903869205527, "grad_norm": 0.22880415618419647, "learning_rate": 4.8792019764643714e-05, "loss": 1.1535, "step": 1274 }, { "epoch": 0.2004362435889878, "grad_norm": 0.22470681369304657, "learning_rate": 4.8790121719339935e-05, "loss": 1.268, "step": 1275 }, { "epoch": 0.20059344848592034, "grad_norm": 0.2413133829832077, "learning_rate": 4.878822222102604e-05, "loss": 1.2291, "step": 1276 }, { "epoch": 0.2007506533828529, "grad_norm": 0.23373375833034515, "learning_rate": 4.878632126981804e-05, "loss": 1.1007, "step": 1277 }, { "epoch": 0.2009078582797854, "grad_norm": 0.3018023371696472, "learning_rate": 4.878441886583203e-05, "loss": 1.2393, "step": 1278 }, { "epoch": 0.20106506317671796, "grad_norm": 0.2107972353696823, "learning_rate": 4.878251500918421e-05, "loss": 1.3164, "step": 1279 }, { "epoch": 0.20122226807365048, "grad_norm": 0.24787524342536926, "learning_rate": 4.878060969999087e-05, "loss": 1.217, "step": 1280 }, { "epoch": 0.20122226807365048, "eval_loss": 1.2021143436431885, "eval_runtime": 2276.1827, "eval_samples_per_second": 4.067, "eval_steps_per_second": 2.034, "step": 1280 }, { "epoch": 0.20137947297058303, "grad_norm": 0.22325897216796875, "learning_rate": 4.877870293836837e-05, "loss": 1.2739, "step": 1281 }, { "epoch": 0.20153667786751558, "grad_norm": 0.20739248394966125, "learning_rate": 4.877679472443315e-05, "loss": 1.2458, "step": 1282 }, { "epoch": 0.2016938827644481, "grad_norm": 0.38787081837654114, "learning_rate": 4.877488505830179e-05, "loss": 1.2039, "step": 1283 }, { "epoch": 0.20185108766138066, "grad_norm": 0.2838675379753113, "learning_rate": 4.8772973940090895e-05, "loss": 1.1647, "step": 1284 }, { "epoch": 0.20200829255831318, "grad_norm": 0.2516341805458069, "learning_rate": 4.877106136991721e-05, "loss": 1.1952, "step": 1285 }, { "epoch": 0.20216549745524573, "grad_norm": 0.24181115627288818, "learning_rate": 4.8769147347897535e-05, "loss": 1.1822, "step": 1286 }, { "epoch": 0.20232270235217828, "grad_norm": 0.3560396730899811, "learning_rate": 4.876723187414878e-05, "loss": 1.1863, "step": 1287 }, { "epoch": 0.2024799072491108, "grad_norm": 0.41868817806243896, "learning_rate": 4.8765314948787934e-05, "loss": 1.1446, "step": 1288 }, { "epoch": 0.20263711214604335, "grad_norm": 0.21799515187740326, "learning_rate": 4.8763396571932066e-05, "loss": 1.155, "step": 1289 }, { "epoch": 0.2027943170429759, "grad_norm": 0.24254827201366425, "learning_rate": 4.876147674369834e-05, "loss": 1.1363, "step": 1290 }, { "epoch": 0.20295152193990842, "grad_norm": 0.24996933341026306, "learning_rate": 4.875955546420404e-05, "loss": 1.1773, "step": 1291 }, { "epoch": 0.20310872683684097, "grad_norm": 0.33059096336364746, "learning_rate": 4.8757632733566484e-05, "loss": 1.2217, "step": 1292 }, { "epoch": 0.2032659317337735, "grad_norm": 0.3126905858516693, "learning_rate": 4.875570855190311e-05, "loss": 1.2031, "step": 1293 }, { "epoch": 0.20342313663070605, "grad_norm": 0.26368340849876404, "learning_rate": 4.8753782919331436e-05, "loss": 1.2348, "step": 1294 }, { "epoch": 0.2035803415276386, "grad_norm": 0.2591840922832489, "learning_rate": 4.875185583596909e-05, "loss": 1.2303, "step": 1295 }, { "epoch": 0.20373754642457112, "grad_norm": 0.3512473702430725, "learning_rate": 4.874992730193375e-05, "loss": 1.149, "step": 1296 }, { "epoch": 0.20389475132150367, "grad_norm": 0.28538331389427185, "learning_rate": 4.874799731734322e-05, "loss": 1.2177, "step": 1297 }, { "epoch": 0.2040519562184362, "grad_norm": 0.27414706349372864, "learning_rate": 4.8746065882315375e-05, "loss": 1.1767, "step": 1298 }, { "epoch": 0.20420916111536874, "grad_norm": 0.2481449991464615, "learning_rate": 4.874413299696816e-05, "loss": 1.1928, "step": 1299 }, { "epoch": 0.2043663660123013, "grad_norm": 0.26623374223709106, "learning_rate": 4.8742198661419646e-05, "loss": 1.2455, "step": 1300 }, { "epoch": 0.20452357090923381, "grad_norm": 0.22189855575561523, "learning_rate": 4.874026287578798e-05, "loss": 1.1934, "step": 1301 }, { "epoch": 0.20468077580616637, "grad_norm": 0.2549070119857788, "learning_rate": 4.873832564019137e-05, "loss": 1.0798, "step": 1302 }, { "epoch": 0.20483798070309892, "grad_norm": 0.2875777781009674, "learning_rate": 4.873638695474816e-05, "loss": 1.2025, "step": 1303 }, { "epoch": 0.20499518560003144, "grad_norm": 0.26281726360321045, "learning_rate": 4.873444681957674e-05, "loss": 1.2533, "step": 1304 }, { "epoch": 0.205152390496964, "grad_norm": 0.294112890958786, "learning_rate": 4.873250523479561e-05, "loss": 1.177, "step": 1305 }, { "epoch": 0.2053095953938965, "grad_norm": 0.24901710450649261, "learning_rate": 4.873056220052336e-05, "loss": 1.2624, "step": 1306 }, { "epoch": 0.20546680029082906, "grad_norm": 0.2856816351413727, "learning_rate": 4.8728617716878664e-05, "loss": 1.0575, "step": 1307 }, { "epoch": 0.2056240051877616, "grad_norm": 0.34814971685409546, "learning_rate": 4.872667178398027e-05, "loss": 1.1829, "step": 1308 }, { "epoch": 0.20578121008469413, "grad_norm": 0.3625463545322418, "learning_rate": 4.872472440194704e-05, "loss": 1.1988, "step": 1309 }, { "epoch": 0.20593841498162668, "grad_norm": 0.2561318576335907, "learning_rate": 4.8722775570897915e-05, "loss": 1.185, "step": 1310 }, { "epoch": 0.2060956198785592, "grad_norm": 0.25931569933891296, "learning_rate": 4.872082529095191e-05, "loss": 1.15, "step": 1311 }, { "epoch": 0.20625282477549176, "grad_norm": 0.21291913092136383, "learning_rate": 4.871887356222815e-05, "loss": 1.2019, "step": 1312 }, { "epoch": 0.2064100296724243, "grad_norm": 0.21931886672973633, "learning_rate": 4.8716920384845844e-05, "loss": 1.3054, "step": 1313 }, { "epoch": 0.20656723456935683, "grad_norm": 0.22694003582000732, "learning_rate": 4.8714965758924276e-05, "loss": 1.1884, "step": 1314 }, { "epoch": 0.20672443946628938, "grad_norm": 0.25568726658821106, "learning_rate": 4.871300968458282e-05, "loss": 1.2516, "step": 1315 }, { "epoch": 0.20688164436322193, "grad_norm": 0.24557702243328094, "learning_rate": 4.871105216194096e-05, "loss": 1.3418, "step": 1316 }, { "epoch": 0.20703884926015445, "grad_norm": 0.2368367463350296, "learning_rate": 4.870909319111825e-05, "loss": 1.2088, "step": 1317 }, { "epoch": 0.207196054157087, "grad_norm": 0.3659820556640625, "learning_rate": 4.870713277223434e-05, "loss": 1.0446, "step": 1318 }, { "epoch": 0.20735325905401952, "grad_norm": 0.2582785189151764, "learning_rate": 4.870517090540896e-05, "loss": 1.3072, "step": 1319 }, { "epoch": 0.20751046395095207, "grad_norm": 0.27062729001045227, "learning_rate": 4.870320759076192e-05, "loss": 1.2977, "step": 1320 }, { "epoch": 0.20766766884788462, "grad_norm": 0.2483299970626831, "learning_rate": 4.870124282841316e-05, "loss": 1.2001, "step": 1321 }, { "epoch": 0.20782487374481715, "grad_norm": 0.17987532913684845, "learning_rate": 4.869927661848266e-05, "loss": 1.253, "step": 1322 }, { "epoch": 0.2079820786417497, "grad_norm": 0.24004274606704712, "learning_rate": 4.869730896109051e-05, "loss": 1.2242, "step": 1323 }, { "epoch": 0.20813928353868222, "grad_norm": 0.259755402803421, "learning_rate": 4.869533985635689e-05, "loss": 1.2338, "step": 1324 }, { "epoch": 0.20829648843561477, "grad_norm": 0.2481742948293686, "learning_rate": 4.869336930440207e-05, "loss": 1.0983, "step": 1325 }, { "epoch": 0.20845369333254732, "grad_norm": 0.2746635675430298, "learning_rate": 4.8691397305346404e-05, "loss": 1.2491, "step": 1326 }, { "epoch": 0.20861089822947984, "grad_norm": 0.3291498124599457, "learning_rate": 4.868942385931032e-05, "loss": 1.2045, "step": 1327 }, { "epoch": 0.2087681031264124, "grad_norm": 0.2693649232387543, "learning_rate": 4.8687448966414376e-05, "loss": 1.2367, "step": 1328 }, { "epoch": 0.2089253080233449, "grad_norm": 0.3101809620857239, "learning_rate": 4.868547262677916e-05, "loss": 1.1759, "step": 1329 }, { "epoch": 0.20908251292027746, "grad_norm": 0.22869786620140076, "learning_rate": 4.86834948405254e-05, "loss": 1.2219, "step": 1330 }, { "epoch": 0.20923971781721, "grad_norm": 0.32914999127388, "learning_rate": 4.868151560777388e-05, "loss": 1.1465, "step": 1331 }, { "epoch": 0.20939692271414254, "grad_norm": 0.2585611343383789, "learning_rate": 4.867953492864549e-05, "loss": 1.202, "step": 1332 }, { "epoch": 0.2095541276110751, "grad_norm": 0.22913898527622223, "learning_rate": 4.8677552803261203e-05, "loss": 1.1182, "step": 1333 }, { "epoch": 0.20971133250800764, "grad_norm": 0.19015748798847198, "learning_rate": 4.867556923174208e-05, "loss": 1.2997, "step": 1334 }, { "epoch": 0.20986853740494016, "grad_norm": 0.28327012062072754, "learning_rate": 4.867358421420927e-05, "loss": 1.2135, "step": 1335 }, { "epoch": 0.2100257423018727, "grad_norm": 0.20216205716133118, "learning_rate": 4.8671597750784006e-05, "loss": 1.0898, "step": 1336 }, { "epoch": 0.21018294719880523, "grad_norm": 0.23876157402992249, "learning_rate": 4.8669609841587607e-05, "loss": 1.1954, "step": 1337 }, { "epoch": 0.21034015209573778, "grad_norm": 0.22727316617965698, "learning_rate": 4.86676204867415e-05, "loss": 1.1578, "step": 1338 }, { "epoch": 0.21049735699267033, "grad_norm": 0.23092162609100342, "learning_rate": 4.8665629686367185e-05, "loss": 1.2883, "step": 1339 }, { "epoch": 0.21065456188960285, "grad_norm": 0.5870768427848816, "learning_rate": 4.8663637440586255e-05, "loss": 1.1947, "step": 1340 }, { "epoch": 0.2108117667865354, "grad_norm": 0.20369422435760498, "learning_rate": 4.866164374952038e-05, "loss": 1.1551, "step": 1341 }, { "epoch": 0.21096897168346793, "grad_norm": 0.21872729063034058, "learning_rate": 4.865964861329133e-05, "loss": 1.195, "step": 1342 }, { "epoch": 0.21112617658040048, "grad_norm": 0.23386871814727783, "learning_rate": 4.8657652032020965e-05, "loss": 1.1417, "step": 1343 }, { "epoch": 0.21128338147733303, "grad_norm": 0.25174540281295776, "learning_rate": 4.865565400583123e-05, "loss": 1.2826, "step": 1344 }, { "epoch": 0.21144058637426555, "grad_norm": 0.2607194781303406, "learning_rate": 4.865365453484415e-05, "loss": 1.1529, "step": 1345 }, { "epoch": 0.2115977912711981, "grad_norm": 0.28137052059173584, "learning_rate": 4.8651653619181835e-05, "loss": 1.2041, "step": 1346 }, { "epoch": 0.21175499616813065, "grad_norm": 0.3423405587673187, "learning_rate": 4.864965125896652e-05, "loss": 1.217, "step": 1347 }, { "epoch": 0.21191220106506317, "grad_norm": 0.35149091482162476, "learning_rate": 4.864764745432048e-05, "loss": 1.234, "step": 1348 }, { "epoch": 0.21206940596199572, "grad_norm": 0.19579124450683594, "learning_rate": 4.864564220536611e-05, "loss": 1.1335, "step": 1349 }, { "epoch": 0.21222661085892824, "grad_norm": 0.2690495550632477, "learning_rate": 4.8643635512225874e-05, "loss": 1.2253, "step": 1350 }, { "epoch": 0.2123838157558608, "grad_norm": 0.2389325350522995, "learning_rate": 4.8641627375022346e-05, "loss": 1.2362, "step": 1351 }, { "epoch": 0.21254102065279334, "grad_norm": 0.2795645296573639, "learning_rate": 4.863961779387817e-05, "loss": 1.0509, "step": 1352 }, { "epoch": 0.21269822554972587, "grad_norm": 0.1987501084804535, "learning_rate": 4.863760676891608e-05, "loss": 1.0825, "step": 1353 }, { "epoch": 0.21285543044665842, "grad_norm": 0.2520771324634552, "learning_rate": 4.8635594300258905e-05, "loss": 1.2879, "step": 1354 }, { "epoch": 0.21301263534359094, "grad_norm": 0.3053792119026184, "learning_rate": 4.863358038802955e-05, "loss": 1.1737, "step": 1355 }, { "epoch": 0.2131698402405235, "grad_norm": 0.24323543906211853, "learning_rate": 4.863156503235102e-05, "loss": 1.3105, "step": 1356 }, { "epoch": 0.21332704513745604, "grad_norm": 0.2622387111186981, "learning_rate": 4.862954823334643e-05, "loss": 1.1817, "step": 1357 }, { "epoch": 0.21348425003438856, "grad_norm": 0.28524765372276306, "learning_rate": 4.862752999113893e-05, "loss": 1.1191, "step": 1358 }, { "epoch": 0.2136414549313211, "grad_norm": 0.2917231619358063, "learning_rate": 4.8625510305851784e-05, "loss": 1.1717, "step": 1359 }, { "epoch": 0.21379865982825366, "grad_norm": 0.23248536884784698, "learning_rate": 4.862348917760837e-05, "loss": 1.1472, "step": 1360 }, { "epoch": 0.21395586472518618, "grad_norm": 0.30648741126060486, "learning_rate": 4.862146660653212e-05, "loss": 1.1325, "step": 1361 }, { "epoch": 0.21411306962211873, "grad_norm": 0.26287996768951416, "learning_rate": 4.8619442592746554e-05, "loss": 1.1891, "step": 1362 }, { "epoch": 0.21427027451905126, "grad_norm": 0.2846413254737854, "learning_rate": 4.861741713637531e-05, "loss": 1.2429, "step": 1363 }, { "epoch": 0.2144274794159838, "grad_norm": 0.26600465178489685, "learning_rate": 4.861539023754208e-05, "loss": 1.2825, "step": 1364 }, { "epoch": 0.21458468431291636, "grad_norm": 0.23721352219581604, "learning_rate": 4.861336189637066e-05, "loss": 1.16, "step": 1365 }, { "epoch": 0.21474188920984888, "grad_norm": 0.21250367164611816, "learning_rate": 4.8611332112984946e-05, "loss": 1.1917, "step": 1366 }, { "epoch": 0.21489909410678143, "grad_norm": 0.2471015751361847, "learning_rate": 4.86093008875089e-05, "loss": 1.2403, "step": 1367 }, { "epoch": 0.21505629900371395, "grad_norm": 0.2969186007976532, "learning_rate": 4.860726822006659e-05, "loss": 1.2443, "step": 1368 }, { "epoch": 0.2152135039006465, "grad_norm": 0.24633657932281494, "learning_rate": 4.860523411078215e-05, "loss": 1.2468, "step": 1369 }, { "epoch": 0.21537070879757905, "grad_norm": 0.24867349863052368, "learning_rate": 4.860319855977982e-05, "loss": 1.194, "step": 1370 }, { "epoch": 0.21552791369451157, "grad_norm": 0.27883896231651306, "learning_rate": 4.8601161567183925e-05, "loss": 1.2181, "step": 1371 }, { "epoch": 0.21568511859144412, "grad_norm": 0.20830753445625305, "learning_rate": 4.859912313311888e-05, "loss": 1.1579, "step": 1372 }, { "epoch": 0.21584232348837668, "grad_norm": 0.25265562534332275, "learning_rate": 4.8597083257709194e-05, "loss": 1.1455, "step": 1373 }, { "epoch": 0.2159995283853092, "grad_norm": 0.23922070860862732, "learning_rate": 4.859504194107943e-05, "loss": 1.1434, "step": 1374 }, { "epoch": 0.21615673328224175, "grad_norm": 0.1661684215068817, "learning_rate": 4.859299918335428e-05, "loss": 1.2426, "step": 1375 }, { "epoch": 0.21631393817917427, "grad_norm": 0.20215146243572235, "learning_rate": 4.859095498465851e-05, "loss": 1.2485, "step": 1376 }, { "epoch": 0.21647114307610682, "grad_norm": 0.2150086909532547, "learning_rate": 4.858890934511697e-05, "loss": 1.2034, "step": 1377 }, { "epoch": 0.21662834797303937, "grad_norm": 0.24220801889896393, "learning_rate": 4.8586862264854595e-05, "loss": 1.2063, "step": 1378 }, { "epoch": 0.2167855528699719, "grad_norm": 0.3015029728412628, "learning_rate": 4.85848137439964e-05, "loss": 1.2371, "step": 1379 }, { "epoch": 0.21694275776690444, "grad_norm": 0.32718077301979065, "learning_rate": 4.8582763782667534e-05, "loss": 1.1915, "step": 1380 }, { "epoch": 0.21709996266383697, "grad_norm": 0.2721043527126312, "learning_rate": 4.858071238099318e-05, "loss": 1.1692, "step": 1381 }, { "epoch": 0.21725716756076952, "grad_norm": 0.21820539236068726, "learning_rate": 4.857865953909862e-05, "loss": 1.2529, "step": 1382 }, { "epoch": 0.21741437245770207, "grad_norm": 0.24213218688964844, "learning_rate": 4.857660525710927e-05, "loss": 1.2273, "step": 1383 }, { "epoch": 0.2175715773546346, "grad_norm": 0.25169044733047485, "learning_rate": 4.857454953515055e-05, "loss": 1.1917, "step": 1384 }, { "epoch": 0.21772878225156714, "grad_norm": 0.1900857836008072, "learning_rate": 4.8572492373348055e-05, "loss": 1.1924, "step": 1385 }, { "epoch": 0.2178859871484997, "grad_norm": 0.27372950315475464, "learning_rate": 4.857043377182741e-05, "loss": 1.2062, "step": 1386 }, { "epoch": 0.2180431920454322, "grad_norm": 0.2589268386363983, "learning_rate": 4.8568373730714344e-05, "loss": 1.1294, "step": 1387 }, { "epoch": 0.21820039694236476, "grad_norm": 0.32234373688697815, "learning_rate": 4.856631225013468e-05, "loss": 1.0772, "step": 1388 }, { "epoch": 0.21835760183929728, "grad_norm": 0.34723344445228577, "learning_rate": 4.8564249330214337e-05, "loss": 1.1989, "step": 1389 }, { "epoch": 0.21851480673622983, "grad_norm": 0.23347902297973633, "learning_rate": 4.85621849710793e-05, "loss": 1.3154, "step": 1390 }, { "epoch": 0.21867201163316238, "grad_norm": 0.19097594916820526, "learning_rate": 4.856011917285565e-05, "loss": 1.241, "step": 1391 }, { "epoch": 0.2188292165300949, "grad_norm": 0.17387251555919647, "learning_rate": 4.855805193566956e-05, "loss": 1.2396, "step": 1392 }, { "epoch": 0.21898642142702746, "grad_norm": 0.25310957431793213, "learning_rate": 4.85559832596473e-05, "loss": 1.1169, "step": 1393 }, { "epoch": 0.21914362632395998, "grad_norm": 0.20254233479499817, "learning_rate": 4.85539131449152e-05, "loss": 1.2233, "step": 1394 }, { "epoch": 0.21930083122089253, "grad_norm": 0.22399556636810303, "learning_rate": 4.8551841591599696e-05, "loss": 1.1257, "step": 1395 }, { "epoch": 0.21945803611782508, "grad_norm": 0.24220795929431915, "learning_rate": 4.854976859982732e-05, "loss": 1.1364, "step": 1396 }, { "epoch": 0.2196152410147576, "grad_norm": 0.2996869683265686, "learning_rate": 4.854769416972468e-05, "loss": 1.277, "step": 1397 }, { "epoch": 0.21977244591169015, "grad_norm": 0.22607775032520294, "learning_rate": 4.854561830141848e-05, "loss": 1.1538, "step": 1398 }, { "epoch": 0.2199296508086227, "grad_norm": 0.3184898793697357, "learning_rate": 4.854354099503549e-05, "loss": 1.2933, "step": 1399 }, { "epoch": 0.22008685570555522, "grad_norm": 0.26133978366851807, "learning_rate": 4.8541462250702595e-05, "loss": 1.1539, "step": 1400 }, { "epoch": 0.22024406060248777, "grad_norm": 0.2325352430343628, "learning_rate": 4.853938206854676e-05, "loss": 1.1242, "step": 1401 }, { "epoch": 0.2204012654994203, "grad_norm": 0.20906803011894226, "learning_rate": 4.853730044869503e-05, "loss": 1.1166, "step": 1402 }, { "epoch": 0.22055847039635285, "grad_norm": 0.27185148000717163, "learning_rate": 4.853521739127453e-05, "loss": 1.1067, "step": 1403 }, { "epoch": 0.2207156752932854, "grad_norm": 0.2759348452091217, "learning_rate": 4.8533132896412514e-05, "loss": 1.1237, "step": 1404 }, { "epoch": 0.22087288019021792, "grad_norm": 0.21420961618423462, "learning_rate": 4.853104696423627e-05, "loss": 1.1362, "step": 1405 }, { "epoch": 0.22103008508715047, "grad_norm": 0.22797361016273499, "learning_rate": 4.852895959487321e-05, "loss": 1.1114, "step": 1406 }, { "epoch": 0.221187289984083, "grad_norm": 0.303501695394516, "learning_rate": 4.8526870788450816e-05, "loss": 1.1836, "step": 1407 }, { "epoch": 0.22134449488101554, "grad_norm": 0.22205421328544617, "learning_rate": 4.852478054509667e-05, "loss": 1.2281, "step": 1408 }, { "epoch": 0.2215016997779481, "grad_norm": 0.31351590156555176, "learning_rate": 4.852268886493844e-05, "loss": 1.161, "step": 1409 }, { "epoch": 0.2216589046748806, "grad_norm": 0.2639506757259369, "learning_rate": 4.852059574810386e-05, "loss": 1.1397, "step": 1410 }, { "epoch": 0.22181610957181316, "grad_norm": 0.2192612588405609, "learning_rate": 4.851850119472079e-05, "loss": 1.1476, "step": 1411 }, { "epoch": 0.2219733144687457, "grad_norm": 0.27032819390296936, "learning_rate": 4.851640520491715e-05, "loss": 1.2416, "step": 1412 }, { "epoch": 0.22213051936567824, "grad_norm": 0.3159463405609131, "learning_rate": 4.851430777882095e-05, "loss": 1.1131, "step": 1413 }, { "epoch": 0.2222877242626108, "grad_norm": 0.20352302491664886, "learning_rate": 4.85122089165603e-05, "loss": 1.2406, "step": 1414 }, { "epoch": 0.2224449291595433, "grad_norm": 0.2544812858104706, "learning_rate": 4.8510108618263385e-05, "loss": 1.2213, "step": 1415 }, { "epoch": 0.22260213405647586, "grad_norm": 0.38868314027786255, "learning_rate": 4.8508006884058485e-05, "loss": 1.1821, "step": 1416 }, { "epoch": 0.2227593389534084, "grad_norm": 0.293169766664505, "learning_rate": 4.850590371407397e-05, "loss": 1.2964, "step": 1417 }, { "epoch": 0.22291654385034093, "grad_norm": 0.282044917345047, "learning_rate": 4.850379910843829e-05, "loss": 1.2502, "step": 1418 }, { "epoch": 0.22307374874727348, "grad_norm": 0.2392999529838562, "learning_rate": 4.850169306727999e-05, "loss": 1.1388, "step": 1419 }, { "epoch": 0.223230953644206, "grad_norm": 0.18563714623451233, "learning_rate": 4.849958559072768e-05, "loss": 1.2458, "step": 1420 }, { "epoch": 0.22338815854113855, "grad_norm": 0.21055997908115387, "learning_rate": 4.84974766789101e-05, "loss": 1.1713, "step": 1421 }, { "epoch": 0.2235453634380711, "grad_norm": 0.21380756795406342, "learning_rate": 4.849536633195606e-05, "loss": 1.2113, "step": 1422 }, { "epoch": 0.22370256833500363, "grad_norm": 0.2441507875919342, "learning_rate": 4.849325454999443e-05, "loss": 1.275, "step": 1423 }, { "epoch": 0.22385977323193618, "grad_norm": 0.2958972454071045, "learning_rate": 4.849114133315419e-05, "loss": 1.1454, "step": 1424 }, { "epoch": 0.22401697812886873, "grad_norm": 0.30271342396736145, "learning_rate": 4.848902668156442e-05, "loss": 1.1151, "step": 1425 }, { "epoch": 0.22417418302580125, "grad_norm": 0.2217937856912613, "learning_rate": 4.848691059535427e-05, "loss": 1.2406, "step": 1426 }, { "epoch": 0.2243313879227338, "grad_norm": 0.22862625122070312, "learning_rate": 4.848479307465299e-05, "loss": 1.1971, "step": 1427 }, { "epoch": 0.22448859281966632, "grad_norm": 0.2594766616821289, "learning_rate": 4.8482674119589896e-05, "loss": 1.1845, "step": 1428 }, { "epoch": 0.22464579771659887, "grad_norm": 0.21012337505817413, "learning_rate": 4.848055373029441e-05, "loss": 1.1644, "step": 1429 }, { "epoch": 0.22480300261353142, "grad_norm": 0.3061239421367645, "learning_rate": 4.847843190689605e-05, "loss": 1.1177, "step": 1430 }, { "epoch": 0.22496020751046394, "grad_norm": 0.27125445008277893, "learning_rate": 4.847630864952439e-05, "loss": 1.2566, "step": 1431 }, { "epoch": 0.2251174124073965, "grad_norm": 0.16465838253498077, "learning_rate": 4.847418395830911e-05, "loss": 1.2477, "step": 1432 }, { "epoch": 0.22527461730432902, "grad_norm": 0.2064242959022522, "learning_rate": 4.8472057833380005e-05, "loss": 1.2777, "step": 1433 }, { "epoch": 0.22543182220126157, "grad_norm": 0.1920788437128067, "learning_rate": 4.846993027486691e-05, "loss": 1.0682, "step": 1434 }, { "epoch": 0.22558902709819412, "grad_norm": 0.2445499151945114, "learning_rate": 4.8467801282899775e-05, "loss": 1.1468, "step": 1435 }, { "epoch": 0.22574623199512664, "grad_norm": 0.2358809858560562, "learning_rate": 4.846567085760861e-05, "loss": 1.2001, "step": 1436 }, { "epoch": 0.2259034368920592, "grad_norm": 0.2340790331363678, "learning_rate": 4.846353899912356e-05, "loss": 1.1401, "step": 1437 }, { "epoch": 0.22606064178899174, "grad_norm": 0.23034712672233582, "learning_rate": 4.8461405707574824e-05, "loss": 1.1617, "step": 1438 }, { "epoch": 0.22621784668592426, "grad_norm": 0.2453261762857437, "learning_rate": 4.8459270983092686e-05, "loss": 1.2889, "step": 1439 }, { "epoch": 0.2263750515828568, "grad_norm": 0.23590409755706787, "learning_rate": 4.8457134825807535e-05, "loss": 1.1807, "step": 1440 }, { "epoch": 0.2263750515828568, "eval_loss": 1.1835801601409912, "eval_runtime": 2320.0135, "eval_samples_per_second": 3.99, "eval_steps_per_second": 1.995, "step": 1440 }, { "epoch": 0.22653225647978933, "grad_norm": 0.2932775318622589, "learning_rate": 4.845499723584984e-05, "loss": 1.1899, "step": 1441 }, { "epoch": 0.22668946137672188, "grad_norm": 0.23739026486873627, "learning_rate": 4.845285821335015e-05, "loss": 1.2037, "step": 1442 }, { "epoch": 0.22684666627365443, "grad_norm": 0.1993498057126999, "learning_rate": 4.8450717758439115e-05, "loss": 1.1681, "step": 1443 }, { "epoch": 0.22700387117058696, "grad_norm": 0.23352783918380737, "learning_rate": 4.8448575871247465e-05, "loss": 1.2268, "step": 1444 }, { "epoch": 0.2271610760675195, "grad_norm": 0.22201289236545563, "learning_rate": 4.844643255190602e-05, "loss": 1.2883, "step": 1445 }, { "epoch": 0.22731828096445203, "grad_norm": 0.2467186152935028, "learning_rate": 4.8444287800545676e-05, "loss": 1.1602, "step": 1446 }, { "epoch": 0.22747548586138458, "grad_norm": 0.20474953949451447, "learning_rate": 4.844214161729743e-05, "loss": 1.3125, "step": 1447 }, { "epoch": 0.22763269075831713, "grad_norm": 0.1738600730895996, "learning_rate": 4.843999400229238e-05, "loss": 1.275, "step": 1448 }, { "epoch": 0.22778989565524965, "grad_norm": 0.23264817893505096, "learning_rate": 4.843784495566166e-05, "loss": 1.2151, "step": 1449 }, { "epoch": 0.2279471005521822, "grad_norm": 0.2865363657474518, "learning_rate": 4.843569447753656e-05, "loss": 1.2071, "step": 1450 }, { "epoch": 0.22810430544911475, "grad_norm": 0.2803146243095398, "learning_rate": 4.84335425680484e-05, "loss": 1.207, "step": 1451 }, { "epoch": 0.22826151034604727, "grad_norm": 0.22393058240413666, "learning_rate": 4.843138922732863e-05, "loss": 1.1584, "step": 1452 }, { "epoch": 0.22841871524297983, "grad_norm": 0.22326746582984924, "learning_rate": 4.8429234455508746e-05, "loss": 1.2231, "step": 1453 }, { "epoch": 0.22857592013991235, "grad_norm": 0.21326550841331482, "learning_rate": 4.8427078252720366e-05, "loss": 1.138, "step": 1454 }, { "epoch": 0.2287331250368449, "grad_norm": 0.19931691884994507, "learning_rate": 4.842492061909518e-05, "loss": 1.2628, "step": 1455 }, { "epoch": 0.22889032993377745, "grad_norm": 0.4217057228088379, "learning_rate": 4.8422761554764974e-05, "loss": 1.0622, "step": 1456 }, { "epoch": 0.22904753483070997, "grad_norm": 0.2697439193725586, "learning_rate": 4.8420601059861605e-05, "loss": 1.1374, "step": 1457 }, { "epoch": 0.22920473972764252, "grad_norm": 0.25051379203796387, "learning_rate": 4.841843913451703e-05, "loss": 1.1349, "step": 1458 }, { "epoch": 0.22936194462457504, "grad_norm": 0.30390244722366333, "learning_rate": 4.841627577886329e-05, "loss": 1.1929, "step": 1459 }, { "epoch": 0.2295191495215076, "grad_norm": 0.2657618820667267, "learning_rate": 4.8414110993032535e-05, "loss": 1.1913, "step": 1460 }, { "epoch": 0.22967635441844014, "grad_norm": 0.27086779475212097, "learning_rate": 4.841194477715696e-05, "loss": 1.2208, "step": 1461 }, { "epoch": 0.22983355931537267, "grad_norm": 0.2512415647506714, "learning_rate": 4.840977713136887e-05, "loss": 1.1451, "step": 1462 }, { "epoch": 0.22999076421230522, "grad_norm": 0.2690669298171997, "learning_rate": 4.8407608055800656e-05, "loss": 1.2148, "step": 1463 }, { "epoch": 0.23014796910923777, "grad_norm": 0.2143964320421219, "learning_rate": 4.8405437550584816e-05, "loss": 1.2793, "step": 1464 }, { "epoch": 0.2303051740061703, "grad_norm": 0.21806931495666504, "learning_rate": 4.8403265615853894e-05, "loss": 1.1906, "step": 1465 }, { "epoch": 0.23046237890310284, "grad_norm": 0.23820914328098297, "learning_rate": 4.8401092251740555e-05, "loss": 1.3061, "step": 1466 }, { "epoch": 0.23061958380003536, "grad_norm": 0.2485547661781311, "learning_rate": 4.839891745837753e-05, "loss": 1.2976, "step": 1467 }, { "epoch": 0.2307767886969679, "grad_norm": 0.2677574157714844, "learning_rate": 4.8396741235897655e-05, "loss": 1.0837, "step": 1468 }, { "epoch": 0.23093399359390046, "grad_norm": 0.21473652124404907, "learning_rate": 4.839456358443385e-05, "loss": 1.1982, "step": 1469 }, { "epoch": 0.23109119849083298, "grad_norm": 0.27341189980506897, "learning_rate": 4.8392384504119116e-05, "loss": 1.1508, "step": 1470 }, { "epoch": 0.23124840338776553, "grad_norm": 0.20952042937278748, "learning_rate": 4.8390203995086525e-05, "loss": 1.2806, "step": 1471 }, { "epoch": 0.23140560828469806, "grad_norm": 0.19671006500720978, "learning_rate": 4.838802205746927e-05, "loss": 1.1397, "step": 1472 }, { "epoch": 0.2315628131816306, "grad_norm": 0.2122461348772049, "learning_rate": 4.838583869140063e-05, "loss": 1.2206, "step": 1473 }, { "epoch": 0.23172001807856316, "grad_norm": 0.2837271988391876, "learning_rate": 4.838365389701392e-05, "loss": 1.1932, "step": 1474 }, { "epoch": 0.23187722297549568, "grad_norm": 0.20707464218139648, "learning_rate": 4.838146767444261e-05, "loss": 1.1619, "step": 1475 }, { "epoch": 0.23203442787242823, "grad_norm": 0.2600030303001404, "learning_rate": 4.837928002382021e-05, "loss": 1.1239, "step": 1476 }, { "epoch": 0.23219163276936078, "grad_norm": 0.1838095337152481, "learning_rate": 4.837709094528035e-05, "loss": 1.2329, "step": 1477 }, { "epoch": 0.2323488376662933, "grad_norm": 0.21165348589420319, "learning_rate": 4.83749004389567e-05, "loss": 1.2103, "step": 1478 }, { "epoch": 0.23250604256322585, "grad_norm": 0.21954618394374847, "learning_rate": 4.837270850498308e-05, "loss": 1.1957, "step": 1479 }, { "epoch": 0.23266324746015837, "grad_norm": 0.1750314086675644, "learning_rate": 4.8370515143493346e-05, "loss": 1.2116, "step": 1480 }, { "epoch": 0.23282045235709092, "grad_norm": 0.24269573390483856, "learning_rate": 4.8368320354621474e-05, "loss": 1.1123, "step": 1481 }, { "epoch": 0.23297765725402347, "grad_norm": 0.2614949345588684, "learning_rate": 4.83661241385015e-05, "loss": 1.2208, "step": 1482 }, { "epoch": 0.233134862150956, "grad_norm": 0.252847284078598, "learning_rate": 4.836392649526756e-05, "loss": 1.1886, "step": 1483 }, { "epoch": 0.23329206704788855, "grad_norm": 0.26073166728019714, "learning_rate": 4.8361727425053895e-05, "loss": 1.1645, "step": 1484 }, { "epoch": 0.23344927194482107, "grad_norm": 0.33702126145362854, "learning_rate": 4.83595269279948e-05, "loss": 1.278, "step": 1485 }, { "epoch": 0.23360647684175362, "grad_norm": 0.23756952583789825, "learning_rate": 4.8357325004224675e-05, "loss": 1.2537, "step": 1486 }, { "epoch": 0.23376368173868617, "grad_norm": 0.2630425989627838, "learning_rate": 4.835512165387801e-05, "loss": 1.1478, "step": 1487 }, { "epoch": 0.2339208866356187, "grad_norm": 0.2218979150056839, "learning_rate": 4.835291687708937e-05, "loss": 1.1887, "step": 1488 }, { "epoch": 0.23407809153255124, "grad_norm": 0.23264755308628082, "learning_rate": 4.8350710673993425e-05, "loss": 1.1516, "step": 1489 }, { "epoch": 0.2342352964294838, "grad_norm": 0.2717369794845581, "learning_rate": 4.834850304472491e-05, "loss": 1.1529, "step": 1490 }, { "epoch": 0.2343925013264163, "grad_norm": 0.19794031977653503, "learning_rate": 4.8346293989418666e-05, "loss": 1.2004, "step": 1491 }, { "epoch": 0.23454970622334886, "grad_norm": 0.21382425725460052, "learning_rate": 4.8344083508209614e-05, "loss": 1.1859, "step": 1492 }, { "epoch": 0.2347069111202814, "grad_norm": 0.3373514413833618, "learning_rate": 4.834187160123276e-05, "loss": 1.1786, "step": 1493 }, { "epoch": 0.23486411601721394, "grad_norm": 0.2665463089942932, "learning_rate": 4.83396582686232e-05, "loss": 1.2605, "step": 1494 }, { "epoch": 0.2350213209141465, "grad_norm": 0.2265530377626419, "learning_rate": 4.833744351051611e-05, "loss": 1.2043, "step": 1495 }, { "epoch": 0.235178525811079, "grad_norm": 0.23344869911670685, "learning_rate": 4.833522732704677e-05, "loss": 1.2518, "step": 1496 }, { "epoch": 0.23533573070801156, "grad_norm": 0.2448417842388153, "learning_rate": 4.833300971835053e-05, "loss": 1.2219, "step": 1497 }, { "epoch": 0.23549293560494408, "grad_norm": 0.21578192710876465, "learning_rate": 4.8330790684562827e-05, "loss": 1.1285, "step": 1498 }, { "epoch": 0.23565014050187663, "grad_norm": 0.25885578989982605, "learning_rate": 4.8328570225819195e-05, "loss": 1.1466, "step": 1499 }, { "epoch": 0.23580734539880918, "grad_norm": 0.22060716152191162, "learning_rate": 4.832634834225526e-05, "loss": 1.126, "step": 1500 }, { "epoch": 0.2359645502957417, "grad_norm": 0.23449023067951202, "learning_rate": 4.8324125034006715e-05, "loss": 1.1628, "step": 1501 }, { "epoch": 0.23612175519267425, "grad_norm": 0.2863908112049103, "learning_rate": 4.832190030120936e-05, "loss": 1.1403, "step": 1502 }, { "epoch": 0.2362789600896068, "grad_norm": 0.25917142629623413, "learning_rate": 4.8319674143999063e-05, "loss": 1.1563, "step": 1503 }, { "epoch": 0.23643616498653933, "grad_norm": 0.2577166259288788, "learning_rate": 4.83174465625118e-05, "loss": 1.2036, "step": 1504 }, { "epoch": 0.23659336988347188, "grad_norm": 0.26499852538108826, "learning_rate": 4.831521755688361e-05, "loss": 1.2324, "step": 1505 }, { "epoch": 0.2367505747804044, "grad_norm": 0.32207560539245605, "learning_rate": 4.831298712725065e-05, "loss": 1.109, "step": 1506 }, { "epoch": 0.23690777967733695, "grad_norm": 0.2242748886346817, "learning_rate": 4.831075527374913e-05, "loss": 1.0943, "step": 1507 }, { "epoch": 0.2370649845742695, "grad_norm": 0.23681138455867767, "learning_rate": 4.830852199651537e-05, "loss": 1.2074, "step": 1508 }, { "epoch": 0.23722218947120202, "grad_norm": 0.23508530855178833, "learning_rate": 4.830628729568577e-05, "loss": 1.1592, "step": 1509 }, { "epoch": 0.23737939436813457, "grad_norm": 0.24309656023979187, "learning_rate": 4.8304051171396815e-05, "loss": 1.1721, "step": 1510 }, { "epoch": 0.2375365992650671, "grad_norm": 0.22660185396671295, "learning_rate": 4.830181362378509e-05, "loss": 1.2004, "step": 1511 }, { "epoch": 0.23769380416199964, "grad_norm": 0.19725121557712555, "learning_rate": 4.8299574652987236e-05, "loss": 1.142, "step": 1512 }, { "epoch": 0.2378510090589322, "grad_norm": 0.251643568277359, "learning_rate": 4.8297334259140015e-05, "loss": 1.24, "step": 1513 }, { "epoch": 0.23800821395586472, "grad_norm": 0.21497240662574768, "learning_rate": 4.829509244238026e-05, "loss": 1.0919, "step": 1514 }, { "epoch": 0.23816541885279727, "grad_norm": 0.20723997056484222, "learning_rate": 4.829284920284488e-05, "loss": 1.2857, "step": 1515 }, { "epoch": 0.23832262374972982, "grad_norm": 0.1896023154258728, "learning_rate": 4.82906045406709e-05, "loss": 1.1827, "step": 1516 }, { "epoch": 0.23847982864666234, "grad_norm": 0.2979254126548767, "learning_rate": 4.828835845599542e-05, "loss": 1.1791, "step": 1517 }, { "epoch": 0.2386370335435949, "grad_norm": 0.22216519713401794, "learning_rate": 4.82861109489556e-05, "loss": 1.1883, "step": 1518 }, { "epoch": 0.2387942384405274, "grad_norm": 0.21972821652889252, "learning_rate": 4.828386201968873e-05, "loss": 1.1213, "step": 1519 }, { "epoch": 0.23895144333745996, "grad_norm": 0.22813616693019867, "learning_rate": 4.828161166833215e-05, "loss": 1.1776, "step": 1520 }, { "epoch": 0.2391086482343925, "grad_norm": 0.2273959070444107, "learning_rate": 4.827935989502331e-05, "loss": 1.0856, "step": 1521 }, { "epoch": 0.23926585313132503, "grad_norm": 0.219630166888237, "learning_rate": 4.827710669989974e-05, "loss": 1.1141, "step": 1522 }, { "epoch": 0.23942305802825758, "grad_norm": 0.278255432844162, "learning_rate": 4.8274852083099065e-05, "loss": 1.1454, "step": 1523 }, { "epoch": 0.2395802629251901, "grad_norm": 0.27673086524009705, "learning_rate": 4.8272596044758974e-05, "loss": 1.1366, "step": 1524 }, { "epoch": 0.23973746782212266, "grad_norm": 0.32165762782096863, "learning_rate": 4.827033858501726e-05, "loss": 1.2367, "step": 1525 }, { "epoch": 0.2398946727190552, "grad_norm": 0.20993465185165405, "learning_rate": 4.82680797040118e-05, "loss": 1.2368, "step": 1526 }, { "epoch": 0.24005187761598773, "grad_norm": 0.23632824420928955, "learning_rate": 4.8265819401880575e-05, "loss": 1.2395, "step": 1527 }, { "epoch": 0.24020908251292028, "grad_norm": 0.2364589422941208, "learning_rate": 4.826355767876161e-05, "loss": 1.1977, "step": 1528 }, { "epoch": 0.24036628740985283, "grad_norm": 0.2299450784921646, "learning_rate": 4.826129453479306e-05, "loss": 1.2261, "step": 1529 }, { "epoch": 0.24052349230678535, "grad_norm": 0.2673666477203369, "learning_rate": 4.825902997011314e-05, "loss": 1.2938, "step": 1530 }, { "epoch": 0.2406806972037179, "grad_norm": 0.19846846163272858, "learning_rate": 4.8256763984860164e-05, "loss": 1.1954, "step": 1531 }, { "epoch": 0.24083790210065043, "grad_norm": 0.23329487442970276, "learning_rate": 4.825449657917253e-05, "loss": 1.2483, "step": 1532 }, { "epoch": 0.24099510699758298, "grad_norm": 0.22006340324878693, "learning_rate": 4.825222775318872e-05, "loss": 1.14, "step": 1533 }, { "epoch": 0.24115231189451553, "grad_norm": 0.2518123984336853, "learning_rate": 4.8249957507047315e-05, "loss": 1.041, "step": 1534 }, { "epoch": 0.24130951679144805, "grad_norm": 0.26999431848526, "learning_rate": 4.824768584088696e-05, "loss": 1.1871, "step": 1535 }, { "epoch": 0.2414667216883806, "grad_norm": 0.26676446199417114, "learning_rate": 4.824541275484641e-05, "loss": 1.1527, "step": 1536 }, { "epoch": 0.24162392658531312, "grad_norm": 0.30466800928115845, "learning_rate": 4.824313824906449e-05, "loss": 1.0792, "step": 1537 }, { "epoch": 0.24178113148224567, "grad_norm": 0.23601631820201874, "learning_rate": 4.824086232368011e-05, "loss": 1.1099, "step": 1538 }, { "epoch": 0.24193833637917822, "grad_norm": 0.21866478025913239, "learning_rate": 4.82385849788323e-05, "loss": 1.2394, "step": 1539 }, { "epoch": 0.24209554127611074, "grad_norm": 0.2272699475288391, "learning_rate": 4.823630621466013e-05, "loss": 1.2503, "step": 1540 }, { "epoch": 0.2422527461730433, "grad_norm": 0.2118367850780487, "learning_rate": 4.823402603130279e-05, "loss": 1.2348, "step": 1541 }, { "epoch": 0.24240995106997584, "grad_norm": 0.2032533437013626, "learning_rate": 4.823174442889953e-05, "loss": 1.205, "step": 1542 }, { "epoch": 0.24256715596690837, "grad_norm": 0.22420114278793335, "learning_rate": 4.822946140758972e-05, "loss": 1.1653, "step": 1543 }, { "epoch": 0.24272436086384092, "grad_norm": 0.22077587246894836, "learning_rate": 4.8227176967512785e-05, "loss": 1.0768, "step": 1544 }, { "epoch": 0.24288156576077344, "grad_norm": 0.25682517886161804, "learning_rate": 4.8224891108808255e-05, "loss": 1.1728, "step": 1545 }, { "epoch": 0.243038770657706, "grad_norm": 0.28335657715797424, "learning_rate": 4.8222603831615744e-05, "loss": 1.1615, "step": 1546 }, { "epoch": 0.24319597555463854, "grad_norm": 0.2960416376590729, "learning_rate": 4.8220315136074946e-05, "loss": 1.1786, "step": 1547 }, { "epoch": 0.24335318045157106, "grad_norm": 0.24929380416870117, "learning_rate": 4.821802502232565e-05, "loss": 1.1626, "step": 1548 }, { "epoch": 0.2435103853485036, "grad_norm": 0.2701495587825775, "learning_rate": 4.821573349050772e-05, "loss": 1.1638, "step": 1549 }, { "epoch": 0.24366759024543613, "grad_norm": 0.21584048867225647, "learning_rate": 4.821344054076111e-05, "loss": 1.083, "step": 1550 }, { "epoch": 0.24382479514236868, "grad_norm": 0.20484817028045654, "learning_rate": 4.8211146173225884e-05, "loss": 1.1552, "step": 1551 }, { "epoch": 0.24398200003930123, "grad_norm": 0.2720612585544586, "learning_rate": 4.8208850388042166e-05, "loss": 1.2093, "step": 1552 }, { "epoch": 0.24413920493623376, "grad_norm": 0.2705625295639038, "learning_rate": 4.820655318535017e-05, "loss": 1.2524, "step": 1553 }, { "epoch": 0.2442964098331663, "grad_norm": 0.2290465533733368, "learning_rate": 4.820425456529019e-05, "loss": 1.2138, "step": 1554 }, { "epoch": 0.24445361473009886, "grad_norm": 0.20924489200115204, "learning_rate": 4.8201954528002634e-05, "loss": 1.215, "step": 1555 }, { "epoch": 0.24461081962703138, "grad_norm": 0.4101350009441376, "learning_rate": 4.819965307362797e-05, "loss": 1.1646, "step": 1556 }, { "epoch": 0.24476802452396393, "grad_norm": 0.3153195381164551, "learning_rate": 4.819735020230677e-05, "loss": 1.0581, "step": 1557 }, { "epoch": 0.24492522942089645, "grad_norm": 0.2894124686717987, "learning_rate": 4.819504591417967e-05, "loss": 1.1619, "step": 1558 }, { "epoch": 0.245082434317829, "grad_norm": 0.2255009263753891, "learning_rate": 4.8192740209387425e-05, "loss": 1.2306, "step": 1559 }, { "epoch": 0.24523963921476155, "grad_norm": 0.23586878180503845, "learning_rate": 4.819043308807085e-05, "loss": 1.0897, "step": 1560 }, { "epoch": 0.24539684411169407, "grad_norm": 0.19656193256378174, "learning_rate": 4.818812455037086e-05, "loss": 1.2547, "step": 1561 }, { "epoch": 0.24555404900862662, "grad_norm": 0.19186031818389893, "learning_rate": 4.818581459642844e-05, "loss": 1.2, "step": 1562 }, { "epoch": 0.24571125390555915, "grad_norm": 0.30889350175857544, "learning_rate": 4.8183503226384685e-05, "loss": 1.0726, "step": 1563 }, { "epoch": 0.2458684588024917, "grad_norm": 0.28955504298210144, "learning_rate": 4.8181190440380755e-05, "loss": 1.0853, "step": 1564 }, { "epoch": 0.24602566369942425, "grad_norm": 0.20687651634216309, "learning_rate": 4.817887623855792e-05, "loss": 1.1026, "step": 1565 }, { "epoch": 0.24618286859635677, "grad_norm": 0.20754511654376984, "learning_rate": 4.817656062105751e-05, "loss": 1.1793, "step": 1566 }, { "epoch": 0.24634007349328932, "grad_norm": 0.20881116390228271, "learning_rate": 4.817424358802096e-05, "loss": 1.1812, "step": 1567 }, { "epoch": 0.24649727839022187, "grad_norm": 0.22653476893901825, "learning_rate": 4.8171925139589777e-05, "loss": 1.1733, "step": 1568 }, { "epoch": 0.2466544832871544, "grad_norm": 0.2595761716365814, "learning_rate": 4.8169605275905574e-05, "loss": 1.1817, "step": 1569 }, { "epoch": 0.24681168818408694, "grad_norm": 0.265379935503006, "learning_rate": 4.8167283997110044e-05, "loss": 1.0631, "step": 1570 }, { "epoch": 0.24696889308101946, "grad_norm": 0.28662997484207153, "learning_rate": 4.816496130334494e-05, "loss": 1.171, "step": 1571 }, { "epoch": 0.24712609797795201, "grad_norm": 0.28659316897392273, "learning_rate": 4.8162637194752146e-05, "loss": 1.158, "step": 1572 }, { "epoch": 0.24728330287488456, "grad_norm": 0.25767526030540466, "learning_rate": 4.8160311671473596e-05, "loss": 1.1179, "step": 1573 }, { "epoch": 0.2474405077718171, "grad_norm": 0.2345789521932602, "learning_rate": 4.815798473365133e-05, "loss": 1.0722, "step": 1574 }, { "epoch": 0.24759771266874964, "grad_norm": 0.20178478956222534, "learning_rate": 4.8155656381427464e-05, "loss": 1.1837, "step": 1575 }, { "epoch": 0.24775491756568216, "grad_norm": 0.2254524976015091, "learning_rate": 4.815332661494421e-05, "loss": 1.0697, "step": 1576 }, { "epoch": 0.2479121224626147, "grad_norm": 0.24192413687705994, "learning_rate": 4.815099543434386e-05, "loss": 1.1321, "step": 1577 }, { "epoch": 0.24806932735954726, "grad_norm": 0.24133244156837463, "learning_rate": 4.814866283976879e-05, "loss": 0.9985, "step": 1578 }, { "epoch": 0.24822653225647978, "grad_norm": 0.25949928164482117, "learning_rate": 4.814632883136146e-05, "loss": 1.1197, "step": 1579 }, { "epoch": 0.24838373715341233, "grad_norm": 0.2583736479282379, "learning_rate": 4.8143993409264446e-05, "loss": 1.1219, "step": 1580 }, { "epoch": 0.24854094205034488, "grad_norm": 0.27031365036964417, "learning_rate": 4.814165657362037e-05, "loss": 1.1165, "step": 1581 }, { "epoch": 0.2486981469472774, "grad_norm": 0.21057547628879547, "learning_rate": 4.813931832457195e-05, "loss": 1.2199, "step": 1582 }, { "epoch": 0.24885535184420995, "grad_norm": 0.16742344200611115, "learning_rate": 4.813697866226201e-05, "loss": 1.1562, "step": 1583 }, { "epoch": 0.24901255674114248, "grad_norm": 0.17304782569408417, "learning_rate": 4.813463758683345e-05, "loss": 1.28, "step": 1584 }, { "epoch": 0.24916976163807503, "grad_norm": 0.22176629304885864, "learning_rate": 4.813229509842924e-05, "loss": 1.261, "step": 1585 }, { "epoch": 0.24932696653500758, "grad_norm": 0.2836274802684784, "learning_rate": 4.812995119719246e-05, "loss": 1.0815, "step": 1586 }, { "epoch": 0.2494841714319401, "grad_norm": 0.28422456979751587, "learning_rate": 4.812760588326627e-05, "loss": 1.2035, "step": 1587 }, { "epoch": 0.24964137632887265, "grad_norm": 0.21732592582702637, "learning_rate": 4.81252591567939e-05, "loss": 1.2436, "step": 1588 }, { "epoch": 0.24979858122580517, "grad_norm": 0.24617740511894226, "learning_rate": 4.8122911017918694e-05, "loss": 1.1604, "step": 1589 }, { "epoch": 0.24995578612273772, "grad_norm": 0.22099459171295166, "learning_rate": 4.8120561466784056e-05, "loss": 1.2597, "step": 1590 }, { "epoch": 0.25011299101967027, "grad_norm": 0.21290400624275208, "learning_rate": 4.811821050353349e-05, "loss": 1.2431, "step": 1591 }, { "epoch": 0.2502701959166028, "grad_norm": 0.32815465331077576, "learning_rate": 4.811585812831059e-05, "loss": 1.202, "step": 1592 }, { "epoch": 0.2504274008135353, "grad_norm": 0.19528530538082123, "learning_rate": 4.811350434125902e-05, "loss": 1.2669, "step": 1593 }, { "epoch": 0.25058460571046787, "grad_norm": 0.2987164855003357, "learning_rate": 4.8111149142522545e-05, "loss": 1.1263, "step": 1594 }, { "epoch": 0.2507418106074004, "grad_norm": 0.22502346336841583, "learning_rate": 4.810879253224502e-05, "loss": 1.0881, "step": 1595 }, { "epoch": 0.25089901550433297, "grad_norm": 0.18701261281967163, "learning_rate": 4.810643451057036e-05, "loss": 1.1612, "step": 1596 }, { "epoch": 0.2510562204012655, "grad_norm": 0.2719084322452545, "learning_rate": 4.81040750776426e-05, "loss": 1.0618, "step": 1597 }, { "epoch": 0.251213425298198, "grad_norm": 0.2659015655517578, "learning_rate": 4.8101714233605845e-05, "loss": 1.2364, "step": 1598 }, { "epoch": 0.25137063019513056, "grad_norm": 0.23821905255317688, "learning_rate": 4.809935197860427e-05, "loss": 1.2978, "step": 1599 }, { "epoch": 0.2515278350920631, "grad_norm": 0.2146797925233841, "learning_rate": 4.8096988312782174e-05, "loss": 1.2891, "step": 1600 }, { "epoch": 0.2515278350920631, "eval_loss": 1.1705546379089355, "eval_runtime": 2320.6543, "eval_samples_per_second": 3.989, "eval_steps_per_second": 1.995, "step": 1600 }, { "epoch": 0.25168503998899566, "grad_norm": 0.2162550389766693, "learning_rate": 4.80946232362839e-05, "loss": 1.1259, "step": 1601 }, { "epoch": 0.2518422448859282, "grad_norm": 0.24746862053871155, "learning_rate": 4.809225674925392e-05, "loss": 1.1532, "step": 1602 }, { "epoch": 0.25199944978286076, "grad_norm": 0.21859407424926758, "learning_rate": 4.808988885183675e-05, "loss": 1.2823, "step": 1603 }, { "epoch": 0.25215665467979326, "grad_norm": 0.24052634835243225, "learning_rate": 4.808751954417702e-05, "loss": 1.2075, "step": 1604 }, { "epoch": 0.2523138595767258, "grad_norm": 0.3079143464565277, "learning_rate": 4.808514882641944e-05, "loss": 1.2258, "step": 1605 }, { "epoch": 0.25247106447365836, "grad_norm": 0.2353007048368454, "learning_rate": 4.8082776698708805e-05, "loss": 1.0725, "step": 1606 }, { "epoch": 0.2526282693705909, "grad_norm": 0.24463996291160583, "learning_rate": 4.808040316118999e-05, "loss": 1.1763, "step": 1607 }, { "epoch": 0.25278547426752346, "grad_norm": 0.24785065650939941, "learning_rate": 4.807802821400796e-05, "loss": 1.146, "step": 1608 }, { "epoch": 0.25294267916445595, "grad_norm": 0.3086298704147339, "learning_rate": 4.8075651857307786e-05, "loss": 1.1113, "step": 1609 }, { "epoch": 0.2530998840613885, "grad_norm": 0.2168322056531906, "learning_rate": 4.807327409123459e-05, "loss": 1.2544, "step": 1610 }, { "epoch": 0.25325708895832105, "grad_norm": 0.23688393831253052, "learning_rate": 4.807089491593359e-05, "loss": 1.2082, "step": 1611 }, { "epoch": 0.2534142938552536, "grad_norm": 0.184744194149971, "learning_rate": 4.8068514331550116e-05, "loss": 1.2256, "step": 1612 }, { "epoch": 0.25357149875218615, "grad_norm": 0.26341819763183594, "learning_rate": 4.8066132338229564e-05, "loss": 1.0727, "step": 1613 }, { "epoch": 0.25372870364911865, "grad_norm": 0.24953097105026245, "learning_rate": 4.80637489361174e-05, "loss": 1.197, "step": 1614 }, { "epoch": 0.2538859085460512, "grad_norm": 0.26930662989616394, "learning_rate": 4.8061364125359204e-05, "loss": 1.2333, "step": 1615 }, { "epoch": 0.25404311344298375, "grad_norm": 0.22073645889759064, "learning_rate": 4.805897790610063e-05, "loss": 1.1749, "step": 1616 }, { "epoch": 0.2542003183399163, "grad_norm": 0.23651045560836792, "learning_rate": 4.805659027848742e-05, "loss": 1.2632, "step": 1617 }, { "epoch": 0.25435752323684885, "grad_norm": 0.2211553007364273, "learning_rate": 4.80542012426654e-05, "loss": 1.2297, "step": 1618 }, { "epoch": 0.25451472813378134, "grad_norm": 0.21516153216362, "learning_rate": 4.805181079878048e-05, "loss": 1.2396, "step": 1619 }, { "epoch": 0.2546719330307139, "grad_norm": 0.2111159861087799, "learning_rate": 4.804941894697867e-05, "loss": 1.0961, "step": 1620 }, { "epoch": 0.25482913792764644, "grad_norm": 0.2587776184082031, "learning_rate": 4.804702568740604e-05, "loss": 1.1243, "step": 1621 }, { "epoch": 0.254986342824579, "grad_norm": 0.21024088561534882, "learning_rate": 4.804463102020878e-05, "loss": 1.1276, "step": 1622 }, { "epoch": 0.25514354772151154, "grad_norm": 0.2880707383155823, "learning_rate": 4.8042234945533127e-05, "loss": 1.0841, "step": 1623 }, { "epoch": 0.25530075261844404, "grad_norm": 0.21504652500152588, "learning_rate": 4.803983746352544e-05, "loss": 1.0336, "step": 1624 }, { "epoch": 0.2554579575153766, "grad_norm": 0.2103573977947235, "learning_rate": 4.803743857433214e-05, "loss": 1.2497, "step": 1625 }, { "epoch": 0.25561516241230914, "grad_norm": 0.24323992431163788, "learning_rate": 4.803503827809974e-05, "loss": 1.1702, "step": 1626 }, { "epoch": 0.2557723673092417, "grad_norm": 0.1768578141927719, "learning_rate": 4.8032636574974845e-05, "loss": 1.2472, "step": 1627 }, { "epoch": 0.25592957220617424, "grad_norm": 0.20027288794517517, "learning_rate": 4.803023346510415e-05, "loss": 1.1757, "step": 1628 }, { "epoch": 0.2560867771031068, "grad_norm": 0.2526282072067261, "learning_rate": 4.8027828948634405e-05, "loss": 1.1338, "step": 1629 }, { "epoch": 0.2562439820000393, "grad_norm": 0.2610374689102173, "learning_rate": 4.802542302571249e-05, "loss": 1.0715, "step": 1630 }, { "epoch": 0.25640118689697183, "grad_norm": 0.2409505993127823, "learning_rate": 4.802301569648534e-05, "loss": 1.1915, "step": 1631 }, { "epoch": 0.2565583917939044, "grad_norm": 0.3539654314517975, "learning_rate": 4.8020606961099996e-05, "loss": 1.1752, "step": 1632 }, { "epoch": 0.25671559669083693, "grad_norm": 0.23121047019958496, "learning_rate": 4.801819681970357e-05, "loss": 1.1966, "step": 1633 }, { "epoch": 0.2568728015877695, "grad_norm": 0.2272186279296875, "learning_rate": 4.801578527244325e-05, "loss": 1.1131, "step": 1634 }, { "epoch": 0.257030006484702, "grad_norm": 0.183711439371109, "learning_rate": 4.801337231946633e-05, "loss": 1.1569, "step": 1635 }, { "epoch": 0.25718721138163453, "grad_norm": 0.22497425973415375, "learning_rate": 4.80109579609202e-05, "loss": 1.1491, "step": 1636 }, { "epoch": 0.2573444162785671, "grad_norm": 0.22534243762493134, "learning_rate": 4.80085421969523e-05, "loss": 1.1827, "step": 1637 }, { "epoch": 0.25750162117549963, "grad_norm": 0.14900386333465576, "learning_rate": 4.800612502771019e-05, "loss": 1.1905, "step": 1638 }, { "epoch": 0.2576588260724322, "grad_norm": 0.26234182715415955, "learning_rate": 4.80037064533415e-05, "loss": 1.1197, "step": 1639 }, { "epoch": 0.2578160309693647, "grad_norm": 0.23205281794071198, "learning_rate": 4.800128647399393e-05, "loss": 1.1325, "step": 1640 }, { "epoch": 0.2579732358662972, "grad_norm": 0.22982637584209442, "learning_rate": 4.799886508981531e-05, "loss": 1.1097, "step": 1641 }, { "epoch": 0.2581304407632298, "grad_norm": 0.2268412709236145, "learning_rate": 4.799644230095351e-05, "loss": 1.2452, "step": 1642 }, { "epoch": 0.2582876456601623, "grad_norm": 0.2071431428194046, "learning_rate": 4.799401810755651e-05, "loss": 1.2338, "step": 1643 }, { "epoch": 0.2584448505570949, "grad_norm": 0.21589568257331848, "learning_rate": 4.799159250977237e-05, "loss": 1.1177, "step": 1644 }, { "epoch": 0.25860205545402737, "grad_norm": 0.19412028789520264, "learning_rate": 4.798916550774924e-05, "loss": 1.113, "step": 1645 }, { "epoch": 0.2587592603509599, "grad_norm": 0.2528839707374573, "learning_rate": 4.798673710163535e-05, "loss": 1.2024, "step": 1646 }, { "epoch": 0.25891646524789247, "grad_norm": 0.21848973631858826, "learning_rate": 4.798430729157901e-05, "loss": 0.9633, "step": 1647 }, { "epoch": 0.259073670144825, "grad_norm": 0.2397562563419342, "learning_rate": 4.7981876077728625e-05, "loss": 1.14, "step": 1648 }, { "epoch": 0.25923087504175757, "grad_norm": 0.19531749188899994, "learning_rate": 4.7979443460232703e-05, "loss": 1.1785, "step": 1649 }, { "epoch": 0.25938807993869006, "grad_norm": 0.20693683624267578, "learning_rate": 4.79770094392398e-05, "loss": 1.2601, "step": 1650 }, { "epoch": 0.2595452848356226, "grad_norm": 0.26081717014312744, "learning_rate": 4.797457401489858e-05, "loss": 1.1489, "step": 1651 }, { "epoch": 0.25970248973255516, "grad_norm": 0.23810921609401703, "learning_rate": 4.7972137187357795e-05, "loss": 1.1896, "step": 1652 }, { "epoch": 0.2598596946294877, "grad_norm": 0.22295863926410675, "learning_rate": 4.796969895676627e-05, "loss": 1.1954, "step": 1653 }, { "epoch": 0.26001689952642026, "grad_norm": 0.26917287707328796, "learning_rate": 4.7967259323272935e-05, "loss": 1.2985, "step": 1654 }, { "epoch": 0.2601741044233528, "grad_norm": 0.20920400321483612, "learning_rate": 4.796481828702678e-05, "loss": 1.2217, "step": 1655 }, { "epoch": 0.2603313093202853, "grad_norm": 0.24075470864772797, "learning_rate": 4.79623758481769e-05, "loss": 1.2349, "step": 1656 }, { "epoch": 0.26048851421721786, "grad_norm": 0.2986784279346466, "learning_rate": 4.795993200687247e-05, "loss": 0.9704, "step": 1657 }, { "epoch": 0.2606457191141504, "grad_norm": 0.21063056588172913, "learning_rate": 4.795748676326275e-05, "loss": 1.1764, "step": 1658 }, { "epoch": 0.26080292401108296, "grad_norm": 0.17640748620033264, "learning_rate": 4.7955040117497084e-05, "loss": 1.1998, "step": 1659 }, { "epoch": 0.2609601289080155, "grad_norm": 0.3088374137878418, "learning_rate": 4.79525920697249e-05, "loss": 1.108, "step": 1660 }, { "epoch": 0.261117333804948, "grad_norm": 0.21907536685466766, "learning_rate": 4.795014262009573e-05, "loss": 1.2134, "step": 1661 }, { "epoch": 0.26127453870188055, "grad_norm": 0.25505852699279785, "learning_rate": 4.794769176875917e-05, "loss": 1.2556, "step": 1662 }, { "epoch": 0.2614317435988131, "grad_norm": 0.2881571650505066, "learning_rate": 4.79452395158649e-05, "loss": 1.1194, "step": 1663 }, { "epoch": 0.26158894849574565, "grad_norm": 0.24344374239444733, "learning_rate": 4.794278586156271e-05, "loss": 1.114, "step": 1664 }, { "epoch": 0.2617461533926782, "grad_norm": 0.2396748960018158, "learning_rate": 4.794033080600244e-05, "loss": 1.0962, "step": 1665 }, { "epoch": 0.2619033582896107, "grad_norm": 0.2769325375556946, "learning_rate": 4.7937874349334056e-05, "loss": 1.0822, "step": 1666 }, { "epoch": 0.26206056318654325, "grad_norm": 0.20145046710968018, "learning_rate": 4.793541649170757e-05, "loss": 1.1856, "step": 1667 }, { "epoch": 0.2622177680834758, "grad_norm": 0.2373715192079544, "learning_rate": 4.7932957233273123e-05, "loss": 1.195, "step": 1668 }, { "epoch": 0.26237497298040835, "grad_norm": 0.26741209626197815, "learning_rate": 4.7930496574180894e-05, "loss": 1.1812, "step": 1669 }, { "epoch": 0.2625321778773409, "grad_norm": 0.18911203742027283, "learning_rate": 4.7928034514581174e-05, "loss": 1.1561, "step": 1670 }, { "epoch": 0.2626893827742734, "grad_norm": 0.23078587651252747, "learning_rate": 4.792557105462434e-05, "loss": 1.1746, "step": 1671 }, { "epoch": 0.26284658767120594, "grad_norm": 0.25644704699516296, "learning_rate": 4.792310619446087e-05, "loss": 1.1917, "step": 1672 }, { "epoch": 0.2630037925681385, "grad_norm": 0.2350175380706787, "learning_rate": 4.7920639934241274e-05, "loss": 1.2823, "step": 1673 }, { "epoch": 0.26316099746507104, "grad_norm": 0.28728723526000977, "learning_rate": 4.79181722741162e-05, "loss": 1.2291, "step": 1674 }, { "epoch": 0.2633182023620036, "grad_norm": 0.28967636823654175, "learning_rate": 4.791570321423637e-05, "loss": 1.0443, "step": 1675 }, { "epoch": 0.2634754072589361, "grad_norm": 0.20903019607067108, "learning_rate": 4.791323275475257e-05, "loss": 1.2378, "step": 1676 }, { "epoch": 0.26363261215586864, "grad_norm": 0.25918152928352356, "learning_rate": 4.791076089581569e-05, "loss": 1.0944, "step": 1677 }, { "epoch": 0.2637898170528012, "grad_norm": 0.2330826371908188, "learning_rate": 4.790828763757671e-05, "loss": 1.2158, "step": 1678 }, { "epoch": 0.26394702194973374, "grad_norm": 0.22032979130744934, "learning_rate": 4.790581298018667e-05, "loss": 1.0987, "step": 1679 }, { "epoch": 0.2641042268466663, "grad_norm": 0.25354599952697754, "learning_rate": 4.7903336923796736e-05, "loss": 1.2191, "step": 1680 }, { "epoch": 0.26426143174359884, "grad_norm": 0.29382967948913574, "learning_rate": 4.7900859468558123e-05, "loss": 1.1906, "step": 1681 }, { "epoch": 0.26441863664053133, "grad_norm": 0.26418671011924744, "learning_rate": 4.7898380614622144e-05, "loss": 1.1357, "step": 1682 }, { "epoch": 0.2645758415374639, "grad_norm": 0.4071028232574463, "learning_rate": 4.78959003621402e-05, "loss": 1.1547, "step": 1683 }, { "epoch": 0.26473304643439644, "grad_norm": 0.20212817192077637, "learning_rate": 4.789341871126378e-05, "loss": 1.2021, "step": 1684 }, { "epoch": 0.264890251331329, "grad_norm": 0.22908912599086761, "learning_rate": 4.789093566214444e-05, "loss": 1.3014, "step": 1685 }, { "epoch": 0.26504745622826154, "grad_norm": 0.2181553691625595, "learning_rate": 4.788845121493385e-05, "loss": 1.2074, "step": 1686 }, { "epoch": 0.26520466112519403, "grad_norm": 0.2574876546859741, "learning_rate": 4.788596536978374e-05, "loss": 1.1311, "step": 1687 }, { "epoch": 0.2653618660221266, "grad_norm": 0.20918451249599457, "learning_rate": 4.7883478126845945e-05, "loss": 1.1652, "step": 1688 }, { "epoch": 0.26551907091905913, "grad_norm": 0.1706637293100357, "learning_rate": 4.7880989486272366e-05, "loss": 1.1742, "step": 1689 }, { "epoch": 0.2656762758159917, "grad_norm": 0.22445620596408844, "learning_rate": 4.787849944821501e-05, "loss": 1.1579, "step": 1690 }, { "epoch": 0.26583348071292423, "grad_norm": 0.25707024335861206, "learning_rate": 4.787600801282596e-05, "loss": 1.0905, "step": 1691 }, { "epoch": 0.2659906856098567, "grad_norm": 0.22234748303890228, "learning_rate": 4.787351518025737e-05, "loss": 1.1657, "step": 1692 }, { "epoch": 0.2661478905067893, "grad_norm": 0.26487070322036743, "learning_rate": 4.78710209506615e-05, "loss": 1.2234, "step": 1693 }, { "epoch": 0.2663050954037218, "grad_norm": 0.26550817489624023, "learning_rate": 4.786852532419069e-05, "loss": 1.1627, "step": 1694 }, { "epoch": 0.2664623003006544, "grad_norm": 0.24167504906654358, "learning_rate": 4.786602830099737e-05, "loss": 1.2216, "step": 1695 }, { "epoch": 0.2666195051975869, "grad_norm": 0.30660563707351685, "learning_rate": 4.786352988123403e-05, "loss": 1.1593, "step": 1696 }, { "epoch": 0.2667767100945194, "grad_norm": 0.222911536693573, "learning_rate": 4.786103006505328e-05, "loss": 1.2223, "step": 1697 }, { "epoch": 0.26693391499145197, "grad_norm": 0.19984647631645203, "learning_rate": 4.78585288526078e-05, "loss": 1.2453, "step": 1698 }, { "epoch": 0.2670911198883845, "grad_norm": 0.19021253287792206, "learning_rate": 4.785602624405034e-05, "loss": 1.2094, "step": 1699 }, { "epoch": 0.26724832478531707, "grad_norm": 0.24404233694076538, "learning_rate": 4.785352223953376e-05, "loss": 1.1247, "step": 1700 }, { "epoch": 0.2674055296822496, "grad_norm": 0.24491339921951294, "learning_rate": 4.7851016839210995e-05, "loss": 1.2667, "step": 1701 }, { "epoch": 0.2675627345791821, "grad_norm": 0.2175382375717163, "learning_rate": 4.7848510043235064e-05, "loss": 1.1664, "step": 1702 }, { "epoch": 0.26771993947611467, "grad_norm": 0.31710314750671387, "learning_rate": 4.784600185175907e-05, "loss": 1.1909, "step": 1703 }, { "epoch": 0.2678771443730472, "grad_norm": 0.21164441108703613, "learning_rate": 4.7843492264936214e-05, "loss": 1.1851, "step": 1704 }, { "epoch": 0.26803434926997977, "grad_norm": 0.18407614529132843, "learning_rate": 4.784098128291976e-05, "loss": 1.175, "step": 1705 }, { "epoch": 0.2681915541669123, "grad_norm": 0.2403915822505951, "learning_rate": 4.783846890586307e-05, "loss": 1.1775, "step": 1706 }, { "epoch": 0.26834875906384487, "grad_norm": 0.25552019476890564, "learning_rate": 4.78359551339196e-05, "loss": 1.1153, "step": 1707 }, { "epoch": 0.26850596396077736, "grad_norm": 0.24397552013397217, "learning_rate": 4.783343996724287e-05, "loss": 1.1795, "step": 1708 }, { "epoch": 0.2686631688577099, "grad_norm": 0.2242836058139801, "learning_rate": 4.78309234059865e-05, "loss": 1.1872, "step": 1709 }, { "epoch": 0.26882037375464246, "grad_norm": 0.2508406639099121, "learning_rate": 4.78284054503042e-05, "loss": 1.177, "step": 1710 }, { "epoch": 0.268977578651575, "grad_norm": 0.31523460149765015, "learning_rate": 4.7825886100349756e-05, "loss": 1.0777, "step": 1711 }, { "epoch": 0.26913478354850756, "grad_norm": 0.2677992582321167, "learning_rate": 4.782336535627703e-05, "loss": 1.135, "step": 1712 }, { "epoch": 0.26929198844544006, "grad_norm": 0.2804398536682129, "learning_rate": 4.782084321823998e-05, "loss": 1.2421, "step": 1713 }, { "epoch": 0.2694491933423726, "grad_norm": 0.2114623337984085, "learning_rate": 4.781831968639266e-05, "loss": 1.1308, "step": 1714 }, { "epoch": 0.26960639823930516, "grad_norm": 0.24072472751140594, "learning_rate": 4.7815794760889196e-05, "loss": 1.0372, "step": 1715 }, { "epoch": 0.2697636031362377, "grad_norm": 0.25040605664253235, "learning_rate": 4.7813268441883784e-05, "loss": 1.2141, "step": 1716 }, { "epoch": 0.26992080803317026, "grad_norm": 0.2768096625804901, "learning_rate": 4.781074072953074e-05, "loss": 1.1514, "step": 1717 }, { "epoch": 0.27007801293010275, "grad_norm": 0.21474453806877136, "learning_rate": 4.780821162398444e-05, "loss": 1.1375, "step": 1718 }, { "epoch": 0.2702352178270353, "grad_norm": 0.2671203911304474, "learning_rate": 4.780568112539936e-05, "loss": 1.1992, "step": 1719 }, { "epoch": 0.27039242272396785, "grad_norm": 0.19425953924655914, "learning_rate": 4.780314923393005e-05, "loss": 1.1796, "step": 1720 }, { "epoch": 0.2705496276209004, "grad_norm": 0.21683478355407715, "learning_rate": 4.780061594973114e-05, "loss": 1.102, "step": 1721 }, { "epoch": 0.27070683251783295, "grad_norm": 0.22656619548797607, "learning_rate": 4.779808127295735e-05, "loss": 1.2064, "step": 1722 }, { "epoch": 0.27086403741476545, "grad_norm": 0.27819374203681946, "learning_rate": 4.779554520376351e-05, "loss": 1.1881, "step": 1723 }, { "epoch": 0.271021242311698, "grad_norm": 0.2076413929462433, "learning_rate": 4.779300774230449e-05, "loss": 1.1586, "step": 1724 }, { "epoch": 0.27117844720863055, "grad_norm": 0.28731614351272583, "learning_rate": 4.779046888873529e-05, "loss": 0.9967, "step": 1725 }, { "epoch": 0.2713356521055631, "grad_norm": 0.23616041243076324, "learning_rate": 4.7787928643210955e-05, "loss": 1.2109, "step": 1726 }, { "epoch": 0.27149285700249565, "grad_norm": 0.3352977931499481, "learning_rate": 4.778538700588664e-05, "loss": 1.316, "step": 1727 }, { "epoch": 0.27165006189942814, "grad_norm": 0.275420218706131, "learning_rate": 4.778284397691758e-05, "loss": 1.2542, "step": 1728 }, { "epoch": 0.2718072667963607, "grad_norm": 0.20388716459274292, "learning_rate": 4.77802995564591e-05, "loss": 1.187, "step": 1729 }, { "epoch": 0.27196447169329324, "grad_norm": 0.24802348017692566, "learning_rate": 4.777775374466659e-05, "loss": 1.2225, "step": 1730 }, { "epoch": 0.2721216765902258, "grad_norm": 0.2412358671426773, "learning_rate": 4.777520654169554e-05, "loss": 1.1717, "step": 1731 }, { "epoch": 0.27227888148715834, "grad_norm": 0.24212747812271118, "learning_rate": 4.777265794770153e-05, "loss": 1.0965, "step": 1732 }, { "epoch": 0.2724360863840909, "grad_norm": 0.2116345465183258, "learning_rate": 4.7770107962840225e-05, "loss": 1.0881, "step": 1733 }, { "epoch": 0.2725932912810234, "grad_norm": 0.1802130490541458, "learning_rate": 4.7767556587267356e-05, "loss": 1.2671, "step": 1734 }, { "epoch": 0.27275049617795594, "grad_norm": 0.22544068098068237, "learning_rate": 4.776500382113875e-05, "loss": 1.1557, "step": 1735 }, { "epoch": 0.2729077010748885, "grad_norm": 0.22627638280391693, "learning_rate": 4.776244966461034e-05, "loss": 1.0589, "step": 1736 }, { "epoch": 0.27306490597182104, "grad_norm": 0.24201518297195435, "learning_rate": 4.77598941178381e-05, "loss": 1.217, "step": 1737 }, { "epoch": 0.2732221108687536, "grad_norm": 0.2708778381347656, "learning_rate": 4.775733718097812e-05, "loss": 1.1762, "step": 1738 }, { "epoch": 0.2733793157656861, "grad_norm": 0.22093307971954346, "learning_rate": 4.775477885418658e-05, "loss": 1.1599, "step": 1739 }, { "epoch": 0.27353652066261863, "grad_norm": 0.2544403672218323, "learning_rate": 4.775221913761971e-05, "loss": 1.1177, "step": 1740 }, { "epoch": 0.2736937255595512, "grad_norm": 0.2178761512041092, "learning_rate": 4.7749658031433873e-05, "loss": 1.1493, "step": 1741 }, { "epoch": 0.27385093045648373, "grad_norm": 0.2969399392604828, "learning_rate": 4.774709553578548e-05, "loss": 1.1048, "step": 1742 }, { "epoch": 0.2740081353534163, "grad_norm": 0.18794193863868713, "learning_rate": 4.7744531650831034e-05, "loss": 1.2954, "step": 1743 }, { "epoch": 0.2741653402503488, "grad_norm": 0.19405794143676758, "learning_rate": 4.774196637672714e-05, "loss": 1.1211, "step": 1744 }, { "epoch": 0.2743225451472813, "grad_norm": 0.24118031561374664, "learning_rate": 4.773939971363046e-05, "loss": 1.0933, "step": 1745 }, { "epoch": 0.2744797500442139, "grad_norm": 0.24610240757465363, "learning_rate": 4.7736831661697766e-05, "loss": 1.3337, "step": 1746 }, { "epoch": 0.2746369549411464, "grad_norm": 0.1838586926460266, "learning_rate": 4.77342622210859e-05, "loss": 1.1453, "step": 1747 }, { "epoch": 0.274794159838079, "grad_norm": 0.26292094588279724, "learning_rate": 4.77316913919518e-05, "loss": 1.1206, "step": 1748 }, { "epoch": 0.27495136473501147, "grad_norm": 0.15059764683246613, "learning_rate": 4.7729119174452475e-05, "loss": 1.2451, "step": 1749 }, { "epoch": 0.275108569631944, "grad_norm": 0.2880924940109253, "learning_rate": 4.772654556874503e-05, "loss": 1.1765, "step": 1750 }, { "epoch": 0.2752657745288766, "grad_norm": 0.20336076617240906, "learning_rate": 4.7723970574986656e-05, "loss": 1.1369, "step": 1751 }, { "epoch": 0.2754229794258091, "grad_norm": 0.24111486971378326, "learning_rate": 4.77213941933346e-05, "loss": 1.0741, "step": 1752 }, { "epoch": 0.2755801843227417, "grad_norm": 0.23586514592170715, "learning_rate": 4.7718816423946256e-05, "loss": 1.214, "step": 1753 }, { "epoch": 0.27573738921967417, "grad_norm": 0.2305273711681366, "learning_rate": 4.7716237266979036e-05, "loss": 1.1988, "step": 1754 }, { "epoch": 0.2758945941166067, "grad_norm": 0.18287290632724762, "learning_rate": 4.7713656722590475e-05, "loss": 1.1478, "step": 1755 }, { "epoch": 0.27605179901353927, "grad_norm": 0.28260156512260437, "learning_rate": 4.7711074790938184e-05, "loss": 1.0779, "step": 1756 }, { "epoch": 0.2762090039104718, "grad_norm": 0.2582015097141266, "learning_rate": 4.770849147217985e-05, "loss": 1.1002, "step": 1757 }, { "epoch": 0.27636620880740437, "grad_norm": 0.28041109442710876, "learning_rate": 4.770590676647326e-05, "loss": 1.0999, "step": 1758 }, { "epoch": 0.2765234137043369, "grad_norm": 0.23540420830249786, "learning_rate": 4.770332067397627e-05, "loss": 1.1478, "step": 1759 }, { "epoch": 0.2766806186012694, "grad_norm": 0.29831403493881226, "learning_rate": 4.770073319484684e-05, "loss": 1.0932, "step": 1760 }, { "epoch": 0.2766806186012694, "eval_loss": 1.1590473651885986, "eval_runtime": 2321.8933, "eval_samples_per_second": 3.987, "eval_steps_per_second": 1.994, "step": 1760 }, { "epoch": 0.27683782349820196, "grad_norm": 0.21759991347789764, "learning_rate": 4.769814432924299e-05, "loss": 1.166, "step": 1761 }, { "epoch": 0.2769950283951345, "grad_norm": 0.2622128129005432, "learning_rate": 4.7695554077322845e-05, "loss": 1.1155, "step": 1762 }, { "epoch": 0.27715223329206706, "grad_norm": 0.24021178483963013, "learning_rate": 4.769296243924462e-05, "loss": 1.1922, "step": 1763 }, { "epoch": 0.2773094381889996, "grad_norm": 0.23425696790218353, "learning_rate": 4.769036941516658e-05, "loss": 1.0903, "step": 1764 }, { "epoch": 0.2774666430859321, "grad_norm": 0.17257444560527802, "learning_rate": 4.768777500524711e-05, "loss": 1.1319, "step": 1765 }, { "epoch": 0.27762384798286466, "grad_norm": 0.2458474487066269, "learning_rate": 4.7685179209644664e-05, "loss": 1.2144, "step": 1766 }, { "epoch": 0.2777810528797972, "grad_norm": 0.18771199882030487, "learning_rate": 4.7682582028517784e-05, "loss": 1.1654, "step": 1767 }, { "epoch": 0.27793825777672976, "grad_norm": 0.2979085445404053, "learning_rate": 4.767998346202509e-05, "loss": 1.1537, "step": 1768 }, { "epoch": 0.2780954626736623, "grad_norm": 0.22919511795043945, "learning_rate": 4.767738351032531e-05, "loss": 1.15, "step": 1769 }, { "epoch": 0.2782526675705948, "grad_norm": 0.24717088043689728, "learning_rate": 4.7674782173577214e-05, "loss": 1.1205, "step": 1770 }, { "epoch": 0.27840987246752735, "grad_norm": 0.21218983829021454, "learning_rate": 4.7672179451939704e-05, "loss": 1.2007, "step": 1771 }, { "epoch": 0.2785670773644599, "grad_norm": 0.16957569122314453, "learning_rate": 4.766957534557173e-05, "loss": 1.183, "step": 1772 }, { "epoch": 0.27872428226139245, "grad_norm": 0.17404644191265106, "learning_rate": 4.766696985463235e-05, "loss": 1.0727, "step": 1773 }, { "epoch": 0.278881487158325, "grad_norm": 0.22840437293052673, "learning_rate": 4.766436297928068e-05, "loss": 1.0331, "step": 1774 }, { "epoch": 0.2790386920552575, "grad_norm": 0.18459245562553406, "learning_rate": 4.766175471967597e-05, "loss": 1.1837, "step": 1775 }, { "epoch": 0.27919589695219005, "grad_norm": 0.1901281177997589, "learning_rate": 4.7659145075977496e-05, "loss": 1.2815, "step": 1776 }, { "epoch": 0.2793531018491226, "grad_norm": 0.20891423523426056, "learning_rate": 4.7656534048344656e-05, "loss": 1.1554, "step": 1777 }, { "epoch": 0.27951030674605515, "grad_norm": 0.2291899174451828, "learning_rate": 4.765392163693691e-05, "loss": 1.137, "step": 1778 }, { "epoch": 0.2796675116429877, "grad_norm": 0.21621862053871155, "learning_rate": 4.765130784191384e-05, "loss": 1.1818, "step": 1779 }, { "epoch": 0.2798247165399202, "grad_norm": 0.24925227463245392, "learning_rate": 4.7648692663435054e-05, "loss": 1.1611, "step": 1780 }, { "epoch": 0.27998192143685274, "grad_norm": 0.26903223991394043, "learning_rate": 4.76460761016603e-05, "loss": 1.1493, "step": 1781 }, { "epoch": 0.2801391263337853, "grad_norm": 0.17821714282035828, "learning_rate": 4.764345815674937e-05, "loss": 1.1292, "step": 1782 }, { "epoch": 0.28029633123071784, "grad_norm": 0.19402022659778595, "learning_rate": 4.764083882886218e-05, "loss": 1.1462, "step": 1783 }, { "epoch": 0.2804535361276504, "grad_norm": 0.18578824400901794, "learning_rate": 4.7638218118158694e-05, "loss": 1.1745, "step": 1784 }, { "epoch": 0.2806107410245829, "grad_norm": 0.2654874920845032, "learning_rate": 4.763559602479898e-05, "loss": 1.1627, "step": 1785 }, { "epoch": 0.28076794592151544, "grad_norm": 0.3167024850845337, "learning_rate": 4.763297254894318e-05, "loss": 1.0536, "step": 1786 }, { "epoch": 0.280925150818448, "grad_norm": 0.21574537456035614, "learning_rate": 4.763034769075153e-05, "loss": 1.1318, "step": 1787 }, { "epoch": 0.28108235571538054, "grad_norm": 0.2255323827266693, "learning_rate": 4.7627721450384354e-05, "loss": 1.299, "step": 1788 }, { "epoch": 0.2812395606123131, "grad_norm": 0.23979492485523224, "learning_rate": 4.7625093828002035e-05, "loss": 1.2133, "step": 1789 }, { "epoch": 0.28139676550924564, "grad_norm": 0.26039859652519226, "learning_rate": 4.762246482376507e-05, "loss": 1.1472, "step": 1790 }, { "epoch": 0.28155397040617813, "grad_norm": 0.2618144452571869, "learning_rate": 4.761983443783403e-05, "loss": 1.1851, "step": 1791 }, { "epoch": 0.2817111753031107, "grad_norm": 0.2428680956363678, "learning_rate": 4.7617202670369556e-05, "loss": 1.1583, "step": 1792 }, { "epoch": 0.28186838020004323, "grad_norm": 0.280111700296402, "learning_rate": 4.76145695215324e-05, "loss": 1.2112, "step": 1793 }, { "epoch": 0.2820255850969758, "grad_norm": 0.24310675263404846, "learning_rate": 4.761193499148339e-05, "loss": 1.1552, "step": 1794 }, { "epoch": 0.28218278999390833, "grad_norm": 0.28014469146728516, "learning_rate": 4.7609299080383415e-05, "loss": 1.2571, "step": 1795 }, { "epoch": 0.28233999489084083, "grad_norm": 0.3029899001121521, "learning_rate": 4.760666178839347e-05, "loss": 1.1485, "step": 1796 }, { "epoch": 0.2824971997877734, "grad_norm": 0.34275612235069275, "learning_rate": 4.7604023115674644e-05, "loss": 1.101, "step": 1797 }, { "epoch": 0.28265440468470593, "grad_norm": 0.20968905091285706, "learning_rate": 4.760138306238809e-05, "loss": 1.0953, "step": 1798 }, { "epoch": 0.2828116095816385, "grad_norm": 0.19242046773433685, "learning_rate": 4.759874162869505e-05, "loss": 1.1754, "step": 1799 }, { "epoch": 0.28296881447857103, "grad_norm": 0.31839972734451294, "learning_rate": 4.759609881475685e-05, "loss": 1.0707, "step": 1800 }, { "epoch": 0.2831260193755035, "grad_norm": 0.3110107183456421, "learning_rate": 4.7593454620734914e-05, "loss": 1.1576, "step": 1801 }, { "epoch": 0.2832832242724361, "grad_norm": 0.2405199557542801, "learning_rate": 4.759080904679072e-05, "loss": 1.179, "step": 1802 }, { "epoch": 0.2834404291693686, "grad_norm": 0.21492940187454224, "learning_rate": 4.758816209308587e-05, "loss": 1.1806, "step": 1803 }, { "epoch": 0.2835976340663012, "grad_norm": 0.2658292055130005, "learning_rate": 4.758551375978202e-05, "loss": 1.0843, "step": 1804 }, { "epoch": 0.2837548389632337, "grad_norm": 0.2901574969291687, "learning_rate": 4.758286404704092e-05, "loss": 1.1257, "step": 1805 }, { "epoch": 0.2839120438601662, "grad_norm": 0.34066644310951233, "learning_rate": 4.758021295502441e-05, "loss": 1.1073, "step": 1806 }, { "epoch": 0.28406924875709877, "grad_norm": 0.2622152268886566, "learning_rate": 4.7577560483894406e-05, "loss": 1.1046, "step": 1807 }, { "epoch": 0.2842264536540313, "grad_norm": 0.1930551379919052, "learning_rate": 4.757490663381291e-05, "loss": 1.2328, "step": 1808 }, { "epoch": 0.28438365855096387, "grad_norm": 0.30099764466285706, "learning_rate": 4.7572251404942e-05, "loss": 1.1743, "step": 1809 }, { "epoch": 0.2845408634478964, "grad_norm": 0.24297019839286804, "learning_rate": 4.756959479744386e-05, "loss": 1.1596, "step": 1810 }, { "epoch": 0.2846980683448289, "grad_norm": 0.2792363464832306, "learning_rate": 4.7566936811480744e-05, "loss": 1.0061, "step": 1811 }, { "epoch": 0.28485527324176146, "grad_norm": 0.2141488492488861, "learning_rate": 4.756427744721499e-05, "loss": 1.1381, "step": 1812 }, { "epoch": 0.285012478138694, "grad_norm": 0.23667530715465546, "learning_rate": 4.756161670480902e-05, "loss": 1.1434, "step": 1813 }, { "epoch": 0.28516968303562656, "grad_norm": 0.21142233908176422, "learning_rate": 4.755895458442534e-05, "loss": 1.1959, "step": 1814 }, { "epoch": 0.2853268879325591, "grad_norm": 0.22952231764793396, "learning_rate": 4.755629108622655e-05, "loss": 1.132, "step": 1815 }, { "epoch": 0.28548409282949166, "grad_norm": 0.2787313759326935, "learning_rate": 4.7553626210375326e-05, "loss": 0.9319, "step": 1816 }, { "epoch": 0.28564129772642416, "grad_norm": 0.1895769089460373, "learning_rate": 4.755095995703441e-05, "loss": 1.2251, "step": 1817 }, { "epoch": 0.2857985026233567, "grad_norm": 0.2436160445213318, "learning_rate": 4.754829232636667e-05, "loss": 1.14, "step": 1818 }, { "epoch": 0.28595570752028926, "grad_norm": 0.3004104793071747, "learning_rate": 4.7545623318535024e-05, "loss": 1.1939, "step": 1819 }, { "epoch": 0.2861129124172218, "grad_norm": 0.30581870675086975, "learning_rate": 4.754295293370248e-05, "loss": 1.1904, "step": 1820 }, { "epoch": 0.28627011731415436, "grad_norm": 0.3518725335597992, "learning_rate": 4.754028117203215e-05, "loss": 1.1693, "step": 1821 }, { "epoch": 0.28642732221108685, "grad_norm": 0.2542705237865448, "learning_rate": 4.7537608033687204e-05, "loss": 1.0971, "step": 1822 }, { "epoch": 0.2865845271080194, "grad_norm": 0.19205254316329956, "learning_rate": 4.7534933518830904e-05, "loss": 1.0652, "step": 1823 }, { "epoch": 0.28674173200495195, "grad_norm": 0.2477174997329712, "learning_rate": 4.753225762762661e-05, "loss": 1.2265, "step": 1824 }, { "epoch": 0.2868989369018845, "grad_norm": 0.19431249797344208, "learning_rate": 4.7529580360237744e-05, "loss": 1.1722, "step": 1825 }, { "epoch": 0.28705614179881705, "grad_norm": 0.24031542241573334, "learning_rate": 4.7526901716827846e-05, "loss": 1.1593, "step": 1826 }, { "epoch": 0.28721334669574955, "grad_norm": 0.3656073808670044, "learning_rate": 4.752422169756048e-05, "loss": 1.1655, "step": 1827 }, { "epoch": 0.2873705515926821, "grad_norm": 0.24645781517028809, "learning_rate": 4.752154030259936e-05, "loss": 1.0771, "step": 1828 }, { "epoch": 0.28752775648961465, "grad_norm": 0.22396346926689148, "learning_rate": 4.7518857532108245e-05, "loss": 1.2857, "step": 1829 }, { "epoch": 0.2876849613865472, "grad_norm": 0.37394481897354126, "learning_rate": 4.751617338625099e-05, "loss": 1.1057, "step": 1830 }, { "epoch": 0.28784216628347975, "grad_norm": 0.260421484708786, "learning_rate": 4.751348786519154e-05, "loss": 1.1764, "step": 1831 }, { "epoch": 0.28799937118041224, "grad_norm": 0.23633582890033722, "learning_rate": 4.7510800969093904e-05, "loss": 1.2257, "step": 1832 }, { "epoch": 0.2881565760773448, "grad_norm": 0.2963007092475891, "learning_rate": 4.750811269812219e-05, "loss": 1.1226, "step": 1833 }, { "epoch": 0.28831378097427734, "grad_norm": 0.2502879798412323, "learning_rate": 4.75054230524406e-05, "loss": 1.1942, "step": 1834 }, { "epoch": 0.2884709858712099, "grad_norm": 0.20743811130523682, "learning_rate": 4.750273203221339e-05, "loss": 1.2321, "step": 1835 }, { "epoch": 0.28862819076814245, "grad_norm": 0.3560789227485657, "learning_rate": 4.750003963760493e-05, "loss": 1.1188, "step": 1836 }, { "epoch": 0.28878539566507494, "grad_norm": 0.3189667761325836, "learning_rate": 4.7497345868779644e-05, "loss": 1.3159, "step": 1837 }, { "epoch": 0.2889426005620075, "grad_norm": 0.22662386298179626, "learning_rate": 4.749465072590208e-05, "loss": 1.055, "step": 1838 }, { "epoch": 0.28909980545894004, "grad_norm": 0.18307331204414368, "learning_rate": 4.749195420913683e-05, "loss": 1.3076, "step": 1839 }, { "epoch": 0.2892570103558726, "grad_norm": 0.25430959463119507, "learning_rate": 4.74892563186486e-05, "loss": 1.2806, "step": 1840 }, { "epoch": 0.28941421525280514, "grad_norm": 0.21403716504573822, "learning_rate": 4.748655705460215e-05, "loss": 1.1572, "step": 1841 }, { "epoch": 0.2895714201497377, "grad_norm": 0.1849280297756195, "learning_rate": 4.7483856417162365e-05, "loss": 1.1613, "step": 1842 }, { "epoch": 0.2897286250466702, "grad_norm": 0.21544575691223145, "learning_rate": 4.7481154406494164e-05, "loss": 1.1786, "step": 1843 }, { "epoch": 0.28988582994360274, "grad_norm": 0.2631804943084717, "learning_rate": 4.7478451022762596e-05, "loss": 1.1764, "step": 1844 }, { "epoch": 0.2900430348405353, "grad_norm": 0.21197448670864105, "learning_rate": 4.747574626613276e-05, "loss": 1.1235, "step": 1845 }, { "epoch": 0.29020023973746784, "grad_norm": 0.26589202880859375, "learning_rate": 4.7473040136769855e-05, "loss": 1.2104, "step": 1846 }, { "epoch": 0.2903574446344004, "grad_norm": 0.3811849057674408, "learning_rate": 4.7470332634839165e-05, "loss": 1.1747, "step": 1847 }, { "epoch": 0.2905146495313329, "grad_norm": 0.23455043137073517, "learning_rate": 4.7467623760506054e-05, "loss": 1.1475, "step": 1848 }, { "epoch": 0.29067185442826543, "grad_norm": 0.2337242066860199, "learning_rate": 4.746491351393596e-05, "loss": 1.1085, "step": 1849 }, { "epoch": 0.290829059325198, "grad_norm": 0.30514079332351685, "learning_rate": 4.746220189529442e-05, "loss": 1.235, "step": 1850 }, { "epoch": 0.29098626422213053, "grad_norm": 0.20872575044631958, "learning_rate": 4.7459488904747064e-05, "loss": 1.1893, "step": 1851 }, { "epoch": 0.2911434691190631, "grad_norm": 0.23578737676143646, "learning_rate": 4.745677454245957e-05, "loss": 1.1986, "step": 1852 }, { "epoch": 0.2913006740159956, "grad_norm": 0.26326242089271545, "learning_rate": 4.745405880859773e-05, "loss": 1.213, "step": 1853 }, { "epoch": 0.2914578789129281, "grad_norm": 0.23654600977897644, "learning_rate": 4.7451341703327414e-05, "loss": 1.19, "step": 1854 }, { "epoch": 0.2916150838098607, "grad_norm": 0.2641269564628601, "learning_rate": 4.744862322681457e-05, "loss": 1.1428, "step": 1855 }, { "epoch": 0.2917722887067932, "grad_norm": 0.24482795596122742, "learning_rate": 4.744590337922522e-05, "loss": 1.1508, "step": 1856 }, { "epoch": 0.2919294936037258, "grad_norm": 0.28008776903152466, "learning_rate": 4.744318216072551e-05, "loss": 1.3053, "step": 1857 }, { "epoch": 0.29208669850065827, "grad_norm": 0.23641882836818695, "learning_rate": 4.744045957148161e-05, "loss": 1.0879, "step": 1858 }, { "epoch": 0.2922439033975908, "grad_norm": 0.25675082206726074, "learning_rate": 4.743773561165982e-05, "loss": 1.2489, "step": 1859 }, { "epoch": 0.29240110829452337, "grad_norm": 0.25265899300575256, "learning_rate": 4.743501028142652e-05, "loss": 1.1042, "step": 1860 }, { "epoch": 0.2925583131914559, "grad_norm": 0.2375422716140747, "learning_rate": 4.743228358094814e-05, "loss": 1.2003, "step": 1861 }, { "epoch": 0.29271551808838847, "grad_norm": 0.24216416478157043, "learning_rate": 4.742955551039123e-05, "loss": 1.1058, "step": 1862 }, { "epoch": 0.29287272298532097, "grad_norm": 0.3152044713497162, "learning_rate": 4.7426826069922416e-05, "loss": 1.2503, "step": 1863 }, { "epoch": 0.2930299278822535, "grad_norm": 0.19799089431762695, "learning_rate": 4.7424095259708384e-05, "loss": 1.0635, "step": 1864 }, { "epoch": 0.29318713277918607, "grad_norm": 0.24691304564476013, "learning_rate": 4.742136307991594e-05, "loss": 1.232, "step": 1865 }, { "epoch": 0.2933443376761186, "grad_norm": 0.24523115158081055, "learning_rate": 4.741862953071194e-05, "loss": 1.1678, "step": 1866 }, { "epoch": 0.29350154257305117, "grad_norm": 0.26968106627464294, "learning_rate": 4.7415894612263344e-05, "loss": 1.194, "step": 1867 }, { "epoch": 0.2936587474699837, "grad_norm": 0.23013943433761597, "learning_rate": 4.7413158324737206e-05, "loss": 1.1052, "step": 1868 }, { "epoch": 0.2938159523669162, "grad_norm": 0.21373358368873596, "learning_rate": 4.741042066830062e-05, "loss": 1.1555, "step": 1869 }, { "epoch": 0.29397315726384876, "grad_norm": 0.26143181324005127, "learning_rate": 4.740768164312081e-05, "loss": 1.0752, "step": 1870 }, { "epoch": 0.2941303621607813, "grad_norm": 0.216646209359169, "learning_rate": 4.7404941249365066e-05, "loss": 1.1678, "step": 1871 }, { "epoch": 0.29428756705771386, "grad_norm": 0.22760845720767975, "learning_rate": 4.740219948720075e-05, "loss": 1.0414, "step": 1872 }, { "epoch": 0.2944447719546464, "grad_norm": 0.2713901400566101, "learning_rate": 4.739945635679532e-05, "loss": 1.2089, "step": 1873 }, { "epoch": 0.2946019768515789, "grad_norm": 0.2115233987569809, "learning_rate": 4.739671185831633e-05, "loss": 1.2092, "step": 1874 }, { "epoch": 0.29475918174851146, "grad_norm": 0.24696362018585205, "learning_rate": 4.739396599193139e-05, "loss": 1.1438, "step": 1875 }, { "epoch": 0.294916386645444, "grad_norm": 0.29846012592315674, "learning_rate": 4.739121875780821e-05, "loss": 1.1604, "step": 1876 }, { "epoch": 0.29507359154237656, "grad_norm": 0.22811514139175415, "learning_rate": 4.7388470156114576e-05, "loss": 1.2149, "step": 1877 }, { "epoch": 0.2952307964393091, "grad_norm": 0.24468787014484406, "learning_rate": 4.738572018701838e-05, "loss": 1.0935, "step": 1878 }, { "epoch": 0.2953880013362416, "grad_norm": 0.28037217259407043, "learning_rate": 4.738296885068756e-05, "loss": 1.0999, "step": 1879 }, { "epoch": 0.29554520623317415, "grad_norm": 0.2207103669643402, "learning_rate": 4.738021614729016e-05, "loss": 1.242, "step": 1880 }, { "epoch": 0.2957024111301067, "grad_norm": 0.16394954919815063, "learning_rate": 4.7377462076994313e-05, "loss": 1.1138, "step": 1881 }, { "epoch": 0.29585961602703925, "grad_norm": 0.3569471538066864, "learning_rate": 4.7374706639968224e-05, "loss": 1.1034, "step": 1882 }, { "epoch": 0.2960168209239718, "grad_norm": 0.19428043067455292, "learning_rate": 4.737194983638018e-05, "loss": 1.2082, "step": 1883 }, { "epoch": 0.2961740258209043, "grad_norm": 0.23990128934383392, "learning_rate": 4.736919166639856e-05, "loss": 1.2445, "step": 1884 }, { "epoch": 0.29633123071783685, "grad_norm": 0.2282816767692566, "learning_rate": 4.736643213019183e-05, "loss": 1.0495, "step": 1885 }, { "epoch": 0.2964884356147694, "grad_norm": 0.279305100440979, "learning_rate": 4.736367122792852e-05, "loss": 1.0828, "step": 1886 }, { "epoch": 0.29664564051170195, "grad_norm": 0.18702171742916107, "learning_rate": 4.736090895977725e-05, "loss": 1.2215, "step": 1887 }, { "epoch": 0.2968028454086345, "grad_norm": 0.232543483376503, "learning_rate": 4.735814532590675e-05, "loss": 1.0841, "step": 1888 }, { "epoch": 0.296960050305567, "grad_norm": 0.1895131915807724, "learning_rate": 4.7355380326485796e-05, "loss": 1.169, "step": 1889 }, { "epoch": 0.29711725520249954, "grad_norm": 0.1745792180299759, "learning_rate": 4.735261396168327e-05, "loss": 1.2403, "step": 1890 }, { "epoch": 0.2972744600994321, "grad_norm": 0.25837990641593933, "learning_rate": 4.734984623166813e-05, "loss": 1.0719, "step": 1891 }, { "epoch": 0.29743166499636464, "grad_norm": 0.21535173058509827, "learning_rate": 4.7347077136609416e-05, "loss": 1.1058, "step": 1892 }, { "epoch": 0.2975888698932972, "grad_norm": 0.25150197744369507, "learning_rate": 4.7344306676676254e-05, "loss": 1.1828, "step": 1893 }, { "epoch": 0.29774607479022974, "grad_norm": 0.2469971477985382, "learning_rate": 4.734153485203786e-05, "loss": 1.0672, "step": 1894 }, { "epoch": 0.29790327968716224, "grad_norm": 0.2318231761455536, "learning_rate": 4.733876166286352e-05, "loss": 1.1708, "step": 1895 }, { "epoch": 0.2980604845840948, "grad_norm": 0.1978289932012558, "learning_rate": 4.733598710932261e-05, "loss": 1.2133, "step": 1896 }, { "epoch": 0.29821768948102734, "grad_norm": 0.22039441764354706, "learning_rate": 4.733321119158459e-05, "loss": 1.1628, "step": 1897 }, { "epoch": 0.2983748943779599, "grad_norm": 0.1828751415014267, "learning_rate": 4.7330433909819e-05, "loss": 1.0819, "step": 1898 }, { "epoch": 0.29853209927489244, "grad_norm": 0.3677425980567932, "learning_rate": 4.732765526419547e-05, "loss": 1.2305, "step": 1899 }, { "epoch": 0.29868930417182493, "grad_norm": 0.2010020911693573, "learning_rate": 4.732487525488371e-05, "loss": 1.1933, "step": 1900 }, { "epoch": 0.2988465090687575, "grad_norm": 0.2198139876127243, "learning_rate": 4.732209388205351e-05, "loss": 1.1228, "step": 1901 }, { "epoch": 0.29900371396569003, "grad_norm": 0.26586952805519104, "learning_rate": 4.731931114587474e-05, "loss": 1.1978, "step": 1902 }, { "epoch": 0.2991609188626226, "grad_norm": 0.18092826008796692, "learning_rate": 4.731652704651737e-05, "loss": 1.2788, "step": 1903 }, { "epoch": 0.29931812375955513, "grad_norm": 0.23159582912921906, "learning_rate": 4.731374158415144e-05, "loss": 1.0886, "step": 1904 }, { "epoch": 0.2994753286564876, "grad_norm": 0.24246764183044434, "learning_rate": 4.7310954758947066e-05, "loss": 1.1374, "step": 1905 }, { "epoch": 0.2996325335534202, "grad_norm": 0.25246042013168335, "learning_rate": 4.730816657107446e-05, "loss": 1.2619, "step": 1906 }, { "epoch": 0.2997897384503527, "grad_norm": 0.23286496102809906, "learning_rate": 4.730537702070393e-05, "loss": 1.2179, "step": 1907 }, { "epoch": 0.2999469433472853, "grad_norm": 0.23141705989837646, "learning_rate": 4.7302586108005834e-05, "loss": 1.219, "step": 1908 }, { "epoch": 0.3001041482442178, "grad_norm": 0.2506294846534729, "learning_rate": 4.7299793833150624e-05, "loss": 1.173, "step": 1909 }, { "epoch": 0.3002613531411503, "grad_norm": 0.23536083102226257, "learning_rate": 4.729700019630886e-05, "loss": 0.9852, "step": 1910 }, { "epoch": 0.3004185580380829, "grad_norm": 0.27178215980529785, "learning_rate": 4.729420519765115e-05, "loss": 1.2663, "step": 1911 }, { "epoch": 0.3005757629350154, "grad_norm": 0.2641531229019165, "learning_rate": 4.7291408837348224e-05, "loss": 1.2179, "step": 1912 }, { "epoch": 0.300732967831948, "grad_norm": 0.2041010558605194, "learning_rate": 4.728861111557085e-05, "loss": 1.1546, "step": 1913 }, { "epoch": 0.3008901727288805, "grad_norm": 0.24100835621356964, "learning_rate": 4.728581203248992e-05, "loss": 1.2314, "step": 1914 }, { "epoch": 0.301047377625813, "grad_norm": 0.2499173879623413, "learning_rate": 4.7283011588276374e-05, "loss": 1.1391, "step": 1915 }, { "epoch": 0.30120458252274557, "grad_norm": 0.17696775496006012, "learning_rate": 4.7280209783101265e-05, "loss": 1.1424, "step": 1916 }, { "epoch": 0.3013617874196781, "grad_norm": 0.2696003019809723, "learning_rate": 4.727740661713571e-05, "loss": 1.1785, "step": 1917 }, { "epoch": 0.30151899231661067, "grad_norm": 0.19227688014507294, "learning_rate": 4.727460209055092e-05, "loss": 1.1431, "step": 1918 }, { "epoch": 0.3016761972135432, "grad_norm": 0.2030278742313385, "learning_rate": 4.7271796203518184e-05, "loss": 1.1968, "step": 1919 }, { "epoch": 0.30183340211047577, "grad_norm": 0.2489650398492813, "learning_rate": 4.726898895620888e-05, "loss": 1.1133, "step": 1920 }, { "epoch": 0.30183340211047577, "eval_loss": 1.1504688262939453, "eval_runtime": 2346.6617, "eval_samples_per_second": 3.945, "eval_steps_per_second": 1.973, "step": 1920 }, { "epoch": 0.30199060700740826, "grad_norm": 0.20980967581272125, "learning_rate": 4.7266180348794456e-05, "loss": 1.1863, "step": 1921 }, { "epoch": 0.3021478119043408, "grad_norm": 0.25084492564201355, "learning_rate": 4.726337038144645e-05, "loss": 1.1693, "step": 1922 }, { "epoch": 0.30230501680127336, "grad_norm": 0.24765540659427643, "learning_rate": 4.726055905433649e-05, "loss": 1.1415, "step": 1923 }, { "epoch": 0.3024622216982059, "grad_norm": 0.21769382059574127, "learning_rate": 4.725774636763628e-05, "loss": 1.2113, "step": 1924 }, { "epoch": 0.30261942659513846, "grad_norm": 0.18449430167675018, "learning_rate": 4.725493232151761e-05, "loss": 1.1187, "step": 1925 }, { "epoch": 0.30277663149207096, "grad_norm": 0.2294544279575348, "learning_rate": 4.725211691615234e-05, "loss": 1.011, "step": 1926 }, { "epoch": 0.3029338363890035, "grad_norm": 0.235177144408226, "learning_rate": 4.724930015171244e-05, "loss": 1.1604, "step": 1927 }, { "epoch": 0.30309104128593606, "grad_norm": 0.32669633626937866, "learning_rate": 4.724648202836993e-05, "loss": 1.1437, "step": 1928 }, { "epoch": 0.3032482461828686, "grad_norm": 0.1598648726940155, "learning_rate": 4.7243662546296954e-05, "loss": 1.2719, "step": 1929 }, { "epoch": 0.30340545107980116, "grad_norm": 0.222189262509346, "learning_rate": 4.724084170566569e-05, "loss": 1.1576, "step": 1930 }, { "epoch": 0.30356265597673365, "grad_norm": 0.20110036432743073, "learning_rate": 4.723801950664844e-05, "loss": 1.1906, "step": 1931 }, { "epoch": 0.3037198608736662, "grad_norm": 0.25523391366004944, "learning_rate": 4.7235195949417564e-05, "loss": 1.1705, "step": 1932 }, { "epoch": 0.30387706577059875, "grad_norm": 0.20221413671970367, "learning_rate": 4.723237103414553e-05, "loss": 1.13, "step": 1933 }, { "epoch": 0.3040342706675313, "grad_norm": 0.2736821472644806, "learning_rate": 4.722954476100485e-05, "loss": 0.9517, "step": 1934 }, { "epoch": 0.30419147556446385, "grad_norm": 0.20103007555007935, "learning_rate": 4.722671713016816e-05, "loss": 1.2214, "step": 1935 }, { "epoch": 0.30434868046139635, "grad_norm": 0.23819975554943085, "learning_rate": 4.7223888141808156e-05, "loss": 1.1757, "step": 1936 }, { "epoch": 0.3045058853583289, "grad_norm": 0.22529488801956177, "learning_rate": 4.7221057796097614e-05, "loss": 1.2031, "step": 1937 }, { "epoch": 0.30466309025526145, "grad_norm": 0.2521119713783264, "learning_rate": 4.7218226093209416e-05, "loss": 1.1728, "step": 1938 }, { "epoch": 0.304820295152194, "grad_norm": 0.23056873679161072, "learning_rate": 4.72153930333165e-05, "loss": 1.2113, "step": 1939 }, { "epoch": 0.30497750004912655, "grad_norm": 0.29192107915878296, "learning_rate": 4.7212558616591895e-05, "loss": 1.0284, "step": 1940 }, { "epoch": 0.30513470494605904, "grad_norm": 0.1903986781835556, "learning_rate": 4.7209722843208725e-05, "loss": 1.2149, "step": 1941 }, { "epoch": 0.3052919098429916, "grad_norm": 0.18553946912288666, "learning_rate": 4.720688571334019e-05, "loss": 1.1815, "step": 1942 }, { "epoch": 0.30544911473992414, "grad_norm": 0.2598629593849182, "learning_rate": 4.720404722715957e-05, "loss": 1.0763, "step": 1943 }, { "epoch": 0.3056063196368567, "grad_norm": 0.3054077625274658, "learning_rate": 4.720120738484022e-05, "loss": 1.1202, "step": 1944 }, { "epoch": 0.30576352453378924, "grad_norm": 0.20831403136253357, "learning_rate": 4.71983661865556e-05, "loss": 1.1807, "step": 1945 }, { "epoch": 0.3059207294307218, "grad_norm": 0.18264667689800262, "learning_rate": 4.7195523632479226e-05, "loss": 1.172, "step": 1946 }, { "epoch": 0.3060779343276543, "grad_norm": 0.18661391735076904, "learning_rate": 4.719267972278472e-05, "loss": 1.2601, "step": 1947 }, { "epoch": 0.30623513922458684, "grad_norm": 0.19430875778198242, "learning_rate": 4.7189834457645775e-05, "loss": 1.1476, "step": 1948 }, { "epoch": 0.3063923441215194, "grad_norm": 0.23284098505973816, "learning_rate": 4.718698783723616e-05, "loss": 1.0859, "step": 1949 }, { "epoch": 0.30654954901845194, "grad_norm": 0.19055165350437164, "learning_rate": 4.7184139861729756e-05, "loss": 1.1245, "step": 1950 }, { "epoch": 0.3067067539153845, "grad_norm": 0.20607997477054596, "learning_rate": 4.7181290531300496e-05, "loss": 1.2386, "step": 1951 }, { "epoch": 0.306863958812317, "grad_norm": 0.16489629447460175, "learning_rate": 4.717843984612239e-05, "loss": 1.2782, "step": 1952 }, { "epoch": 0.30702116370924953, "grad_norm": 0.22474288940429688, "learning_rate": 4.717558780636957e-05, "loss": 1.219, "step": 1953 }, { "epoch": 0.3071783686061821, "grad_norm": 0.22763358056545258, "learning_rate": 4.7172734412216224e-05, "loss": 1.0742, "step": 1954 }, { "epoch": 0.30733557350311463, "grad_norm": 0.2578442692756653, "learning_rate": 4.7169879663836614e-05, "loss": 1.0377, "step": 1955 }, { "epoch": 0.3074927784000472, "grad_norm": 0.26955658197402954, "learning_rate": 4.716702356140511e-05, "loss": 1.0492, "step": 1956 }, { "epoch": 0.3076499832969797, "grad_norm": 0.2685282826423645, "learning_rate": 4.716416610509614e-05, "loss": 1.1809, "step": 1957 }, { "epoch": 0.30780718819391223, "grad_norm": 0.26675713062286377, "learning_rate": 4.716130729508424e-05, "loss": 1.2132, "step": 1958 }, { "epoch": 0.3079643930908448, "grad_norm": 0.22978322207927704, "learning_rate": 4.7158447131544e-05, "loss": 1.1327, "step": 1959 }, { "epoch": 0.30812159798777733, "grad_norm": 0.2514789402484894, "learning_rate": 4.7155585614650134e-05, "loss": 1.2075, "step": 1960 }, { "epoch": 0.3082788028847099, "grad_norm": 0.2616156339645386, "learning_rate": 4.715272274457738e-05, "loss": 1.0809, "step": 1961 }, { "epoch": 0.3084360077816424, "grad_norm": 0.3378554880619049, "learning_rate": 4.7149858521500614e-05, "loss": 1.1245, "step": 1962 }, { "epoch": 0.3085932126785749, "grad_norm": 0.20755833387374878, "learning_rate": 4.714699294559476e-05, "loss": 1.2121, "step": 1963 }, { "epoch": 0.3087504175755075, "grad_norm": 0.2765822410583496, "learning_rate": 4.714412601703484e-05, "loss": 1.0652, "step": 1964 }, { "epoch": 0.30890762247244, "grad_norm": 0.2736920416355133, "learning_rate": 4.714125773599596e-05, "loss": 1.1376, "step": 1965 }, { "epoch": 0.3090648273693726, "grad_norm": 0.15031395852565765, "learning_rate": 4.7138388102653295e-05, "loss": 1.1655, "step": 1966 }, { "epoch": 0.30922203226630507, "grad_norm": 0.30680006742477417, "learning_rate": 4.713551711718212e-05, "loss": 1.1064, "step": 1967 }, { "epoch": 0.3093792371632376, "grad_norm": 0.2674984335899353, "learning_rate": 4.713264477975777e-05, "loss": 1.2259, "step": 1968 }, { "epoch": 0.30953644206017017, "grad_norm": 0.2514277398586273, "learning_rate": 4.7129771090555694e-05, "loss": 1.1184, "step": 1969 }, { "epoch": 0.3096936469571027, "grad_norm": 0.25297811627388, "learning_rate": 4.71268960497514e-05, "loss": 1.0941, "step": 1970 }, { "epoch": 0.30985085185403527, "grad_norm": 0.24282206594944, "learning_rate": 4.712401965752048e-05, "loss": 1.0437, "step": 1971 }, { "epoch": 0.3100080567509678, "grad_norm": 0.17618770897388458, "learning_rate": 4.712114191403862e-05, "loss": 1.2034, "step": 1972 }, { "epoch": 0.3101652616479003, "grad_norm": 0.2321152687072754, "learning_rate": 4.7118262819481576e-05, "loss": 1.1255, "step": 1973 }, { "epoch": 0.31032246654483286, "grad_norm": 0.2128259241580963, "learning_rate": 4.711538237402518e-05, "loss": 1.2008, "step": 1974 }, { "epoch": 0.3104796714417654, "grad_norm": 0.20988841354846954, "learning_rate": 4.711250057784539e-05, "loss": 1.2586, "step": 1975 }, { "epoch": 0.31063687633869796, "grad_norm": 0.16251863539218903, "learning_rate": 4.7109617431118195e-05, "loss": 1.1829, "step": 1976 }, { "epoch": 0.3107940812356305, "grad_norm": 0.24834904074668884, "learning_rate": 4.710673293401968e-05, "loss": 1.1107, "step": 1977 }, { "epoch": 0.310951286132563, "grad_norm": 0.42274773120880127, "learning_rate": 4.7103847086726026e-05, "loss": 1.1066, "step": 1978 }, { "epoch": 0.31110849102949556, "grad_norm": 0.22393664717674255, "learning_rate": 4.7100959889413505e-05, "loss": 1.1203, "step": 1979 }, { "epoch": 0.3112656959264281, "grad_norm": 0.2756640315055847, "learning_rate": 4.709807134225843e-05, "loss": 1.0559, "step": 1980 }, { "epoch": 0.31142290082336066, "grad_norm": 0.3086540997028351, "learning_rate": 4.709518144543724e-05, "loss": 1.0609, "step": 1981 }, { "epoch": 0.3115801057202932, "grad_norm": 0.22327710688114166, "learning_rate": 4.7092290199126444e-05, "loss": 1.1378, "step": 1982 }, { "epoch": 0.3117373106172257, "grad_norm": 0.24802890419960022, "learning_rate": 4.708939760350261e-05, "loss": 1.3069, "step": 1983 }, { "epoch": 0.31189451551415825, "grad_norm": 0.24601417779922485, "learning_rate": 4.708650365874241e-05, "loss": 0.9913, "step": 1984 }, { "epoch": 0.3120517204110908, "grad_norm": 0.1627042442560196, "learning_rate": 4.70836083650226e-05, "loss": 1.1499, "step": 1985 }, { "epoch": 0.31220892530802336, "grad_norm": 0.28036314249038696, "learning_rate": 4.708071172252002e-05, "loss": 1.2129, "step": 1986 }, { "epoch": 0.3123661302049559, "grad_norm": 0.1716984510421753, "learning_rate": 4.707781373141158e-05, "loss": 1.0891, "step": 1987 }, { "epoch": 0.3125233351018884, "grad_norm": 0.23412878811359406, "learning_rate": 4.707491439187427e-05, "loss": 1.1228, "step": 1988 }, { "epoch": 0.31268053999882095, "grad_norm": 0.1973845213651657, "learning_rate": 4.707201370408518e-05, "loss": 1.0968, "step": 1989 }, { "epoch": 0.3128377448957535, "grad_norm": 0.22760836780071259, "learning_rate": 4.7069111668221476e-05, "loss": 1.0974, "step": 1990 }, { "epoch": 0.31299494979268605, "grad_norm": 0.24710045754909515, "learning_rate": 4.70662082844604e-05, "loss": 1.189, "step": 1991 }, { "epoch": 0.3131521546896186, "grad_norm": 0.1627720296382904, "learning_rate": 4.7063303552979275e-05, "loss": 1.2501, "step": 1992 }, { "epoch": 0.3133093595865511, "grad_norm": 0.2823135554790497, "learning_rate": 4.706039747395552e-05, "loss": 1.1686, "step": 1993 }, { "epoch": 0.31346656448348365, "grad_norm": 0.16645194590091705, "learning_rate": 4.705749004756662e-05, "loss": 1.1522, "step": 1994 }, { "epoch": 0.3136237693804162, "grad_norm": 0.2954739034175873, "learning_rate": 4.705458127399015e-05, "loss": 1.0194, "step": 1995 }, { "epoch": 0.31378097427734875, "grad_norm": 0.21758581697940826, "learning_rate": 4.705167115340376e-05, "loss": 1.1406, "step": 1996 }, { "epoch": 0.3139381791742813, "grad_norm": 0.2450348287820816, "learning_rate": 4.704875968598521e-05, "loss": 1.1483, "step": 1997 }, { "epoch": 0.31409538407121385, "grad_norm": 0.22131307423114777, "learning_rate": 4.70458468719123e-05, "loss": 1.2813, "step": 1998 }, { "epoch": 0.31425258896814634, "grad_norm": 0.20907478034496307, "learning_rate": 4.704293271136294e-05, "loss": 1.2129, "step": 1999 }, { "epoch": 0.3144097938650789, "grad_norm": 0.18173012137413025, "learning_rate": 4.704001720451513e-05, "loss": 1.1854, "step": 2000 }, { "epoch": 0.31456699876201144, "grad_norm": 0.24892567098140717, "learning_rate": 4.7037100351546914e-05, "loss": 1.2041, "step": 2001 }, { "epoch": 0.314724203658944, "grad_norm": 0.24656441807746887, "learning_rate": 4.703418215263646e-05, "loss": 1.1454, "step": 2002 }, { "epoch": 0.31488140855587654, "grad_norm": 0.31545913219451904, "learning_rate": 4.703126260796199e-05, "loss": 1.0506, "step": 2003 }, { "epoch": 0.31503861345280904, "grad_norm": 0.19431567192077637, "learning_rate": 4.702834171770183e-05, "loss": 1.1689, "step": 2004 }, { "epoch": 0.3151958183497416, "grad_norm": 0.22574302554130554, "learning_rate": 4.702541948203436e-05, "loss": 1.1604, "step": 2005 }, { "epoch": 0.31535302324667414, "grad_norm": 0.18993429839611053, "learning_rate": 4.7022495901138084e-05, "loss": 1.2154, "step": 2006 }, { "epoch": 0.3155102281436067, "grad_norm": 0.21746498346328735, "learning_rate": 4.7019570975191544e-05, "loss": 1.1902, "step": 2007 }, { "epoch": 0.31566743304053924, "grad_norm": 0.274848610162735, "learning_rate": 4.701664470437338e-05, "loss": 1.0607, "step": 2008 }, { "epoch": 0.31582463793747173, "grad_norm": 0.2473202347755432, "learning_rate": 4.701371708886233e-05, "loss": 1.1909, "step": 2009 }, { "epoch": 0.3159818428344043, "grad_norm": 0.22823210060596466, "learning_rate": 4.701078812883719e-05, "loss": 1.1499, "step": 2010 }, { "epoch": 0.31613904773133683, "grad_norm": 0.21258869767189026, "learning_rate": 4.700785782447686e-05, "loss": 1.029, "step": 2011 }, { "epoch": 0.3162962526282694, "grad_norm": 0.17091694474220276, "learning_rate": 4.700492617596032e-05, "loss": 1.1888, "step": 2012 }, { "epoch": 0.31645345752520193, "grad_norm": 0.22551240026950836, "learning_rate": 4.70019931834666e-05, "loss": 1.1017, "step": 2013 }, { "epoch": 0.3166106624221344, "grad_norm": 0.24327991902828217, "learning_rate": 4.6999058847174856e-05, "loss": 1.1317, "step": 2014 }, { "epoch": 0.316767867319067, "grad_norm": 0.2276056557893753, "learning_rate": 4.699612316726429e-05, "loss": 1.1481, "step": 2015 }, { "epoch": 0.3169250722159995, "grad_norm": 0.3077816069126129, "learning_rate": 4.699318614391422e-05, "loss": 1.1448, "step": 2016 }, { "epoch": 0.3170822771129321, "grad_norm": 0.17493993043899536, "learning_rate": 4.699024777730402e-05, "loss": 1.1789, "step": 2017 }, { "epoch": 0.3172394820098646, "grad_norm": 0.2110036462545395, "learning_rate": 4.698730806761314e-05, "loss": 1.1272, "step": 2018 }, { "epoch": 0.3173966869067971, "grad_norm": 0.23337996006011963, "learning_rate": 4.6984367015021154e-05, "loss": 1.0113, "step": 2019 }, { "epoch": 0.31755389180372967, "grad_norm": 0.23595476150512695, "learning_rate": 4.698142461970767e-05, "loss": 1.0771, "step": 2020 }, { "epoch": 0.3177110967006622, "grad_norm": 0.3300482928752899, "learning_rate": 4.697848088185241e-05, "loss": 1.0853, "step": 2021 }, { "epoch": 0.31786830159759477, "grad_norm": 0.26284661889076233, "learning_rate": 4.6975535801635146e-05, "loss": 1.1216, "step": 2022 }, { "epoch": 0.3180255064945273, "grad_norm": 0.2010652720928192, "learning_rate": 4.6972589379235775e-05, "loss": 1.1745, "step": 2023 }, { "epoch": 0.31818271139145987, "grad_norm": 0.29553526639938354, "learning_rate": 4.6969641614834244e-05, "loss": 1.1722, "step": 2024 }, { "epoch": 0.31833991628839237, "grad_norm": 0.2783648669719696, "learning_rate": 4.6966692508610586e-05, "loss": 1.1632, "step": 2025 }, { "epoch": 0.3184971211853249, "grad_norm": 0.2214052528142929, "learning_rate": 4.696374206074494e-05, "loss": 1.2004, "step": 2026 }, { "epoch": 0.31865432608225747, "grad_norm": 0.3076881766319275, "learning_rate": 4.696079027141749e-05, "loss": 1.1441, "step": 2027 }, { "epoch": 0.31881153097919, "grad_norm": 0.2371261715888977, "learning_rate": 4.695783714080852e-05, "loss": 1.1661, "step": 2028 }, { "epoch": 0.31896873587612257, "grad_norm": 0.20042219758033752, "learning_rate": 4.695488266909841e-05, "loss": 1.2247, "step": 2029 }, { "epoch": 0.31912594077305506, "grad_norm": 0.1989884376525879, "learning_rate": 4.69519268564676e-05, "loss": 1.1628, "step": 2030 }, { "epoch": 0.3192831456699876, "grad_norm": 0.16829486191272736, "learning_rate": 4.6948969703096614e-05, "loss": 1.0928, "step": 2031 }, { "epoch": 0.31944035056692016, "grad_norm": 0.20480427145957947, "learning_rate": 4.694601120916607e-05, "loss": 1.3009, "step": 2032 }, { "epoch": 0.3195975554638527, "grad_norm": 0.20662841200828552, "learning_rate": 4.694305137485666e-05, "loss": 1.1324, "step": 2033 }, { "epoch": 0.31975476036078526, "grad_norm": 0.22225816547870636, "learning_rate": 4.6940090200349165e-05, "loss": 1.0711, "step": 2034 }, { "epoch": 0.31991196525771776, "grad_norm": 0.1969664990901947, "learning_rate": 4.6937127685824426e-05, "loss": 1.223, "step": 2035 }, { "epoch": 0.3200691701546503, "grad_norm": 0.21913392841815948, "learning_rate": 4.6934163831463405e-05, "loss": 1.1268, "step": 2036 }, { "epoch": 0.32022637505158286, "grad_norm": 0.21957406401634216, "learning_rate": 4.69311986374471e-05, "loss": 1.1839, "step": 2037 }, { "epoch": 0.3203835799485154, "grad_norm": 0.189280167222023, "learning_rate": 4.6928232103956635e-05, "loss": 1.1528, "step": 2038 }, { "epoch": 0.32054078484544796, "grad_norm": 0.26548171043395996, "learning_rate": 4.6925264231173185e-05, "loss": 1.1047, "step": 2039 }, { "epoch": 0.32069798974238045, "grad_norm": 0.20444843173027039, "learning_rate": 4.6922295019278005e-05, "loss": 1.0089, "step": 2040 }, { "epoch": 0.320855194639313, "grad_norm": 0.2803153693675995, "learning_rate": 4.691932446845246e-05, "loss": 1.0605, "step": 2041 }, { "epoch": 0.32101239953624555, "grad_norm": 0.22570960223674774, "learning_rate": 4.691635257887798e-05, "loss": 1.2089, "step": 2042 }, { "epoch": 0.3211696044331781, "grad_norm": 0.2413252294063568, "learning_rate": 4.691337935073606e-05, "loss": 1.2844, "step": 2043 }, { "epoch": 0.32132680933011065, "grad_norm": 0.2029626965522766, "learning_rate": 4.691040478420832e-05, "loss": 1.1383, "step": 2044 }, { "epoch": 0.32148401422704315, "grad_norm": 0.22830082476139069, "learning_rate": 4.6907428879476404e-05, "loss": 1.2141, "step": 2045 }, { "epoch": 0.3216412191239757, "grad_norm": 0.2177695333957672, "learning_rate": 4.690445163672209e-05, "loss": 1.0376, "step": 2046 }, { "epoch": 0.32179842402090825, "grad_norm": 0.24039793014526367, "learning_rate": 4.690147305612721e-05, "loss": 1.1292, "step": 2047 }, { "epoch": 0.3219556289178408, "grad_norm": 0.20877334475517273, "learning_rate": 4.6898493137873687e-05, "loss": 1.1784, "step": 2048 }, { "epoch": 0.32211283381477335, "grad_norm": 0.238002210855484, "learning_rate": 4.689551188214352e-05, "loss": 1.1585, "step": 2049 }, { "epoch": 0.3222700387117059, "grad_norm": 0.24130697548389435, "learning_rate": 4.68925292891188e-05, "loss": 1.0301, "step": 2050 }, { "epoch": 0.3224272436086384, "grad_norm": 0.2053060382604599, "learning_rate": 4.688954535898168e-05, "loss": 1.2153, "step": 2051 }, { "epoch": 0.32258444850557094, "grad_norm": 0.2303832322359085, "learning_rate": 4.6886560091914415e-05, "loss": 1.1762, "step": 2052 }, { "epoch": 0.3227416534025035, "grad_norm": 0.25025346875190735, "learning_rate": 4.688357348809933e-05, "loss": 1.111, "step": 2053 }, { "epoch": 0.32289885829943604, "grad_norm": 0.2501404583454132, "learning_rate": 4.6880585547718845e-05, "loss": 1.0784, "step": 2054 }, { "epoch": 0.3230560631963686, "grad_norm": 0.2034914195537567, "learning_rate": 4.687759627095544e-05, "loss": 1.17, "step": 2055 }, { "epoch": 0.3232132680933011, "grad_norm": 0.17908981442451477, "learning_rate": 4.68746056579917e-05, "loss": 1.1601, "step": 2056 }, { "epoch": 0.32337047299023364, "grad_norm": 0.22271564602851868, "learning_rate": 4.6871613709010266e-05, "loss": 1.2194, "step": 2057 }, { "epoch": 0.3235276778871662, "grad_norm": 0.23161068558692932, "learning_rate": 4.6868620424193885e-05, "loss": 1.1704, "step": 2058 }, { "epoch": 0.32368488278409874, "grad_norm": 0.24310912191867828, "learning_rate": 4.6865625803725375e-05, "loss": 1.0526, "step": 2059 }, { "epoch": 0.3238420876810313, "grad_norm": 0.2134714424610138, "learning_rate": 4.6862629847787633e-05, "loss": 1.1974, "step": 2060 }, { "epoch": 0.3239992925779638, "grad_norm": 0.17639285326004028, "learning_rate": 4.685963255656364e-05, "loss": 1.2024, "step": 2061 }, { "epoch": 0.32415649747489633, "grad_norm": 0.22746217250823975, "learning_rate": 4.6856633930236457e-05, "loss": 1.1888, "step": 2062 }, { "epoch": 0.3243137023718289, "grad_norm": 0.17931844294071198, "learning_rate": 4.6853633968989244e-05, "loss": 1.2835, "step": 2063 }, { "epoch": 0.32447090726876143, "grad_norm": 0.18917080760002136, "learning_rate": 4.68506326730052e-05, "loss": 1.2193, "step": 2064 }, { "epoch": 0.324628112165694, "grad_norm": 0.2527455687522888, "learning_rate": 4.684763004246766e-05, "loss": 1.2187, "step": 2065 }, { "epoch": 0.3247853170626265, "grad_norm": 0.1864977329969406, "learning_rate": 4.684462607756001e-05, "loss": 1.2025, "step": 2066 }, { "epoch": 0.324942521959559, "grad_norm": 0.3215543031692505, "learning_rate": 4.6841620778465695e-05, "loss": 1.16, "step": 2067 }, { "epoch": 0.3250997268564916, "grad_norm": 0.3193890452384949, "learning_rate": 4.683861414536829e-05, "loss": 1.0823, "step": 2068 }, { "epoch": 0.32525693175342413, "grad_norm": 0.2766779959201813, "learning_rate": 4.683560617845143e-05, "loss": 1.0615, "step": 2069 }, { "epoch": 0.3254141366503567, "grad_norm": 0.2471621334552765, "learning_rate": 4.683259687789881e-05, "loss": 1.1033, "step": 2070 }, { "epoch": 0.3255713415472892, "grad_norm": 0.22947926819324493, "learning_rate": 4.682958624389426e-05, "loss": 1.3086, "step": 2071 }, { "epoch": 0.3257285464442217, "grad_norm": 0.19345812499523163, "learning_rate": 4.682657427662163e-05, "loss": 1.1013, "step": 2072 }, { "epoch": 0.3258857513411543, "grad_norm": 0.21470381319522858, "learning_rate": 4.682356097626488e-05, "loss": 1.1439, "step": 2073 }, { "epoch": 0.3260429562380868, "grad_norm": 0.19939084351062775, "learning_rate": 4.682054634300807e-05, "loss": 1.0446, "step": 2074 }, { "epoch": 0.3262001611350194, "grad_norm": 0.23096968233585358, "learning_rate": 4.68175303770353e-05, "loss": 1.1722, "step": 2075 }, { "epoch": 0.32635736603195187, "grad_norm": 0.19705285131931305, "learning_rate": 4.6814513078530796e-05, "loss": 1.2121, "step": 2076 }, { "epoch": 0.3265145709288844, "grad_norm": 0.14740808308124542, "learning_rate": 4.681149444767883e-05, "loss": 1.0735, "step": 2077 }, { "epoch": 0.32667177582581697, "grad_norm": 0.1801644265651703, "learning_rate": 4.680847448466376e-05, "loss": 1.14, "step": 2078 }, { "epoch": 0.3268289807227495, "grad_norm": 0.221450537443161, "learning_rate": 4.680545318967006e-05, "loss": 1.1093, "step": 2079 }, { "epoch": 0.32698618561968207, "grad_norm": 0.25395452976226807, "learning_rate": 4.6802430562882226e-05, "loss": 1.1791, "step": 2080 }, { "epoch": 0.32698618561968207, "eval_loss": 1.1437312364578247, "eval_runtime": 2317.1332, "eval_samples_per_second": 3.995, "eval_steps_per_second": 1.998, "step": 2080 }, { "epoch": 0.3271433905166146, "grad_norm": 0.15380804240703583, "learning_rate": 4.6799406604484894e-05, "loss": 1.1315, "step": 2081 }, { "epoch": 0.3273005954135471, "grad_norm": 0.24307769536972046, "learning_rate": 4.679638131466275e-05, "loss": 1.0447, "step": 2082 }, { "epoch": 0.32745780031047966, "grad_norm": 0.19176973402500153, "learning_rate": 4.6793354693600565e-05, "loss": 1.1879, "step": 2083 }, { "epoch": 0.3276150052074122, "grad_norm": 0.2640921175479889, "learning_rate": 4.679032674148319e-05, "loss": 1.0972, "step": 2084 }, { "epoch": 0.32777221010434476, "grad_norm": 0.2140382081270218, "learning_rate": 4.678729745849557e-05, "loss": 1.2197, "step": 2085 }, { "epoch": 0.3279294150012773, "grad_norm": 0.2784416675567627, "learning_rate": 4.678426684482272e-05, "loss": 1.1183, "step": 2086 }, { "epoch": 0.3280866198982098, "grad_norm": 0.2722271978855133, "learning_rate": 4.678123490064973e-05, "loss": 1.1631, "step": 2087 }, { "epoch": 0.32824382479514236, "grad_norm": 0.18923896551132202, "learning_rate": 4.6778201626161776e-05, "loss": 1.1795, "step": 2088 }, { "epoch": 0.3284010296920749, "grad_norm": 0.18081101775169373, "learning_rate": 4.6775167021544136e-05, "loss": 1.1308, "step": 2089 }, { "epoch": 0.32855823458900746, "grad_norm": 0.24127820134162903, "learning_rate": 4.677213108698214e-05, "loss": 1.1461, "step": 2090 }, { "epoch": 0.32871543948594, "grad_norm": 0.20159919559955597, "learning_rate": 4.6769093822661214e-05, "loss": 1.2414, "step": 2091 }, { "epoch": 0.3288726443828725, "grad_norm": 0.2225925475358963, "learning_rate": 4.676605522876687e-05, "loss": 1.3019, "step": 2092 }, { "epoch": 0.32902984927980505, "grad_norm": 0.20402312278747559, "learning_rate": 4.676301530548468e-05, "loss": 0.9615, "step": 2093 }, { "epoch": 0.3291870541767376, "grad_norm": 0.2000385820865631, "learning_rate": 4.6759974053000324e-05, "loss": 1.1958, "step": 2094 }, { "epoch": 0.32934425907367015, "grad_norm": 0.27835071086883545, "learning_rate": 4.6756931471499546e-05, "loss": 1.1249, "step": 2095 }, { "epoch": 0.3295014639706027, "grad_norm": 0.20449519157409668, "learning_rate": 4.675388756116816e-05, "loss": 1.0496, "step": 2096 }, { "epoch": 0.3296586688675352, "grad_norm": 0.26013025641441345, "learning_rate": 4.67508423221921e-05, "loss": 1.1194, "step": 2097 }, { "epoch": 0.32981587376446775, "grad_norm": 0.19826790690422058, "learning_rate": 4.6747795754757354e-05, "loss": 1.1878, "step": 2098 }, { "epoch": 0.3299730786614003, "grad_norm": 0.20292681455612183, "learning_rate": 4.6744747859049975e-05, "loss": 1.2216, "step": 2099 }, { "epoch": 0.33013028355833285, "grad_norm": 0.2701912224292755, "learning_rate": 4.674169863525614e-05, "loss": 1.1728, "step": 2100 }, { "epoch": 0.3302874884552654, "grad_norm": 0.2722685933113098, "learning_rate": 4.673864808356206e-05, "loss": 1.1967, "step": 2101 }, { "epoch": 0.3304446933521979, "grad_norm": 0.19252356886863708, "learning_rate": 4.673559620415408e-05, "loss": 1.1614, "step": 2102 }, { "epoch": 0.33060189824913044, "grad_norm": 0.2063681185245514, "learning_rate": 4.673254299721858e-05, "loss": 1.192, "step": 2103 }, { "epoch": 0.330759103146063, "grad_norm": 0.22739455103874207, "learning_rate": 4.6729488462942036e-05, "loss": 1.207, "step": 2104 }, { "epoch": 0.33091630804299554, "grad_norm": 0.220958411693573, "learning_rate": 4.672643260151101e-05, "loss": 1.1716, "step": 2105 }, { "epoch": 0.3310735129399281, "grad_norm": 0.24066145718097687, "learning_rate": 4.672337541311215e-05, "loss": 1.1382, "step": 2106 }, { "epoch": 0.33123071783686064, "grad_norm": 0.25801724195480347, "learning_rate": 4.672031689793217e-05, "loss": 1.1494, "step": 2107 }, { "epoch": 0.33138792273379314, "grad_norm": 0.2185317426919937, "learning_rate": 4.671725705615787e-05, "loss": 1.1325, "step": 2108 }, { "epoch": 0.3315451276307257, "grad_norm": 0.25223109126091003, "learning_rate": 4.671419588797615e-05, "loss": 1.1606, "step": 2109 }, { "epoch": 0.33170233252765824, "grad_norm": 0.320625364780426, "learning_rate": 4.6711133393573945e-05, "loss": 1.0385, "step": 2110 }, { "epoch": 0.3318595374245908, "grad_norm": 0.2799528241157532, "learning_rate": 4.6708069573138335e-05, "loss": 1.1079, "step": 2111 }, { "epoch": 0.33201674232152334, "grad_norm": 0.2274870127439499, "learning_rate": 4.670500442685642e-05, "loss": 1.0838, "step": 2112 }, { "epoch": 0.33217394721845583, "grad_norm": 0.27315694093704224, "learning_rate": 4.670193795491542e-05, "loss": 1.095, "step": 2113 }, { "epoch": 0.3323311521153884, "grad_norm": 0.21600811183452606, "learning_rate": 4.669887015750262e-05, "loss": 1.0079, "step": 2114 }, { "epoch": 0.33248835701232093, "grad_norm": 0.2445525974035263, "learning_rate": 4.669580103480539e-05, "loss": 1.1513, "step": 2115 }, { "epoch": 0.3326455619092535, "grad_norm": 0.2136092483997345, "learning_rate": 4.669273058701117e-05, "loss": 1.203, "step": 2116 }, { "epoch": 0.33280276680618603, "grad_norm": 0.2629857361316681, "learning_rate": 4.668965881430751e-05, "loss": 1.0757, "step": 2117 }, { "epoch": 0.33295997170311853, "grad_norm": 0.2461937963962555, "learning_rate": 4.668658571688201e-05, "loss": 1.0329, "step": 2118 }, { "epoch": 0.3331171766000511, "grad_norm": 0.21537308394908905, "learning_rate": 4.668351129492237e-05, "loss": 1.1246, "step": 2119 }, { "epoch": 0.33327438149698363, "grad_norm": 0.21597877144813538, "learning_rate": 4.6680435548616366e-05, "loss": 1.1728, "step": 2120 }, { "epoch": 0.3334315863939162, "grad_norm": 0.2764485776424408, "learning_rate": 4.667735847815183e-05, "loss": 1.1108, "step": 2121 }, { "epoch": 0.33358879129084873, "grad_norm": 0.24224579334259033, "learning_rate": 4.667428008371674e-05, "loss": 1.1129, "step": 2122 }, { "epoch": 0.3337459961877812, "grad_norm": 0.2688109874725342, "learning_rate": 4.667120036549907e-05, "loss": 1.0214, "step": 2123 }, { "epoch": 0.3339032010847138, "grad_norm": 0.30181390047073364, "learning_rate": 4.666811932368693e-05, "loss": 1.1261, "step": 2124 }, { "epoch": 0.3340604059816463, "grad_norm": 0.20725978910923004, "learning_rate": 4.666503695846852e-05, "loss": 1.193, "step": 2125 }, { "epoch": 0.3342176108785789, "grad_norm": 0.1896386742591858, "learning_rate": 4.666195327003208e-05, "loss": 1.1811, "step": 2126 }, { "epoch": 0.3343748157755114, "grad_norm": 0.1871979534626007, "learning_rate": 4.665886825856594e-05, "loss": 1.2184, "step": 2127 }, { "epoch": 0.3345320206724439, "grad_norm": 0.2065206915140152, "learning_rate": 4.665578192425854e-05, "loss": 1.1762, "step": 2128 }, { "epoch": 0.33468922556937647, "grad_norm": 0.293587863445282, "learning_rate": 4.665269426729838e-05, "loss": 1.1041, "step": 2129 }, { "epoch": 0.334846430466309, "grad_norm": 0.23634661734104156, "learning_rate": 4.664960528787403e-05, "loss": 1.0625, "step": 2130 }, { "epoch": 0.33500363536324157, "grad_norm": 0.21932774782180786, "learning_rate": 4.664651498617417e-05, "loss": 1.0748, "step": 2131 }, { "epoch": 0.3351608402601741, "grad_norm": 0.23783034086227417, "learning_rate": 4.6643423362387526e-05, "loss": 1.1149, "step": 2132 }, { "epoch": 0.33531804515710667, "grad_norm": 0.24081239104270935, "learning_rate": 4.664033041670293e-05, "loss": 1.2532, "step": 2133 }, { "epoch": 0.33547525005403916, "grad_norm": 0.23142661154270172, "learning_rate": 4.6637236149309296e-05, "loss": 1.178, "step": 2134 }, { "epoch": 0.3356324549509717, "grad_norm": 0.27728673815727234, "learning_rate": 4.663414056039559e-05, "loss": 1.0755, "step": 2135 }, { "epoch": 0.33578965984790426, "grad_norm": 0.22718535363674164, "learning_rate": 4.6631043650150905e-05, "loss": 1.1332, "step": 2136 }, { "epoch": 0.3359468647448368, "grad_norm": 0.28607967495918274, "learning_rate": 4.6627945418764366e-05, "loss": 1.2812, "step": 2137 }, { "epoch": 0.33610406964176937, "grad_norm": 0.19152137637138367, "learning_rate": 4.6624845866425215e-05, "loss": 1.0983, "step": 2138 }, { "epoch": 0.33626127453870186, "grad_norm": 0.230951726436615, "learning_rate": 4.662174499332275e-05, "loss": 1.1511, "step": 2139 }, { "epoch": 0.3364184794356344, "grad_norm": 0.23929493129253387, "learning_rate": 4.661864279964637e-05, "loss": 1.1524, "step": 2140 }, { "epoch": 0.33657568433256696, "grad_norm": 0.2901476323604584, "learning_rate": 4.661553928558554e-05, "loss": 1.1344, "step": 2141 }, { "epoch": 0.3367328892294995, "grad_norm": 0.26585260033607483, "learning_rate": 4.661243445132981e-05, "loss": 1.1513, "step": 2142 }, { "epoch": 0.33689009412643206, "grad_norm": 0.23678666353225708, "learning_rate": 4.660932829706882e-05, "loss": 1.0852, "step": 2143 }, { "epoch": 0.33704729902336455, "grad_norm": 0.4830389618873596, "learning_rate": 4.660622082299227e-05, "loss": 1.1329, "step": 2144 }, { "epoch": 0.3372045039202971, "grad_norm": 0.21181213855743408, "learning_rate": 4.660311202928996e-05, "loss": 1.1513, "step": 2145 }, { "epoch": 0.33736170881722966, "grad_norm": 0.22538620233535767, "learning_rate": 4.660000191615176e-05, "loss": 1.1165, "step": 2146 }, { "epoch": 0.3375189137141622, "grad_norm": 0.19452321529388428, "learning_rate": 4.659689048376763e-05, "loss": 1.1231, "step": 2147 }, { "epoch": 0.33767611861109476, "grad_norm": 0.2382746934890747, "learning_rate": 4.6593777732327595e-05, "loss": 1.1805, "step": 2148 }, { "epoch": 0.33783332350802725, "grad_norm": 0.26696088910102844, "learning_rate": 4.659066366202178e-05, "loss": 1.1743, "step": 2149 }, { "epoch": 0.3379905284049598, "grad_norm": 0.19131755828857422, "learning_rate": 4.658754827304037e-05, "loss": 1.1538, "step": 2150 }, { "epoch": 0.33814773330189235, "grad_norm": 0.20308135449886322, "learning_rate": 4.658443156557365e-05, "loss": 1.1274, "step": 2151 }, { "epoch": 0.3383049381988249, "grad_norm": 0.25910693407058716, "learning_rate": 4.658131353981198e-05, "loss": 1.2042, "step": 2152 }, { "epoch": 0.33846214309575745, "grad_norm": 0.2608768939971924, "learning_rate": 4.6578194195945776e-05, "loss": 1.0935, "step": 2153 }, { "epoch": 0.33861934799268995, "grad_norm": 0.22527897357940674, "learning_rate": 4.657507353416558e-05, "loss": 1.1007, "step": 2154 }, { "epoch": 0.3387765528896225, "grad_norm": 0.2805950343608856, "learning_rate": 4.657195155466198e-05, "loss": 1.0509, "step": 2155 }, { "epoch": 0.33893375778655505, "grad_norm": 0.244426891207695, "learning_rate": 4.656882825762565e-05, "loss": 1.0811, "step": 2156 }, { "epoch": 0.3390909626834876, "grad_norm": 0.20174293220043182, "learning_rate": 4.656570364324736e-05, "loss": 1.197, "step": 2157 }, { "epoch": 0.33924816758042015, "grad_norm": 0.2714351713657379, "learning_rate": 4.656257771171795e-05, "loss": 1.1018, "step": 2158 }, { "epoch": 0.3394053724773527, "grad_norm": 0.266418993473053, "learning_rate": 4.6559450463228316e-05, "loss": 1.1652, "step": 2159 }, { "epoch": 0.3395625773742852, "grad_norm": 0.29422682523727417, "learning_rate": 4.655632189796949e-05, "loss": 1.0787, "step": 2160 }, { "epoch": 0.33971978227121774, "grad_norm": 0.26280736923217773, "learning_rate": 4.655319201613253e-05, "loss": 1.2125, "step": 2161 }, { "epoch": 0.3398769871681503, "grad_norm": 0.20241224765777588, "learning_rate": 4.655006081790861e-05, "loss": 1.227, "step": 2162 }, { "epoch": 0.34003419206508284, "grad_norm": 0.23165303468704224, "learning_rate": 4.654692830348897e-05, "loss": 1.0675, "step": 2163 }, { "epoch": 0.3401913969620154, "grad_norm": 0.3363533020019531, "learning_rate": 4.654379447306493e-05, "loss": 1.1536, "step": 2164 }, { "epoch": 0.3403486018589479, "grad_norm": 0.21518482267856598, "learning_rate": 4.654065932682789e-05, "loss": 1.1749, "step": 2165 }, { "epoch": 0.34050580675588044, "grad_norm": 0.16388140618801117, "learning_rate": 4.653752286496933e-05, "loss": 1.2337, "step": 2166 }, { "epoch": 0.340663011652813, "grad_norm": 0.20146115124225616, "learning_rate": 4.6534385087680824e-05, "loss": 1.1984, "step": 2167 }, { "epoch": 0.34082021654974554, "grad_norm": 0.22612817585468292, "learning_rate": 4.653124599515401e-05, "loss": 1.1037, "step": 2168 }, { "epoch": 0.3409774214466781, "grad_norm": 0.23036076128482819, "learning_rate": 4.652810558758061e-05, "loss": 1.2341, "step": 2169 }, { "epoch": 0.3411346263436106, "grad_norm": 0.23168572783470154, "learning_rate": 4.652496386515243e-05, "loss": 1.167, "step": 2170 }, { "epoch": 0.34129183124054313, "grad_norm": 0.24674174189567566, "learning_rate": 4.6521820828061354e-05, "loss": 1.1577, "step": 2171 }, { "epoch": 0.3414490361374757, "grad_norm": 0.1900520622730255, "learning_rate": 4.6518676476499353e-05, "loss": 1.1402, "step": 2172 }, { "epoch": 0.34160624103440823, "grad_norm": 0.18541108071804047, "learning_rate": 4.651553081065846e-05, "loss": 1.0964, "step": 2173 }, { "epoch": 0.3417634459313408, "grad_norm": 0.2279513031244278, "learning_rate": 4.651238383073081e-05, "loss": 1.0895, "step": 2174 }, { "epoch": 0.3419206508282733, "grad_norm": 0.2681223750114441, "learning_rate": 4.65092355369086e-05, "loss": 1.0725, "step": 2175 }, { "epoch": 0.3420778557252058, "grad_norm": 0.2828345000743866, "learning_rate": 4.6506085929384124e-05, "loss": 1.0066, "step": 2176 }, { "epoch": 0.3422350606221384, "grad_norm": 0.20930872857570648, "learning_rate": 4.6502935008349747e-05, "loss": 1.2589, "step": 2177 }, { "epoch": 0.3423922655190709, "grad_norm": 0.17460274696350098, "learning_rate": 4.6499782773997906e-05, "loss": 1.1705, "step": 2178 }, { "epoch": 0.3425494704160035, "grad_norm": 0.24282746016979218, "learning_rate": 4.649662922652114e-05, "loss": 1.152, "step": 2179 }, { "epoch": 0.34270667531293597, "grad_norm": 0.21630579233169556, "learning_rate": 4.649347436611205e-05, "loss": 1.1686, "step": 2180 }, { "epoch": 0.3428638802098685, "grad_norm": 0.2531259059906006, "learning_rate": 4.649031819296332e-05, "loss": 1.1321, "step": 2181 }, { "epoch": 0.34302108510680107, "grad_norm": 0.26737165451049805, "learning_rate": 4.648716070726772e-05, "loss": 1.1569, "step": 2182 }, { "epoch": 0.3431782900037336, "grad_norm": 0.27461764216423035, "learning_rate": 4.64840019092181e-05, "loss": 1.0203, "step": 2183 }, { "epoch": 0.34333549490066617, "grad_norm": 0.2627179026603699, "learning_rate": 4.648084179900739e-05, "loss": 1.2092, "step": 2184 }, { "epoch": 0.3434926997975987, "grad_norm": 0.23868447542190552, "learning_rate": 4.647768037682858e-05, "loss": 1.0356, "step": 2185 }, { "epoch": 0.3436499046945312, "grad_norm": 0.24952903389930725, "learning_rate": 4.647451764287478e-05, "loss": 1.0801, "step": 2186 }, { "epoch": 0.34380710959146377, "grad_norm": 0.23518601059913635, "learning_rate": 4.647135359733914e-05, "loss": 1.0065, "step": 2187 }, { "epoch": 0.3439643144883963, "grad_norm": 0.20000571012496948, "learning_rate": 4.6468188240414924e-05, "loss": 1.1048, "step": 2188 }, { "epoch": 0.34412151938532887, "grad_norm": 0.2511467933654785, "learning_rate": 4.646502157229544e-05, "loss": 1.1423, "step": 2189 }, { "epoch": 0.3442787242822614, "grad_norm": 0.18525715172290802, "learning_rate": 4.646185359317412e-05, "loss": 1.1245, "step": 2190 }, { "epoch": 0.3444359291791939, "grad_norm": 0.18307547271251678, "learning_rate": 4.6458684303244435e-05, "loss": 1.0499, "step": 2191 }, { "epoch": 0.34459313407612646, "grad_norm": 0.263036847114563, "learning_rate": 4.645551370269995e-05, "loss": 1.1585, "step": 2192 }, { "epoch": 0.344750338973059, "grad_norm": 0.276479572057724, "learning_rate": 4.6452341791734335e-05, "loss": 1.1801, "step": 2193 }, { "epoch": 0.34490754386999156, "grad_norm": 0.19550184905529022, "learning_rate": 4.644916857054129e-05, "loss": 1.09, "step": 2194 }, { "epoch": 0.3450647487669241, "grad_norm": 0.22002463042736053, "learning_rate": 4.644599403931465e-05, "loss": 1.0928, "step": 2195 }, { "epoch": 0.3452219536638566, "grad_norm": 0.26032400131225586, "learning_rate": 4.6442818198248276e-05, "loss": 1.1699, "step": 2196 }, { "epoch": 0.34537915856078916, "grad_norm": 0.19974485039710999, "learning_rate": 4.643964104753617e-05, "loss": 1.1668, "step": 2197 }, { "epoch": 0.3455363634577217, "grad_norm": 0.2452566772699356, "learning_rate": 4.6436462587372345e-05, "loss": 1.1597, "step": 2198 }, { "epoch": 0.34569356835465426, "grad_norm": 0.2143898904323578, "learning_rate": 4.643328281795095e-05, "loss": 1.1134, "step": 2199 }, { "epoch": 0.3458507732515868, "grad_norm": 0.21687857806682587, "learning_rate": 4.643010173946619e-05, "loss": 1.1802, "step": 2200 }, { "epoch": 0.3460079781485193, "grad_norm": 0.21475355327129364, "learning_rate": 4.6426919352112355e-05, "loss": 1.2065, "step": 2201 }, { "epoch": 0.34616518304545185, "grad_norm": 0.25029754638671875, "learning_rate": 4.64237356560838e-05, "loss": 1.1723, "step": 2202 }, { "epoch": 0.3463223879423844, "grad_norm": 0.23117133975028992, "learning_rate": 4.642055065157499e-05, "loss": 1.1915, "step": 2203 }, { "epoch": 0.34647959283931695, "grad_norm": 0.28021204471588135, "learning_rate": 4.641736433878045e-05, "loss": 1.0219, "step": 2204 }, { "epoch": 0.3466367977362495, "grad_norm": 0.24753634631633759, "learning_rate": 4.641417671789478e-05, "loss": 1.0664, "step": 2205 }, { "epoch": 0.346794002633182, "grad_norm": 0.2812795341014862, "learning_rate": 4.6410987789112676e-05, "loss": 1.1794, "step": 2206 }, { "epoch": 0.34695120753011455, "grad_norm": 0.17878840863704681, "learning_rate": 4.64077975526289e-05, "loss": 1.1064, "step": 2207 }, { "epoch": 0.3471084124270471, "grad_norm": 0.20422449707984924, "learning_rate": 4.6404606008638295e-05, "loss": 1.2858, "step": 2208 }, { "epoch": 0.34726561732397965, "grad_norm": 0.2892017066478729, "learning_rate": 4.6401413157335796e-05, "loss": 1.1227, "step": 2209 }, { "epoch": 0.3474228222209122, "grad_norm": 0.16432105004787445, "learning_rate": 4.639821899891641e-05, "loss": 1.1985, "step": 2210 }, { "epoch": 0.34758002711784475, "grad_norm": 0.2849401533603668, "learning_rate": 4.639502353357522e-05, "loss": 1.056, "step": 2211 }, { "epoch": 0.34773723201477724, "grad_norm": 0.17997637391090393, "learning_rate": 4.6391826761507403e-05, "loss": 1.1146, "step": 2212 }, { "epoch": 0.3478944369117098, "grad_norm": 0.22177653014659882, "learning_rate": 4.6388628682908186e-05, "loss": 1.1871, "step": 2213 }, { "epoch": 0.34805164180864234, "grad_norm": 0.18821461498737335, "learning_rate": 4.6385429297972914e-05, "loss": 1.1184, "step": 2214 }, { "epoch": 0.3482088467055749, "grad_norm": 0.2185995727777481, "learning_rate": 4.6382228606896994e-05, "loss": 1.2098, "step": 2215 }, { "epoch": 0.34836605160250744, "grad_norm": 0.2325439453125, "learning_rate": 4.6379026609875894e-05, "loss": 1.1827, "step": 2216 }, { "epoch": 0.34852325649943994, "grad_norm": 0.23903654515743256, "learning_rate": 4.637582330710519e-05, "loss": 1.1347, "step": 2217 }, { "epoch": 0.3486804613963725, "grad_norm": 0.2033540904521942, "learning_rate": 4.637261869878054e-05, "loss": 1.0412, "step": 2218 }, { "epoch": 0.34883766629330504, "grad_norm": 0.21630467474460602, "learning_rate": 4.6369412785097644e-05, "loss": 1.2504, "step": 2219 }, { "epoch": 0.3489948711902376, "grad_norm": 0.2330722063779831, "learning_rate": 4.636620556625233e-05, "loss": 1.2137, "step": 2220 }, { "epoch": 0.34915207608717014, "grad_norm": 0.22527045011520386, "learning_rate": 4.636299704244047e-05, "loss": 1.1868, "step": 2221 }, { "epoch": 0.34930928098410263, "grad_norm": 0.26466044783592224, "learning_rate": 4.635978721385803e-05, "loss": 1.0843, "step": 2222 }, { "epoch": 0.3494664858810352, "grad_norm": 0.1883208006620407, "learning_rate": 4.6356576080701054e-05, "loss": 1.1547, "step": 2223 }, { "epoch": 0.34962369077796773, "grad_norm": 0.23476985096931458, "learning_rate": 4.635336364316567e-05, "loss": 1.1051, "step": 2224 }, { "epoch": 0.3497808956749003, "grad_norm": 0.224160835146904, "learning_rate": 4.635014990144808e-05, "loss": 1.1945, "step": 2225 }, { "epoch": 0.34993810057183283, "grad_norm": 0.19404496252536774, "learning_rate": 4.634693485574457e-05, "loss": 1.2492, "step": 2226 }, { "epoch": 0.3500953054687653, "grad_norm": 0.253548800945282, "learning_rate": 4.6343718506251485e-05, "loss": 1.2643, "step": 2227 }, { "epoch": 0.3502525103656979, "grad_norm": 0.3645852208137512, "learning_rate": 4.634050085316529e-05, "loss": 1.147, "step": 2228 }, { "epoch": 0.35040971526263043, "grad_norm": 0.25876176357269287, "learning_rate": 4.6337281896682504e-05, "loss": 0.9978, "step": 2229 }, { "epoch": 0.350566920159563, "grad_norm": 0.20177343487739563, "learning_rate": 4.633406163699972e-05, "loss": 1.1169, "step": 2230 }, { "epoch": 0.35072412505649553, "grad_norm": 0.23162272572517395, "learning_rate": 4.633084007431361e-05, "loss": 1.0649, "step": 2231 }, { "epoch": 0.350881329953428, "grad_norm": 0.18436746299266815, "learning_rate": 4.6327617208820964e-05, "loss": 1.2476, "step": 2232 }, { "epoch": 0.3510385348503606, "grad_norm": 0.17864318192005157, "learning_rate": 4.6324393040718596e-05, "loss": 1.1171, "step": 2233 }, { "epoch": 0.3511957397472931, "grad_norm": 0.21066221594810486, "learning_rate": 4.632116757020343e-05, "loss": 1.1708, "step": 2234 }, { "epoch": 0.3513529446442257, "grad_norm": 0.19731271266937256, "learning_rate": 4.631794079747248e-05, "loss": 1.0509, "step": 2235 }, { "epoch": 0.3515101495411582, "grad_norm": 0.24790634214878082, "learning_rate": 4.631471272272281e-05, "loss": 1.0759, "step": 2236 }, { "epoch": 0.3516673544380908, "grad_norm": 0.23736214637756348, "learning_rate": 4.6311483346151587e-05, "loss": 1.1302, "step": 2237 }, { "epoch": 0.35182455933502327, "grad_norm": 0.20299890637397766, "learning_rate": 4.630825266795605e-05, "loss": 1.171, "step": 2238 }, { "epoch": 0.3519817642319558, "grad_norm": 0.17163951694965363, "learning_rate": 4.63050206883335e-05, "loss": 1.1312, "step": 2239 }, { "epoch": 0.35213896912888837, "grad_norm": 0.22283077239990234, "learning_rate": 4.6301787407481356e-05, "loss": 1.2501, "step": 2240 }, { "epoch": 0.35213896912888837, "eval_loss": 1.137406349182129, "eval_runtime": 2319.6638, "eval_samples_per_second": 3.991, "eval_steps_per_second": 1.996, "step": 2240 }, { "epoch": 0.3522961740258209, "grad_norm": 0.21536953747272491, "learning_rate": 4.6298552825597084e-05, "loss": 1.1445, "step": 2241 }, { "epoch": 0.35245337892275347, "grad_norm": 0.18569806218147278, "learning_rate": 4.629531694287824e-05, "loss": 1.1741, "step": 2242 }, { "epoch": 0.35261058381968596, "grad_norm": 0.21473194658756256, "learning_rate": 4.629207975952247e-05, "loss": 1.1673, "step": 2243 }, { "epoch": 0.3527677887166185, "grad_norm": 0.2836077809333801, "learning_rate": 4.628884127572747e-05, "loss": 1.1015, "step": 2244 }, { "epoch": 0.35292499361355106, "grad_norm": 0.25129783153533936, "learning_rate": 4.6285601491691044e-05, "loss": 1.0962, "step": 2245 }, { "epoch": 0.3530821985104836, "grad_norm": 0.1728084534406662, "learning_rate": 4.628236040761106e-05, "loss": 1.2618, "step": 2246 }, { "epoch": 0.35323940340741616, "grad_norm": 0.24836137890815735, "learning_rate": 4.6279118023685485e-05, "loss": 1.1205, "step": 2247 }, { "epoch": 0.35339660830434866, "grad_norm": 0.18902529776096344, "learning_rate": 4.627587434011234e-05, "loss": 1.2781, "step": 2248 }, { "epoch": 0.3535538132012812, "grad_norm": 0.16344639658927917, "learning_rate": 4.6272629357089745e-05, "loss": 1.1919, "step": 2249 }, { "epoch": 0.35371101809821376, "grad_norm": 0.2835703194141388, "learning_rate": 4.6269383074815874e-05, "loss": 1.0918, "step": 2250 }, { "epoch": 0.3538682229951463, "grad_norm": 0.21615809202194214, "learning_rate": 4.6266135493489015e-05, "loss": 1.1403, "step": 2251 }, { "epoch": 0.35402542789207886, "grad_norm": 0.2002420276403427, "learning_rate": 4.6262886613307516e-05, "loss": 1.2075, "step": 2252 }, { "epoch": 0.35418263278901135, "grad_norm": 0.21766100823879242, "learning_rate": 4.62596364344698e-05, "loss": 1.0956, "step": 2253 }, { "epoch": 0.3543398376859439, "grad_norm": 0.24325627088546753, "learning_rate": 4.625638495717438e-05, "loss": 1.1861, "step": 2254 }, { "epoch": 0.35449704258287645, "grad_norm": 0.2508534789085388, "learning_rate": 4.625313218161984e-05, "loss": 1.1318, "step": 2255 }, { "epoch": 0.354654247479809, "grad_norm": 0.20808853209018707, "learning_rate": 4.624987810800485e-05, "loss": 1.1406, "step": 2256 }, { "epoch": 0.35481145237674155, "grad_norm": 0.18308702111244202, "learning_rate": 4.6246622736528154e-05, "loss": 1.099, "step": 2257 }, { "epoch": 0.35496865727367405, "grad_norm": 0.2754043936729431, "learning_rate": 4.6243366067388585e-05, "loss": 1.1066, "step": 2258 }, { "epoch": 0.3551258621706066, "grad_norm": 0.22903695702552795, "learning_rate": 4.624010810078504e-05, "loss": 1.1673, "step": 2259 }, { "epoch": 0.35528306706753915, "grad_norm": 0.23207323253154755, "learning_rate": 4.62368488369165e-05, "loss": 1.0022, "step": 2260 }, { "epoch": 0.3554402719644717, "grad_norm": 0.17466799914836884, "learning_rate": 4.623358827598204e-05, "loss": 1.2006, "step": 2261 }, { "epoch": 0.35559747686140425, "grad_norm": 0.17655743658542633, "learning_rate": 4.623032641818079e-05, "loss": 1.2022, "step": 2262 }, { "epoch": 0.3557546817583368, "grad_norm": 0.21429038047790527, "learning_rate": 4.622706326371199e-05, "loss": 1.1119, "step": 2263 }, { "epoch": 0.3559118866552693, "grad_norm": 0.2043597400188446, "learning_rate": 4.622379881277492e-05, "loss": 1.0538, "step": 2264 }, { "epoch": 0.35606909155220184, "grad_norm": 0.2455209642648697, "learning_rate": 4.622053306556897e-05, "loss": 1.1122, "step": 2265 }, { "epoch": 0.3562262964491344, "grad_norm": 0.22231566905975342, "learning_rate": 4.6217266022293605e-05, "loss": 1.0947, "step": 2266 }, { "epoch": 0.35638350134606694, "grad_norm": 0.26289692521095276, "learning_rate": 4.6213997683148355e-05, "loss": 1.2072, "step": 2267 }, { "epoch": 0.3565407062429995, "grad_norm": 0.208029642701149, "learning_rate": 4.621072804833284e-05, "loss": 1.2221, "step": 2268 }, { "epoch": 0.356697911139932, "grad_norm": 0.23250968754291534, "learning_rate": 4.620745711804676e-05, "loss": 1.1863, "step": 2269 }, { "epoch": 0.35685511603686454, "grad_norm": 0.26194247603416443, "learning_rate": 4.6204184892489875e-05, "loss": 1.0326, "step": 2270 }, { "epoch": 0.3570123209337971, "grad_norm": 0.3551010191440582, "learning_rate": 4.6200911371862063e-05, "loss": 1.1552, "step": 2271 }, { "epoch": 0.35716952583072964, "grad_norm": 0.24021030962467194, "learning_rate": 4.6197636556363256e-05, "loss": 1.1127, "step": 2272 }, { "epoch": 0.3573267307276622, "grad_norm": 0.22826530039310455, "learning_rate": 4.619436044619345e-05, "loss": 1.134, "step": 2273 }, { "epoch": 0.3574839356245947, "grad_norm": 0.2583426237106323, "learning_rate": 4.619108304155275e-05, "loss": 1.0296, "step": 2274 }, { "epoch": 0.35764114052152723, "grad_norm": 0.2455807328224182, "learning_rate": 4.618780434264133e-05, "loss": 1.087, "step": 2275 }, { "epoch": 0.3577983454184598, "grad_norm": 0.21768860518932343, "learning_rate": 4.618452434965943e-05, "loss": 1.1911, "step": 2276 }, { "epoch": 0.35795555031539233, "grad_norm": 0.19263923168182373, "learning_rate": 4.6181243062807387e-05, "loss": 1.1437, "step": 2277 }, { "epoch": 0.3581127552123249, "grad_norm": 0.23038311302661896, "learning_rate": 4.61779604822856e-05, "loss": 1.1394, "step": 2278 }, { "epoch": 0.3582699601092574, "grad_norm": 0.38476505875587463, "learning_rate": 4.6174676608294574e-05, "loss": 1.1242, "step": 2279 }, { "epoch": 0.35842716500618993, "grad_norm": 0.349468469619751, "learning_rate": 4.617139144103486e-05, "loss": 1.2413, "step": 2280 }, { "epoch": 0.3585843699031225, "grad_norm": 0.18502017855644226, "learning_rate": 4.6168104980707107e-05, "loss": 1.202, "step": 2281 }, { "epoch": 0.35874157480005503, "grad_norm": 0.18562203645706177, "learning_rate": 4.616481722751205e-05, "loss": 1.0617, "step": 2282 }, { "epoch": 0.3588987796969876, "grad_norm": 0.24319809675216675, "learning_rate": 4.616152818165047e-05, "loss": 1.1994, "step": 2283 }, { "epoch": 0.3590559845939201, "grad_norm": 0.21097324788570404, "learning_rate": 4.615823784332327e-05, "loss": 1.1771, "step": 2284 }, { "epoch": 0.3592131894908526, "grad_norm": 0.20706066489219666, "learning_rate": 4.61549462127314e-05, "loss": 1.0416, "step": 2285 }, { "epoch": 0.3593703943877852, "grad_norm": 0.17828263342380524, "learning_rate": 4.615165329007591e-05, "loss": 1.0867, "step": 2286 }, { "epoch": 0.3595275992847177, "grad_norm": 0.20547035336494446, "learning_rate": 4.6148359075557915e-05, "loss": 1.1115, "step": 2287 }, { "epoch": 0.3596848041816503, "grad_norm": 0.30150124430656433, "learning_rate": 4.614506356937861e-05, "loss": 1.0364, "step": 2288 }, { "epoch": 0.3598420090785828, "grad_norm": 0.2805284261703491, "learning_rate": 4.6141766771739267e-05, "loss": 1.1075, "step": 2289 }, { "epoch": 0.3599992139755153, "grad_norm": 0.2630583345890045, "learning_rate": 4.613846868284126e-05, "loss": 1.1411, "step": 2290 }, { "epoch": 0.36015641887244787, "grad_norm": 0.20439459383487701, "learning_rate": 4.6135169302886006e-05, "loss": 1.0507, "step": 2291 }, { "epoch": 0.3603136237693804, "grad_norm": 0.18172521889209747, "learning_rate": 4.6131868632075024e-05, "loss": 1.1667, "step": 2292 }, { "epoch": 0.36047082866631297, "grad_norm": 0.14271654188632965, "learning_rate": 4.612856667060991e-05, "loss": 1.1963, "step": 2293 }, { "epoch": 0.3606280335632455, "grad_norm": 0.22915297746658325, "learning_rate": 4.612526341869233e-05, "loss": 1.1523, "step": 2294 }, { "epoch": 0.360785238460178, "grad_norm": 0.19867637753486633, "learning_rate": 4.612195887652404e-05, "loss": 1.1402, "step": 2295 }, { "epoch": 0.36094244335711057, "grad_norm": 0.2168378382921219, "learning_rate": 4.611865304430687e-05, "loss": 1.1781, "step": 2296 }, { "epoch": 0.3610996482540431, "grad_norm": 0.2496039867401123, "learning_rate": 4.6115345922242716e-05, "loss": 1.1168, "step": 2297 }, { "epoch": 0.36125685315097567, "grad_norm": 0.15307332575321198, "learning_rate": 4.6112037510533574e-05, "loss": 1.1752, "step": 2298 }, { "epoch": 0.3614140580479082, "grad_norm": 0.22675082087516785, "learning_rate": 4.610872780938151e-05, "loss": 1.2404, "step": 2299 }, { "epoch": 0.3615712629448407, "grad_norm": 0.216874897480011, "learning_rate": 4.610541681898865e-05, "loss": 1.1117, "step": 2300 }, { "epoch": 0.36172846784177326, "grad_norm": 0.2463904470205307, "learning_rate": 4.6102104539557254e-05, "loss": 1.0656, "step": 2301 }, { "epoch": 0.3618856727387058, "grad_norm": 0.21900813281536102, "learning_rate": 4.609879097128959e-05, "loss": 1.149, "step": 2302 }, { "epoch": 0.36204287763563836, "grad_norm": 0.23547019064426422, "learning_rate": 4.6095476114388046e-05, "loss": 1.0561, "step": 2303 }, { "epoch": 0.3622000825325709, "grad_norm": 0.23986342549324036, "learning_rate": 4.609215996905509e-05, "loss": 1.1362, "step": 2304 }, { "epoch": 0.3623572874295034, "grad_norm": 0.1994561403989792, "learning_rate": 4.6088842535493247e-05, "loss": 1.256, "step": 2305 }, { "epoch": 0.36251449232643596, "grad_norm": 0.2824605405330658, "learning_rate": 4.608552381390515e-05, "loss": 1.0065, "step": 2306 }, { "epoch": 0.3626716972233685, "grad_norm": 0.24372591078281403, "learning_rate": 4.6082203804493474e-05, "loss": 1.1457, "step": 2307 }, { "epoch": 0.36282890212030106, "grad_norm": 0.19230058789253235, "learning_rate": 4.607888250746101e-05, "loss": 1.2162, "step": 2308 }, { "epoch": 0.3629861070172336, "grad_norm": 0.22559943795204163, "learning_rate": 4.6075559923010594e-05, "loss": 1.0851, "step": 2309 }, { "epoch": 0.3631433119141661, "grad_norm": 0.30149683356285095, "learning_rate": 4.607223605134517e-05, "loss": 1.1287, "step": 2310 }, { "epoch": 0.36330051681109865, "grad_norm": 0.21107155084609985, "learning_rate": 4.6068910892667744e-05, "loss": 1.0637, "step": 2311 }, { "epoch": 0.3634577217080312, "grad_norm": 0.23039104044437408, "learning_rate": 4.60655844471814e-05, "loss": 1.1839, "step": 2312 }, { "epoch": 0.36361492660496375, "grad_norm": 0.19493019580841064, "learning_rate": 4.6062256715089304e-05, "loss": 1.0697, "step": 2313 }, { "epoch": 0.3637721315018963, "grad_norm": 0.24054986238479614, "learning_rate": 4.605892769659471e-05, "loss": 1.1201, "step": 2314 }, { "epoch": 0.36392933639882885, "grad_norm": 0.2367154210805893, "learning_rate": 4.6055597391900933e-05, "loss": 1.1292, "step": 2315 }, { "epoch": 0.36408654129576135, "grad_norm": 0.18168602883815765, "learning_rate": 4.605226580121138e-05, "loss": 1.116, "step": 2316 }, { "epoch": 0.3642437461926939, "grad_norm": 0.22764810919761658, "learning_rate": 4.604893292472954e-05, "loss": 1.0941, "step": 2317 }, { "epoch": 0.36440095108962645, "grad_norm": 0.1890590786933899, "learning_rate": 4.6045598762658945e-05, "loss": 1.1693, "step": 2318 }, { "epoch": 0.364558155986559, "grad_norm": 0.30014708638191223, "learning_rate": 4.604226331520326e-05, "loss": 1.1798, "step": 2319 }, { "epoch": 0.36471536088349155, "grad_norm": 0.29448553919792175, "learning_rate": 4.603892658256619e-05, "loss": 1.1618, "step": 2320 }, { "epoch": 0.36487256578042404, "grad_norm": 0.220899298787117, "learning_rate": 4.603558856495154e-05, "loss": 1.1511, "step": 2321 }, { "epoch": 0.3650297706773566, "grad_norm": 0.21098098158836365, "learning_rate": 4.603224926256317e-05, "loss": 1.0377, "step": 2322 }, { "epoch": 0.36518697557428914, "grad_norm": 0.24701356887817383, "learning_rate": 4.602890867560503e-05, "loss": 1.1466, "step": 2323 }, { "epoch": 0.3653441804712217, "grad_norm": 0.24385958909988403, "learning_rate": 4.602556680428117e-05, "loss": 1.0753, "step": 2324 }, { "epoch": 0.36550138536815424, "grad_norm": 0.28453299403190613, "learning_rate": 4.6022223648795685e-05, "loss": 1.2767, "step": 2325 }, { "epoch": 0.36565859026508674, "grad_norm": 0.16848404705524445, "learning_rate": 4.6018879209352755e-05, "loss": 1.1453, "step": 2326 }, { "epoch": 0.3658157951620193, "grad_norm": 0.20252664387226105, "learning_rate": 4.601553348615666e-05, "loss": 1.1672, "step": 2327 }, { "epoch": 0.36597300005895184, "grad_norm": 0.25076255202293396, "learning_rate": 4.601218647941174e-05, "loss": 1.1014, "step": 2328 }, { "epoch": 0.3661302049558844, "grad_norm": 0.2385731041431427, "learning_rate": 4.600883818932241e-05, "loss": 1.2667, "step": 2329 }, { "epoch": 0.36628740985281694, "grad_norm": 0.22775901854038239, "learning_rate": 4.600548861609318e-05, "loss": 1.1281, "step": 2330 }, { "epoch": 0.36644461474974943, "grad_norm": 0.2670734226703644, "learning_rate": 4.600213775992863e-05, "loss": 1.2638, "step": 2331 }, { "epoch": 0.366601819646682, "grad_norm": 0.19620922207832336, "learning_rate": 4.599878562103341e-05, "loss": 1.2204, "step": 2332 }, { "epoch": 0.36675902454361453, "grad_norm": 0.20084300637245178, "learning_rate": 4.599543219961226e-05, "loss": 1.1472, "step": 2333 }, { "epoch": 0.3669162294405471, "grad_norm": 0.18164968490600586, "learning_rate": 4.599207749587e-05, "loss": 1.2188, "step": 2334 }, { "epoch": 0.36707343433747963, "grad_norm": 0.2300948202610016, "learning_rate": 4.598872151001151e-05, "loss": 1.1519, "step": 2335 }, { "epoch": 0.3672306392344121, "grad_norm": 0.22130456566810608, "learning_rate": 4.598536424224177e-05, "loss": 1.1081, "step": 2336 }, { "epoch": 0.3673878441313447, "grad_norm": 0.2190874069929123, "learning_rate": 4.598200569276582e-05, "loss": 0.9442, "step": 2337 }, { "epoch": 0.3675450490282772, "grad_norm": 0.244587242603302, "learning_rate": 4.5978645861788793e-05, "loss": 0.9082, "step": 2338 }, { "epoch": 0.3677022539252098, "grad_norm": 0.1806013435125351, "learning_rate": 4.5975284749515904e-05, "loss": 1.1944, "step": 2339 }, { "epoch": 0.3678594588221423, "grad_norm": 0.21137726306915283, "learning_rate": 4.597192235615242e-05, "loss": 1.1479, "step": 2340 }, { "epoch": 0.3680166637190749, "grad_norm": 0.18507908284664154, "learning_rate": 4.5968558681903716e-05, "loss": 1.1367, "step": 2341 }, { "epoch": 0.36817386861600737, "grad_norm": 0.2921428680419922, "learning_rate": 4.596519372697523e-05, "loss": 1.0496, "step": 2342 }, { "epoch": 0.3683310735129399, "grad_norm": 0.1963038295507431, "learning_rate": 4.596182749157247e-05, "loss": 1.2727, "step": 2343 }, { "epoch": 0.36848827840987247, "grad_norm": 0.16632871329784393, "learning_rate": 4.5958459975901046e-05, "loss": 1.0428, "step": 2344 }, { "epoch": 0.368645483306805, "grad_norm": 0.21543806791305542, "learning_rate": 4.595509118016663e-05, "loss": 1.1868, "step": 2345 }, { "epoch": 0.36880268820373757, "grad_norm": 0.20484694838523865, "learning_rate": 4.595172110457497e-05, "loss": 1.1649, "step": 2346 }, { "epoch": 0.36895989310067007, "grad_norm": 0.2756751775741577, "learning_rate": 4.594834974933191e-05, "loss": 1.0987, "step": 2347 }, { "epoch": 0.3691170979976026, "grad_norm": 0.2065090388059616, "learning_rate": 4.5944977114643346e-05, "loss": 1.2065, "step": 2348 }, { "epoch": 0.36927430289453517, "grad_norm": 0.2959758937358856, "learning_rate": 4.594160320071527e-05, "loss": 1.0922, "step": 2349 }, { "epoch": 0.3694315077914677, "grad_norm": 0.19524092972278595, "learning_rate": 4.593822800775375e-05, "loss": 1.1071, "step": 2350 }, { "epoch": 0.36958871268840027, "grad_norm": 0.17790867388248444, "learning_rate": 4.593485153596492e-05, "loss": 1.171, "step": 2351 }, { "epoch": 0.36974591758533276, "grad_norm": 0.18420405685901642, "learning_rate": 4.593147378555501e-05, "loss": 1.1355, "step": 2352 }, { "epoch": 0.3699031224822653, "grad_norm": 0.2588077485561371, "learning_rate": 4.5928094756730326e-05, "loss": 1.1258, "step": 2353 }, { "epoch": 0.37006032737919786, "grad_norm": 0.2283763289451599, "learning_rate": 4.592471444969724e-05, "loss": 1.1717, "step": 2354 }, { "epoch": 0.3702175322761304, "grad_norm": 0.2567402422428131, "learning_rate": 4.5921332864662215e-05, "loss": 1.1495, "step": 2355 }, { "epoch": 0.37037473717306296, "grad_norm": 0.18225303292274475, "learning_rate": 4.5917950001831766e-05, "loss": 1.2088, "step": 2356 }, { "epoch": 0.37053194206999546, "grad_norm": 0.2010773867368698, "learning_rate": 4.591456586141253e-05, "loss": 1.2081, "step": 2357 }, { "epoch": 0.370689146966928, "grad_norm": 0.19127878546714783, "learning_rate": 4.591118044361118e-05, "loss": 1.0942, "step": 2358 }, { "epoch": 0.37084635186386056, "grad_norm": 0.18000200390815735, "learning_rate": 4.590779374863449e-05, "loss": 0.9802, "step": 2359 }, { "epoch": 0.3710035567607931, "grad_norm": 0.16793574392795563, "learning_rate": 4.590440577668931e-05, "loss": 1.2521, "step": 2360 }, { "epoch": 0.37116076165772566, "grad_norm": 0.16233059763908386, "learning_rate": 4.5901016527982555e-05, "loss": 1.1423, "step": 2361 }, { "epoch": 0.37131796655465815, "grad_norm": 0.2126810997724533, "learning_rate": 4.5897626002721236e-05, "loss": 1.125, "step": 2362 }, { "epoch": 0.3714751714515907, "grad_norm": 0.20584918558597565, "learning_rate": 4.589423420111244e-05, "loss": 1.1424, "step": 2363 }, { "epoch": 0.37163237634852325, "grad_norm": 0.19902043044567108, "learning_rate": 4.5890841123363305e-05, "loss": 1.0977, "step": 2364 }, { "epoch": 0.3717895812454558, "grad_norm": 0.20691698789596558, "learning_rate": 4.588744676968109e-05, "loss": 1.1506, "step": 2365 }, { "epoch": 0.37194678614238835, "grad_norm": 0.2217256873846054, "learning_rate": 4.588405114027309e-05, "loss": 1.2978, "step": 2366 }, { "epoch": 0.37210399103932085, "grad_norm": 0.24339617788791656, "learning_rate": 4.5880654235346705e-05, "loss": 1.1033, "step": 2367 }, { "epoch": 0.3722611959362534, "grad_norm": 0.22742657363414764, "learning_rate": 4.587725605510941e-05, "loss": 1.1811, "step": 2368 }, { "epoch": 0.37241840083318595, "grad_norm": 0.3227018117904663, "learning_rate": 4.587385659976874e-05, "loss": 1.1963, "step": 2369 }, { "epoch": 0.3725756057301185, "grad_norm": 0.22187107801437378, "learning_rate": 4.587045586953233e-05, "loss": 1.1627, "step": 2370 }, { "epoch": 0.37273281062705105, "grad_norm": 0.2444133311510086, "learning_rate": 4.586705386460789e-05, "loss": 1.2408, "step": 2371 }, { "epoch": 0.3728900155239836, "grad_norm": 0.21586081385612488, "learning_rate": 4.586365058520319e-05, "loss": 1.1803, "step": 2372 }, { "epoch": 0.3730472204209161, "grad_norm": 0.15065719187259674, "learning_rate": 4.586024603152609e-05, "loss": 1.1824, "step": 2373 }, { "epoch": 0.37320442531784864, "grad_norm": 0.18252897262573242, "learning_rate": 4.585684020378453e-05, "loss": 1.0426, "step": 2374 }, { "epoch": 0.3733616302147812, "grad_norm": 0.27016788721084595, "learning_rate": 4.585343310218653e-05, "loss": 1.0083, "step": 2375 }, { "epoch": 0.37351883511171374, "grad_norm": 0.23528437316417694, "learning_rate": 4.585002472694018e-05, "loss": 1.1698, "step": 2376 }, { "epoch": 0.3736760400086463, "grad_norm": 0.2290978729724884, "learning_rate": 4.5846615078253644e-05, "loss": 1.0959, "step": 2377 }, { "epoch": 0.3738332449055788, "grad_norm": 0.25804629921913147, "learning_rate": 4.5843204156335176e-05, "loss": 1.1847, "step": 2378 }, { "epoch": 0.37399044980251134, "grad_norm": 0.21353112161159515, "learning_rate": 4.58397919613931e-05, "loss": 1.1614, "step": 2379 }, { "epoch": 0.3741476546994439, "grad_norm": 0.1873713582754135, "learning_rate": 4.5836378493635826e-05, "loss": 1.2798, "step": 2380 }, { "epoch": 0.37430485959637644, "grad_norm": 0.22049100697040558, "learning_rate": 4.583296375327182e-05, "loss": 1.1098, "step": 2381 }, { "epoch": 0.374462064493309, "grad_norm": 0.20885449647903442, "learning_rate": 4.582954774050966e-05, "loss": 1.144, "step": 2382 }, { "epoch": 0.3746192693902415, "grad_norm": 0.23146776854991913, "learning_rate": 4.582613045555798e-05, "loss": 1.0994, "step": 2383 }, { "epoch": 0.37477647428717403, "grad_norm": 0.19679352641105652, "learning_rate": 4.582271189862548e-05, "loss": 1.1002, "step": 2384 }, { "epoch": 0.3749336791841066, "grad_norm": 0.20884890854358673, "learning_rate": 4.581929206992097e-05, "loss": 1.0454, "step": 2385 }, { "epoch": 0.37509088408103913, "grad_norm": 0.20319582521915436, "learning_rate": 4.581587096965331e-05, "loss": 1.0725, "step": 2386 }, { "epoch": 0.3752480889779717, "grad_norm": 0.2325729876756668, "learning_rate": 4.581244859803146e-05, "loss": 1.0087, "step": 2387 }, { "epoch": 0.3754052938749042, "grad_norm": 0.19451522827148438, "learning_rate": 4.580902495526442e-05, "loss": 1.1993, "step": 2388 }, { "epoch": 0.37556249877183673, "grad_norm": 0.28159642219543457, "learning_rate": 4.580560004156131e-05, "loss": 1.1678, "step": 2389 }, { "epoch": 0.3757197036687693, "grad_norm": 0.30161935091018677, "learning_rate": 4.580217385713132e-05, "loss": 1.1238, "step": 2390 }, { "epoch": 0.37587690856570183, "grad_norm": 0.34735164046287537, "learning_rate": 4.579874640218369e-05, "loss": 1.1889, "step": 2391 }, { "epoch": 0.3760341134626344, "grad_norm": 0.27134138345718384, "learning_rate": 4.579531767692777e-05, "loss": 1.0985, "step": 2392 }, { "epoch": 0.3761913183595669, "grad_norm": 0.15087197721004486, "learning_rate": 4.5791887681572964e-05, "loss": 1.1239, "step": 2393 }, { "epoch": 0.3763485232564994, "grad_norm": 0.18321779370307922, "learning_rate": 4.5788456416328766e-05, "loss": 1.0854, "step": 2394 }, { "epoch": 0.376505728153432, "grad_norm": 0.21554580330848694, "learning_rate": 4.5785023881404744e-05, "loss": 1.1672, "step": 2395 }, { "epoch": 0.3766629330503645, "grad_norm": 0.21578359603881836, "learning_rate": 4.578159007701055e-05, "loss": 1.0585, "step": 2396 }, { "epoch": 0.3768201379472971, "grad_norm": 0.21935155987739563, "learning_rate": 4.57781550033559e-05, "loss": 1.1878, "step": 2397 }, { "epoch": 0.3769773428442296, "grad_norm": 0.22799645364284515, "learning_rate": 4.5774718660650594e-05, "loss": 1.1155, "step": 2398 }, { "epoch": 0.3771345477411621, "grad_norm": 0.18265032768249512, "learning_rate": 4.577128104910452e-05, "loss": 1.2611, "step": 2399 }, { "epoch": 0.37729175263809467, "grad_norm": 0.24530020356178284, "learning_rate": 4.576784216892763e-05, "loss": 1.0692, "step": 2400 }, { "epoch": 0.37729175263809467, "eval_loss": 1.130866289138794, "eval_runtime": 2318.0007, "eval_samples_per_second": 3.994, "eval_steps_per_second": 1.997, "step": 2400 }, { "epoch": 0.3774489575350272, "grad_norm": 0.22097055613994598, "learning_rate": 4.5764402020329953e-05, "loss": 1.1712, "step": 2401 }, { "epoch": 0.37760616243195977, "grad_norm": 0.24752137064933777, "learning_rate": 4.576096060352161e-05, "loss": 1.0011, "step": 2402 }, { "epoch": 0.3777633673288923, "grad_norm": 0.20604589581489563, "learning_rate": 4.5757517918712775e-05, "loss": 1.1616, "step": 2403 }, { "epoch": 0.3779205722258248, "grad_norm": 0.23652517795562744, "learning_rate": 4.5754073966113734e-05, "loss": 1.0465, "step": 2404 }, { "epoch": 0.37807777712275736, "grad_norm": 0.2082519680261612, "learning_rate": 4.575062874593481e-05, "loss": 1.0695, "step": 2405 }, { "epoch": 0.3782349820196899, "grad_norm": 0.3347269892692566, "learning_rate": 4.574718225838644e-05, "loss": 1.1395, "step": 2406 }, { "epoch": 0.37839218691662246, "grad_norm": 0.19232496619224548, "learning_rate": 4.57437345036791e-05, "loss": 1.0818, "step": 2407 }, { "epoch": 0.378549391813555, "grad_norm": 0.23415827751159668, "learning_rate": 4.5740285482023396e-05, "loss": 1.1472, "step": 2408 }, { "epoch": 0.3787065967104875, "grad_norm": 0.24319365620613098, "learning_rate": 4.5736835193629964e-05, "loss": 1.1827, "step": 2409 }, { "epoch": 0.37886380160742006, "grad_norm": 0.18908008933067322, "learning_rate": 4.5733383638709536e-05, "loss": 1.1589, "step": 2410 }, { "epoch": 0.3790210065043526, "grad_norm": 0.23955419659614563, "learning_rate": 4.572993081747291e-05, "loss": 1.1292, "step": 2411 }, { "epoch": 0.37917821140128516, "grad_norm": 0.27845147252082825, "learning_rate": 4.5726476730130994e-05, "loss": 1.0401, "step": 2412 }, { "epoch": 0.3793354162982177, "grad_norm": 0.2088197022676468, "learning_rate": 4.572302137689474e-05, "loss": 1.1491, "step": 2413 }, { "epoch": 0.3794926211951502, "grad_norm": 0.24244044721126556, "learning_rate": 4.571956475797519e-05, "loss": 1.1756, "step": 2414 }, { "epoch": 0.37964982609208275, "grad_norm": 0.18544475734233856, "learning_rate": 4.571610687358344e-05, "loss": 1.1849, "step": 2415 }, { "epoch": 0.3798070309890153, "grad_norm": 0.21180425584316254, "learning_rate": 4.571264772393071e-05, "loss": 1.1463, "step": 2416 }, { "epoch": 0.37996423588594785, "grad_norm": 0.18626239895820618, "learning_rate": 4.5709187309228273e-05, "loss": 1.0571, "step": 2417 }, { "epoch": 0.3801214407828804, "grad_norm": 0.21916179358959198, "learning_rate": 4.570572562968746e-05, "loss": 1.0754, "step": 2418 }, { "epoch": 0.3802786456798129, "grad_norm": 0.22719772160053253, "learning_rate": 4.570226268551971e-05, "loss": 1.1745, "step": 2419 }, { "epoch": 0.38043585057674545, "grad_norm": 0.1658662110567093, "learning_rate": 4.5698798476936515e-05, "loss": 1.1501, "step": 2420 }, { "epoch": 0.380593055473678, "grad_norm": 0.20030854642391205, "learning_rate": 4.5695333004149465e-05, "loss": 1.1977, "step": 2421 }, { "epoch": 0.38075026037061055, "grad_norm": 0.22993536293506622, "learning_rate": 4.569186626737022e-05, "loss": 1.1129, "step": 2422 }, { "epoch": 0.3809074652675431, "grad_norm": 0.24522928893566132, "learning_rate": 4.568839826681051e-05, "loss": 1.133, "step": 2423 }, { "epoch": 0.38106467016447565, "grad_norm": 0.1903398483991623, "learning_rate": 4.568492900268214e-05, "loss": 1.0693, "step": 2424 }, { "epoch": 0.38122187506140814, "grad_norm": 0.3089503049850464, "learning_rate": 4.568145847519702e-05, "loss": 1.1383, "step": 2425 }, { "epoch": 0.3813790799583407, "grad_norm": 0.1913723647594452, "learning_rate": 4.5677986684567095e-05, "loss": 1.1326, "step": 2426 }, { "epoch": 0.38153628485527324, "grad_norm": 0.2117031067609787, "learning_rate": 4.5674513631004424e-05, "loss": 1.0813, "step": 2427 }, { "epoch": 0.3816934897522058, "grad_norm": 0.21630004048347473, "learning_rate": 4.567103931472112e-05, "loss": 1.0788, "step": 2428 }, { "epoch": 0.38185069464913834, "grad_norm": 0.22654882073402405, "learning_rate": 4.566756373592938e-05, "loss": 1.135, "step": 2429 }, { "epoch": 0.38200789954607084, "grad_norm": 0.20162400603294373, "learning_rate": 4.566408689484148e-05, "loss": 1.1594, "step": 2430 }, { "epoch": 0.3821651044430034, "grad_norm": 0.1734067052602768, "learning_rate": 4.566060879166978e-05, "loss": 1.1672, "step": 2431 }, { "epoch": 0.38232230933993594, "grad_norm": 0.21130593121051788, "learning_rate": 4.56571294266267e-05, "loss": 1.1983, "step": 2432 }, { "epoch": 0.3824795142368685, "grad_norm": 0.2882966697216034, "learning_rate": 4.565364879992475e-05, "loss": 1.0704, "step": 2433 }, { "epoch": 0.38263671913380104, "grad_norm": 0.20934030413627625, "learning_rate": 4.565016691177651e-05, "loss": 1.0241, "step": 2434 }, { "epoch": 0.38279392403073353, "grad_norm": 0.2850327789783478, "learning_rate": 4.564668376239466e-05, "loss": 1.0301, "step": 2435 }, { "epoch": 0.3829511289276661, "grad_norm": 0.19671715795993805, "learning_rate": 4.564319935199191e-05, "loss": 1.1333, "step": 2436 }, { "epoch": 0.38310833382459863, "grad_norm": 0.18312984704971313, "learning_rate": 4.563971368078108e-05, "loss": 1.203, "step": 2437 }, { "epoch": 0.3832655387215312, "grad_norm": 0.1931515783071518, "learning_rate": 4.563622674897507e-05, "loss": 1.0835, "step": 2438 }, { "epoch": 0.38342274361846373, "grad_norm": 0.24310877919197083, "learning_rate": 4.563273855678685e-05, "loss": 1.0888, "step": 2439 }, { "epoch": 0.38357994851539623, "grad_norm": 0.20095118880271912, "learning_rate": 4.562924910442946e-05, "loss": 1.0833, "step": 2440 }, { "epoch": 0.3837371534123288, "grad_norm": 0.25016844272613525, "learning_rate": 4.5625758392116025e-05, "loss": 1.0446, "step": 2441 }, { "epoch": 0.38389435830926133, "grad_norm": 0.21593958139419556, "learning_rate": 4.5622266420059745e-05, "loss": 1.137, "step": 2442 }, { "epoch": 0.3840515632061939, "grad_norm": 0.25021597743034363, "learning_rate": 4.561877318847389e-05, "loss": 1.2507, "step": 2443 }, { "epoch": 0.38420876810312643, "grad_norm": 0.2984952926635742, "learning_rate": 4.561527869757182e-05, "loss": 1.0938, "step": 2444 }, { "epoch": 0.3843659730000589, "grad_norm": 0.20226380228996277, "learning_rate": 4.561178294756696e-05, "loss": 1.1269, "step": 2445 }, { "epoch": 0.3845231778969915, "grad_norm": 0.23726654052734375, "learning_rate": 4.5608285938672826e-05, "loss": 1.1787, "step": 2446 }, { "epoch": 0.384680382793924, "grad_norm": 0.22613126039505005, "learning_rate": 4.560478767110299e-05, "loss": 1.1021, "step": 2447 }, { "epoch": 0.3848375876908566, "grad_norm": 0.31989023089408875, "learning_rate": 4.560128814507112e-05, "loss": 1.0293, "step": 2448 }, { "epoch": 0.3849947925877891, "grad_norm": 0.2094157338142395, "learning_rate": 4.559778736079096e-05, "loss": 1.1724, "step": 2449 }, { "epoch": 0.3851519974847217, "grad_norm": 0.26532888412475586, "learning_rate": 4.5594285318476315e-05, "loss": 1.197, "step": 2450 }, { "epoch": 0.38530920238165417, "grad_norm": 0.17318859696388245, "learning_rate": 4.559078201834107e-05, "loss": 1.1503, "step": 2451 }, { "epoch": 0.3854664072785867, "grad_norm": 0.2698046863079071, "learning_rate": 4.558727746059922e-05, "loss": 1.0119, "step": 2452 }, { "epoch": 0.38562361217551927, "grad_norm": 0.18497617542743683, "learning_rate": 4.558377164546478e-05, "loss": 1.0471, "step": 2453 }, { "epoch": 0.3857808170724518, "grad_norm": 0.257929265499115, "learning_rate": 4.558026457315188e-05, "loss": 1.1298, "step": 2454 }, { "epoch": 0.38593802196938437, "grad_norm": 0.17087925970554352, "learning_rate": 4.557675624387473e-05, "loss": 1.174, "step": 2455 }, { "epoch": 0.38609522686631687, "grad_norm": 0.19346606731414795, "learning_rate": 4.5573246657847595e-05, "loss": 1.152, "step": 2456 }, { "epoch": 0.3862524317632494, "grad_norm": 0.22339493036270142, "learning_rate": 4.556973581528483e-05, "loss": 1.0753, "step": 2457 }, { "epoch": 0.38640963666018197, "grad_norm": 0.19931474328041077, "learning_rate": 4.556622371640087e-05, "loss": 1.1562, "step": 2458 }, { "epoch": 0.3865668415571145, "grad_norm": 0.20702451467514038, "learning_rate": 4.556271036141021e-05, "loss": 1.1617, "step": 2459 }, { "epoch": 0.38672404645404707, "grad_norm": 0.16566786170005798, "learning_rate": 4.5559195750527436e-05, "loss": 1.2268, "step": 2460 }, { "epoch": 0.38688125135097956, "grad_norm": 0.18909968435764313, "learning_rate": 4.5555679883967206e-05, "loss": 1.2718, "step": 2461 }, { "epoch": 0.3870384562479121, "grad_norm": 0.16908888518810272, "learning_rate": 4.555216276194426e-05, "loss": 1.2435, "step": 2462 }, { "epoch": 0.38719566114484466, "grad_norm": 0.18960361182689667, "learning_rate": 4.554864438467341e-05, "loss": 1.081, "step": 2463 }, { "epoch": 0.3873528660417772, "grad_norm": 0.18155843019485474, "learning_rate": 4.5545124752369546e-05, "loss": 1.2362, "step": 2464 }, { "epoch": 0.38751007093870976, "grad_norm": 0.195807084441185, "learning_rate": 4.554160386524763e-05, "loss": 1.1104, "step": 2465 }, { "epoch": 0.38766727583564226, "grad_norm": 0.36397311091423035, "learning_rate": 4.55380817235227e-05, "loss": 1.1308, "step": 2466 }, { "epoch": 0.3878244807325748, "grad_norm": 0.28057196736335754, "learning_rate": 4.553455832740989e-05, "loss": 1.0053, "step": 2467 }, { "epoch": 0.38798168562950736, "grad_norm": 0.24437671899795532, "learning_rate": 4.553103367712438e-05, "loss": 1.2225, "step": 2468 }, { "epoch": 0.3881388905264399, "grad_norm": 0.25742071866989136, "learning_rate": 4.5527507772881454e-05, "loss": 1.1577, "step": 2469 }, { "epoch": 0.38829609542337246, "grad_norm": 0.2540157735347748, "learning_rate": 4.5523980614896445e-05, "loss": 0.9798, "step": 2470 }, { "epoch": 0.38845330032030495, "grad_norm": 0.21823832392692566, "learning_rate": 4.5520452203384795e-05, "loss": 1.0552, "step": 2471 }, { "epoch": 0.3886105052172375, "grad_norm": 0.22293493151664734, "learning_rate": 4.5516922538562e-05, "loss": 1.2099, "step": 2472 }, { "epoch": 0.38876771011417005, "grad_norm": 0.1513427495956421, "learning_rate": 4.551339162064364e-05, "loss": 1.2112, "step": 2473 }, { "epoch": 0.3889249150111026, "grad_norm": 0.18423447012901306, "learning_rate": 4.550985944984536e-05, "loss": 1.2173, "step": 2474 }, { "epoch": 0.38908211990803515, "grad_norm": 0.17693789303302765, "learning_rate": 4.550632602638291e-05, "loss": 1.2094, "step": 2475 }, { "epoch": 0.3892393248049677, "grad_norm": 0.19538095593452454, "learning_rate": 4.550279135047208e-05, "loss": 1.1339, "step": 2476 }, { "epoch": 0.3893965297019002, "grad_norm": 0.17086082696914673, "learning_rate": 4.549925542232877e-05, "loss": 1.0723, "step": 2477 }, { "epoch": 0.38955373459883275, "grad_norm": 0.21500667929649353, "learning_rate": 4.549571824216892e-05, "loss": 1.0772, "step": 2478 }, { "epoch": 0.3897109394957653, "grad_norm": 0.21435636281967163, "learning_rate": 4.549217981020859e-05, "loss": 1.0753, "step": 2479 }, { "epoch": 0.38986814439269785, "grad_norm": 0.21024413406848907, "learning_rate": 4.5488640126663883e-05, "loss": 1.0733, "step": 2480 }, { "epoch": 0.3900253492896304, "grad_norm": 0.18347331881523132, "learning_rate": 4.5485099191751e-05, "loss": 1.1209, "step": 2481 }, { "epoch": 0.3901825541865629, "grad_norm": 0.2669816315174103, "learning_rate": 4.548155700568619e-05, "loss": 1.2356, "step": 2482 }, { "epoch": 0.39033975908349544, "grad_norm": 0.24279122054576874, "learning_rate": 4.547801356868581e-05, "loss": 1.1473, "step": 2483 }, { "epoch": 0.390496963980428, "grad_norm": 0.24355027079582214, "learning_rate": 4.547446888096627e-05, "loss": 1.1443, "step": 2484 }, { "epoch": 0.39065416887736054, "grad_norm": 0.17095209658145905, "learning_rate": 4.547092294274407e-05, "loss": 1.1495, "step": 2485 }, { "epoch": 0.3908113737742931, "grad_norm": 0.2063921093940735, "learning_rate": 4.546737575423579e-05, "loss": 1.0875, "step": 2486 }, { "epoch": 0.3909685786712256, "grad_norm": 0.22583013772964478, "learning_rate": 4.546382731565807e-05, "loss": 1.1615, "step": 2487 }, { "epoch": 0.39112578356815814, "grad_norm": 0.1564696729183197, "learning_rate": 4.546027762722763e-05, "loss": 1.1193, "step": 2488 }, { "epoch": 0.3912829884650907, "grad_norm": 0.23818811774253845, "learning_rate": 4.5456726689161285e-05, "loss": 1.2302, "step": 2489 }, { "epoch": 0.39144019336202324, "grad_norm": 0.21855293214321136, "learning_rate": 4.545317450167591e-05, "loss": 1.1537, "step": 2490 }, { "epoch": 0.3915973982589558, "grad_norm": 0.18843285739421844, "learning_rate": 4.544962106498846e-05, "loss": 1.1447, "step": 2491 }, { "epoch": 0.3917546031558883, "grad_norm": 0.27255892753601074, "learning_rate": 4.544606637931594e-05, "loss": 1.1954, "step": 2492 }, { "epoch": 0.39191180805282083, "grad_norm": 0.22189125418663025, "learning_rate": 4.54425104448755e-05, "loss": 1.1074, "step": 2493 }, { "epoch": 0.3920690129497534, "grad_norm": 0.20159167051315308, "learning_rate": 4.5438953261884286e-05, "loss": 1.1248, "step": 2494 }, { "epoch": 0.39222621784668593, "grad_norm": 0.22371767461299896, "learning_rate": 4.543539483055958e-05, "loss": 1.0238, "step": 2495 }, { "epoch": 0.3923834227436185, "grad_norm": 0.21290668845176697, "learning_rate": 4.5431835151118704e-05, "loss": 1.1483, "step": 2496 }, { "epoch": 0.392540627640551, "grad_norm": 0.21544845402240753, "learning_rate": 4.542827422377908e-05, "loss": 1.1274, "step": 2497 }, { "epoch": 0.3926978325374835, "grad_norm": 0.2112491875886917, "learning_rate": 4.542471204875819e-05, "loss": 1.1891, "step": 2498 }, { "epoch": 0.3928550374344161, "grad_norm": 0.25035732984542847, "learning_rate": 4.54211486262736e-05, "loss": 1.0903, "step": 2499 }, { "epoch": 0.3930122423313486, "grad_norm": 0.27401211857795715, "learning_rate": 4.541758395654294e-05, "loss": 1.1587, "step": 2500 }, { "epoch": 0.3931694472282812, "grad_norm": 0.1791762411594391, "learning_rate": 4.541401803978394e-05, "loss": 1.1841, "step": 2501 }, { "epoch": 0.3933266521252137, "grad_norm": 0.2872371971607208, "learning_rate": 4.54104508762144e-05, "loss": 1.1494, "step": 2502 }, { "epoch": 0.3934838570221462, "grad_norm": 0.2522941529750824, "learning_rate": 4.540688246605217e-05, "loss": 1.0157, "step": 2503 }, { "epoch": 0.39364106191907877, "grad_norm": 0.19489054381847382, "learning_rate": 4.5403312809515194e-05, "loss": 1.1834, "step": 2504 }, { "epoch": 0.3937982668160113, "grad_norm": 0.22451898455619812, "learning_rate": 4.539974190682151e-05, "loss": 1.057, "step": 2505 }, { "epoch": 0.39395547171294387, "grad_norm": 0.22620606422424316, "learning_rate": 4.539616975818921e-05, "loss": 1.1665, "step": 2506 }, { "epoch": 0.3941126766098764, "grad_norm": 0.1968408226966858, "learning_rate": 4.539259636383646e-05, "loss": 1.1033, "step": 2507 }, { "epoch": 0.3942698815068089, "grad_norm": 0.207768514752388, "learning_rate": 4.538902172398151e-05, "loss": 1.1113, "step": 2508 }, { "epoch": 0.39442708640374147, "grad_norm": 0.20376287400722504, "learning_rate": 4.538544583884269e-05, "loss": 1.1207, "step": 2509 }, { "epoch": 0.394584291300674, "grad_norm": 0.22011759877204895, "learning_rate": 4.5381868708638395e-05, "loss": 1.1046, "step": 2510 }, { "epoch": 0.39474149619760657, "grad_norm": 0.1497177630662918, "learning_rate": 4.537829033358711e-05, "loss": 1.2215, "step": 2511 }, { "epoch": 0.3948987010945391, "grad_norm": 0.17269910871982574, "learning_rate": 4.5374710713907386e-05, "loss": 1.2025, "step": 2512 }, { "epoch": 0.3950559059914716, "grad_norm": 0.24141822755336761, "learning_rate": 4.537112984981785e-05, "loss": 1.2037, "step": 2513 }, { "epoch": 0.39521311088840416, "grad_norm": 0.2416863590478897, "learning_rate": 4.536754774153722e-05, "loss": 1.2009, "step": 2514 }, { "epoch": 0.3953703157853367, "grad_norm": 0.19514837861061096, "learning_rate": 4.536396438928426e-05, "loss": 1.0463, "step": 2515 }, { "epoch": 0.39552752068226926, "grad_norm": 0.19753624498844147, "learning_rate": 4.536037979327783e-05, "loss": 1.1548, "step": 2516 }, { "epoch": 0.3956847255792018, "grad_norm": 0.23067662119865417, "learning_rate": 4.535679395373687e-05, "loss": 1.1382, "step": 2517 }, { "epoch": 0.3958419304761343, "grad_norm": 0.28596460819244385, "learning_rate": 4.53532068708804e-05, "loss": 1.1092, "step": 2518 }, { "epoch": 0.39599913537306686, "grad_norm": 0.2753187417984009, "learning_rate": 4.5349618544927486e-05, "loss": 1.0776, "step": 2519 }, { "epoch": 0.3961563402699994, "grad_norm": 0.20237961411476135, "learning_rate": 4.534602897609729e-05, "loss": 1.1795, "step": 2520 }, { "epoch": 0.39631354516693196, "grad_norm": 0.21416132152080536, "learning_rate": 4.534243816460906e-05, "loss": 1.042, "step": 2521 }, { "epoch": 0.3964707500638645, "grad_norm": 0.15869580209255219, "learning_rate": 4.5338846110682106e-05, "loss": 1.1952, "step": 2522 }, { "epoch": 0.396627954960797, "grad_norm": 0.19368652999401093, "learning_rate": 4.533525281453582e-05, "loss": 1.0974, "step": 2523 }, { "epoch": 0.39678515985772955, "grad_norm": 0.19327567517757416, "learning_rate": 4.533165827638965e-05, "loss": 1.2733, "step": 2524 }, { "epoch": 0.3969423647546621, "grad_norm": 0.25186845660209656, "learning_rate": 4.532806249646316e-05, "loss": 1.2386, "step": 2525 }, { "epoch": 0.39709956965159465, "grad_norm": 0.22584927082061768, "learning_rate": 4.5324465474975955e-05, "loss": 1.0643, "step": 2526 }, { "epoch": 0.3972567745485272, "grad_norm": 0.19394904375076294, "learning_rate": 4.532086721214773e-05, "loss": 1.2075, "step": 2527 }, { "epoch": 0.39741397944545975, "grad_norm": 0.20700658857822418, "learning_rate": 4.531726770819825e-05, "loss": 1.1718, "step": 2528 }, { "epoch": 0.39757118434239225, "grad_norm": 0.19494110345840454, "learning_rate": 4.5313666963347356e-05, "loss": 1.0573, "step": 2529 }, { "epoch": 0.3977283892393248, "grad_norm": 0.2095107138156891, "learning_rate": 4.5310064977814977e-05, "loss": 1.1195, "step": 2530 }, { "epoch": 0.39788559413625735, "grad_norm": 0.22246003150939941, "learning_rate": 4.530646175182111e-05, "loss": 1.152, "step": 2531 }, { "epoch": 0.3980427990331899, "grad_norm": 0.16814185678958893, "learning_rate": 4.53028572855858e-05, "loss": 1.1613, "step": 2532 }, { "epoch": 0.39820000393012245, "grad_norm": 0.2411145269870758, "learning_rate": 4.529925157932923e-05, "loss": 1.0908, "step": 2533 }, { "epoch": 0.39835720882705494, "grad_norm": 0.28689107298851013, "learning_rate": 4.529564463327161e-05, "loss": 1.0446, "step": 2534 }, { "epoch": 0.3985144137239875, "grad_norm": 0.1856895238161087, "learning_rate": 4.529203644763322e-05, "loss": 1.2297, "step": 2535 }, { "epoch": 0.39867161862092004, "grad_norm": 0.2174742966890335, "learning_rate": 4.528842702263446e-05, "loss": 1.1054, "step": 2536 }, { "epoch": 0.3988288235178526, "grad_norm": 0.2260787934064865, "learning_rate": 4.528481635849577e-05, "loss": 1.0361, "step": 2537 }, { "epoch": 0.39898602841478514, "grad_norm": 0.2079022079706192, "learning_rate": 4.5281204455437676e-05, "loss": 1.096, "step": 2538 }, { "epoch": 0.39914323331171764, "grad_norm": 0.17258255183696747, "learning_rate": 4.527759131368078e-05, "loss": 1.1946, "step": 2539 }, { "epoch": 0.3993004382086502, "grad_norm": 0.19373294711112976, "learning_rate": 4.527397693344575e-05, "loss": 1.089, "step": 2540 }, { "epoch": 0.39945764310558274, "grad_norm": 0.17514479160308838, "learning_rate": 4.527036131495336e-05, "loss": 1.089, "step": 2541 }, { "epoch": 0.3996148480025153, "grad_norm": 0.2525259852409363, "learning_rate": 4.5266744458424414e-05, "loss": 1.0712, "step": 2542 }, { "epoch": 0.39977205289944784, "grad_norm": 0.19268137216567993, "learning_rate": 4.5263126364079834e-05, "loss": 1.1875, "step": 2543 }, { "epoch": 0.39992925779638033, "grad_norm": 0.20745940506458282, "learning_rate": 4.525950703214058e-05, "loss": 1.0354, "step": 2544 }, { "epoch": 0.4000864626933129, "grad_norm": 0.193342387676239, "learning_rate": 4.525588646282773e-05, "loss": 1.1441, "step": 2545 }, { "epoch": 0.40024366759024543, "grad_norm": 0.1741735190153122, "learning_rate": 4.52522646563624e-05, "loss": 1.1172, "step": 2546 }, { "epoch": 0.400400872487178, "grad_norm": 0.15238472819328308, "learning_rate": 4.52486416129658e-05, "loss": 1.1061, "step": 2547 }, { "epoch": 0.40055807738411053, "grad_norm": 0.2244487702846527, "learning_rate": 4.5245017332859206e-05, "loss": 1.0468, "step": 2548 }, { "epoch": 0.40071528228104303, "grad_norm": 0.19981707632541656, "learning_rate": 4.5241391816263986e-05, "loss": 1.0995, "step": 2549 }, { "epoch": 0.4008724871779756, "grad_norm": 0.20777346193790436, "learning_rate": 4.523776506340157e-05, "loss": 1.0742, "step": 2550 }, { "epoch": 0.40102969207490813, "grad_norm": 0.24414701759815216, "learning_rate": 4.523413707449345e-05, "loss": 1.1748, "step": 2551 }, { "epoch": 0.4011868969718407, "grad_norm": 0.20141269266605377, "learning_rate": 4.523050784976124e-05, "loss": 1.0779, "step": 2552 }, { "epoch": 0.40134410186877323, "grad_norm": 0.21372921764850616, "learning_rate": 4.522687738942658e-05, "loss": 1.0947, "step": 2553 }, { "epoch": 0.4015013067657058, "grad_norm": 0.3348469138145447, "learning_rate": 4.5223245693711196e-05, "loss": 0.9531, "step": 2554 }, { "epoch": 0.4016585116626383, "grad_norm": 0.2114667445421219, "learning_rate": 4.521961276283691e-05, "loss": 1.2247, "step": 2555 }, { "epoch": 0.4018157165595708, "grad_norm": 0.20499029755592346, "learning_rate": 4.521597859702562e-05, "loss": 1.1545, "step": 2556 }, { "epoch": 0.4019729214565034, "grad_norm": 0.21829423308372498, "learning_rate": 4.521234319649927e-05, "loss": 1.1829, "step": 2557 }, { "epoch": 0.4021301263534359, "grad_norm": 0.1649905890226364, "learning_rate": 4.5208706561479895e-05, "loss": 1.0656, "step": 2558 }, { "epoch": 0.4022873312503685, "grad_norm": 0.21117526292800903, "learning_rate": 4.5205068692189617e-05, "loss": 1.0623, "step": 2559 }, { "epoch": 0.40244453614730097, "grad_norm": 0.18778620660305023, "learning_rate": 4.520142958885062e-05, "loss": 1.1203, "step": 2560 }, { "epoch": 0.40244453614730097, "eval_loss": 1.1262409687042236, "eval_runtime": 2319.1512, "eval_samples_per_second": 3.992, "eval_steps_per_second": 1.996, "step": 2560 }, { "epoch": 0.4026017410442335, "grad_norm": 0.17170308530330658, "learning_rate": 4.519778925168516e-05, "loss": 1.0933, "step": 2561 }, { "epoch": 0.40275894594116607, "grad_norm": 0.21042533218860626, "learning_rate": 4.519414768091558e-05, "loss": 1.1722, "step": 2562 }, { "epoch": 0.4029161508380986, "grad_norm": 0.17878204584121704, "learning_rate": 4.5190504876764296e-05, "loss": 1.1351, "step": 2563 }, { "epoch": 0.40307335573503117, "grad_norm": 0.15718936920166016, "learning_rate": 4.5186860839453795e-05, "loss": 0.9882, "step": 2564 }, { "epoch": 0.40323056063196366, "grad_norm": 0.17324978113174438, "learning_rate": 4.518321556920664e-05, "loss": 1.1786, "step": 2565 }, { "epoch": 0.4033877655288962, "grad_norm": 0.2370029240846634, "learning_rate": 4.517956906624546e-05, "loss": 1.1443, "step": 2566 }, { "epoch": 0.40354497042582876, "grad_norm": 0.209358811378479, "learning_rate": 4.517592133079299e-05, "loss": 1.1474, "step": 2567 }, { "epoch": 0.4037021753227613, "grad_norm": 0.23134967684745789, "learning_rate": 4.517227236307201e-05, "loss": 1.1006, "step": 2568 }, { "epoch": 0.40385938021969386, "grad_norm": 0.2563744783401489, "learning_rate": 4.5168622163305384e-05, "loss": 1.095, "step": 2569 }, { "epoch": 0.40401658511662636, "grad_norm": 0.19634851813316345, "learning_rate": 4.516497073171605e-05, "loss": 1.049, "step": 2570 }, { "epoch": 0.4041737900135589, "grad_norm": 0.15794876217842102, "learning_rate": 4.5161318068527025e-05, "loss": 1.2387, "step": 2571 }, { "epoch": 0.40433099491049146, "grad_norm": 0.1933317631483078, "learning_rate": 4.515766417396141e-05, "loss": 1.1564, "step": 2572 }, { "epoch": 0.404488199807424, "grad_norm": 0.22028346359729767, "learning_rate": 4.5154009048242355e-05, "loss": 1.2238, "step": 2573 }, { "epoch": 0.40464540470435656, "grad_norm": 0.19432009756565094, "learning_rate": 4.515035269159311e-05, "loss": 1.1126, "step": 2574 }, { "epoch": 0.40480260960128905, "grad_norm": 0.16745704412460327, "learning_rate": 4.5146695104236985e-05, "loss": 1.1979, "step": 2575 }, { "epoch": 0.4049598144982216, "grad_norm": 0.23030900955200195, "learning_rate": 4.514303628639738e-05, "loss": 1.0651, "step": 2576 }, { "epoch": 0.40511701939515415, "grad_norm": 0.1960119605064392, "learning_rate": 4.513937623829776e-05, "loss": 1.1086, "step": 2577 }, { "epoch": 0.4052742242920867, "grad_norm": 0.17087821662425995, "learning_rate": 4.513571496016166e-05, "loss": 1.1907, "step": 2578 }, { "epoch": 0.40543142918901925, "grad_norm": 0.24511630833148956, "learning_rate": 4.5132052452212706e-05, "loss": 1.0378, "step": 2579 }, { "epoch": 0.4055886340859518, "grad_norm": 0.2701500356197357, "learning_rate": 4.512838871467458e-05, "loss": 1.0078, "step": 2580 }, { "epoch": 0.4057458389828843, "grad_norm": 0.18003912270069122, "learning_rate": 4.512472374777106e-05, "loss": 1.1813, "step": 2581 }, { "epoch": 0.40590304387981685, "grad_norm": 0.19022230803966522, "learning_rate": 4.512105755172599e-05, "loss": 1.105, "step": 2582 }, { "epoch": 0.4060602487767494, "grad_norm": 0.19312334060668945, "learning_rate": 4.5117390126763273e-05, "loss": 1.1694, "step": 2583 }, { "epoch": 0.40621745367368195, "grad_norm": 0.15560056269168854, "learning_rate": 4.5113721473106904e-05, "loss": 1.2581, "step": 2584 }, { "epoch": 0.4063746585706145, "grad_norm": 0.27965423464775085, "learning_rate": 4.511005159098096e-05, "loss": 1.079, "step": 2585 }, { "epoch": 0.406531863467547, "grad_norm": 0.1963738203048706, "learning_rate": 4.5106380480609575e-05, "loss": 1.1422, "step": 2586 }, { "epoch": 0.40668906836447954, "grad_norm": 0.2169744223356247, "learning_rate": 4.5102708142216974e-05, "loss": 1.1722, "step": 2587 }, { "epoch": 0.4068462732614121, "grad_norm": 0.21945542097091675, "learning_rate": 4.509903457602744e-05, "loss": 1.0711, "step": 2588 }, { "epoch": 0.40700347815834464, "grad_norm": 0.16455252468585968, "learning_rate": 4.5095359782265355e-05, "loss": 1.2068, "step": 2589 }, { "epoch": 0.4071606830552772, "grad_norm": 0.18729642033576965, "learning_rate": 4.5091683761155144e-05, "loss": 1.1389, "step": 2590 }, { "epoch": 0.4073178879522097, "grad_norm": 0.15742127597332, "learning_rate": 4.508800651292134e-05, "loss": 1.1224, "step": 2591 }, { "epoch": 0.40747509284914224, "grad_norm": 0.2024814486503601, "learning_rate": 4.5084328037788526e-05, "loss": 1.1028, "step": 2592 }, { "epoch": 0.4076322977460748, "grad_norm": 0.25306496024131775, "learning_rate": 4.5080648335981366e-05, "loss": 1.1402, "step": 2593 }, { "epoch": 0.40778950264300734, "grad_norm": 0.22500720620155334, "learning_rate": 4.5076967407724614e-05, "loss": 1.0391, "step": 2594 }, { "epoch": 0.4079467075399399, "grad_norm": 0.22207467257976532, "learning_rate": 4.507328525324308e-05, "loss": 1.1483, "step": 2595 }, { "epoch": 0.4081039124368724, "grad_norm": 0.16076889634132385, "learning_rate": 4.506960187276164e-05, "loss": 1.1517, "step": 2596 }, { "epoch": 0.40826111733380493, "grad_norm": 0.19102972745895386, "learning_rate": 4.50659172665053e-05, "loss": 1.0793, "step": 2597 }, { "epoch": 0.4084183222307375, "grad_norm": 0.19929270446300507, "learning_rate": 4.506223143469908e-05, "loss": 1.1269, "step": 2598 }, { "epoch": 0.40857552712767004, "grad_norm": 0.2082059532403946, "learning_rate": 4.5058544377568076e-05, "loss": 1.0286, "step": 2599 }, { "epoch": 0.4087327320246026, "grad_norm": 0.21476319432258606, "learning_rate": 4.5054856095337516e-05, "loss": 1.1064, "step": 2600 }, { "epoch": 0.4088899369215351, "grad_norm": 0.20130178332328796, "learning_rate": 4.505116658823264e-05, "loss": 1.0551, "step": 2601 }, { "epoch": 0.40904714181846763, "grad_norm": 0.16284184157848358, "learning_rate": 4.5047475856478805e-05, "loss": 1.1272, "step": 2602 }, { "epoch": 0.4092043467154002, "grad_norm": 0.1783224493265152, "learning_rate": 4.504378390030142e-05, "loss": 1.0616, "step": 2603 }, { "epoch": 0.40936155161233273, "grad_norm": 0.18486501276493073, "learning_rate": 4.504009071992597e-05, "loss": 1.2219, "step": 2604 }, { "epoch": 0.4095187565092653, "grad_norm": 0.17707344889640808, "learning_rate": 4.503639631557803e-05, "loss": 0.9766, "step": 2605 }, { "epoch": 0.40967596140619783, "grad_norm": 0.23095828294754028, "learning_rate": 4.503270068748324e-05, "loss": 1.1355, "step": 2606 }, { "epoch": 0.4098331663031303, "grad_norm": 0.21981950104236603, "learning_rate": 4.5029003835867305e-05, "loss": 1.1692, "step": 2607 }, { "epoch": 0.4099903712000629, "grad_norm": 0.28627362847328186, "learning_rate": 4.502530576095603e-05, "loss": 1.1312, "step": 2608 }, { "epoch": 0.4101475760969954, "grad_norm": 0.22414131462574005, "learning_rate": 4.502160646297526e-05, "loss": 1.1141, "step": 2609 }, { "epoch": 0.410304780993928, "grad_norm": 0.2514089047908783, "learning_rate": 4.501790594215095e-05, "loss": 1.1178, "step": 2610 }, { "epoch": 0.4104619858908605, "grad_norm": 0.21636080741882324, "learning_rate": 4.50142041987091e-05, "loss": 1.1528, "step": 2611 }, { "epoch": 0.410619190787793, "grad_norm": 0.14287950098514557, "learning_rate": 4.501050123287582e-05, "loss": 1.0704, "step": 2612 }, { "epoch": 0.41077639568472557, "grad_norm": 0.2378457635641098, "learning_rate": 4.5006797044877245e-05, "loss": 1.295, "step": 2613 }, { "epoch": 0.4109336005816581, "grad_norm": 0.23281626403331757, "learning_rate": 4.500309163493964e-05, "loss": 1.111, "step": 2614 }, { "epoch": 0.41109080547859067, "grad_norm": 0.20313280820846558, "learning_rate": 4.49993850032893e-05, "loss": 1.1184, "step": 2615 }, { "epoch": 0.4112480103755232, "grad_norm": 0.18402956426143646, "learning_rate": 4.499567715015262e-05, "loss": 1.127, "step": 2616 }, { "epoch": 0.4114052152724557, "grad_norm": 0.15802954137325287, "learning_rate": 4.499196807575605e-05, "loss": 1.108, "step": 2617 }, { "epoch": 0.41156242016938827, "grad_norm": 0.2320290356874466, "learning_rate": 4.498825778032615e-05, "loss": 1.1369, "step": 2618 }, { "epoch": 0.4117196250663208, "grad_norm": 0.19282428920269012, "learning_rate": 4.49845462640895e-05, "loss": 1.1828, "step": 2619 }, { "epoch": 0.41187682996325337, "grad_norm": 0.18736015260219574, "learning_rate": 4.4980833527272804e-05, "loss": 1.1012, "step": 2620 }, { "epoch": 0.4120340348601859, "grad_norm": 0.13990436494350433, "learning_rate": 4.497711957010282e-05, "loss": 1.0997, "step": 2621 }, { "epoch": 0.4121912397571184, "grad_norm": 0.1439363956451416, "learning_rate": 4.497340439280638e-05, "loss": 1.1568, "step": 2622 }, { "epoch": 0.41234844465405096, "grad_norm": 0.23712140321731567, "learning_rate": 4.49696879956104e-05, "loss": 1.2776, "step": 2623 }, { "epoch": 0.4125056495509835, "grad_norm": 0.20338992774486542, "learning_rate": 4.4965970378741845e-05, "loss": 1.2083, "step": 2624 }, { "epoch": 0.41266285444791606, "grad_norm": 0.1320551484823227, "learning_rate": 4.496225154242779e-05, "loss": 1.1028, "step": 2625 }, { "epoch": 0.4128200593448486, "grad_norm": 0.20532973110675812, "learning_rate": 4.495853148689536e-05, "loss": 1.133, "step": 2626 }, { "epoch": 0.4129772642417811, "grad_norm": 0.20016834139823914, "learning_rate": 4.4954810212371766e-05, "loss": 1.1123, "step": 2627 }, { "epoch": 0.41313446913871366, "grad_norm": 0.22745396196842194, "learning_rate": 4.495108771908429e-05, "loss": 1.0813, "step": 2628 }, { "epoch": 0.4132916740356462, "grad_norm": 0.1960359364748001, "learning_rate": 4.494736400726029e-05, "loss": 1.1536, "step": 2629 }, { "epoch": 0.41344887893257876, "grad_norm": 0.22357088327407837, "learning_rate": 4.494363907712719e-05, "loss": 1.0777, "step": 2630 }, { "epoch": 0.4136060838295113, "grad_norm": 0.19476816058158875, "learning_rate": 4.4939912928912484e-05, "loss": 1.1019, "step": 2631 }, { "epoch": 0.41376328872644386, "grad_norm": 0.16308310627937317, "learning_rate": 4.493618556284377e-05, "loss": 1.1504, "step": 2632 }, { "epoch": 0.41392049362337635, "grad_norm": 0.23933067917823792, "learning_rate": 4.493245697914869e-05, "loss": 1.1173, "step": 2633 }, { "epoch": 0.4140776985203089, "grad_norm": 0.19607731699943542, "learning_rate": 4.492872717805498e-05, "loss": 1.1738, "step": 2634 }, { "epoch": 0.41423490341724145, "grad_norm": 0.1769096553325653, "learning_rate": 4.492499615979043e-05, "loss": 1.1922, "step": 2635 }, { "epoch": 0.414392108314174, "grad_norm": 0.2839674949645996, "learning_rate": 4.492126392458293e-05, "loss": 1.1784, "step": 2636 }, { "epoch": 0.41454931321110655, "grad_norm": 0.22033485770225525, "learning_rate": 4.491753047266043e-05, "loss": 1.1593, "step": 2637 }, { "epoch": 0.41470651810803905, "grad_norm": 0.23080600798130035, "learning_rate": 4.491379580425095e-05, "loss": 1.0708, "step": 2638 }, { "epoch": 0.4148637230049716, "grad_norm": 0.19914209842681885, "learning_rate": 4.491005991958258e-05, "loss": 1.1696, "step": 2639 }, { "epoch": 0.41502092790190415, "grad_norm": 0.1926482617855072, "learning_rate": 4.490632281888351e-05, "loss": 1.1159, "step": 2640 }, { "epoch": 0.4151781327988367, "grad_norm": 0.20513269305229187, "learning_rate": 4.4902584502381986e-05, "loss": 1.0759, "step": 2641 }, { "epoch": 0.41533533769576925, "grad_norm": 0.19525010883808136, "learning_rate": 4.4898844970306306e-05, "loss": 1.1967, "step": 2642 }, { "epoch": 0.41549254259270174, "grad_norm": 0.2566875219345093, "learning_rate": 4.48951042228849e-05, "loss": 1.148, "step": 2643 }, { "epoch": 0.4156497474896343, "grad_norm": 0.23048824071884155, "learning_rate": 4.4891362260346226e-05, "loss": 1.2564, "step": 2644 }, { "epoch": 0.41580695238656684, "grad_norm": 0.22120051085948944, "learning_rate": 4.488761908291882e-05, "loss": 1.2008, "step": 2645 }, { "epoch": 0.4159641572834994, "grad_norm": 0.2097010314464569, "learning_rate": 4.488387469083131e-05, "loss": 1.1148, "step": 2646 }, { "epoch": 0.41612136218043194, "grad_norm": 0.18486234545707703, "learning_rate": 4.488012908431239e-05, "loss": 1.0886, "step": 2647 }, { "epoch": 0.41627856707736444, "grad_norm": 0.1906927525997162, "learning_rate": 4.487638226359082e-05, "loss": 1.1823, "step": 2648 }, { "epoch": 0.416435771974297, "grad_norm": 0.2732648253440857, "learning_rate": 4.487263422889545e-05, "loss": 1.0555, "step": 2649 }, { "epoch": 0.41659297687122954, "grad_norm": 0.1381736546754837, "learning_rate": 4.486888498045519e-05, "loss": 1.1222, "step": 2650 }, { "epoch": 0.4167501817681621, "grad_norm": 0.13629379868507385, "learning_rate": 4.486513451849903e-05, "loss": 1.0836, "step": 2651 }, { "epoch": 0.41690738666509464, "grad_norm": 0.18362641334533691, "learning_rate": 4.4861382843256035e-05, "loss": 1.2531, "step": 2652 }, { "epoch": 0.41706459156202713, "grad_norm": 0.24584434926509857, "learning_rate": 4.485762995495534e-05, "loss": 1.1816, "step": 2653 }, { "epoch": 0.4172217964589597, "grad_norm": 0.21411539614200592, "learning_rate": 4.485387585382617e-05, "loss": 1.1371, "step": 2654 }, { "epoch": 0.41737900135589223, "grad_norm": 0.18241286277770996, "learning_rate": 4.485012054009779e-05, "loss": 1.1405, "step": 2655 }, { "epoch": 0.4175362062528248, "grad_norm": 0.18648092448711395, "learning_rate": 4.4846364013999584e-05, "loss": 1.085, "step": 2656 }, { "epoch": 0.41769341114975733, "grad_norm": 0.19255079329013824, "learning_rate": 4.484260627576098e-05, "loss": 1.0671, "step": 2657 }, { "epoch": 0.4178506160466898, "grad_norm": 0.17891213297843933, "learning_rate": 4.483884732561147e-05, "loss": 1.1641, "step": 2658 }, { "epoch": 0.4180078209436224, "grad_norm": 0.26097598671913147, "learning_rate": 4.483508716378064e-05, "loss": 1.0732, "step": 2659 }, { "epoch": 0.4181650258405549, "grad_norm": 0.16144752502441406, "learning_rate": 4.483132579049817e-05, "loss": 1.1039, "step": 2660 }, { "epoch": 0.4183222307374875, "grad_norm": 0.3161677420139313, "learning_rate": 4.482756320599376e-05, "loss": 1.1283, "step": 2661 }, { "epoch": 0.41847943563442, "grad_norm": 0.2448277771472931, "learning_rate": 4.482379941049724e-05, "loss": 1.1227, "step": 2662 }, { "epoch": 0.4186366405313526, "grad_norm": 0.23270602524280548, "learning_rate": 4.482003440423847e-05, "loss": 1.0057, "step": 2663 }, { "epoch": 0.41879384542828507, "grad_norm": 0.2527409791946411, "learning_rate": 4.481626818744741e-05, "loss": 1.0883, "step": 2664 }, { "epoch": 0.4189510503252176, "grad_norm": 0.2619737982749939, "learning_rate": 4.481250076035408e-05, "loss": 1.0826, "step": 2665 }, { "epoch": 0.4191082552221502, "grad_norm": 0.16641849279403687, "learning_rate": 4.4808732123188593e-05, "loss": 1.0928, "step": 2666 }, { "epoch": 0.4192654601190827, "grad_norm": 0.22881364822387695, "learning_rate": 4.4804962276181115e-05, "loss": 1.2395, "step": 2667 }, { "epoch": 0.4194226650160153, "grad_norm": 0.1922225058078766, "learning_rate": 4.4801191219561886e-05, "loss": 1.125, "step": 2668 }, { "epoch": 0.41957986991294777, "grad_norm": 0.16535848379135132, "learning_rate": 4.479741895356124e-05, "loss": 1.1668, "step": 2669 }, { "epoch": 0.4197370748098803, "grad_norm": 0.1907588392496109, "learning_rate": 4.4793645478409576e-05, "loss": 1.1296, "step": 2670 }, { "epoch": 0.41989427970681287, "grad_norm": 0.19568997621536255, "learning_rate": 4.4789870794337355e-05, "loss": 1.0744, "step": 2671 }, { "epoch": 0.4200514846037454, "grad_norm": 0.2092171162366867, "learning_rate": 4.478609490157512e-05, "loss": 1.1545, "step": 2672 }, { "epoch": 0.42020868950067797, "grad_norm": 0.18425819277763367, "learning_rate": 4.4782317800353476e-05, "loss": 1.1108, "step": 2673 }, { "epoch": 0.42036589439761046, "grad_norm": 0.16294459998607635, "learning_rate": 4.477853949090314e-05, "loss": 1.1444, "step": 2674 }, { "epoch": 0.420523099294543, "grad_norm": 0.1816370189189911, "learning_rate": 4.477475997345486e-05, "loss": 1.1692, "step": 2675 }, { "epoch": 0.42068030419147556, "grad_norm": 0.1853678822517395, "learning_rate": 4.477097924823948e-05, "loss": 1.1799, "step": 2676 }, { "epoch": 0.4208375090884081, "grad_norm": 0.19207173585891724, "learning_rate": 4.476719731548792e-05, "loss": 1.1308, "step": 2677 }, { "epoch": 0.42099471398534066, "grad_norm": 0.18002179265022278, "learning_rate": 4.4763414175431146e-05, "loss": 1.1212, "step": 2678 }, { "epoch": 0.42115191888227316, "grad_norm": 0.2948683500289917, "learning_rate": 4.4759629828300234e-05, "loss": 1.1712, "step": 2679 }, { "epoch": 0.4213091237792057, "grad_norm": 0.1672886162996292, "learning_rate": 4.475584427432631e-05, "loss": 1.098, "step": 2680 }, { "epoch": 0.42146632867613826, "grad_norm": 0.1577957570552826, "learning_rate": 4.4752057513740584e-05, "loss": 1.0854, "step": 2681 }, { "epoch": 0.4216235335730708, "grad_norm": 0.20781965553760529, "learning_rate": 4.474826954677434e-05, "loss": 1.1503, "step": 2682 }, { "epoch": 0.42178073847000336, "grad_norm": 0.20414681732654572, "learning_rate": 4.4744480373658925e-05, "loss": 1.1162, "step": 2683 }, { "epoch": 0.42193794336693585, "grad_norm": 0.1716751754283905, "learning_rate": 4.4740689994625775e-05, "loss": 1.0506, "step": 2684 }, { "epoch": 0.4220951482638684, "grad_norm": 0.20430327951908112, "learning_rate": 4.4736898409906385e-05, "loss": 1.1359, "step": 2685 }, { "epoch": 0.42225235316080095, "grad_norm": 0.21866196393966675, "learning_rate": 4.4733105619732334e-05, "loss": 1.138, "step": 2686 }, { "epoch": 0.4224095580577335, "grad_norm": 0.19445818662643433, "learning_rate": 4.4729311624335275e-05, "loss": 1.1227, "step": 2687 }, { "epoch": 0.42256676295466605, "grad_norm": 0.16323702037334442, "learning_rate": 4.472551642394693e-05, "loss": 1.1723, "step": 2688 }, { "epoch": 0.4227239678515986, "grad_norm": 0.18188408017158508, "learning_rate": 4.472172001879909e-05, "loss": 1.1264, "step": 2689 }, { "epoch": 0.4228811727485311, "grad_norm": 0.25359630584716797, "learning_rate": 4.471792240912362e-05, "loss": 1.164, "step": 2690 }, { "epoch": 0.42303837764546365, "grad_norm": 0.14942757785320282, "learning_rate": 4.4714123595152476e-05, "loss": 1.2065, "step": 2691 }, { "epoch": 0.4231955825423962, "grad_norm": 0.16352878510951996, "learning_rate": 4.471032357711767e-05, "loss": 1.1007, "step": 2692 }, { "epoch": 0.42335278743932875, "grad_norm": 0.25480419397354126, "learning_rate": 4.470652235525129e-05, "loss": 1.1349, "step": 2693 }, { "epoch": 0.4235099923362613, "grad_norm": 0.201410710811615, "learning_rate": 4.4702719929785505e-05, "loss": 1.1439, "step": 2694 }, { "epoch": 0.4236671972331938, "grad_norm": 0.20557720959186554, "learning_rate": 4.469891630095256e-05, "loss": 1.192, "step": 2695 }, { "epoch": 0.42382440213012634, "grad_norm": 0.1845296025276184, "learning_rate": 4.469511146898475e-05, "loss": 1.1645, "step": 2696 }, { "epoch": 0.4239816070270589, "grad_norm": 0.1634630411863327, "learning_rate": 4.4691305434114466e-05, "loss": 1.0994, "step": 2697 }, { "epoch": 0.42413881192399144, "grad_norm": 0.25110775232315063, "learning_rate": 4.4687498196574165e-05, "loss": 1.2912, "step": 2698 }, { "epoch": 0.424296016820924, "grad_norm": 0.1885124295949936, "learning_rate": 4.468368975659638e-05, "loss": 1.1205, "step": 2699 }, { "epoch": 0.4244532217178565, "grad_norm": 0.17171499133110046, "learning_rate": 4.467988011441372e-05, "loss": 1.1772, "step": 2700 }, { "epoch": 0.42461042661478904, "grad_norm": 0.19311368465423584, "learning_rate": 4.467606927025886e-05, "loss": 1.1761, "step": 2701 }, { "epoch": 0.4247676315117216, "grad_norm": 0.18169812858104706, "learning_rate": 4.4672257224364545e-05, "loss": 1.0528, "step": 2702 }, { "epoch": 0.42492483640865414, "grad_norm": 0.2034253031015396, "learning_rate": 4.466844397696361e-05, "loss": 1.0786, "step": 2703 }, { "epoch": 0.4250820413055867, "grad_norm": 0.1843811571598053, "learning_rate": 4.466462952828895e-05, "loss": 1.052, "step": 2704 }, { "epoch": 0.4252392462025192, "grad_norm": 0.19026444852352142, "learning_rate": 4.466081387857354e-05, "loss": 1.1456, "step": 2705 }, { "epoch": 0.42539645109945173, "grad_norm": 0.16506510972976685, "learning_rate": 4.4656997028050426e-05, "loss": 1.2883, "step": 2706 }, { "epoch": 0.4255536559963843, "grad_norm": 0.25424107909202576, "learning_rate": 4.465317897695271e-05, "loss": 0.9969, "step": 2707 }, { "epoch": 0.42571086089331683, "grad_norm": 0.18849819898605347, "learning_rate": 4.464935972551361e-05, "loss": 1.1043, "step": 2708 }, { "epoch": 0.4258680657902494, "grad_norm": 0.22406542301177979, "learning_rate": 4.4645539273966374e-05, "loss": 1.1311, "step": 2709 }, { "epoch": 0.4260252706871819, "grad_norm": 0.1915282905101776, "learning_rate": 4.464171762254436e-05, "loss": 1.1565, "step": 2710 }, { "epoch": 0.42618247558411443, "grad_norm": 0.21550290286540985, "learning_rate": 4.463789477148094e-05, "loss": 1.0745, "step": 2711 }, { "epoch": 0.426339680481047, "grad_norm": 0.18291574716567993, "learning_rate": 4.463407072100964e-05, "loss": 1.1443, "step": 2712 }, { "epoch": 0.42649688537797953, "grad_norm": 0.174668088555336, "learning_rate": 4.4630245471364004e-05, "loss": 1.0771, "step": 2713 }, { "epoch": 0.4266540902749121, "grad_norm": 0.21048250794410706, "learning_rate": 4.462641902277765e-05, "loss": 1.1574, "step": 2714 }, { "epoch": 0.42681129517184463, "grad_norm": 0.22583508491516113, "learning_rate": 4.4622591375484316e-05, "loss": 1.1584, "step": 2715 }, { "epoch": 0.4269685000687771, "grad_norm": 0.17098526656627655, "learning_rate": 4.461876252971774e-05, "loss": 1.0009, "step": 2716 }, { "epoch": 0.4271257049657097, "grad_norm": 0.16546213626861572, "learning_rate": 4.4614932485711805e-05, "loss": 1.1723, "step": 2717 }, { "epoch": 0.4272829098626422, "grad_norm": 0.1419089436531067, "learning_rate": 4.461110124370042e-05, "loss": 1.2826, "step": 2718 }, { "epoch": 0.4274401147595748, "grad_norm": 0.2434864342212677, "learning_rate": 4.460726880391759e-05, "loss": 1.0681, "step": 2719 }, { "epoch": 0.4275973196565073, "grad_norm": 0.23129241168498993, "learning_rate": 4.460343516659738e-05, "loss": 1.2189, "step": 2720 }, { "epoch": 0.4275973196565073, "eval_loss": 1.121034026145935, "eval_runtime": 2296.2998, "eval_samples_per_second": 4.032, "eval_steps_per_second": 2.016, "step": 2720 }, { "epoch": 0.4277545245534398, "grad_norm": 0.16803450882434845, "learning_rate": 4.4599600331973936e-05, "loss": 1.3243, "step": 2721 }, { "epoch": 0.42791172945037237, "grad_norm": 0.20555785298347473, "learning_rate": 4.459576430028147e-05, "loss": 1.2458, "step": 2722 }, { "epoch": 0.4280689343473049, "grad_norm": 0.17189499735832214, "learning_rate": 4.459192707175428e-05, "loss": 1.0436, "step": 2723 }, { "epoch": 0.42822613924423747, "grad_norm": 0.21618017554283142, "learning_rate": 4.4588088646626736e-05, "loss": 1.1963, "step": 2724 }, { "epoch": 0.42838334414117, "grad_norm": 0.1805650144815445, "learning_rate": 4.4584249025133256e-05, "loss": 1.1143, "step": 2725 }, { "epoch": 0.4285405490381025, "grad_norm": 0.1871338039636612, "learning_rate": 4.458040820750836e-05, "loss": 1.0584, "step": 2726 }, { "epoch": 0.42869775393503506, "grad_norm": 0.197019562125206, "learning_rate": 4.4576566193986635e-05, "loss": 1.0886, "step": 2727 }, { "epoch": 0.4288549588319676, "grad_norm": 0.1631096750497818, "learning_rate": 4.457272298480273e-05, "loss": 1.1358, "step": 2728 }, { "epoch": 0.42901216372890016, "grad_norm": 0.16281025111675262, "learning_rate": 4.4568878580191364e-05, "loss": 1.1463, "step": 2729 }, { "epoch": 0.4291693686258327, "grad_norm": 0.1618165671825409, "learning_rate": 4.456503298038735e-05, "loss": 1.1639, "step": 2730 }, { "epoch": 0.4293265735227652, "grad_norm": 0.20703785121440887, "learning_rate": 4.4561186185625574e-05, "loss": 1.1258, "step": 2731 }, { "epoch": 0.42948377841969776, "grad_norm": 0.2330520749092102, "learning_rate": 4.455733819614096e-05, "loss": 1.0725, "step": 2732 }, { "epoch": 0.4296409833166303, "grad_norm": 0.18974269926548004, "learning_rate": 4.4553489012168546e-05, "loss": 1.1385, "step": 2733 }, { "epoch": 0.42979818821356286, "grad_norm": 0.17599286139011383, "learning_rate": 4.454963863394343e-05, "loss": 1.1215, "step": 2734 }, { "epoch": 0.4299553931104954, "grad_norm": 0.15310634672641754, "learning_rate": 4.454578706170075e-05, "loss": 1.137, "step": 2735 }, { "epoch": 0.4301125980074279, "grad_norm": 0.15119053423404694, "learning_rate": 4.454193429567577e-05, "loss": 1.152, "step": 2736 }, { "epoch": 0.43026980290436045, "grad_norm": 0.18684156239032745, "learning_rate": 4.45380803361038e-05, "loss": 1.1315, "step": 2737 }, { "epoch": 0.430427007801293, "grad_norm": 0.1770356446504593, "learning_rate": 4.45342251832202e-05, "loss": 1.0439, "step": 2738 }, { "epoch": 0.43058421269822555, "grad_norm": 0.151130810379982, "learning_rate": 4.453036883726047e-05, "loss": 1.1593, "step": 2739 }, { "epoch": 0.4307414175951581, "grad_norm": 0.20127306878566742, "learning_rate": 4.4526511298460114e-05, "loss": 1.1414, "step": 2740 }, { "epoch": 0.43089862249209065, "grad_norm": 0.1737474501132965, "learning_rate": 4.452265256705474e-05, "loss": 1.2059, "step": 2741 }, { "epoch": 0.43105582738902315, "grad_norm": 0.15642091631889343, "learning_rate": 4.451879264328003e-05, "loss": 1.2293, "step": 2742 }, { "epoch": 0.4312130322859557, "grad_norm": 0.1680547147989273, "learning_rate": 4.451493152737172e-05, "loss": 1.0843, "step": 2743 }, { "epoch": 0.43137023718288825, "grad_norm": 0.3224833607673645, "learning_rate": 4.451106921956565e-05, "loss": 1.1196, "step": 2744 }, { "epoch": 0.4315274420798208, "grad_norm": 0.20196394622325897, "learning_rate": 4.450720572009771e-05, "loss": 1.0971, "step": 2745 }, { "epoch": 0.43168464697675335, "grad_norm": 0.1925266832113266, "learning_rate": 4.4503341029203856e-05, "loss": 1.2121, "step": 2746 }, { "epoch": 0.43184185187368584, "grad_norm": 0.1832456737756729, "learning_rate": 4.449947514712014e-05, "loss": 1.0814, "step": 2747 }, { "epoch": 0.4319990567706184, "grad_norm": 0.17716901004314423, "learning_rate": 4.449560807408267e-05, "loss": 1.153, "step": 2748 }, { "epoch": 0.43215626166755094, "grad_norm": 0.20120856165885925, "learning_rate": 4.4491739810327635e-05, "loss": 1.1343, "step": 2749 }, { "epoch": 0.4323134665644835, "grad_norm": 0.15247990190982819, "learning_rate": 4.44878703560913e-05, "loss": 1.1549, "step": 2750 }, { "epoch": 0.43247067146141605, "grad_norm": 0.230882465839386, "learning_rate": 4.448399971160999e-05, "loss": 1.0305, "step": 2751 }, { "epoch": 0.43262787635834854, "grad_norm": 0.23438026010990143, "learning_rate": 4.44801278771201e-05, "loss": 1.1409, "step": 2752 }, { "epoch": 0.4327850812552811, "grad_norm": 0.18865425884723663, "learning_rate": 4.447625485285813e-05, "loss": 1.1122, "step": 2753 }, { "epoch": 0.43294228615221364, "grad_norm": 0.18455225229263306, "learning_rate": 4.4472380639060605e-05, "loss": 1.1954, "step": 2754 }, { "epoch": 0.4330994910491462, "grad_norm": 0.17371979355812073, "learning_rate": 4.4468505235964165e-05, "loss": 1.0564, "step": 2755 }, { "epoch": 0.43325669594607874, "grad_norm": 0.17381125688552856, "learning_rate": 4.4464628643805495e-05, "loss": 1.1591, "step": 2756 }, { "epoch": 0.43341390084301123, "grad_norm": 0.20750965178012848, "learning_rate": 4.4460750862821366e-05, "loss": 1.0045, "step": 2757 }, { "epoch": 0.4335711057399438, "grad_norm": 0.2176593393087387, "learning_rate": 4.445687189324862e-05, "loss": 1.1218, "step": 2758 }, { "epoch": 0.43372831063687634, "grad_norm": 0.20053786039352417, "learning_rate": 4.445299173532416e-05, "loss": 1.2038, "step": 2759 }, { "epoch": 0.4338855155338089, "grad_norm": 0.159897580742836, "learning_rate": 4.444911038928499e-05, "loss": 1.1344, "step": 2760 }, { "epoch": 0.43404272043074144, "grad_norm": 0.19078446924686432, "learning_rate": 4.4445227855368144e-05, "loss": 1.1395, "step": 2761 }, { "epoch": 0.43419992532767393, "grad_norm": 0.15986692905426025, "learning_rate": 4.4441344133810766e-05, "loss": 1.2075, "step": 2762 }, { "epoch": 0.4343571302246065, "grad_norm": 0.1700146645307541, "learning_rate": 4.443745922485006e-05, "loss": 1.1439, "step": 2763 }, { "epoch": 0.43451433512153903, "grad_norm": 0.16691164672374725, "learning_rate": 4.4433573128723306e-05, "loss": 0.9497, "step": 2764 }, { "epoch": 0.4346715400184716, "grad_norm": 0.31677502393722534, "learning_rate": 4.442968584566784e-05, "loss": 1.0701, "step": 2765 }, { "epoch": 0.43482874491540413, "grad_norm": 0.17169591784477234, "learning_rate": 4.442579737592109e-05, "loss": 1.1079, "step": 2766 }, { "epoch": 0.4349859498123367, "grad_norm": 0.33990463614463806, "learning_rate": 4.442190771972054e-05, "loss": 1.0178, "step": 2767 }, { "epoch": 0.4351431547092692, "grad_norm": 0.1680772304534912, "learning_rate": 4.441801687730377e-05, "loss": 1.2274, "step": 2768 }, { "epoch": 0.4353003596062017, "grad_norm": 0.21385186910629272, "learning_rate": 4.441412484890841e-05, "loss": 1.1274, "step": 2769 }, { "epoch": 0.4354575645031343, "grad_norm": 0.19230084121227264, "learning_rate": 4.4410231634772164e-05, "loss": 0.9932, "step": 2770 }, { "epoch": 0.4356147694000668, "grad_norm": 0.21352258324623108, "learning_rate": 4.440633723513282e-05, "loss": 1.0798, "step": 2771 }, { "epoch": 0.4357719742969994, "grad_norm": 0.17857983708381653, "learning_rate": 4.440244165022824e-05, "loss": 1.1705, "step": 2772 }, { "epoch": 0.43592917919393187, "grad_norm": 0.1947200447320938, "learning_rate": 4.439854488029634e-05, "loss": 1.0449, "step": 2773 }, { "epoch": 0.4360863840908644, "grad_norm": 0.22335095703601837, "learning_rate": 4.439464692557514e-05, "loss": 1.0209, "step": 2774 }, { "epoch": 0.43624358898779697, "grad_norm": 0.16146698594093323, "learning_rate": 4.439074778630268e-05, "loss": 1.0747, "step": 2775 }, { "epoch": 0.4364007938847295, "grad_norm": 0.20121018588542938, "learning_rate": 4.4386847462717126e-05, "loss": 1.1736, "step": 2776 }, { "epoch": 0.43655799878166207, "grad_norm": 0.20480762422084808, "learning_rate": 4.43829459550567e-05, "loss": 1.0921, "step": 2777 }, { "epoch": 0.43671520367859457, "grad_norm": 0.1490727663040161, "learning_rate": 4.437904326355967e-05, "loss": 1.1989, "step": 2778 }, { "epoch": 0.4368724085755271, "grad_norm": 0.2289019227027893, "learning_rate": 4.4375139388464415e-05, "loss": 1.0705, "step": 2779 }, { "epoch": 0.43702961347245967, "grad_norm": 0.21382930874824524, "learning_rate": 4.437123433000937e-05, "loss": 1.1479, "step": 2780 }, { "epoch": 0.4371868183693922, "grad_norm": 0.1747499406337738, "learning_rate": 4.4367328088433026e-05, "loss": 1.0534, "step": 2781 }, { "epoch": 0.43734402326632477, "grad_norm": 0.1528068631887436, "learning_rate": 4.436342066397397e-05, "loss": 1.0571, "step": 2782 }, { "epoch": 0.43750122816325726, "grad_norm": 0.2127470076084137, "learning_rate": 4.435951205687086e-05, "loss": 1.0721, "step": 2783 }, { "epoch": 0.4376584330601898, "grad_norm": 0.20668397843837738, "learning_rate": 4.4355602267362404e-05, "loss": 1.0828, "step": 2784 }, { "epoch": 0.43781563795712236, "grad_norm": 0.17368297278881073, "learning_rate": 4.435169129568742e-05, "loss": 1.0282, "step": 2785 }, { "epoch": 0.4379728428540549, "grad_norm": 0.20142029225826263, "learning_rate": 4.434777914208475e-05, "loss": 1.042, "step": 2786 }, { "epoch": 0.43813004775098746, "grad_norm": 0.15896591544151306, "learning_rate": 4.434386580679334e-05, "loss": 1.1075, "step": 2787 }, { "epoch": 0.43828725264791996, "grad_norm": 0.24827557802200317, "learning_rate": 4.433995129005221e-05, "loss": 1.1095, "step": 2788 }, { "epoch": 0.4384444575448525, "grad_norm": 0.20462195575237274, "learning_rate": 4.433603559210043e-05, "loss": 1.0703, "step": 2789 }, { "epoch": 0.43860166244178506, "grad_norm": 0.20382918417453766, "learning_rate": 4.4332118713177175e-05, "loss": 1.1476, "step": 2790 }, { "epoch": 0.4387588673387176, "grad_norm": 0.8631371855735779, "learning_rate": 4.432820065352166e-05, "loss": 1.158, "step": 2791 }, { "epoch": 0.43891607223565016, "grad_norm": 0.18501952290534973, "learning_rate": 4.432428141337318e-05, "loss": 1.1529, "step": 2792 }, { "epoch": 0.4390732771325827, "grad_norm": 0.21414092183113098, "learning_rate": 4.432036099297113e-05, "loss": 1.0746, "step": 2793 }, { "epoch": 0.4392304820295152, "grad_norm": 0.15636850893497467, "learning_rate": 4.4316439392554934e-05, "loss": 1.1779, "step": 2794 }, { "epoch": 0.43938768692644775, "grad_norm": 0.17315103113651276, "learning_rate": 4.4312516612364106e-05, "loss": 1.0921, "step": 2795 }, { "epoch": 0.4395448918233803, "grad_norm": 0.15908555686473846, "learning_rate": 4.4308592652638245e-05, "loss": 1.1729, "step": 2796 }, { "epoch": 0.43970209672031285, "grad_norm": 0.14251650869846344, "learning_rate": 4.4304667513617014e-05, "loss": 1.1684, "step": 2797 }, { "epoch": 0.4398593016172454, "grad_norm": 0.29002174735069275, "learning_rate": 4.4300741195540144e-05, "loss": 1.1376, "step": 2798 }, { "epoch": 0.4400165065141779, "grad_norm": 0.1559830904006958, "learning_rate": 4.429681369864743e-05, "loss": 1.1366, "step": 2799 }, { "epoch": 0.44017371141111045, "grad_norm": 0.2127392441034317, "learning_rate": 4.429288502317876e-05, "loss": 1.144, "step": 2800 }, { "epoch": 0.440330916308043, "grad_norm": 0.1910010576248169, "learning_rate": 4.428895516937408e-05, "loss": 1.0268, "step": 2801 }, { "epoch": 0.44048812120497555, "grad_norm": 0.14085833728313446, "learning_rate": 4.42850241374734e-05, "loss": 1.2426, "step": 2802 }, { "epoch": 0.4406453261019081, "grad_norm": 0.16497376561164856, "learning_rate": 4.428109192771682e-05, "loss": 0.9953, "step": 2803 }, { "epoch": 0.4408025309988406, "grad_norm": 0.24869468808174133, "learning_rate": 4.427715854034451e-05, "loss": 1.1183, "step": 2804 }, { "epoch": 0.44095973589577314, "grad_norm": 0.2633094787597656, "learning_rate": 4.4273223975596704e-05, "loss": 1.1522, "step": 2805 }, { "epoch": 0.4411169407927057, "grad_norm": 0.17634916305541992, "learning_rate": 4.4269288233713704e-05, "loss": 1.2075, "step": 2806 }, { "epoch": 0.44127414568963824, "grad_norm": 0.2116928994655609, "learning_rate": 4.426535131493589e-05, "loss": 1.0832, "step": 2807 }, { "epoch": 0.4414313505865708, "grad_norm": 0.17454542219638824, "learning_rate": 4.4261413219503714e-05, "loss": 1.0844, "step": 2808 }, { "epoch": 0.4415885554835033, "grad_norm": 0.21070100367069244, "learning_rate": 4.425747394765771e-05, "loss": 1.105, "step": 2809 }, { "epoch": 0.44174576038043584, "grad_norm": 0.17848363518714905, "learning_rate": 4.425353349963847e-05, "loss": 1.1904, "step": 2810 }, { "epoch": 0.4419029652773684, "grad_norm": 0.15250985324382782, "learning_rate": 4.4249591875686655e-05, "loss": 1.1707, "step": 2811 }, { "epoch": 0.44206017017430094, "grad_norm": 0.1811569631099701, "learning_rate": 4.4245649076043e-05, "loss": 1.1703, "step": 2812 }, { "epoch": 0.4422173750712335, "grad_norm": 0.2516029477119446, "learning_rate": 4.424170510094834e-05, "loss": 1.1403, "step": 2813 }, { "epoch": 0.442374579968166, "grad_norm": 0.19467385113239288, "learning_rate": 4.423775995064353e-05, "loss": 1.079, "step": 2814 }, { "epoch": 0.44253178486509853, "grad_norm": 0.17406821250915527, "learning_rate": 4.4233813625369547e-05, "loss": 1.0794, "step": 2815 }, { "epoch": 0.4426889897620311, "grad_norm": 0.18775492906570435, "learning_rate": 4.4229866125367404e-05, "loss": 1.134, "step": 2816 }, { "epoch": 0.44284619465896363, "grad_norm": 0.18786782026290894, "learning_rate": 4.42259174508782e-05, "loss": 1.0186, "step": 2817 }, { "epoch": 0.4430033995558962, "grad_norm": 0.23192930221557617, "learning_rate": 4.422196760214311e-05, "loss": 1.1521, "step": 2818 }, { "epoch": 0.44316060445282873, "grad_norm": 0.21265646815299988, "learning_rate": 4.421801657940337e-05, "loss": 1.0756, "step": 2819 }, { "epoch": 0.4433178093497612, "grad_norm": 0.1617029756307602, "learning_rate": 4.42140643829003e-05, "loss": 0.9996, "step": 2820 }, { "epoch": 0.4434750142466938, "grad_norm": 0.15906690061092377, "learning_rate": 4.421011101287529e-05, "loss": 1.0924, "step": 2821 }, { "epoch": 0.4436322191436263, "grad_norm": 0.17704284191131592, "learning_rate": 4.4206156469569774e-05, "loss": 1.1341, "step": 2822 }, { "epoch": 0.4437894240405589, "grad_norm": 0.1560877412557602, "learning_rate": 4.420220075322531e-05, "loss": 1.1215, "step": 2823 }, { "epoch": 0.4439466289374914, "grad_norm": 0.16209112107753754, "learning_rate": 4.4198243864083474e-05, "loss": 1.2121, "step": 2824 }, { "epoch": 0.4441038338344239, "grad_norm": 0.21708954870700836, "learning_rate": 4.4194285802385946e-05, "loss": 1.1038, "step": 2825 }, { "epoch": 0.4442610387313565, "grad_norm": 0.18496249616146088, "learning_rate": 4.419032656837448e-05, "loss": 1.1421, "step": 2826 }, { "epoch": 0.444418243628289, "grad_norm": 0.1930168867111206, "learning_rate": 4.418636616229087e-05, "loss": 1.0371, "step": 2827 }, { "epoch": 0.4445754485252216, "grad_norm": 0.287266343832016, "learning_rate": 4.4182404584377026e-05, "loss": 1.0948, "step": 2828 }, { "epoch": 0.4447326534221541, "grad_norm": 0.16841591894626617, "learning_rate": 4.417844183487488e-05, "loss": 1.0902, "step": 2829 }, { "epoch": 0.4448898583190866, "grad_norm": 0.1748381108045578, "learning_rate": 4.417447791402649e-05, "loss": 1.124, "step": 2830 }, { "epoch": 0.44504706321601917, "grad_norm": 0.23243847489356995, "learning_rate": 4.417051282207394e-05, "loss": 1.0163, "step": 2831 }, { "epoch": 0.4452042681129517, "grad_norm": 0.15848687291145325, "learning_rate": 4.41665465592594e-05, "loss": 1.0835, "step": 2832 }, { "epoch": 0.44536147300988427, "grad_norm": 0.18608181178569794, "learning_rate": 4.4162579125825124e-05, "loss": 1.1433, "step": 2833 }, { "epoch": 0.4455186779068168, "grad_norm": 0.2262060046195984, "learning_rate": 4.4158610522013424e-05, "loss": 1.0953, "step": 2834 }, { "epoch": 0.4456758828037493, "grad_norm": 0.2521316409111023, "learning_rate": 4.415464074806669e-05, "loss": 0.9369, "step": 2835 }, { "epoch": 0.44583308770068186, "grad_norm": 0.16947337985038757, "learning_rate": 4.415066980422737e-05, "loss": 1.119, "step": 2836 }, { "epoch": 0.4459902925976144, "grad_norm": 0.17369608581066132, "learning_rate": 4.4146697690738015e-05, "loss": 1.1021, "step": 2837 }, { "epoch": 0.44614749749454696, "grad_norm": 0.13055743277072906, "learning_rate": 4.41427244078412e-05, "loss": 1.1948, "step": 2838 }, { "epoch": 0.4463047023914795, "grad_norm": 0.1885204315185547, "learning_rate": 4.4138749955779617e-05, "loss": 1.1319, "step": 2839 }, { "epoch": 0.446461907288412, "grad_norm": 0.24677696824073792, "learning_rate": 4.4134774334796005e-05, "loss": 1.1474, "step": 2840 }, { "epoch": 0.44661911218534456, "grad_norm": 0.1810072809457779, "learning_rate": 4.413079754513318e-05, "loss": 1.1885, "step": 2841 }, { "epoch": 0.4467763170822771, "grad_norm": 0.14184872806072235, "learning_rate": 4.412681958703403e-05, "loss": 1.2749, "step": 2842 }, { "epoch": 0.44693352197920966, "grad_norm": 0.1864720582962036, "learning_rate": 4.412284046074151e-05, "loss": 1.0701, "step": 2843 }, { "epoch": 0.4470907268761422, "grad_norm": 0.16632923483848572, "learning_rate": 4.411886016649865e-05, "loss": 1.0741, "step": 2844 }, { "epoch": 0.44724793177307476, "grad_norm": 0.1669205278158188, "learning_rate": 4.4114878704548555e-05, "loss": 1.1825, "step": 2845 }, { "epoch": 0.44740513667000725, "grad_norm": 0.16396629810333252, "learning_rate": 4.41108960751344e-05, "loss": 1.1323, "step": 2846 }, { "epoch": 0.4475623415669398, "grad_norm": 0.2604614794254303, "learning_rate": 4.410691227849942e-05, "loss": 1.1358, "step": 2847 }, { "epoch": 0.44771954646387235, "grad_norm": 0.20713360607624054, "learning_rate": 4.410292731488694e-05, "loss": 1.054, "step": 2848 }, { "epoch": 0.4478767513608049, "grad_norm": 0.20656947791576385, "learning_rate": 4.4098941184540335e-05, "loss": 1.0937, "step": 2849 }, { "epoch": 0.44803395625773745, "grad_norm": 0.19168046116828918, "learning_rate": 4.4094953887703074e-05, "loss": 1.0854, "step": 2850 }, { "epoch": 0.44819116115466995, "grad_norm": 0.2075878083705902, "learning_rate": 4.409096542461868e-05, "loss": 1.0695, "step": 2851 }, { "epoch": 0.4483483660516025, "grad_norm": 0.226437047123909, "learning_rate": 4.408697579553076e-05, "loss": 1.1658, "step": 2852 }, { "epoch": 0.44850557094853505, "grad_norm": 0.14772802591323853, "learning_rate": 4.408298500068297e-05, "loss": 1.2319, "step": 2853 }, { "epoch": 0.4486627758454676, "grad_norm": 0.17494453489780426, "learning_rate": 4.407899304031906e-05, "loss": 1.0833, "step": 2854 }, { "epoch": 0.44881998074240015, "grad_norm": 0.21048682928085327, "learning_rate": 4.407499991468286e-05, "loss": 1.0763, "step": 2855 }, { "epoch": 0.44897718563933264, "grad_norm": 0.17920631170272827, "learning_rate": 4.407100562401823e-05, "loss": 1.0633, "step": 2856 }, { "epoch": 0.4491343905362652, "grad_norm": 0.17860840260982513, "learning_rate": 4.406701016856914e-05, "loss": 1.1584, "step": 2857 }, { "epoch": 0.44929159543319774, "grad_norm": 0.15417364239692688, "learning_rate": 4.406301354857962e-05, "loss": 1.1879, "step": 2858 }, { "epoch": 0.4494488003301303, "grad_norm": 0.15544764697551727, "learning_rate": 4.405901576429375e-05, "loss": 1.1779, "step": 2859 }, { "epoch": 0.44960600522706284, "grad_norm": 0.21157599985599518, "learning_rate": 4.4055016815955716e-05, "loss": 1.1368, "step": 2860 }, { "epoch": 0.44976321012399534, "grad_norm": 0.1670396625995636, "learning_rate": 4.405101670380976e-05, "loss": 1.1953, "step": 2861 }, { "epoch": 0.4499204150209279, "grad_norm": 0.17523956298828125, "learning_rate": 4.4047015428100184e-05, "loss": 1.1532, "step": 2862 }, { "epoch": 0.45007761991786044, "grad_norm": 0.1506781280040741, "learning_rate": 4.404301298907138e-05, "loss": 1.1732, "step": 2863 }, { "epoch": 0.450234824814793, "grad_norm": 0.2258424013853073, "learning_rate": 4.403900938696779e-05, "loss": 1.0762, "step": 2864 }, { "epoch": 0.45039202971172554, "grad_norm": 0.15648561716079712, "learning_rate": 4.403500462203395e-05, "loss": 1.1401, "step": 2865 }, { "epoch": 0.45054923460865803, "grad_norm": 0.16834858059883118, "learning_rate": 4.403099869451445e-05, "loss": 1.1393, "step": 2866 }, { "epoch": 0.4507064395055906, "grad_norm": 0.24699564278125763, "learning_rate": 4.4026991604653954e-05, "loss": 1.1564, "step": 2867 }, { "epoch": 0.45086364440252313, "grad_norm": 0.21116980910301208, "learning_rate": 4.402298335269721e-05, "loss": 1.1344, "step": 2868 }, { "epoch": 0.4510208492994557, "grad_norm": 0.1752268224954605, "learning_rate": 4.401897393888902e-05, "loss": 1.0779, "step": 2869 }, { "epoch": 0.45117805419638823, "grad_norm": 0.22002138197422028, "learning_rate": 4.401496336347426e-05, "loss": 0.9862, "step": 2870 }, { "epoch": 0.4513352590933208, "grad_norm": 1.107559323310852, "learning_rate": 4.401095162669788e-05, "loss": 1.0143, "step": 2871 }, { "epoch": 0.4514924639902533, "grad_norm": 0.23041501641273499, "learning_rate": 4.400693872880491e-05, "loss": 1.1625, "step": 2872 }, { "epoch": 0.45164966888718583, "grad_norm": 0.18464645743370056, "learning_rate": 4.400292467004044e-05, "loss": 1.15, "step": 2873 }, { "epoch": 0.4518068737841184, "grad_norm": 0.19527535140514374, "learning_rate": 4.3998909450649644e-05, "loss": 1.1722, "step": 2874 }, { "epoch": 0.45196407868105093, "grad_norm": 0.2334446758031845, "learning_rate": 4.3994893070877734e-05, "loss": 1.1482, "step": 2875 }, { "epoch": 0.4521212835779835, "grad_norm": 0.18088001012802124, "learning_rate": 4.3990875530970034e-05, "loss": 1.1087, "step": 2876 }, { "epoch": 0.452278488474916, "grad_norm": 0.21173341572284698, "learning_rate": 4.39868568311719e-05, "loss": 1.1277, "step": 2877 }, { "epoch": 0.4524356933718485, "grad_norm": 0.33623263239860535, "learning_rate": 4.39828369717288e-05, "loss": 1.1106, "step": 2878 }, { "epoch": 0.4525928982687811, "grad_norm": 0.1798340082168579, "learning_rate": 4.397881595288624e-05, "loss": 1.1459, "step": 2879 }, { "epoch": 0.4527501031657136, "grad_norm": 0.17362205684185028, "learning_rate": 4.397479377488981e-05, "loss": 1.2209, "step": 2880 }, { "epoch": 0.4527501031657136, "eval_loss": 1.1185150146484375, "eval_runtime": 2300.8084, "eval_samples_per_second": 4.024, "eval_steps_per_second": 2.012, "step": 2880 }, { "epoch": 0.4529073080626462, "grad_norm": 0.22614096105098724, "learning_rate": 4.397077043798517e-05, "loss": 1.1362, "step": 2881 }, { "epoch": 0.45306451295957867, "grad_norm": 0.23455561697483063, "learning_rate": 4.3966745942418056e-05, "loss": 1.0824, "step": 2882 }, { "epoch": 0.4532217178565112, "grad_norm": 0.2692466974258423, "learning_rate": 4.3962720288434254e-05, "loss": 1.125, "step": 2883 }, { "epoch": 0.45337892275344377, "grad_norm": 0.23672759532928467, "learning_rate": 4.395869347627966e-05, "loss": 1.1478, "step": 2884 }, { "epoch": 0.4535361276503763, "grad_norm": 0.25377583503723145, "learning_rate": 4.395466550620019e-05, "loss": 1.177, "step": 2885 }, { "epoch": 0.45369333254730887, "grad_norm": 0.14862841367721558, "learning_rate": 4.395063637844187e-05, "loss": 1.156, "step": 2886 }, { "epoch": 0.45385053744424136, "grad_norm": 0.2521793842315674, "learning_rate": 4.3946606093250786e-05, "loss": 1.0896, "step": 2887 }, { "epoch": 0.4540077423411739, "grad_norm": 0.2162342667579651, "learning_rate": 4.3942574650873084e-05, "loss": 1.0524, "step": 2888 }, { "epoch": 0.45416494723810646, "grad_norm": 0.1862705945968628, "learning_rate": 4.3938542051555e-05, "loss": 1.1185, "step": 2889 }, { "epoch": 0.454322152135039, "grad_norm": 0.2547942101955414, "learning_rate": 4.393450829554282e-05, "loss": 1.0975, "step": 2890 }, { "epoch": 0.45447935703197156, "grad_norm": 0.20143720507621765, "learning_rate": 4.393047338308292e-05, "loss": 1.1355, "step": 2891 }, { "epoch": 0.45463656192890406, "grad_norm": 0.18321287631988525, "learning_rate": 4.392643731442172e-05, "loss": 1.146, "step": 2892 }, { "epoch": 0.4547937668258366, "grad_norm": 0.19959893822669983, "learning_rate": 4.392240008980575e-05, "loss": 1.1397, "step": 2893 }, { "epoch": 0.45495097172276916, "grad_norm": 0.17295321822166443, "learning_rate": 4.391836170948157e-05, "loss": 1.1841, "step": 2894 }, { "epoch": 0.4551081766197017, "grad_norm": 0.6232892274856567, "learning_rate": 4.391432217369584e-05, "loss": 1.0642, "step": 2895 }, { "epoch": 0.45526538151663426, "grad_norm": 0.18548434972763062, "learning_rate": 4.391028148269528e-05, "loss": 1.2487, "step": 2896 }, { "epoch": 0.4554225864135668, "grad_norm": 0.7635773420333862, "learning_rate": 4.390623963672667e-05, "loss": 1.0099, "step": 2897 }, { "epoch": 0.4555797913104993, "grad_norm": 0.28948745131492615, "learning_rate": 4.3902196636036874e-05, "loss": 1.1306, "step": 2898 }, { "epoch": 0.45573699620743185, "grad_norm": 0.1994236707687378, "learning_rate": 4.389815248087284e-05, "loss": 1.0493, "step": 2899 }, { "epoch": 0.4558942011043644, "grad_norm": 0.18875907361507416, "learning_rate": 4.389410717148154e-05, "loss": 1.2152, "step": 2900 }, { "epoch": 0.45605140600129696, "grad_norm": 0.17050974071025848, "learning_rate": 4.389006070811007e-05, "loss": 1.2183, "step": 2901 }, { "epoch": 0.4562086108982295, "grad_norm": 0.1716713309288025, "learning_rate": 4.3886013091005554e-05, "loss": 1.1924, "step": 2902 }, { "epoch": 0.456365815795162, "grad_norm": 0.2430170327425003, "learning_rate": 4.388196432041522e-05, "loss": 1.2085, "step": 2903 }, { "epoch": 0.45652302069209455, "grad_norm": 0.21377846598625183, "learning_rate": 4.387791439658635e-05, "loss": 1.194, "step": 2904 }, { "epoch": 0.4566802255890271, "grad_norm": 0.2664497494697571, "learning_rate": 4.3873863319766294e-05, "loss": 1.096, "step": 2905 }, { "epoch": 0.45683743048595965, "grad_norm": 0.3957104980945587, "learning_rate": 4.386981109020248e-05, "loss": 1.2151, "step": 2906 }, { "epoch": 0.4569946353828922, "grad_norm": 0.2563398778438568, "learning_rate": 4.38657577081424e-05, "loss": 1.0582, "step": 2907 }, { "epoch": 0.4571518402798247, "grad_norm": 0.19324427843093872, "learning_rate": 4.3861703173833606e-05, "loss": 1.1949, "step": 2908 }, { "epoch": 0.45730904517675725, "grad_norm": 0.24101825058460236, "learning_rate": 4.385764748752376e-05, "loss": 1.1812, "step": 2909 }, { "epoch": 0.4574662500736898, "grad_norm": 0.1889408826828003, "learning_rate": 4.385359064946054e-05, "loss": 1.0492, "step": 2910 }, { "epoch": 0.45762345497062235, "grad_norm": 0.18517924845218658, "learning_rate": 4.3849532659891746e-05, "loss": 1.082, "step": 2911 }, { "epoch": 0.4577806598675549, "grad_norm": 0.20901206135749817, "learning_rate": 4.384547351906522e-05, "loss": 1.1381, "step": 2912 }, { "epoch": 0.4579378647644874, "grad_norm": 0.211951345205307, "learning_rate": 4.384141322722886e-05, "loss": 1.0802, "step": 2913 }, { "epoch": 0.45809506966141994, "grad_norm": 0.2100234031677246, "learning_rate": 4.3837351784630676e-05, "loss": 1.1165, "step": 2914 }, { "epoch": 0.4582522745583525, "grad_norm": 0.20928257703781128, "learning_rate": 4.383328919151871e-05, "loss": 1.0882, "step": 2915 }, { "epoch": 0.45840947945528504, "grad_norm": 0.2149987667798996, "learning_rate": 4.38292254481411e-05, "loss": 1.2111, "step": 2916 }, { "epoch": 0.4585666843522176, "grad_norm": 0.16672222316265106, "learning_rate": 4.382516055474605e-05, "loss": 1.2464, "step": 2917 }, { "epoch": 0.4587238892491501, "grad_norm": 0.2167981117963791, "learning_rate": 4.382109451158181e-05, "loss": 1.1166, "step": 2918 }, { "epoch": 0.45888109414608264, "grad_norm": 0.17324289679527283, "learning_rate": 4.381702731889672e-05, "loss": 1.0655, "step": 2919 }, { "epoch": 0.4590382990430152, "grad_norm": 0.1950710564851761, "learning_rate": 4.38129589769392e-05, "loss": 1.1824, "step": 2920 }, { "epoch": 0.45919550393994774, "grad_norm": 0.31070321798324585, "learning_rate": 4.3808889485957726e-05, "loss": 1.1666, "step": 2921 }, { "epoch": 0.4593527088368803, "grad_norm": 0.24002663791179657, "learning_rate": 4.380481884620084e-05, "loss": 1.1344, "step": 2922 }, { "epoch": 0.45950991373381284, "grad_norm": 0.2190941721200943, "learning_rate": 4.380074705791718e-05, "loss": 1.0767, "step": 2923 }, { "epoch": 0.45966711863074533, "grad_norm": 0.24419142305850983, "learning_rate": 4.3796674121355416e-05, "loss": 1.1538, "step": 2924 }, { "epoch": 0.4598243235276779, "grad_norm": 0.24275580048561096, "learning_rate": 4.379260003676431e-05, "loss": 1.2278, "step": 2925 }, { "epoch": 0.45998152842461043, "grad_norm": 0.20563790202140808, "learning_rate": 4.3788524804392694e-05, "loss": 1.1544, "step": 2926 }, { "epoch": 0.460138733321543, "grad_norm": 0.2236025035381317, "learning_rate": 4.3784448424489476e-05, "loss": 1.0229, "step": 2927 }, { "epoch": 0.46029593821847553, "grad_norm": 0.22830867767333984, "learning_rate": 4.378037089730361e-05, "loss": 1.1557, "step": 2928 }, { "epoch": 0.460453143115408, "grad_norm": 0.1820029616355896, "learning_rate": 4.3776292223084146e-05, "loss": 1.195, "step": 2929 }, { "epoch": 0.4606103480123406, "grad_norm": 0.19844214618206024, "learning_rate": 4.377221240208019e-05, "loss": 1.1008, "step": 2930 }, { "epoch": 0.4607675529092731, "grad_norm": 0.20659616589546204, "learning_rate": 4.376813143454093e-05, "loss": 1.1284, "step": 2931 }, { "epoch": 0.4609247578062057, "grad_norm": 0.21897144615650177, "learning_rate": 4.3764049320715606e-05, "loss": 1.0937, "step": 2932 }, { "epoch": 0.4610819627031382, "grad_norm": 0.20715178549289703, "learning_rate": 4.3759966060853545e-05, "loss": 0.9869, "step": 2933 }, { "epoch": 0.4612391676000707, "grad_norm": 0.2190580666065216, "learning_rate": 4.3755881655204136e-05, "loss": 1.2428, "step": 2934 }, { "epoch": 0.46139637249700327, "grad_norm": 0.17025412619113922, "learning_rate": 4.375179610401683e-05, "loss": 1.2025, "step": 2935 }, { "epoch": 0.4615535773939358, "grad_norm": 0.17598217725753784, "learning_rate": 4.3747709407541174e-05, "loss": 1.1, "step": 2936 }, { "epoch": 0.46171078229086837, "grad_norm": 0.20214369893074036, "learning_rate": 4.374362156602675e-05, "loss": 1.1479, "step": 2937 }, { "epoch": 0.4618679871878009, "grad_norm": 0.1486193835735321, "learning_rate": 4.373953257972323e-05, "loss": 1.2626, "step": 2938 }, { "epoch": 0.4620251920847334, "grad_norm": 0.1790590137243271, "learning_rate": 4.373544244888037e-05, "loss": 1.1798, "step": 2939 }, { "epoch": 0.46218239698166597, "grad_norm": 0.18835453689098358, "learning_rate": 4.373135117374797e-05, "loss": 1.0728, "step": 2940 }, { "epoch": 0.4623396018785985, "grad_norm": 0.21751640737056732, "learning_rate": 4.37272587545759e-05, "loss": 1.0957, "step": 2941 }, { "epoch": 0.46249680677553107, "grad_norm": 0.17027625441551208, "learning_rate": 4.3723165191614126e-05, "loss": 1.1387, "step": 2942 }, { "epoch": 0.4626540116724636, "grad_norm": 0.2957732081413269, "learning_rate": 4.3719070485112646e-05, "loss": 1.1947, "step": 2943 }, { "epoch": 0.4628112165693961, "grad_norm": 0.22199559211730957, "learning_rate": 4.371497463532157e-05, "loss": 1.1139, "step": 2944 }, { "epoch": 0.46296842146632866, "grad_norm": 0.14452876150608063, "learning_rate": 4.371087764249106e-05, "loss": 1.1742, "step": 2945 }, { "epoch": 0.4631256263632612, "grad_norm": 0.2115442007780075, "learning_rate": 4.370677950687132e-05, "loss": 1.124, "step": 2946 }, { "epoch": 0.46328283126019376, "grad_norm": 0.21930637955665588, "learning_rate": 4.370268022871267e-05, "loss": 1.1338, "step": 2947 }, { "epoch": 0.4634400361571263, "grad_norm": 0.17294028401374817, "learning_rate": 4.369857980826546e-05, "loss": 1.1796, "step": 2948 }, { "epoch": 0.4635972410540588, "grad_norm": 0.21426047384738922, "learning_rate": 4.369447824578015e-05, "loss": 1.153, "step": 2949 }, { "epoch": 0.46375444595099136, "grad_norm": 0.19322267174720764, "learning_rate": 4.369037554150723e-05, "loss": 1.1341, "step": 2950 }, { "epoch": 0.4639116508479239, "grad_norm": 0.2040453553199768, "learning_rate": 4.368627169569729e-05, "loss": 1.1638, "step": 2951 }, { "epoch": 0.46406885574485646, "grad_norm": 0.18749818205833435, "learning_rate": 4.368216670860096e-05, "loss": 1.0691, "step": 2952 }, { "epoch": 0.464226060641789, "grad_norm": 0.18389207124710083, "learning_rate": 4.3678060580468984e-05, "loss": 1.0091, "step": 2953 }, { "epoch": 0.46438326553872156, "grad_norm": 0.23443341255187988, "learning_rate": 4.3673953311552115e-05, "loss": 1.1391, "step": 2954 }, { "epoch": 0.46454047043565405, "grad_norm": 0.14893463253974915, "learning_rate": 4.366984490210124e-05, "loss": 1.1696, "step": 2955 }, { "epoch": 0.4646976753325866, "grad_norm": 0.2501336336135864, "learning_rate": 4.366573535236728e-05, "loss": 1.0786, "step": 2956 }, { "epoch": 0.46485488022951915, "grad_norm": 0.17904464900493622, "learning_rate": 4.366162466260121e-05, "loss": 1.2305, "step": 2957 }, { "epoch": 0.4650120851264517, "grad_norm": 0.22989200055599213, "learning_rate": 4.365751283305411e-05, "loss": 1.1096, "step": 2958 }, { "epoch": 0.46516929002338425, "grad_norm": 0.20203103125095367, "learning_rate": 4.365339986397712e-05, "loss": 1.1126, "step": 2959 }, { "epoch": 0.46532649492031675, "grad_norm": 0.21098080277442932, "learning_rate": 4.364928575562143e-05, "loss": 1.1223, "step": 2960 }, { "epoch": 0.4654836998172493, "grad_norm": 0.1513175219297409, "learning_rate": 4.364517050823832e-05, "loss": 1.0855, "step": 2961 }, { "epoch": 0.46564090471418185, "grad_norm": 0.14790455996990204, "learning_rate": 4.364105412207914e-05, "loss": 1.1123, "step": 2962 }, { "epoch": 0.4657981096111144, "grad_norm": 0.24311016499996185, "learning_rate": 4.36369365973953e-05, "loss": 1.1073, "step": 2963 }, { "epoch": 0.46595531450804695, "grad_norm": 0.24297219514846802, "learning_rate": 4.3632817934438284e-05, "loss": 0.9989, "step": 2964 }, { "epoch": 0.46611251940497944, "grad_norm": 0.21929419040679932, "learning_rate": 4.362869813345964e-05, "loss": 1.1306, "step": 2965 }, { "epoch": 0.466269724301912, "grad_norm": 0.21477848291397095, "learning_rate": 4.3624577194710993e-05, "loss": 1.1154, "step": 2966 }, { "epoch": 0.46642692919884454, "grad_norm": 0.22623370587825775, "learning_rate": 4.362045511844402e-05, "loss": 1.167, "step": 2967 }, { "epoch": 0.4665841340957771, "grad_norm": 0.17645582556724548, "learning_rate": 4.3616331904910515e-05, "loss": 1.1196, "step": 2968 }, { "epoch": 0.46674133899270964, "grad_norm": 0.1720646470785141, "learning_rate": 4.361220755436227e-05, "loss": 1.1096, "step": 2969 }, { "epoch": 0.46689854388964214, "grad_norm": 0.2820630371570587, "learning_rate": 4.3608082067051214e-05, "loss": 1.1802, "step": 2970 }, { "epoch": 0.4670557487865747, "grad_norm": 0.15399809181690216, "learning_rate": 4.36039554432293e-05, "loss": 1.0906, "step": 2971 }, { "epoch": 0.46721295368350724, "grad_norm": 0.14993947744369507, "learning_rate": 4.359982768314857e-05, "loss": 1.1125, "step": 2972 }, { "epoch": 0.4673701585804398, "grad_norm": 0.2066517025232315, "learning_rate": 4.359569878706113e-05, "loss": 1.1647, "step": 2973 }, { "epoch": 0.46752736347737234, "grad_norm": 0.1963212788105011, "learning_rate": 4.359156875521917e-05, "loss": 1.127, "step": 2974 }, { "epoch": 0.46768456837430483, "grad_norm": 0.17257174849510193, "learning_rate": 4.3587437587874926e-05, "loss": 1.0375, "step": 2975 }, { "epoch": 0.4678417732712374, "grad_norm": 0.1978078931570053, "learning_rate": 4.3583305285280704e-05, "loss": 1.132, "step": 2976 }, { "epoch": 0.46799897816816993, "grad_norm": 0.17845632135868073, "learning_rate": 4.35791718476889e-05, "loss": 1.0805, "step": 2977 }, { "epoch": 0.4681561830651025, "grad_norm": 0.24879387021064758, "learning_rate": 4.357503727535198e-05, "loss": 1.0959, "step": 2978 }, { "epoch": 0.46831338796203503, "grad_norm": 0.23771560192108154, "learning_rate": 4.3570901568522445e-05, "loss": 1.0856, "step": 2979 }, { "epoch": 0.4684705928589676, "grad_norm": 0.1961091309785843, "learning_rate": 4.3566764727452914e-05, "loss": 1.1077, "step": 2980 }, { "epoch": 0.4686277977559001, "grad_norm": 0.2253684103488922, "learning_rate": 4.356262675239603e-05, "loss": 1.1091, "step": 2981 }, { "epoch": 0.4687850026528326, "grad_norm": 0.22095037996768951, "learning_rate": 4.355848764360453e-05, "loss": 1.0995, "step": 2982 }, { "epoch": 0.4689422075497652, "grad_norm": 0.15505705773830414, "learning_rate": 4.355434740133121e-05, "loss": 1.0732, "step": 2983 }, { "epoch": 0.4690994124466977, "grad_norm": 0.13467110693454742, "learning_rate": 4.355020602582895e-05, "loss": 1.0693, "step": 2984 }, { "epoch": 0.4692566173436303, "grad_norm": 0.2177276611328125, "learning_rate": 4.354606351735068e-05, "loss": 1.0611, "step": 2985 }, { "epoch": 0.4694138222405628, "grad_norm": 0.2077556699514389, "learning_rate": 4.3541919876149416e-05, "loss": 1.1161, "step": 2986 }, { "epoch": 0.4695710271374953, "grad_norm": 0.17732571065425873, "learning_rate": 4.3537775102478234e-05, "loss": 1.1127, "step": 2987 }, { "epoch": 0.4697282320344279, "grad_norm": 0.15690331161022186, "learning_rate": 4.353362919659028e-05, "loss": 1.0747, "step": 2988 }, { "epoch": 0.4698854369313604, "grad_norm": 0.14969655871391296, "learning_rate": 4.352948215873877e-05, "loss": 1.1483, "step": 2989 }, { "epoch": 0.470042641828293, "grad_norm": 0.15860646963119507, "learning_rate": 4.3525333989177e-05, "loss": 1.1119, "step": 2990 }, { "epoch": 0.47019984672522547, "grad_norm": 0.25320160388946533, "learning_rate": 4.35211846881583e-05, "loss": 1.1342, "step": 2991 }, { "epoch": 0.470357051622158, "grad_norm": 0.18844164907932281, "learning_rate": 4.3517034255936104e-05, "loss": 1.1295, "step": 2992 }, { "epoch": 0.47051425651909057, "grad_norm": 0.17962507903575897, "learning_rate": 4.3512882692763926e-05, "loss": 1.1112, "step": 2993 }, { "epoch": 0.4706714614160231, "grad_norm": 0.2303573489189148, "learning_rate": 4.35087299988953e-05, "loss": 1.1863, "step": 2994 }, { "epoch": 0.47082866631295567, "grad_norm": 0.2209206074476242, "learning_rate": 4.350457617458387e-05, "loss": 1.0956, "step": 2995 }, { "epoch": 0.47098587120988816, "grad_norm": 0.15327110886573792, "learning_rate": 4.350042122008333e-05, "loss": 1.2205, "step": 2996 }, { "epoch": 0.4711430761068207, "grad_norm": 0.19386257231235504, "learning_rate": 4.349626513564745e-05, "loss": 1.1507, "step": 2997 }, { "epoch": 0.47130028100375326, "grad_norm": 0.19012616574764252, "learning_rate": 4.3492107921530067e-05, "loss": 1.1616, "step": 2998 }, { "epoch": 0.4714574859006858, "grad_norm": 0.3165668845176697, "learning_rate": 4.3487949577985096e-05, "loss": 1.1514, "step": 2999 }, { "epoch": 0.47161469079761836, "grad_norm": 0.185637429356575, "learning_rate": 4.348379010526651e-05, "loss": 1.0717, "step": 3000 }, { "epoch": 0.47177189569455086, "grad_norm": 0.1926163285970688, "learning_rate": 4.347962950362834e-05, "loss": 1.0976, "step": 3001 }, { "epoch": 0.4719291005914834, "grad_norm": 0.21438013017177582, "learning_rate": 4.347546777332472e-05, "loss": 1.101, "step": 3002 }, { "epoch": 0.47208630548841596, "grad_norm": 0.13781698048114777, "learning_rate": 4.347130491460982e-05, "loss": 1.1528, "step": 3003 }, { "epoch": 0.4722435103853485, "grad_norm": 0.2153354436159134, "learning_rate": 4.346714092773789e-05, "loss": 1.0979, "step": 3004 }, { "epoch": 0.47240071528228106, "grad_norm": 0.1563863605260849, "learning_rate": 4.3462975812963255e-05, "loss": 1.0688, "step": 3005 }, { "epoch": 0.4725579201792136, "grad_norm": 0.14853888750076294, "learning_rate": 4.3458809570540315e-05, "loss": 1.3536, "step": 3006 }, { "epoch": 0.4727151250761461, "grad_norm": 0.18929365277290344, "learning_rate": 4.34546422007235e-05, "loss": 1.1523, "step": 3007 }, { "epoch": 0.47287232997307865, "grad_norm": 0.1504976898431778, "learning_rate": 4.345047370376737e-05, "loss": 1.1512, "step": 3008 }, { "epoch": 0.4730295348700112, "grad_norm": 0.164264515042305, "learning_rate": 4.3446304079926505e-05, "loss": 1.1382, "step": 3009 }, { "epoch": 0.47318673976694375, "grad_norm": 0.17672760784626007, "learning_rate": 4.344213332945557e-05, "loss": 1.1067, "step": 3010 }, { "epoch": 0.4733439446638763, "grad_norm": 0.15469646453857422, "learning_rate": 4.343796145260929e-05, "loss": 1.1057, "step": 3011 }, { "epoch": 0.4735011495608088, "grad_norm": 0.2399524748325348, "learning_rate": 4.3433788449642485e-05, "loss": 1.0207, "step": 3012 }, { "epoch": 0.47365835445774135, "grad_norm": 0.21121852099895477, "learning_rate": 4.342961432081001e-05, "loss": 1.0743, "step": 3013 }, { "epoch": 0.4738155593546739, "grad_norm": 0.18658918142318726, "learning_rate": 4.342543906636682e-05, "loss": 1.0925, "step": 3014 }, { "epoch": 0.47397276425160645, "grad_norm": 0.16221532225608826, "learning_rate": 4.342126268656791e-05, "loss": 1.2256, "step": 3015 }, { "epoch": 0.474129969148539, "grad_norm": 0.3126663863658905, "learning_rate": 4.341708518166837e-05, "loss": 1.0866, "step": 3016 }, { "epoch": 0.4742871740454715, "grad_norm": 0.16433748602867126, "learning_rate": 4.341290655192333e-05, "loss": 1.1915, "step": 3017 }, { "epoch": 0.47444437894240404, "grad_norm": 0.15654461085796356, "learning_rate": 4.3408726797588023e-05, "loss": 1.0935, "step": 3018 }, { "epoch": 0.4746015838393366, "grad_norm": 0.17568379640579224, "learning_rate": 4.3404545918917724e-05, "loss": 1.2203, "step": 3019 }, { "epoch": 0.47475878873626914, "grad_norm": 0.15613645315170288, "learning_rate": 4.3400363916167774e-05, "loss": 1.1754, "step": 3020 }, { "epoch": 0.4749159936332017, "grad_norm": 0.1775684654712677, "learning_rate": 4.339618078959362e-05, "loss": 1.1522, "step": 3021 }, { "epoch": 0.4750731985301342, "grad_norm": 0.25972139835357666, "learning_rate": 4.339199653945072e-05, "loss": 1.0544, "step": 3022 }, { "epoch": 0.47523040342706674, "grad_norm": 0.2272106260061264, "learning_rate": 4.338781116599466e-05, "loss": 1.2017, "step": 3023 }, { "epoch": 0.4753876083239993, "grad_norm": 0.17775173485279083, "learning_rate": 4.338362466948105e-05, "loss": 1.0349, "step": 3024 }, { "epoch": 0.47554481322093184, "grad_norm": 0.17978572845458984, "learning_rate": 4.3379437050165595e-05, "loss": 1.2645, "step": 3025 }, { "epoch": 0.4757020181178644, "grad_norm": 0.27326762676239014, "learning_rate": 4.337524830830405e-05, "loss": 1.1615, "step": 3026 }, { "epoch": 0.4758592230147969, "grad_norm": 0.15005174279212952, "learning_rate": 4.337105844415226e-05, "loss": 1.0954, "step": 3027 }, { "epoch": 0.47601642791172943, "grad_norm": 0.1960383504629135, "learning_rate": 4.3366867457966106e-05, "loss": 1.1727, "step": 3028 }, { "epoch": 0.476173632808662, "grad_norm": 0.2822120785713196, "learning_rate": 4.336267535000157e-05, "loss": 1.0665, "step": 3029 }, { "epoch": 0.47633083770559453, "grad_norm": 0.19774426519870758, "learning_rate": 4.33584821205147e-05, "loss": 1.0842, "step": 3030 }, { "epoch": 0.4764880426025271, "grad_norm": 0.18241934478282928, "learning_rate": 4.3354287769761584e-05, "loss": 1.062, "step": 3031 }, { "epoch": 0.47664524749945963, "grad_norm": 0.15259404480457306, "learning_rate": 4.33500922979984e-05, "loss": 1.1752, "step": 3032 }, { "epoch": 0.47680245239639213, "grad_norm": 0.28016725182533264, "learning_rate": 4.33458957054814e-05, "loss": 1.1332, "step": 3033 }, { "epoch": 0.4769596572933247, "grad_norm": 0.21748925745487213, "learning_rate": 4.33416979924669e-05, "loss": 1.1161, "step": 3034 }, { "epoch": 0.47711686219025723, "grad_norm": 0.20696724951267242, "learning_rate": 4.333749915921126e-05, "loss": 1.176, "step": 3035 }, { "epoch": 0.4772740670871898, "grad_norm": 0.17570556700229645, "learning_rate": 4.3333299205970946e-05, "loss": 1.1311, "step": 3036 }, { "epoch": 0.47743127198412233, "grad_norm": 0.15662014484405518, "learning_rate": 4.3329098133002475e-05, "loss": 1.0178, "step": 3037 }, { "epoch": 0.4775884768810548, "grad_norm": 0.16522455215454102, "learning_rate": 4.332489594056242e-05, "loss": 1.0892, "step": 3038 }, { "epoch": 0.4777456817779874, "grad_norm": 0.2099331170320511, "learning_rate": 4.332069262890745e-05, "loss": 1.1686, "step": 3039 }, { "epoch": 0.4779028866749199, "grad_norm": 0.2070057988166809, "learning_rate": 4.331648819829427e-05, "loss": 1.1487, "step": 3040 }, { "epoch": 0.4779028866749199, "eval_loss": 1.1146655082702637, "eval_runtime": 2325.1865, "eval_samples_per_second": 3.982, "eval_steps_per_second": 1.991, "step": 3040 }, { "epoch": 0.4780600915718525, "grad_norm": 0.20269645750522614, "learning_rate": 4.331228264897968e-05, "loss": 1.1289, "step": 3041 }, { "epoch": 0.478217296468785, "grad_norm": 0.22121499478816986, "learning_rate": 4.3308075981220555e-05, "loss": 1.0923, "step": 3042 }, { "epoch": 0.4783745013657175, "grad_norm": 0.3529304265975952, "learning_rate": 4.330386819527379e-05, "loss": 1.1289, "step": 3043 }, { "epoch": 0.47853170626265007, "grad_norm": 0.18771342933177948, "learning_rate": 4.329965929139641e-05, "loss": 1.0981, "step": 3044 }, { "epoch": 0.4786889111595826, "grad_norm": 0.2015509307384491, "learning_rate": 4.329544926984546e-05, "loss": 1.0751, "step": 3045 }, { "epoch": 0.47884611605651517, "grad_norm": 0.18288064002990723, "learning_rate": 4.329123813087808e-05, "loss": 1.2024, "step": 3046 }, { "epoch": 0.4790033209534477, "grad_norm": 0.17577502131462097, "learning_rate": 4.3287025874751466e-05, "loss": 1.0602, "step": 3047 }, { "epoch": 0.4791605258503802, "grad_norm": 0.14473573863506317, "learning_rate": 4.3282812501722895e-05, "loss": 1.1354, "step": 3048 }, { "epoch": 0.47931773074731276, "grad_norm": 0.13793261349201202, "learning_rate": 4.3278598012049685e-05, "loss": 1.1413, "step": 3049 }, { "epoch": 0.4794749356442453, "grad_norm": 0.19814302027225494, "learning_rate": 4.3274382405989266e-05, "loss": 1.1041, "step": 3050 }, { "epoch": 0.47963214054117786, "grad_norm": 0.17834322154521942, "learning_rate": 4.32701656837991e-05, "loss": 1.0496, "step": 3051 }, { "epoch": 0.4797893454381104, "grad_norm": 0.17347043752670288, "learning_rate": 4.326594784573672e-05, "loss": 1.1002, "step": 3052 }, { "epoch": 0.4799465503350429, "grad_norm": 0.18119116127490997, "learning_rate": 4.326172889205975e-05, "loss": 1.1539, "step": 3053 }, { "epoch": 0.48010375523197546, "grad_norm": 0.18658742308616638, "learning_rate": 4.325750882302586e-05, "loss": 1.0959, "step": 3054 }, { "epoch": 0.480260960128908, "grad_norm": 0.19841469824314117, "learning_rate": 4.325328763889279e-05, "loss": 1.1105, "step": 3055 }, { "epoch": 0.48041816502584056, "grad_norm": 0.21206001937389374, "learning_rate": 4.324906533991836e-05, "loss": 1.0634, "step": 3056 }, { "epoch": 0.4805753699227731, "grad_norm": 0.18314093351364136, "learning_rate": 4.324484192636046e-05, "loss": 1.1106, "step": 3057 }, { "epoch": 0.48073257481970566, "grad_norm": 0.18414467573165894, "learning_rate": 4.324061739847702e-05, "loss": 0.9478, "step": 3058 }, { "epoch": 0.48088977971663815, "grad_norm": 0.19265106320381165, "learning_rate": 4.323639175652608e-05, "loss": 1.1306, "step": 3059 }, { "epoch": 0.4810469846135707, "grad_norm": 0.17755688726902008, "learning_rate": 4.323216500076572e-05, "loss": 1.0935, "step": 3060 }, { "epoch": 0.48120418951050326, "grad_norm": 0.1681739240884781, "learning_rate": 4.322793713145408e-05, "loss": 1.1834, "step": 3061 }, { "epoch": 0.4813613944074358, "grad_norm": 0.17772947251796722, "learning_rate": 4.3223708148849404e-05, "loss": 1.2605, "step": 3062 }, { "epoch": 0.48151859930436836, "grad_norm": 0.16665947437286377, "learning_rate": 4.321947805320996e-05, "loss": 1.173, "step": 3063 }, { "epoch": 0.48167580420130085, "grad_norm": 0.1512874960899353, "learning_rate": 4.321524684479412e-05, "loss": 1.2195, "step": 3064 }, { "epoch": 0.4818330090982334, "grad_norm": 0.14165686070919037, "learning_rate": 4.3211014523860315e-05, "loss": 1.1116, "step": 3065 }, { "epoch": 0.48199021399516595, "grad_norm": 0.205230712890625, "learning_rate": 4.3206781090667026e-05, "loss": 1.0755, "step": 3066 }, { "epoch": 0.4821474188920985, "grad_norm": 0.15291771292686462, "learning_rate": 4.3202546545472824e-05, "loss": 0.9513, "step": 3067 }, { "epoch": 0.48230462378903105, "grad_norm": 0.2035183608531952, "learning_rate": 4.3198310888536325e-05, "loss": 1.089, "step": 3068 }, { "epoch": 0.48246182868596355, "grad_norm": 0.18657858669757843, "learning_rate": 4.319407412011625e-05, "loss": 1.1266, "step": 3069 }, { "epoch": 0.4826190335828961, "grad_norm": 0.19956351816654205, "learning_rate": 4.3189836240471335e-05, "loss": 1.1221, "step": 3070 }, { "epoch": 0.48277623847982865, "grad_norm": 0.2228332757949829, "learning_rate": 4.318559724986044e-05, "loss": 1.1047, "step": 3071 }, { "epoch": 0.4829334433767612, "grad_norm": 0.1445583552122116, "learning_rate": 4.318135714854246e-05, "loss": 1.1637, "step": 3072 }, { "epoch": 0.48309064827369375, "grad_norm": 0.19175522029399872, "learning_rate": 4.317711593677636e-05, "loss": 1.1594, "step": 3073 }, { "epoch": 0.48324785317062624, "grad_norm": 0.19144673645496368, "learning_rate": 4.3172873614821176e-05, "loss": 1.1604, "step": 3074 }, { "epoch": 0.4834050580675588, "grad_norm": 0.20282785594463348, "learning_rate": 4.316863018293601e-05, "loss": 1.1843, "step": 3075 }, { "epoch": 0.48356226296449134, "grad_norm": 0.1457735151052475, "learning_rate": 4.3164385641380045e-05, "loss": 1.0704, "step": 3076 }, { "epoch": 0.4837194678614239, "grad_norm": 0.1576244980096817, "learning_rate": 4.316013999041252e-05, "loss": 1.204, "step": 3077 }, { "epoch": 0.48387667275835644, "grad_norm": 0.15812274813652039, "learning_rate": 4.315589323029273e-05, "loss": 1.048, "step": 3078 }, { "epoch": 0.48403387765528894, "grad_norm": 0.19766604900360107, "learning_rate": 4.315164536128007e-05, "loss": 1.0627, "step": 3079 }, { "epoch": 0.4841910825522215, "grad_norm": 0.1403280347585678, "learning_rate": 4.314739638363396e-05, "loss": 1.2214, "step": 3080 }, { "epoch": 0.48434828744915404, "grad_norm": 0.19247771799564362, "learning_rate": 4.314314629761393e-05, "loss": 1.1174, "step": 3081 }, { "epoch": 0.4845054923460866, "grad_norm": 0.15992018580436707, "learning_rate": 4.313889510347956e-05, "loss": 1.0445, "step": 3082 }, { "epoch": 0.48466269724301914, "grad_norm": 0.1454923152923584, "learning_rate": 4.313464280149049e-05, "loss": 1.0791, "step": 3083 }, { "epoch": 0.4848199021399517, "grad_norm": 0.17863498628139496, "learning_rate": 4.313038939190644e-05, "loss": 1.1887, "step": 3084 }, { "epoch": 0.4849771070368842, "grad_norm": 0.16536812484264374, "learning_rate": 4.3126134874987176e-05, "loss": 1.0926, "step": 3085 }, { "epoch": 0.48513431193381673, "grad_norm": 0.21133893728256226, "learning_rate": 4.3121879250992566e-05, "loss": 1.1867, "step": 3086 }, { "epoch": 0.4852915168307493, "grad_norm": 0.13754390180110931, "learning_rate": 4.311762252018252e-05, "loss": 1.1501, "step": 3087 }, { "epoch": 0.48544872172768183, "grad_norm": 0.18948908150196075, "learning_rate": 4.3113364682817024e-05, "loss": 1.0505, "step": 3088 }, { "epoch": 0.4856059266246144, "grad_norm": 0.18722207844257355, "learning_rate": 4.310910573915613e-05, "loss": 1.1081, "step": 3089 }, { "epoch": 0.4857631315215469, "grad_norm": 0.16461189091205597, "learning_rate": 4.310484568945996e-05, "loss": 1.1096, "step": 3090 }, { "epoch": 0.4859203364184794, "grad_norm": 0.16495181620121002, "learning_rate": 4.3100584533988694e-05, "loss": 1.2194, "step": 3091 }, { "epoch": 0.486077541315412, "grad_norm": 0.1575373113155365, "learning_rate": 4.30963222730026e-05, "loss": 1.0526, "step": 3092 }, { "epoch": 0.4862347462123445, "grad_norm": 0.16486801207065582, "learning_rate": 4.309205890676199e-05, "loss": 1.1296, "step": 3093 }, { "epoch": 0.4863919511092771, "grad_norm": 0.16061507165431976, "learning_rate": 4.308779443552726e-05, "loss": 1.2397, "step": 3094 }, { "epoch": 0.48654915600620957, "grad_norm": 0.20702579617500305, "learning_rate": 4.3083528859558855e-05, "loss": 1.1798, "step": 3095 }, { "epoch": 0.4867063609031421, "grad_norm": 0.1880490630865097, "learning_rate": 4.3079262179117316e-05, "loss": 0.9903, "step": 3096 }, { "epoch": 0.48686356580007467, "grad_norm": 0.1547381728887558, "learning_rate": 4.307499439446324e-05, "loss": 1.1069, "step": 3097 }, { "epoch": 0.4870207706970072, "grad_norm": 0.1779354214668274, "learning_rate": 4.307072550585727e-05, "loss": 1.1234, "step": 3098 }, { "epoch": 0.48717797559393977, "grad_norm": 0.1460437774658203, "learning_rate": 4.306645551356014e-05, "loss": 1.1703, "step": 3099 }, { "epoch": 0.48733518049087227, "grad_norm": 0.19177353382110596, "learning_rate": 4.3062184417832644e-05, "loss": 1.0394, "step": 3100 }, { "epoch": 0.4874923853878048, "grad_norm": 0.19918255507946014, "learning_rate": 4.305791221893565e-05, "loss": 1.1018, "step": 3101 }, { "epoch": 0.48764959028473737, "grad_norm": 0.19703230261802673, "learning_rate": 4.305363891713008e-05, "loss": 1.1699, "step": 3102 }, { "epoch": 0.4878067951816699, "grad_norm": 0.20058362185955048, "learning_rate": 4.304936451267694e-05, "loss": 1.1041, "step": 3103 }, { "epoch": 0.48796400007860247, "grad_norm": 0.16634905338287354, "learning_rate": 4.304508900583728e-05, "loss": 0.9865, "step": 3104 }, { "epoch": 0.48812120497553496, "grad_norm": 0.13455606997013092, "learning_rate": 4.304081239687225e-05, "loss": 1.1838, "step": 3105 }, { "epoch": 0.4882784098724675, "grad_norm": 0.146565243601799, "learning_rate": 4.303653468604303e-05, "loss": 1.0494, "step": 3106 }, { "epoch": 0.48843561476940006, "grad_norm": 0.21972596645355225, "learning_rate": 4.3032255873610905e-05, "loss": 1.0792, "step": 3107 }, { "epoch": 0.4885928196663326, "grad_norm": 0.20464344322681427, "learning_rate": 4.30279759598372e-05, "loss": 0.9665, "step": 3108 }, { "epoch": 0.48875002456326516, "grad_norm": 0.17815683782100677, "learning_rate": 4.3023694944983305e-05, "loss": 1.0264, "step": 3109 }, { "epoch": 0.4889072294601977, "grad_norm": 0.16300149261951447, "learning_rate": 4.3019412829310704e-05, "loss": 1.0808, "step": 3110 }, { "epoch": 0.4890644343571302, "grad_norm": 0.16352546215057373, "learning_rate": 4.301512961308093e-05, "loss": 1.0634, "step": 3111 }, { "epoch": 0.48922163925406276, "grad_norm": 0.14276129007339478, "learning_rate": 4.301084529655558e-05, "loss": 1.1966, "step": 3112 }, { "epoch": 0.4893788441509953, "grad_norm": 0.17422689497470856, "learning_rate": 4.300655987999633e-05, "loss": 1.082, "step": 3113 }, { "epoch": 0.48953604904792786, "grad_norm": 0.23078230023384094, "learning_rate": 4.30022733636649e-05, "loss": 1.1523, "step": 3114 }, { "epoch": 0.4896932539448604, "grad_norm": 0.16097494959831238, "learning_rate": 4.299798574782312e-05, "loss": 1.2632, "step": 3115 }, { "epoch": 0.4898504588417929, "grad_norm": 0.1580386459827423, "learning_rate": 4.299369703273285e-05, "loss": 1.0741, "step": 3116 }, { "epoch": 0.49000766373872545, "grad_norm": 0.18079139292240143, "learning_rate": 4.298940721865602e-05, "loss": 1.1011, "step": 3117 }, { "epoch": 0.490164868635658, "grad_norm": 0.21938787400722504, "learning_rate": 4.298511630585464e-05, "loss": 1.1851, "step": 3118 }, { "epoch": 0.49032207353259055, "grad_norm": 0.17028893530368805, "learning_rate": 4.298082429459079e-05, "loss": 1.1701, "step": 3119 }, { "epoch": 0.4904792784295231, "grad_norm": 0.20295727252960205, "learning_rate": 4.29765311851266e-05, "loss": 1.1091, "step": 3120 }, { "epoch": 0.4906364833264556, "grad_norm": 0.1343192160129547, "learning_rate": 4.297223697772429e-05, "loss": 1.1188, "step": 3121 }, { "epoch": 0.49079368822338815, "grad_norm": 0.17286306619644165, "learning_rate": 4.296794167264612e-05, "loss": 1.1054, "step": 3122 }, { "epoch": 0.4909508931203207, "grad_norm": 0.17326287925243378, "learning_rate": 4.296364527015443e-05, "loss": 0.98, "step": 3123 }, { "epoch": 0.49110809801725325, "grad_norm": 0.15772922337055206, "learning_rate": 4.295934777051164e-05, "loss": 1.0586, "step": 3124 }, { "epoch": 0.4912653029141858, "grad_norm": 0.20056964457035065, "learning_rate": 4.295504917398022e-05, "loss": 1.0914, "step": 3125 }, { "epoch": 0.4914225078111183, "grad_norm": 0.16219423711299896, "learning_rate": 4.2950749480822714e-05, "loss": 1.0313, "step": 3126 }, { "epoch": 0.49157971270805084, "grad_norm": 0.18571913242340088, "learning_rate": 4.294644869130172e-05, "loss": 1.0954, "step": 3127 }, { "epoch": 0.4917369176049834, "grad_norm": 0.17019760608673096, "learning_rate": 4.294214680567993e-05, "loss": 1.1577, "step": 3128 }, { "epoch": 0.49189412250191594, "grad_norm": 0.19843801856040955, "learning_rate": 4.293784382422007e-05, "loss": 1.083, "step": 3129 }, { "epoch": 0.4920513273988485, "grad_norm": 0.1765783727169037, "learning_rate": 4.2933539747184966e-05, "loss": 1.081, "step": 3130 }, { "epoch": 0.492208532295781, "grad_norm": 0.14573058485984802, "learning_rate": 4.292923457483748e-05, "loss": 0.9873, "step": 3131 }, { "epoch": 0.49236573719271354, "grad_norm": 0.24786439538002014, "learning_rate": 4.292492830744057e-05, "loss": 1.1195, "step": 3132 }, { "epoch": 0.4925229420896461, "grad_norm": 0.16276319324970245, "learning_rate": 4.292062094525723e-05, "loss": 1.1914, "step": 3133 }, { "epoch": 0.49268014698657864, "grad_norm": 0.15106429159641266, "learning_rate": 4.291631248855055e-05, "loss": 1.2236, "step": 3134 }, { "epoch": 0.4928373518835112, "grad_norm": 0.15074944496154785, "learning_rate": 4.2912002937583674e-05, "loss": 1.1511, "step": 3135 }, { "epoch": 0.49299455678044374, "grad_norm": 0.14793723821640015, "learning_rate": 4.2907692292619804e-05, "loss": 0.9877, "step": 3136 }, { "epoch": 0.49315176167737623, "grad_norm": 0.13366791605949402, "learning_rate": 4.290338055392223e-05, "loss": 1.1199, "step": 3137 }, { "epoch": 0.4933089665743088, "grad_norm": 0.19258376955986023, "learning_rate": 4.289906772175428e-05, "loss": 1.1465, "step": 3138 }, { "epoch": 0.49346617147124133, "grad_norm": 0.22038963437080383, "learning_rate": 4.289475379637938e-05, "loss": 1.1204, "step": 3139 }, { "epoch": 0.4936233763681739, "grad_norm": 0.18796618282794952, "learning_rate": 4.289043877806101e-05, "loss": 1.0417, "step": 3140 }, { "epoch": 0.49378058126510643, "grad_norm": 0.1992577463388443, "learning_rate": 4.28861226670627e-05, "loss": 1.1426, "step": 3141 }, { "epoch": 0.4939377861620389, "grad_norm": 0.199345663189888, "learning_rate": 4.2881805463648075e-05, "loss": 1.1781, "step": 3142 }, { "epoch": 0.4940949910589715, "grad_norm": 0.19986248016357422, "learning_rate": 4.28774871680808e-05, "loss": 1.1761, "step": 3143 }, { "epoch": 0.49425219595590403, "grad_norm": 0.2036089152097702, "learning_rate": 4.2873167780624634e-05, "loss": 1.0096, "step": 3144 }, { "epoch": 0.4944094008528366, "grad_norm": 0.16037362813949585, "learning_rate": 4.286884730154338e-05, "loss": 1.132, "step": 3145 }, { "epoch": 0.49456660574976913, "grad_norm": 0.17637239396572113, "learning_rate": 4.286452573110092e-05, "loss": 1.1883, "step": 3146 }, { "epoch": 0.4947238106467016, "grad_norm": 0.1752503663301468, "learning_rate": 4.28602030695612e-05, "loss": 1.1, "step": 3147 }, { "epoch": 0.4948810155436342, "grad_norm": 0.1588784009218216, "learning_rate": 4.285587931718823e-05, "loss": 1.0101, "step": 3148 }, { "epoch": 0.4950382204405667, "grad_norm": 0.18488885462284088, "learning_rate": 4.285155447424609e-05, "loss": 1.0712, "step": 3149 }, { "epoch": 0.4951954253374993, "grad_norm": 0.2019781470298767, "learning_rate": 4.284722854099892e-05, "loss": 0.9967, "step": 3150 }, { "epoch": 0.4953526302344318, "grad_norm": 0.12347476184368134, "learning_rate": 4.284290151771094e-05, "loss": 1.1921, "step": 3151 }, { "epoch": 0.4955098351313643, "grad_norm": 0.16759006679058075, "learning_rate": 4.283857340464642e-05, "loss": 1.0069, "step": 3152 }, { "epoch": 0.49566704002829687, "grad_norm": 0.2088867723941803, "learning_rate": 4.283424420206971e-05, "loss": 1.1376, "step": 3153 }, { "epoch": 0.4958242449252294, "grad_norm": 0.13864010572433472, "learning_rate": 4.282991391024521e-05, "loss": 1.2096, "step": 3154 }, { "epoch": 0.49598144982216197, "grad_norm": 0.1731109619140625, "learning_rate": 4.282558252943741e-05, "loss": 1.1171, "step": 3155 }, { "epoch": 0.4961386547190945, "grad_norm": 0.15584808588027954, "learning_rate": 4.2821250059910857e-05, "loss": 1.2236, "step": 3156 }, { "epoch": 0.496295859616027, "grad_norm": 0.14465118944644928, "learning_rate": 4.281691650193016e-05, "loss": 1.0953, "step": 3157 }, { "epoch": 0.49645306451295956, "grad_norm": 0.16158504784107208, "learning_rate": 4.281258185575998e-05, "loss": 1.0887, "step": 3158 }, { "epoch": 0.4966102694098921, "grad_norm": 0.15371543169021606, "learning_rate": 4.2808246121665075e-05, "loss": 1.1856, "step": 3159 }, { "epoch": 0.49676747430682466, "grad_norm": 0.15584081411361694, "learning_rate": 4.280390929991026e-05, "loss": 1.1122, "step": 3160 }, { "epoch": 0.4969246792037572, "grad_norm": 0.18433384597301483, "learning_rate": 4.27995713907604e-05, "loss": 1.0427, "step": 3161 }, { "epoch": 0.49708188410068976, "grad_norm": 0.13477365672588348, "learning_rate": 4.279523239448044e-05, "loss": 1.1669, "step": 3162 }, { "epoch": 0.49723908899762226, "grad_norm": 0.15489578247070312, "learning_rate": 4.27908923113354e-05, "loss": 1.1288, "step": 3163 }, { "epoch": 0.4973962938945548, "grad_norm": 0.16293781995773315, "learning_rate": 4.278655114159034e-05, "loss": 1.0994, "step": 3164 }, { "epoch": 0.49755349879148736, "grad_norm": 0.15055860579013824, "learning_rate": 4.278220888551041e-05, "loss": 1.0994, "step": 3165 }, { "epoch": 0.4977107036884199, "grad_norm": 0.16553258895874023, "learning_rate": 4.277786554336082e-05, "loss": 1.1377, "step": 3166 }, { "epoch": 0.49786790858535246, "grad_norm": 0.17897070944309235, "learning_rate": 4.277352111540685e-05, "loss": 1.0658, "step": 3167 }, { "epoch": 0.49802511348228495, "grad_norm": 0.17738717794418335, "learning_rate": 4.276917560191382e-05, "loss": 1.1975, "step": 3168 }, { "epoch": 0.4981823183792175, "grad_norm": 0.17369163036346436, "learning_rate": 4.2764829003147155e-05, "loss": 1.1395, "step": 3169 }, { "epoch": 0.49833952327615005, "grad_norm": 0.14780254662036896, "learning_rate": 4.276048131937233e-05, "loss": 1.2145, "step": 3170 }, { "epoch": 0.4984967281730826, "grad_norm": 0.15877056121826172, "learning_rate": 4.275613255085488e-05, "loss": 1.1127, "step": 3171 }, { "epoch": 0.49865393307001515, "grad_norm": 0.1346055120229721, "learning_rate": 4.2751782697860407e-05, "loss": 1.1607, "step": 3172 }, { "epoch": 0.49881113796694765, "grad_norm": 0.2160274088382721, "learning_rate": 4.2747431760654596e-05, "loss": 1.1515, "step": 3173 }, { "epoch": 0.4989683428638802, "grad_norm": 0.21286068856716156, "learning_rate": 4.274307973950317e-05, "loss": 1.1661, "step": 3174 }, { "epoch": 0.49912554776081275, "grad_norm": 0.2089259922504425, "learning_rate": 4.2738726634671944e-05, "loss": 1.1204, "step": 3175 }, { "epoch": 0.4992827526577453, "grad_norm": 0.20848344266414642, "learning_rate": 4.273437244642678e-05, "loss": 1.1643, "step": 3176 }, { "epoch": 0.49943995755467785, "grad_norm": 0.1899596005678177, "learning_rate": 4.273001717503364e-05, "loss": 1.1176, "step": 3177 }, { "epoch": 0.49959716245161034, "grad_norm": 0.14908720552921295, "learning_rate": 4.2725660820758494e-05, "loss": 1.1538, "step": 3178 }, { "epoch": 0.4997543673485429, "grad_norm": 0.2636309564113617, "learning_rate": 4.2721303383867426e-05, "loss": 1.0771, "step": 3179 }, { "epoch": 0.49991157224547544, "grad_norm": 0.14900867640972137, "learning_rate": 4.2716944864626585e-05, "loss": 1.08, "step": 3180 }, { "epoch": 0.5000687771424079, "grad_norm": 0.16542400419712067, "learning_rate": 4.271258526330215e-05, "loss": 1.1431, "step": 3181 }, { "epoch": 0.5002259820393405, "grad_norm": 0.29188400506973267, "learning_rate": 4.27082245801604e-05, "loss": 0.9727, "step": 3182 }, { "epoch": 0.500383186936273, "grad_norm": 0.13522717356681824, "learning_rate": 4.2703862815467674e-05, "loss": 1.1593, "step": 3183 }, { "epoch": 0.5005403918332056, "grad_norm": 0.16434258222579956, "learning_rate": 4.269949996949036e-05, "loss": 1.1627, "step": 3184 }, { "epoch": 0.5006975967301381, "grad_norm": 0.15872086584568024, "learning_rate": 4.2695136042494934e-05, "loss": 1.1622, "step": 3185 }, { "epoch": 0.5008548016270706, "grad_norm": 0.16532592475414276, "learning_rate": 4.2690771034747916e-05, "loss": 1.1645, "step": 3186 }, { "epoch": 0.5010120065240032, "grad_norm": 0.28270232677459717, "learning_rate": 4.2686404946515926e-05, "loss": 1.2106, "step": 3187 }, { "epoch": 0.5011692114209357, "grad_norm": 0.17568837106227875, "learning_rate": 4.26820377780656e-05, "loss": 1.1278, "step": 3188 }, { "epoch": 0.5013264163178683, "grad_norm": 0.21069854497909546, "learning_rate": 4.267766952966369e-05, "loss": 1.0745, "step": 3189 }, { "epoch": 0.5014836212148008, "grad_norm": 0.18528971076011658, "learning_rate": 4.267330020157698e-05, "loss": 1.2046, "step": 3190 }, { "epoch": 0.5016408261117333, "grad_norm": 0.18909358978271484, "learning_rate": 4.266892979407234e-05, "loss": 1.035, "step": 3191 }, { "epoch": 0.5017980310086659, "grad_norm": 0.198701873421669, "learning_rate": 4.26645583074167e-05, "loss": 1.0765, "step": 3192 }, { "epoch": 0.5019552359055984, "grad_norm": 0.14333534240722656, "learning_rate": 4.266018574187703e-05, "loss": 1.2583, "step": 3193 }, { "epoch": 0.502112440802531, "grad_norm": 0.1988966464996338, "learning_rate": 4.265581209772043e-05, "loss": 1.1101, "step": 3194 }, { "epoch": 0.5022696456994635, "grad_norm": 0.15289467573165894, "learning_rate": 4.265143737521399e-05, "loss": 1.042, "step": 3195 }, { "epoch": 0.502426850596396, "grad_norm": 0.1811387538909912, "learning_rate": 4.2647061574624916e-05, "loss": 1.0892, "step": 3196 }, { "epoch": 0.5025840554933286, "grad_norm": 0.1622971147298813, "learning_rate": 4.264268469622046e-05, "loss": 1.1167, "step": 3197 }, { "epoch": 0.5027412603902611, "grad_norm": 0.13076646625995636, "learning_rate": 4.263830674026795e-05, "loss": 1.1922, "step": 3198 }, { "epoch": 0.5028984652871937, "grad_norm": 0.16915321350097656, "learning_rate": 4.2633927707034785e-05, "loss": 1.0426, "step": 3199 }, { "epoch": 0.5030556701841262, "grad_norm": 0.15100719034671783, "learning_rate": 4.26295475967884e-05, "loss": 1.0658, "step": 3200 }, { "epoch": 0.5030556701841262, "eval_loss": 1.1112091541290283, "eval_runtime": 2329.136, "eval_samples_per_second": 3.975, "eval_steps_per_second": 1.987, "step": 3200 }, { "epoch": 0.5032128750810588, "grad_norm": 0.22507062554359436, "learning_rate": 4.262516640979632e-05, "loss": 1.0526, "step": 3201 }, { "epoch": 0.5033700799779913, "grad_norm": 0.1575806438922882, "learning_rate": 4.2620784146326134e-05, "loss": 1.1379, "step": 3202 }, { "epoch": 0.5035272848749238, "grad_norm": 0.23570889234542847, "learning_rate": 4.26164008066455e-05, "loss": 1.0043, "step": 3203 }, { "epoch": 0.5036844897718564, "grad_norm": 0.15176087617874146, "learning_rate": 4.261201639102214e-05, "loss": 1.1273, "step": 3204 }, { "epoch": 0.5038416946687889, "grad_norm": 0.2037598192691803, "learning_rate": 4.2607630899723815e-05, "loss": 1.1443, "step": 3205 }, { "epoch": 0.5039988995657215, "grad_norm": 0.18929214775562286, "learning_rate": 4.260324433301839e-05, "loss": 1.1648, "step": 3206 }, { "epoch": 0.504156104462654, "grad_norm": 0.1639530211687088, "learning_rate": 4.259885669117377e-05, "loss": 1.1273, "step": 3207 }, { "epoch": 0.5043133093595865, "grad_norm": 0.16363751888275146, "learning_rate": 4.259446797445795e-05, "loss": 1.022, "step": 3208 }, { "epoch": 0.5044705142565191, "grad_norm": 0.15995655953884125, "learning_rate": 4.2590078183138976e-05, "loss": 1.1725, "step": 3209 }, { "epoch": 0.5046277191534516, "grad_norm": 0.14447644352912903, "learning_rate": 4.258568731748494e-05, "loss": 1.1501, "step": 3210 }, { "epoch": 0.5047849240503842, "grad_norm": 0.183898463845253, "learning_rate": 4.258129537776405e-05, "loss": 1.1573, "step": 3211 }, { "epoch": 0.5049421289473167, "grad_norm": 0.16828583180904388, "learning_rate": 4.257690236424451e-05, "loss": 1.2383, "step": 3212 }, { "epoch": 0.5050993338442492, "grad_norm": 0.15795843303203583, "learning_rate": 4.257250827719466e-05, "loss": 1.0214, "step": 3213 }, { "epoch": 0.5052565387411818, "grad_norm": 0.2043066918849945, "learning_rate": 4.2568113116882854e-05, "loss": 0.9322, "step": 3214 }, { "epoch": 0.5054137436381143, "grad_norm": 0.15502794086933136, "learning_rate": 4.256371688357755e-05, "loss": 1.1537, "step": 3215 }, { "epoch": 0.5055709485350469, "grad_norm": 0.19955813884735107, "learning_rate": 4.255931957754725e-05, "loss": 1.0808, "step": 3216 }, { "epoch": 0.5057281534319794, "grad_norm": 0.13643762469291687, "learning_rate": 4.255492119906051e-05, "loss": 1.0076, "step": 3217 }, { "epoch": 0.5058853583289119, "grad_norm": 0.18335899710655212, "learning_rate": 4.2550521748385975e-05, "loss": 1.0465, "step": 3218 }, { "epoch": 0.5060425632258445, "grad_norm": 0.13511648774147034, "learning_rate": 4.254612122579235e-05, "loss": 1.0214, "step": 3219 }, { "epoch": 0.506199768122777, "grad_norm": 0.1416211724281311, "learning_rate": 4.25417196315484e-05, "loss": 1.0799, "step": 3220 }, { "epoch": 0.5063569730197096, "grad_norm": 0.1651175618171692, "learning_rate": 4.253731696592295e-05, "loss": 1.1739, "step": 3221 }, { "epoch": 0.5065141779166421, "grad_norm": 0.1566598117351532, "learning_rate": 4.253291322918491e-05, "loss": 1.0812, "step": 3222 }, { "epoch": 0.5066713828135746, "grad_norm": 0.1521669626235962, "learning_rate": 4.252850842160324e-05, "loss": 1.1608, "step": 3223 }, { "epoch": 0.5068285877105072, "grad_norm": 0.1704331487417221, "learning_rate": 4.252410254344696e-05, "loss": 1.1423, "step": 3224 }, { "epoch": 0.5069857926074397, "grad_norm": 0.1667756289243698, "learning_rate": 4.251969559498519e-05, "loss": 1.1358, "step": 3225 }, { "epoch": 0.5071429975043723, "grad_norm": 0.1840224266052246, "learning_rate": 4.251528757648705e-05, "loss": 1.1496, "step": 3226 }, { "epoch": 0.5073002024013048, "grad_norm": 0.2619423270225525, "learning_rate": 4.2510878488221794e-05, "loss": 1.058, "step": 3227 }, { "epoch": 0.5074574072982373, "grad_norm": 0.16940724849700928, "learning_rate": 4.2506468330458706e-05, "loss": 1.0027, "step": 3228 }, { "epoch": 0.5076146121951699, "grad_norm": 0.14660978317260742, "learning_rate": 4.250205710346714e-05, "loss": 1.064, "step": 3229 }, { "epoch": 0.5077718170921024, "grad_norm": 0.1640421599149704, "learning_rate": 4.249764480751652e-05, "loss": 1.1039, "step": 3230 }, { "epoch": 0.507929021989035, "grad_norm": 0.14718511700630188, "learning_rate": 4.249323144287632e-05, "loss": 1.1057, "step": 3231 }, { "epoch": 0.5080862268859675, "grad_norm": 0.16258497536182404, "learning_rate": 4.248881700981611e-05, "loss": 1.1102, "step": 3232 }, { "epoch": 0.5082434317829, "grad_norm": 0.18936386704444885, "learning_rate": 4.248440150860549e-05, "loss": 1.1816, "step": 3233 }, { "epoch": 0.5084006366798326, "grad_norm": 0.1487811654806137, "learning_rate": 4.2479984939514155e-05, "loss": 1.1632, "step": 3234 }, { "epoch": 0.5085578415767651, "grad_norm": 0.19328673183918, "learning_rate": 4.2475567302811846e-05, "loss": 1.085, "step": 3235 }, { "epoch": 0.5087150464736977, "grad_norm": 0.16193972527980804, "learning_rate": 4.2471148598768375e-05, "loss": 1.0772, "step": 3236 }, { "epoch": 0.5088722513706302, "grad_norm": 0.16668765246868134, "learning_rate": 4.246672882765362e-05, "loss": 1.1176, "step": 3237 }, { "epoch": 0.5090294562675627, "grad_norm": 0.24110597372055054, "learning_rate": 4.246230798973753e-05, "loss": 1.0638, "step": 3238 }, { "epoch": 0.5091866611644953, "grad_norm": 0.1574033796787262, "learning_rate": 4.24578860852901e-05, "loss": 1.2187, "step": 3239 }, { "epoch": 0.5093438660614278, "grad_norm": 0.14084014296531677, "learning_rate": 4.2453463114581414e-05, "loss": 1.0872, "step": 3240 }, { "epoch": 0.5095010709583604, "grad_norm": 0.16895703971385956, "learning_rate": 4.2449039077881616e-05, "loss": 1.0453, "step": 3241 }, { "epoch": 0.5096582758552929, "grad_norm": 0.2261844277381897, "learning_rate": 4.244461397546089e-05, "loss": 1.1039, "step": 3242 }, { "epoch": 0.5098154807522254, "grad_norm": 0.178606778383255, "learning_rate": 4.2440187807589515e-05, "loss": 1.043, "step": 3243 }, { "epoch": 0.509972685649158, "grad_norm": 0.1622498333454132, "learning_rate": 4.243576057453783e-05, "loss": 1.0987, "step": 3244 }, { "epoch": 0.5101298905460905, "grad_norm": 0.14160633087158203, "learning_rate": 4.243133227657622e-05, "loss": 1.1027, "step": 3245 }, { "epoch": 0.5102870954430231, "grad_norm": 0.14689600467681885, "learning_rate": 4.2426902913975165e-05, "loss": 1.113, "step": 3246 }, { "epoch": 0.5104443003399556, "grad_norm": 0.5163748860359192, "learning_rate": 4.242247248700518e-05, "loss": 0.9741, "step": 3247 }, { "epoch": 0.5106015052368881, "grad_norm": 0.13907118141651154, "learning_rate": 4.241804099593686e-05, "loss": 1.0915, "step": 3248 }, { "epoch": 0.5107587101338207, "grad_norm": 0.16090743243694305, "learning_rate": 4.241360844104087e-05, "loss": 1.1899, "step": 3249 }, { "epoch": 0.5109159150307532, "grad_norm": 0.1669221967458725, "learning_rate": 4.240917482258794e-05, "loss": 1.0962, "step": 3250 }, { "epoch": 0.5110731199276858, "grad_norm": 0.18096497654914856, "learning_rate": 4.240474014084884e-05, "loss": 1.2463, "step": 3251 }, { "epoch": 0.5112303248246183, "grad_norm": 0.1275794804096222, "learning_rate": 4.240030439609444e-05, "loss": 1.1379, "step": 3252 }, { "epoch": 0.5113875297215508, "grad_norm": 0.13416628539562225, "learning_rate": 4.239586758859564e-05, "loss": 1.1639, "step": 3253 }, { "epoch": 0.5115447346184834, "grad_norm": 0.15411725640296936, "learning_rate": 4.2391429718623445e-05, "loss": 1.0484, "step": 3254 }, { "epoch": 0.5117019395154159, "grad_norm": 0.15557514131069183, "learning_rate": 4.238699078644889e-05, "loss": 1.0661, "step": 3255 }, { "epoch": 0.5118591444123485, "grad_norm": 0.21330618858337402, "learning_rate": 4.238255079234309e-05, "loss": 1.2001, "step": 3256 }, { "epoch": 0.512016349309281, "grad_norm": 0.1585479974746704, "learning_rate": 4.237810973657722e-05, "loss": 1.1734, "step": 3257 }, { "epoch": 0.5121735542062136, "grad_norm": 0.1720305234193802, "learning_rate": 4.237366761942253e-05, "loss": 1.0198, "step": 3258 }, { "epoch": 0.5123307591031461, "grad_norm": 0.16566184163093567, "learning_rate": 4.2369224441150324e-05, "loss": 1.2449, "step": 3259 }, { "epoch": 0.5124879640000786, "grad_norm": 0.17463403940200806, "learning_rate": 4.236478020203198e-05, "loss": 1.1379, "step": 3260 }, { "epoch": 0.5126451688970112, "grad_norm": 0.16861651837825775, "learning_rate": 4.236033490233892e-05, "loss": 1.2033, "step": 3261 }, { "epoch": 0.5128023737939437, "grad_norm": 0.17625701427459717, "learning_rate": 4.2355888542342666e-05, "loss": 1.1482, "step": 3262 }, { "epoch": 0.5129595786908763, "grad_norm": 0.1449575424194336, "learning_rate": 4.2351441122314764e-05, "loss": 1.1017, "step": 3263 }, { "epoch": 0.5131167835878088, "grad_norm": 0.12848585844039917, "learning_rate": 4.234699264252687e-05, "loss": 1.1988, "step": 3264 }, { "epoch": 0.5132739884847413, "grad_norm": 0.16907472908496857, "learning_rate": 4.2342543103250654e-05, "loss": 0.9311, "step": 3265 }, { "epoch": 0.5134311933816739, "grad_norm": 0.1578347235918045, "learning_rate": 4.2338092504757896e-05, "loss": 1.1661, "step": 3266 }, { "epoch": 0.5135883982786064, "grad_norm": 0.15366345643997192, "learning_rate": 4.233364084732041e-05, "loss": 1.0702, "step": 3267 }, { "epoch": 0.513745603175539, "grad_norm": 0.14660310745239258, "learning_rate": 4.2329188131210094e-05, "loss": 1.0974, "step": 3268 }, { "epoch": 0.5139028080724715, "grad_norm": 0.1957070529460907, "learning_rate": 4.23247343566989e-05, "loss": 0.9215, "step": 3269 }, { "epoch": 0.514060012969404, "grad_norm": 0.16161002218723297, "learning_rate": 4.2320279524058855e-05, "loss": 1.0315, "step": 3270 }, { "epoch": 0.5142172178663366, "grad_norm": 0.15061277151107788, "learning_rate": 4.2315823633562025e-05, "loss": 1.2509, "step": 3271 }, { "epoch": 0.5143744227632691, "grad_norm": 0.13233885169029236, "learning_rate": 4.231136668548057e-05, "loss": 1.0631, "step": 3272 }, { "epoch": 0.5145316276602017, "grad_norm": 0.15317268669605255, "learning_rate": 4.230690868008671e-05, "loss": 1.151, "step": 3273 }, { "epoch": 0.5146888325571342, "grad_norm": 0.18332509696483612, "learning_rate": 4.2302449617652716e-05, "loss": 1.1068, "step": 3274 }, { "epoch": 0.5148460374540667, "grad_norm": 0.17250898480415344, "learning_rate": 4.229798949845093e-05, "loss": 1.1823, "step": 3275 }, { "epoch": 0.5150032423509993, "grad_norm": 0.14509278535842896, "learning_rate": 4.2293528322753754e-05, "loss": 1.1604, "step": 3276 }, { "epoch": 0.5151604472479318, "grad_norm": 0.20953206717967987, "learning_rate": 4.2289066090833674e-05, "loss": 1.0799, "step": 3277 }, { "epoch": 0.5153176521448644, "grad_norm": 0.14778871834278107, "learning_rate": 4.2284602802963216e-05, "loss": 1.1736, "step": 3278 }, { "epoch": 0.5154748570417969, "grad_norm": 0.20026050508022308, "learning_rate": 4.2280138459414976e-05, "loss": 1.1321, "step": 3279 }, { "epoch": 0.5156320619387293, "grad_norm": 0.20624299347400665, "learning_rate": 4.227567306046164e-05, "loss": 1.1062, "step": 3280 }, { "epoch": 0.515789266835662, "grad_norm": 0.12861621379852295, "learning_rate": 4.2271206606375904e-05, "loss": 1.1786, "step": 3281 }, { "epoch": 0.5159464717325944, "grad_norm": 0.1462821513414383, "learning_rate": 4.226673909743059e-05, "loss": 1.1002, "step": 3282 }, { "epoch": 0.516103676629527, "grad_norm": 0.1743149310350418, "learning_rate": 4.226227053389855e-05, "loss": 1.1643, "step": 3283 }, { "epoch": 0.5162608815264595, "grad_norm": 0.17978879809379578, "learning_rate": 4.22578009160527e-05, "loss": 1.2249, "step": 3284 }, { "epoch": 0.516418086423392, "grad_norm": 0.14567448198795319, "learning_rate": 4.2253330244166035e-05, "loss": 1.161, "step": 3285 }, { "epoch": 0.5165752913203246, "grad_norm": 0.1618671864271164, "learning_rate": 4.2248858518511605e-05, "loss": 1.2158, "step": 3286 }, { "epoch": 0.5167324962172571, "grad_norm": 0.16569803655147552, "learning_rate": 4.224438573936252e-05, "loss": 1.0314, "step": 3287 }, { "epoch": 0.5168897011141897, "grad_norm": 0.1353185921907425, "learning_rate": 4.223991190699197e-05, "loss": 1.0542, "step": 3288 }, { "epoch": 0.5170469060111222, "grad_norm": 0.16214190423488617, "learning_rate": 4.223543702167319e-05, "loss": 1.1062, "step": 3289 }, { "epoch": 0.5172041109080547, "grad_norm": 0.20043228566646576, "learning_rate": 4.2230961083679485e-05, "loss": 1.0467, "step": 3290 }, { "epoch": 0.5173613158049873, "grad_norm": 0.1516493409872055, "learning_rate": 4.222648409328425e-05, "loss": 1.2023, "step": 3291 }, { "epoch": 0.5175185207019198, "grad_norm": 0.15186642110347748, "learning_rate": 4.22220060507609e-05, "loss": 1.1551, "step": 3292 }, { "epoch": 0.5176757255988524, "grad_norm": 0.24731656908988953, "learning_rate": 4.221752695638296e-05, "loss": 1.1195, "step": 3293 }, { "epoch": 0.5178329304957849, "grad_norm": 0.14959721267223358, "learning_rate": 4.221304681042397e-05, "loss": 1.0982, "step": 3294 }, { "epoch": 0.5179901353927174, "grad_norm": 0.16160228848457336, "learning_rate": 4.220856561315757e-05, "loss": 1.0693, "step": 3295 }, { "epoch": 0.51814734028965, "grad_norm": 0.17099249362945557, "learning_rate": 4.220408336485746e-05, "loss": 1.0608, "step": 3296 }, { "epoch": 0.5183045451865825, "grad_norm": 0.1416684240102768, "learning_rate": 4.219960006579739e-05, "loss": 1.1713, "step": 3297 }, { "epoch": 0.5184617500835151, "grad_norm": 0.2992304265499115, "learning_rate": 4.21951157162512e-05, "loss": 1.0846, "step": 3298 }, { "epoch": 0.5186189549804476, "grad_norm": 0.14916300773620605, "learning_rate": 4.219063031649276e-05, "loss": 1.0304, "step": 3299 }, { "epoch": 0.5187761598773801, "grad_norm": 0.12246151268482208, "learning_rate": 4.2186143866796025e-05, "loss": 1.2179, "step": 3300 }, { "epoch": 0.5189333647743127, "grad_norm": 0.1627214252948761, "learning_rate": 4.218165636743502e-05, "loss": 1.1276, "step": 3301 }, { "epoch": 0.5190905696712452, "grad_norm": 0.14838017523288727, "learning_rate": 4.217716781868381e-05, "loss": 1.2015, "step": 3302 }, { "epoch": 0.5192477745681778, "grad_norm": 0.14416925609111786, "learning_rate": 4.217267822081654e-05, "loss": 1.296, "step": 3303 }, { "epoch": 0.5194049794651103, "grad_norm": 0.15439528226852417, "learning_rate": 4.2168187574107424e-05, "loss": 1.2018, "step": 3304 }, { "epoch": 0.5195621843620428, "grad_norm": 0.15193216502666473, "learning_rate": 4.216369587883073e-05, "loss": 1.0985, "step": 3305 }, { "epoch": 0.5197193892589754, "grad_norm": 0.14423884451389313, "learning_rate": 4.2159203135260804e-05, "loss": 1.1073, "step": 3306 }, { "epoch": 0.5198765941559079, "grad_norm": 0.15461623668670654, "learning_rate": 4.215470934367203e-05, "loss": 1.0362, "step": 3307 }, { "epoch": 0.5200337990528405, "grad_norm": 0.14225488901138306, "learning_rate": 4.215021450433888e-05, "loss": 1.0318, "step": 3308 }, { "epoch": 0.520191003949773, "grad_norm": 0.16612344980239868, "learning_rate": 4.214571861753588e-05, "loss": 1.1269, "step": 3309 }, { "epoch": 0.5203482088467056, "grad_norm": 0.21814720332622528, "learning_rate": 4.2141221683537624e-05, "loss": 1.2525, "step": 3310 }, { "epoch": 0.5205054137436381, "grad_norm": 0.15520301461219788, "learning_rate": 4.2136723702618765e-05, "loss": 1.1255, "step": 3311 }, { "epoch": 0.5206626186405706, "grad_norm": 0.15896332263946533, "learning_rate": 4.213222467505402e-05, "loss": 1.1534, "step": 3312 }, { "epoch": 0.5208198235375032, "grad_norm": 0.14684036374092102, "learning_rate": 4.212772460111818e-05, "loss": 1.1959, "step": 3313 }, { "epoch": 0.5209770284344357, "grad_norm": 0.1430623084306717, "learning_rate": 4.2123223481086084e-05, "loss": 1.2392, "step": 3314 }, { "epoch": 0.5211342333313683, "grad_norm": 0.19666410982608795, "learning_rate": 4.211872131523265e-05, "loss": 1.1901, "step": 3315 }, { "epoch": 0.5212914382283008, "grad_norm": 0.1641821265220642, "learning_rate": 4.211421810383285e-05, "loss": 1.0118, "step": 3316 }, { "epoch": 0.5214486431252333, "grad_norm": 0.16409622132778168, "learning_rate": 4.210971384716173e-05, "loss": 1.0415, "step": 3317 }, { "epoch": 0.5216058480221659, "grad_norm": 0.25037920475006104, "learning_rate": 4.2105208545494375e-05, "loss": 1.075, "step": 3318 }, { "epoch": 0.5217630529190984, "grad_norm": 0.1819911003112793, "learning_rate": 4.210070219910597e-05, "loss": 1.0585, "step": 3319 }, { "epoch": 0.521920257816031, "grad_norm": 0.19422422349452972, "learning_rate": 4.209619480827173e-05, "loss": 1.153, "step": 3320 }, { "epoch": 0.5220774627129635, "grad_norm": 0.22342312335968018, "learning_rate": 4.209168637326697e-05, "loss": 0.981, "step": 3321 }, { "epoch": 0.522234667609896, "grad_norm": 0.18515369296073914, "learning_rate": 4.208717689436703e-05, "loss": 1.089, "step": 3322 }, { "epoch": 0.5223918725068286, "grad_norm": 0.14646635949611664, "learning_rate": 4.208266637184734e-05, "loss": 1.061, "step": 3323 }, { "epoch": 0.5225490774037611, "grad_norm": 0.15196828544139862, "learning_rate": 4.207815480598338e-05, "loss": 1.1832, "step": 3324 }, { "epoch": 0.5227062823006937, "grad_norm": 0.1385664939880371, "learning_rate": 4.207364219705071e-05, "loss": 1.1758, "step": 3325 }, { "epoch": 0.5228634871976262, "grad_norm": 0.18854454159736633, "learning_rate": 4.206912854532492e-05, "loss": 1.0566, "step": 3326 }, { "epoch": 0.5230206920945587, "grad_norm": 0.16105569899082184, "learning_rate": 4.2064613851081717e-05, "loss": 1.0328, "step": 3327 }, { "epoch": 0.5231778969914913, "grad_norm": 0.15872891247272491, "learning_rate": 4.2060098114596824e-05, "loss": 1.0453, "step": 3328 }, { "epoch": 0.5233351018884238, "grad_norm": 0.1624576896429062, "learning_rate": 4.205558133614604e-05, "loss": 1.1419, "step": 3329 }, { "epoch": 0.5234923067853564, "grad_norm": 0.14295019209384918, "learning_rate": 4.205106351600525e-05, "loss": 1.1008, "step": 3330 }, { "epoch": 0.5236495116822889, "grad_norm": 0.19790048897266388, "learning_rate": 4.204654465445037e-05, "loss": 1.181, "step": 3331 }, { "epoch": 0.5238067165792214, "grad_norm": 0.1526133269071579, "learning_rate": 4.20420247517574e-05, "loss": 1.1758, "step": 3332 }, { "epoch": 0.523963921476154, "grad_norm": 0.18274520337581635, "learning_rate": 4.20375038082024e-05, "loss": 1.1191, "step": 3333 }, { "epoch": 0.5241211263730865, "grad_norm": 0.14900778234004974, "learning_rate": 4.20329818240615e-05, "loss": 1.01, "step": 3334 }, { "epoch": 0.5242783312700191, "grad_norm": 0.17267143726348877, "learning_rate": 4.202845879961086e-05, "loss": 1.006, "step": 3335 }, { "epoch": 0.5244355361669516, "grad_norm": 0.18299135565757751, "learning_rate": 4.202393473512676e-05, "loss": 1.0996, "step": 3336 }, { "epoch": 0.5245927410638841, "grad_norm": 0.1378333568572998, "learning_rate": 4.2019409630885485e-05, "loss": 1.1321, "step": 3337 }, { "epoch": 0.5247499459608167, "grad_norm": 0.184214249253273, "learning_rate": 4.2014883487163434e-05, "loss": 1.0813, "step": 3338 }, { "epoch": 0.5249071508577492, "grad_norm": 0.14198952913284302, "learning_rate": 4.201035630423703e-05, "loss": 1.2111, "step": 3339 }, { "epoch": 0.5250643557546818, "grad_norm": 0.18620987236499786, "learning_rate": 4.2005828082382784e-05, "loss": 1.0249, "step": 3340 }, { "epoch": 0.5252215606516143, "grad_norm": 0.17725297808647156, "learning_rate": 4.200129882187726e-05, "loss": 1.0831, "step": 3341 }, { "epoch": 0.5253787655485468, "grad_norm": 0.1756705790758133, "learning_rate": 4.19967685229971e-05, "loss": 1.1297, "step": 3342 }, { "epoch": 0.5255359704454794, "grad_norm": 0.18759998679161072, "learning_rate": 4.199223718601899e-05, "loss": 1.0577, "step": 3343 }, { "epoch": 0.5256931753424119, "grad_norm": 0.1379726082086563, "learning_rate": 4.198770481121967e-05, "loss": 1.053, "step": 3344 }, { "epoch": 0.5258503802393445, "grad_norm": 0.223505437374115, "learning_rate": 4.198317139887598e-05, "loss": 1.0955, "step": 3345 }, { "epoch": 0.526007585136277, "grad_norm": 0.1413012593984604, "learning_rate": 4.197863694926479e-05, "loss": 1.1897, "step": 3346 }, { "epoch": 0.5261647900332095, "grad_norm": 0.20731431245803833, "learning_rate": 4.1974101462663075e-05, "loss": 1.1718, "step": 3347 }, { "epoch": 0.5263219949301421, "grad_norm": 0.13680782914161682, "learning_rate": 4.19695649393478e-05, "loss": 1.1427, "step": 3348 }, { "epoch": 0.5264791998270746, "grad_norm": 0.1617634892463684, "learning_rate": 4.1965027379596077e-05, "loss": 0.9997, "step": 3349 }, { "epoch": 0.5266364047240072, "grad_norm": 0.12036359310150146, "learning_rate": 4.196048878368503e-05, "loss": 1.2218, "step": 3350 }, { "epoch": 0.5267936096209397, "grad_norm": 0.23155498504638672, "learning_rate": 4.195594915189186e-05, "loss": 1.1274, "step": 3351 }, { "epoch": 0.5269508145178722, "grad_norm": 0.1686663031578064, "learning_rate": 4.195140848449383e-05, "loss": 1.0256, "step": 3352 }, { "epoch": 0.5271080194148048, "grad_norm": 0.19032715260982513, "learning_rate": 4.1946866781768256e-05, "loss": 1.1329, "step": 3353 }, { "epoch": 0.5272652243117373, "grad_norm": 0.18130408227443695, "learning_rate": 4.1942324043992546e-05, "loss": 1.0983, "step": 3354 }, { "epoch": 0.5274224292086699, "grad_norm": 0.16532330214977264, "learning_rate": 4.193778027144414e-05, "loss": 1.153, "step": 3355 }, { "epoch": 0.5275796341056024, "grad_norm": 0.14856065809726715, "learning_rate": 4.1933235464400554e-05, "loss": 1.1062, "step": 3356 }, { "epoch": 0.5277368390025349, "grad_norm": 0.16016262769699097, "learning_rate": 4.1928689623139385e-05, "loss": 1.1015, "step": 3357 }, { "epoch": 0.5278940438994675, "grad_norm": 0.16563931107521057, "learning_rate": 4.192414274793825e-05, "loss": 1.074, "step": 3358 }, { "epoch": 0.5280512487964, "grad_norm": 0.18133817613124847, "learning_rate": 4.1919594839074884e-05, "loss": 1.0512, "step": 3359 }, { "epoch": 0.5282084536933326, "grad_norm": 0.14429429173469543, "learning_rate": 4.191504589682702e-05, "loss": 1.0633, "step": 3360 }, { "epoch": 0.5282084536933326, "eval_loss": 1.108935832977295, "eval_runtime": 2328.9809, "eval_samples_per_second": 3.975, "eval_steps_per_second": 1.988, "step": 3360 }, { "epoch": 0.5283656585902651, "grad_norm": 0.15838821232318878, "learning_rate": 4.1910495921472525e-05, "loss": 1.0943, "step": 3361 }, { "epoch": 0.5285228634871977, "grad_norm": 0.16242218017578125, "learning_rate": 4.190594491328928e-05, "loss": 1.1898, "step": 3362 }, { "epoch": 0.5286800683841302, "grad_norm": 0.15097495913505554, "learning_rate": 4.190139287255524e-05, "loss": 1.1512, "step": 3363 }, { "epoch": 0.5288372732810627, "grad_norm": 0.16428695619106293, "learning_rate": 4.1896839799548424e-05, "loss": 0.9446, "step": 3364 }, { "epoch": 0.5289944781779953, "grad_norm": 0.1545208841562271, "learning_rate": 4.189228569454693e-05, "loss": 1.1657, "step": 3365 }, { "epoch": 0.5291516830749278, "grad_norm": 0.14587946236133575, "learning_rate": 4.1887730557828886e-05, "loss": 1.216, "step": 3366 }, { "epoch": 0.5293088879718604, "grad_norm": 0.17108961939811707, "learning_rate": 4.188317438967252e-05, "loss": 1.1622, "step": 3367 }, { "epoch": 0.5294660928687929, "grad_norm": 0.1690489500761032, "learning_rate": 4.1878617190356095e-05, "loss": 1.1143, "step": 3368 }, { "epoch": 0.5296232977657254, "grad_norm": 0.1476382315158844, "learning_rate": 4.187405896015795e-05, "loss": 1.2572, "step": 3369 }, { "epoch": 0.529780502662658, "grad_norm": 0.12622134387493134, "learning_rate": 4.1869499699356494e-05, "loss": 1.0056, "step": 3370 }, { "epoch": 0.5299377075595905, "grad_norm": 0.1347476989030838, "learning_rate": 4.186493940823018e-05, "loss": 1.1663, "step": 3371 }, { "epoch": 0.5300949124565231, "grad_norm": 0.1345609575510025, "learning_rate": 4.186037808705753e-05, "loss": 1.0417, "step": 3372 }, { "epoch": 0.5302521173534556, "grad_norm": 0.18443985283374786, "learning_rate": 4.1855815736117135e-05, "loss": 0.9926, "step": 3373 }, { "epoch": 0.5304093222503881, "grad_norm": 0.15589120984077454, "learning_rate": 4.185125235568764e-05, "loss": 1.0637, "step": 3374 }, { "epoch": 0.5305665271473207, "grad_norm": 0.15485121309757233, "learning_rate": 4.184668794604777e-05, "loss": 1.1427, "step": 3375 }, { "epoch": 0.5307237320442532, "grad_norm": 0.14066243171691895, "learning_rate": 4.184212250747631e-05, "loss": 1.0034, "step": 3376 }, { "epoch": 0.5308809369411858, "grad_norm": 0.13589130342006683, "learning_rate": 4.183755604025208e-05, "loss": 1.1053, "step": 3377 }, { "epoch": 0.5310381418381183, "grad_norm": 0.13392047584056854, "learning_rate": 4.183298854465398e-05, "loss": 1.0974, "step": 3378 }, { "epoch": 0.5311953467350508, "grad_norm": 0.13384903967380524, "learning_rate": 4.182842002096099e-05, "loss": 1.0901, "step": 3379 }, { "epoch": 0.5313525516319834, "grad_norm": 0.1484198421239853, "learning_rate": 4.182385046945214e-05, "loss": 1.1706, "step": 3380 }, { "epoch": 0.5315097565289159, "grad_norm": 0.13713057339191437, "learning_rate": 4.1819279890406506e-05, "loss": 1.1243, "step": 3381 }, { "epoch": 0.5316669614258485, "grad_norm": 0.2148253470659256, "learning_rate": 4.181470828410325e-05, "loss": 1.1012, "step": 3382 }, { "epoch": 0.531824166322781, "grad_norm": 0.1586213856935501, "learning_rate": 4.1810135650821604e-05, "loss": 1.1232, "step": 3383 }, { "epoch": 0.5319813712197135, "grad_norm": 0.15845105051994324, "learning_rate": 4.180556199084082e-05, "loss": 1.1245, "step": 3384 }, { "epoch": 0.5321385761166461, "grad_norm": 0.14359626173973083, "learning_rate": 4.180098730444024e-05, "loss": 1.0408, "step": 3385 }, { "epoch": 0.5322957810135786, "grad_norm": 0.15530677139759064, "learning_rate": 4.179641159189929e-05, "loss": 0.9341, "step": 3386 }, { "epoch": 0.5324529859105112, "grad_norm": 0.16154368221759796, "learning_rate": 4.179183485349742e-05, "loss": 1.1421, "step": 3387 }, { "epoch": 0.5326101908074437, "grad_norm": 0.16615764796733856, "learning_rate": 4.178725708951418e-05, "loss": 1.0734, "step": 3388 }, { "epoch": 0.5327673957043761, "grad_norm": 0.2428814321756363, "learning_rate": 4.178267830022913e-05, "loss": 1.0714, "step": 3389 }, { "epoch": 0.5329246006013088, "grad_norm": 0.14412826299667358, "learning_rate": 4.177809848592195e-05, "loss": 1.2244, "step": 3390 }, { "epoch": 0.5330818054982412, "grad_norm": 0.18421795964241028, "learning_rate": 4.177351764687235e-05, "loss": 1.1815, "step": 3391 }, { "epoch": 0.5332390103951739, "grad_norm": 0.14347229897975922, "learning_rate": 4.176893578336012e-05, "loss": 1.0274, "step": 3392 }, { "epoch": 0.5333962152921063, "grad_norm": 0.12033331394195557, "learning_rate": 4.1764352895665085e-05, "loss": 1.1543, "step": 3393 }, { "epoch": 0.5335534201890388, "grad_norm": 0.15218985080718994, "learning_rate": 4.175976898406716e-05, "loss": 1.1674, "step": 3394 }, { "epoch": 0.5337106250859714, "grad_norm": 0.13008874654769897, "learning_rate": 4.1755184048846316e-05, "loss": 0.9576, "step": 3395 }, { "epoch": 0.5338678299829039, "grad_norm": 0.14899078011512756, "learning_rate": 4.175059809028258e-05, "loss": 1.0168, "step": 3396 }, { "epoch": 0.5340250348798365, "grad_norm": 0.1490127146244049, "learning_rate": 4.1746011108656045e-05, "loss": 1.1041, "step": 3397 }, { "epoch": 0.534182239776769, "grad_norm": 0.15424524247646332, "learning_rate": 4.1741423104246855e-05, "loss": 1.1484, "step": 3398 }, { "epoch": 0.5343394446737015, "grad_norm": 0.13760043680667877, "learning_rate": 4.173683407733525e-05, "loss": 1.1936, "step": 3399 }, { "epoch": 0.5344966495706341, "grad_norm": 0.1429462432861328, "learning_rate": 4.1732244028201495e-05, "loss": 1.1503, "step": 3400 }, { "epoch": 0.5346538544675666, "grad_norm": 0.1386295109987259, "learning_rate": 4.172765295712594e-05, "loss": 1.0223, "step": 3401 }, { "epoch": 0.5348110593644992, "grad_norm": 0.1693735420703888, "learning_rate": 4.172306086438898e-05, "loss": 1.1685, "step": 3402 }, { "epoch": 0.5349682642614317, "grad_norm": 0.19189637899398804, "learning_rate": 4.1718467750271095e-05, "loss": 1.1067, "step": 3403 }, { "epoch": 0.5351254691583642, "grad_norm": 0.14684249460697174, "learning_rate": 4.1713873615052815e-05, "loss": 1.0856, "step": 3404 }, { "epoch": 0.5352826740552968, "grad_norm": 0.15605910122394562, "learning_rate": 4.1709278459014713e-05, "loss": 1.045, "step": 3405 }, { "epoch": 0.5354398789522293, "grad_norm": 0.14440178871154785, "learning_rate": 4.170468228243747e-05, "loss": 1.0972, "step": 3406 }, { "epoch": 0.5355970838491619, "grad_norm": 0.13100458681583405, "learning_rate": 4.170008508560178e-05, "loss": 1.2334, "step": 3407 }, { "epoch": 0.5357542887460944, "grad_norm": 0.19869981706142426, "learning_rate": 4.1695486868788435e-05, "loss": 1.1948, "step": 3408 }, { "epoch": 0.5359114936430269, "grad_norm": 0.1492564082145691, "learning_rate": 4.169088763227828e-05, "loss": 1.0754, "step": 3409 }, { "epoch": 0.5360686985399595, "grad_norm": 0.1578557938337326, "learning_rate": 4.168628737635221e-05, "loss": 1.1661, "step": 3410 }, { "epoch": 0.536225903436892, "grad_norm": 0.12749029695987701, "learning_rate": 4.1681686101291194e-05, "loss": 1.1115, "step": 3411 }, { "epoch": 0.5363831083338246, "grad_norm": 0.12497510015964508, "learning_rate": 4.167708380737626e-05, "loss": 1.1506, "step": 3412 }, { "epoch": 0.5365403132307571, "grad_norm": 0.18353544175624847, "learning_rate": 4.1672480494888496e-05, "loss": 1.0797, "step": 3413 }, { "epoch": 0.5366975181276897, "grad_norm": 0.19030232727527618, "learning_rate": 4.1667876164109065e-05, "loss": 1.0587, "step": 3414 }, { "epoch": 0.5368547230246222, "grad_norm": 0.1877097338438034, "learning_rate": 4.1663270815319176e-05, "loss": 1.1747, "step": 3415 }, { "epoch": 0.5370119279215547, "grad_norm": 0.1300366222858429, "learning_rate": 4.16586644488001e-05, "loss": 1.0576, "step": 3416 }, { "epoch": 0.5371691328184873, "grad_norm": 0.15688586235046387, "learning_rate": 4.165405706483318e-05, "loss": 1.0936, "step": 3417 }, { "epoch": 0.5373263377154198, "grad_norm": 0.17134688794612885, "learning_rate": 4.164944866369983e-05, "loss": 1.1725, "step": 3418 }, { "epoch": 0.5374835426123524, "grad_norm": 0.19654420018196106, "learning_rate": 4.164483924568149e-05, "loss": 1.1048, "step": 3419 }, { "epoch": 0.5376407475092849, "grad_norm": 0.1443120241165161, "learning_rate": 4.16402288110597e-05, "loss": 1.1554, "step": 3420 }, { "epoch": 0.5377979524062174, "grad_norm": 0.1997511386871338, "learning_rate": 4.1635617360116056e-05, "loss": 1.1937, "step": 3421 }, { "epoch": 0.53795515730315, "grad_norm": 0.13346394896507263, "learning_rate": 4.1631004893132186e-05, "loss": 1.1473, "step": 3422 }, { "epoch": 0.5381123622000825, "grad_norm": 0.15485936403274536, "learning_rate": 4.162639141038982e-05, "loss": 1.0169, "step": 3423 }, { "epoch": 0.5382695670970151, "grad_norm": 0.14727017283439636, "learning_rate": 4.1621776912170726e-05, "loss": 1.1214, "step": 3424 }, { "epoch": 0.5384267719939476, "grad_norm": 0.12584324181079865, "learning_rate": 4.161716139875674e-05, "loss": 1.1039, "step": 3425 }, { "epoch": 0.5385839768908801, "grad_norm": 0.15168297290802002, "learning_rate": 4.161254487042976e-05, "loss": 1.1374, "step": 3426 }, { "epoch": 0.5387411817878127, "grad_norm": 0.16526435315608978, "learning_rate": 4.1607927327471746e-05, "loss": 1.1036, "step": 3427 }, { "epoch": 0.5388983866847452, "grad_norm": 0.13458026945590973, "learning_rate": 4.160330877016472e-05, "loss": 1.2455, "step": 3428 }, { "epoch": 0.5390555915816778, "grad_norm": 0.1569700390100479, "learning_rate": 4.159868919879076e-05, "loss": 1.0301, "step": 3429 }, { "epoch": 0.5392127964786103, "grad_norm": 0.35492467880249023, "learning_rate": 4.159406861363202e-05, "loss": 1.1051, "step": 3430 }, { "epoch": 0.5393700013755428, "grad_norm": 0.13499435782432556, "learning_rate": 4.158944701497071e-05, "loss": 1.0865, "step": 3431 }, { "epoch": 0.5395272062724754, "grad_norm": 0.1455809772014618, "learning_rate": 4.1584824403089096e-05, "loss": 1.194, "step": 3432 }, { "epoch": 0.5396844111694079, "grad_norm": 0.13883116841316223, "learning_rate": 4.1580200778269504e-05, "loss": 1.0438, "step": 3433 }, { "epoch": 0.5398416160663405, "grad_norm": 0.32132571935653687, "learning_rate": 4.157557614079433e-05, "loss": 1.1206, "step": 3434 }, { "epoch": 0.539998820963273, "grad_norm": 0.17668181657791138, "learning_rate": 4.157095049094604e-05, "loss": 1.0786, "step": 3435 }, { "epoch": 0.5401560258602055, "grad_norm": 0.16002698242664337, "learning_rate": 4.156632382900713e-05, "loss": 1.0721, "step": 3436 }, { "epoch": 0.5403132307571381, "grad_norm": 0.14776268601417542, "learning_rate": 4.15616961552602e-05, "loss": 1.1081, "step": 3437 }, { "epoch": 0.5404704356540706, "grad_norm": 0.186686173081398, "learning_rate": 4.155706746998788e-05, "loss": 1.2066, "step": 3438 }, { "epoch": 0.5406276405510032, "grad_norm": 0.1298559308052063, "learning_rate": 4.155243777347287e-05, "loss": 1.1499, "step": 3439 }, { "epoch": 0.5407848454479357, "grad_norm": 0.14942237734794617, "learning_rate": 4.154780706599795e-05, "loss": 1.1508, "step": 3440 }, { "epoch": 0.5409420503448682, "grad_norm": 0.13668204843997955, "learning_rate": 4.154317534784593e-05, "loss": 1.1353, "step": 3441 }, { "epoch": 0.5410992552418008, "grad_norm": 0.1622249037027359, "learning_rate": 4.15385426192997e-05, "loss": 1.1168, "step": 3442 }, { "epoch": 0.5412564601387333, "grad_norm": 0.1504237949848175, "learning_rate": 4.1533908880642206e-05, "loss": 1.0531, "step": 3443 }, { "epoch": 0.5414136650356659, "grad_norm": 0.19511030614376068, "learning_rate": 4.152927413215647e-05, "loss": 1.017, "step": 3444 }, { "epoch": 0.5415708699325984, "grad_norm": 0.15601330995559692, "learning_rate": 4.1524638374125565e-05, "loss": 1.0748, "step": 3445 }, { "epoch": 0.5417280748295309, "grad_norm": 0.21786673367023468, "learning_rate": 4.1520001606832616e-05, "loss": 1.1413, "step": 3446 }, { "epoch": 0.5418852797264635, "grad_norm": 0.1456231325864792, "learning_rate": 4.1515363830560824e-05, "loss": 1.0997, "step": 3447 }, { "epoch": 0.542042484623396, "grad_norm": 0.17294257879257202, "learning_rate": 4.151072504559344e-05, "loss": 1.1522, "step": 3448 }, { "epoch": 0.5421996895203286, "grad_norm": 0.17040883004665375, "learning_rate": 4.15060852522138e-05, "loss": 1.1087, "step": 3449 }, { "epoch": 0.5423568944172611, "grad_norm": 0.18714837729930878, "learning_rate": 4.150144445070527e-05, "loss": 1.1267, "step": 3450 }, { "epoch": 0.5425140993141936, "grad_norm": 0.1632552444934845, "learning_rate": 4.1496802641351295e-05, "loss": 1.1252, "step": 3451 }, { "epoch": 0.5426713042111262, "grad_norm": 0.15470536053180695, "learning_rate": 4.1492159824435386e-05, "loss": 1.1384, "step": 3452 }, { "epoch": 0.5428285091080587, "grad_norm": 0.1544426828622818, "learning_rate": 4.14875160002411e-05, "loss": 1.1067, "step": 3453 }, { "epoch": 0.5429857140049913, "grad_norm": 0.1383865624666214, "learning_rate": 4.1482871169052065e-05, "loss": 1.2324, "step": 3454 }, { "epoch": 0.5431429189019238, "grad_norm": 0.22764433920383453, "learning_rate": 4.1478225331151976e-05, "loss": 1.0468, "step": 3455 }, { "epoch": 0.5433001237988563, "grad_norm": 0.12846636772155762, "learning_rate": 4.1473578486824585e-05, "loss": 1.1248, "step": 3456 }, { "epoch": 0.5434573286957889, "grad_norm": 0.17757047712802887, "learning_rate": 4.146893063635369e-05, "loss": 1.125, "step": 3457 }, { "epoch": 0.5436145335927214, "grad_norm": 0.16342827677726746, "learning_rate": 4.1464281780023165e-05, "loss": 1.1254, "step": 3458 }, { "epoch": 0.543771738489654, "grad_norm": 0.15827199816703796, "learning_rate": 4.145963191811696e-05, "loss": 1.0685, "step": 3459 }, { "epoch": 0.5439289433865865, "grad_norm": 0.12205367535352707, "learning_rate": 4.1454981050919064e-05, "loss": 1.1013, "step": 3460 }, { "epoch": 0.544086148283519, "grad_norm": 0.1325395554304123, "learning_rate": 4.1450329178713535e-05, "loss": 1.1054, "step": 3461 }, { "epoch": 0.5442433531804516, "grad_norm": 0.13637445867061615, "learning_rate": 4.144567630178447e-05, "loss": 1.0977, "step": 3462 }, { "epoch": 0.5444005580773841, "grad_norm": 0.17633673548698425, "learning_rate": 4.144102242041609e-05, "loss": 1.1647, "step": 3463 }, { "epoch": 0.5445577629743167, "grad_norm": 0.2620643675327301, "learning_rate": 4.1436367534892604e-05, "loss": 1.1291, "step": 3464 }, { "epoch": 0.5447149678712492, "grad_norm": 0.18059346079826355, "learning_rate": 4.1431711645498325e-05, "loss": 0.998, "step": 3465 }, { "epoch": 0.5448721727681818, "grad_norm": 0.13098302483558655, "learning_rate": 4.1427054752517626e-05, "loss": 1.1569, "step": 3466 }, { "epoch": 0.5450293776651143, "grad_norm": 0.14102932810783386, "learning_rate": 4.142239685623492e-05, "loss": 1.1395, "step": 3467 }, { "epoch": 0.5451865825620468, "grad_norm": 0.15154092013835907, "learning_rate": 4.1417737956934696e-05, "loss": 1.1194, "step": 3468 }, { "epoch": 0.5453437874589794, "grad_norm": 0.17950110137462616, "learning_rate": 4.14130780549015e-05, "loss": 1.1436, "step": 3469 }, { "epoch": 0.5455009923559119, "grad_norm": 0.15299329161643982, "learning_rate": 4.140841715041995e-05, "loss": 1.0516, "step": 3470 }, { "epoch": 0.5456581972528445, "grad_norm": 0.15179339051246643, "learning_rate": 4.140375524377471e-05, "loss": 1.0996, "step": 3471 }, { "epoch": 0.545815402149777, "grad_norm": 0.1765875220298767, "learning_rate": 4.13990923352505e-05, "loss": 1.1138, "step": 3472 }, { "epoch": 0.5459726070467095, "grad_norm": 0.15171504020690918, "learning_rate": 4.139442842513214e-05, "loss": 1.0216, "step": 3473 }, { "epoch": 0.5461298119436421, "grad_norm": 0.13052770495414734, "learning_rate": 4.138976351370446e-05, "loss": 1.0444, "step": 3474 }, { "epoch": 0.5462870168405746, "grad_norm": 0.17383016645908356, "learning_rate": 4.138509760125239e-05, "loss": 1.1402, "step": 3475 }, { "epoch": 0.5464442217375072, "grad_norm": 0.17007580399513245, "learning_rate": 4.138043068806089e-05, "loss": 1.0068, "step": 3476 }, { "epoch": 0.5466014266344397, "grad_norm": 0.14631864428520203, "learning_rate": 4.137576277441501e-05, "loss": 1.1191, "step": 3477 }, { "epoch": 0.5467586315313722, "grad_norm": 0.16629987955093384, "learning_rate": 4.137109386059985e-05, "loss": 1.062, "step": 3478 }, { "epoch": 0.5469158364283048, "grad_norm": 0.17852547764778137, "learning_rate": 4.1366423946900565e-05, "loss": 1.0064, "step": 3479 }, { "epoch": 0.5470730413252373, "grad_norm": 0.14198686182498932, "learning_rate": 4.1361753033602365e-05, "loss": 1.159, "step": 3480 }, { "epoch": 0.5472302462221699, "grad_norm": 0.13683508336544037, "learning_rate": 4.135708112099056e-05, "loss": 1.1204, "step": 3481 }, { "epoch": 0.5473874511191024, "grad_norm": 0.14965085685253143, "learning_rate": 4.135240820935046e-05, "loss": 1.1551, "step": 3482 }, { "epoch": 0.5475446560160349, "grad_norm": 0.14537665247917175, "learning_rate": 4.1347734298967486e-05, "loss": 1.0633, "step": 3483 }, { "epoch": 0.5477018609129675, "grad_norm": 0.14488355815410614, "learning_rate": 4.13430593901271e-05, "loss": 1.1625, "step": 3484 }, { "epoch": 0.5478590658099, "grad_norm": 0.12533707916736603, "learning_rate": 4.1338383483114834e-05, "loss": 1.1789, "step": 3485 }, { "epoch": 0.5480162707068326, "grad_norm": 0.1712876558303833, "learning_rate": 4.133370657821627e-05, "loss": 1.0927, "step": 3486 }, { "epoch": 0.5481734756037651, "grad_norm": 0.13493269681930542, "learning_rate": 4.1329028675717044e-05, "loss": 1.1271, "step": 3487 }, { "epoch": 0.5483306805006976, "grad_norm": 0.1372552216053009, "learning_rate": 4.132434977590288e-05, "loss": 1.1048, "step": 3488 }, { "epoch": 0.5484878853976302, "grad_norm": 0.16924694180488586, "learning_rate": 4.131966987905954e-05, "loss": 0.9947, "step": 3489 }, { "epoch": 0.5486450902945627, "grad_norm": 0.1597478687763214, "learning_rate": 4.131498898547286e-05, "loss": 1.0079, "step": 3490 }, { "epoch": 0.5488022951914953, "grad_norm": 0.14878109097480774, "learning_rate": 4.1310307095428726e-05, "loss": 0.9512, "step": 3491 }, { "epoch": 0.5489595000884278, "grad_norm": 0.1394796073436737, "learning_rate": 4.1305624209213084e-05, "loss": 1.0678, "step": 3492 }, { "epoch": 0.5491167049853602, "grad_norm": 0.1500493586063385, "learning_rate": 4.130094032711196e-05, "loss": 1.1213, "step": 3493 }, { "epoch": 0.5492739098822929, "grad_norm": 0.15943877398967743, "learning_rate": 4.129625544941142e-05, "loss": 1.1808, "step": 3494 }, { "epoch": 0.5494311147792253, "grad_norm": 0.13630403578281403, "learning_rate": 4.1291569576397604e-05, "loss": 1.1033, "step": 3495 }, { "epoch": 0.549588319676158, "grad_norm": 0.13915424048900604, "learning_rate": 4.12868827083567e-05, "loss": 1.1821, "step": 3496 }, { "epoch": 0.5497455245730904, "grad_norm": 0.13302691280841827, "learning_rate": 4.1282194845574966e-05, "loss": 1.1093, "step": 3497 }, { "epoch": 0.5499027294700229, "grad_norm": 0.14590945839881897, "learning_rate": 4.127750598833873e-05, "loss": 1.0629, "step": 3498 }, { "epoch": 0.5500599343669555, "grad_norm": 0.20246762037277222, "learning_rate": 4.127281613693435e-05, "loss": 1.25, "step": 3499 }, { "epoch": 0.550217139263888, "grad_norm": 0.17783774435520172, "learning_rate": 4.126812529164828e-05, "loss": 1.0606, "step": 3500 }, { "epoch": 0.5503743441608206, "grad_norm": 0.19174616038799286, "learning_rate": 4.126343345276701e-05, "loss": 1.0292, "step": 3501 }, { "epoch": 0.5505315490577531, "grad_norm": 0.2838459014892578, "learning_rate": 4.1258740620577104e-05, "loss": 1.0346, "step": 3502 }, { "epoch": 0.5506887539546856, "grad_norm": 0.15128780901432037, "learning_rate": 4.125404679536518e-05, "loss": 1.0671, "step": 3503 }, { "epoch": 0.5508459588516182, "grad_norm": 0.14986170828342438, "learning_rate": 4.1249351977417926e-05, "loss": 1.1325, "step": 3504 }, { "epoch": 0.5510031637485507, "grad_norm": 0.13601899147033691, "learning_rate": 4.124465616702207e-05, "loss": 1.1468, "step": 3505 }, { "epoch": 0.5511603686454833, "grad_norm": 0.1459992378950119, "learning_rate": 4.123995936446443e-05, "loss": 1.0934, "step": 3506 }, { "epoch": 0.5513175735424158, "grad_norm": 0.20352213084697723, "learning_rate": 4.123526157003186e-05, "loss": 1.0711, "step": 3507 }, { "epoch": 0.5514747784393483, "grad_norm": 0.1530255526304245, "learning_rate": 4.123056278401128e-05, "loss": 1.192, "step": 3508 }, { "epoch": 0.5516319833362809, "grad_norm": 0.1644168347120285, "learning_rate": 4.122586300668968e-05, "loss": 1.1092, "step": 3509 }, { "epoch": 0.5517891882332134, "grad_norm": 0.14001496136188507, "learning_rate": 4.12211622383541e-05, "loss": 1.1029, "step": 3510 }, { "epoch": 0.551946393130146, "grad_norm": 0.15016533434391022, "learning_rate": 4.121646047929165e-05, "loss": 1.0815, "step": 3511 }, { "epoch": 0.5521035980270785, "grad_norm": 0.23084966838359833, "learning_rate": 4.12117577297895e-05, "loss": 1.0295, "step": 3512 }, { "epoch": 0.552260802924011, "grad_norm": 0.1836826652288437, "learning_rate": 4.120705399013486e-05, "loss": 1.137, "step": 3513 }, { "epoch": 0.5524180078209436, "grad_norm": 0.15557770431041718, "learning_rate": 4.1202349260615034e-05, "loss": 1.1611, "step": 3514 }, { "epoch": 0.5525752127178761, "grad_norm": 0.1517626792192459, "learning_rate": 4.119764354151736e-05, "loss": 1.0761, "step": 3515 }, { "epoch": 0.5527324176148087, "grad_norm": 0.14277459681034088, "learning_rate": 4.119293683312924e-05, "loss": 1.2162, "step": 3516 }, { "epoch": 0.5528896225117412, "grad_norm": 0.14309588074684143, "learning_rate": 4.1188229135738154e-05, "loss": 1.0045, "step": 3517 }, { "epoch": 0.5530468274086738, "grad_norm": 0.18487317860126495, "learning_rate": 4.1183520449631615e-05, "loss": 1.0144, "step": 3518 }, { "epoch": 0.5532040323056063, "grad_norm": 0.12368316948413849, "learning_rate": 4.117881077509723e-05, "loss": 1.0489, "step": 3519 }, { "epoch": 0.5533612372025388, "grad_norm": 0.12940697371959686, "learning_rate": 4.117410011242264e-05, "loss": 1.1486, "step": 3520 }, { "epoch": 0.5533612372025388, "eval_loss": 1.107121229171753, "eval_runtime": 2329.4939, "eval_samples_per_second": 3.974, "eval_steps_per_second": 1.987, "step": 3520 }, { "epoch": 0.5535184420994714, "grad_norm": 0.14380988478660583, "learning_rate": 4.1169388461895557e-05, "loss": 1.1459, "step": 3521 }, { "epoch": 0.5536756469964039, "grad_norm": 0.2936558723449707, "learning_rate": 4.116467582380374e-05, "loss": 1.0386, "step": 3522 }, { "epoch": 0.5538328518933365, "grad_norm": 0.1526966094970703, "learning_rate": 4.1159962198435034e-05, "loss": 1.0906, "step": 3523 }, { "epoch": 0.553990056790269, "grad_norm": 0.14400209486484528, "learning_rate": 4.115524758607731e-05, "loss": 1.0553, "step": 3524 }, { "epoch": 0.5541472616872015, "grad_norm": 0.1706121563911438, "learning_rate": 4.1150531987018535e-05, "loss": 1.1487, "step": 3525 }, { "epoch": 0.5543044665841341, "grad_norm": 0.19591955840587616, "learning_rate": 4.114581540154672e-05, "loss": 1.0226, "step": 3526 }, { "epoch": 0.5544616714810666, "grad_norm": 0.18198204040527344, "learning_rate": 4.114109782994993e-05, "loss": 1.05, "step": 3527 }, { "epoch": 0.5546188763779992, "grad_norm": 0.13738413155078888, "learning_rate": 4.113637927251629e-05, "loss": 1.1371, "step": 3528 }, { "epoch": 0.5547760812749317, "grad_norm": 0.15277525782585144, "learning_rate": 4.1131659729534006e-05, "loss": 1.2617, "step": 3529 }, { "epoch": 0.5549332861718642, "grad_norm": 0.12683087587356567, "learning_rate": 4.112693920129132e-05, "loss": 1.1409, "step": 3530 }, { "epoch": 0.5550904910687968, "grad_norm": 0.13529619574546814, "learning_rate": 4.1122217688076546e-05, "loss": 1.109, "step": 3531 }, { "epoch": 0.5552476959657293, "grad_norm": 0.152116596698761, "learning_rate": 4.111749519017806e-05, "loss": 1.203, "step": 3532 }, { "epoch": 0.5554049008626619, "grad_norm": 0.15577782690525055, "learning_rate": 4.111277170788428e-05, "loss": 1.0422, "step": 3533 }, { "epoch": 0.5555621057595944, "grad_norm": 0.1772596538066864, "learning_rate": 4.110804724148373e-05, "loss": 1.2143, "step": 3534 }, { "epoch": 0.5557193106565269, "grad_norm": 0.1525980532169342, "learning_rate": 4.110332179126493e-05, "loss": 1.0592, "step": 3535 }, { "epoch": 0.5558765155534595, "grad_norm": 0.15575167536735535, "learning_rate": 4.109859535751649e-05, "loss": 1.1221, "step": 3536 }, { "epoch": 0.556033720450392, "grad_norm": 0.15791207551956177, "learning_rate": 4.1093867940527115e-05, "loss": 1.0413, "step": 3537 }, { "epoch": 0.5561909253473246, "grad_norm": 0.22039160132408142, "learning_rate": 4.108913954058552e-05, "loss": 1.0689, "step": 3538 }, { "epoch": 0.5563481302442571, "grad_norm": 0.1370696723461151, "learning_rate": 4.1084410157980484e-05, "loss": 1.1589, "step": 3539 }, { "epoch": 0.5565053351411896, "grad_norm": 0.14719007909297943, "learning_rate": 4.107967979300088e-05, "loss": 1.1352, "step": 3540 }, { "epoch": 0.5566625400381222, "grad_norm": 0.1324034333229065, "learning_rate": 4.107494844593561e-05, "loss": 1.2209, "step": 3541 }, { "epoch": 0.5568197449350547, "grad_norm": 0.13277071714401245, "learning_rate": 4.107021611707366e-05, "loss": 1.1789, "step": 3542 }, { "epoch": 0.5569769498319873, "grad_norm": 0.14232930541038513, "learning_rate": 4.106548280670405e-05, "loss": 1.0172, "step": 3543 }, { "epoch": 0.5571341547289198, "grad_norm": 0.1549500972032547, "learning_rate": 4.106074851511587e-05, "loss": 0.9018, "step": 3544 }, { "epoch": 0.5572913596258523, "grad_norm": 0.1389908343553543, "learning_rate": 4.1056013242598276e-05, "loss": 1.1046, "step": 3545 }, { "epoch": 0.5574485645227849, "grad_norm": 0.13126444816589355, "learning_rate": 4.105127698944049e-05, "loss": 1.2158, "step": 3546 }, { "epoch": 0.5576057694197174, "grad_norm": 0.1347741037607193, "learning_rate": 4.104653975593177e-05, "loss": 1.1548, "step": 3547 }, { "epoch": 0.55776297431665, "grad_norm": 0.15513011813163757, "learning_rate": 4.104180154236146e-05, "loss": 1.1751, "step": 3548 }, { "epoch": 0.5579201792135825, "grad_norm": 0.14042234420776367, "learning_rate": 4.103706234901894e-05, "loss": 1.019, "step": 3549 }, { "epoch": 0.558077384110515, "grad_norm": 0.12702056765556335, "learning_rate": 4.103232217619368e-05, "loss": 1.1166, "step": 3550 }, { "epoch": 0.5582345890074476, "grad_norm": 0.14291422069072723, "learning_rate": 4.102758102417517e-05, "loss": 1.189, "step": 3551 }, { "epoch": 0.5583917939043801, "grad_norm": 0.1567365825176239, "learning_rate": 4.102283889325299e-05, "loss": 1.0894, "step": 3552 }, { "epoch": 0.5585489988013127, "grad_norm": 0.16741162538528442, "learning_rate": 4.101809578371678e-05, "loss": 1.0622, "step": 3553 }, { "epoch": 0.5587062036982452, "grad_norm": 0.2240571677684784, "learning_rate": 4.101335169585623e-05, "loss": 1.1743, "step": 3554 }, { "epoch": 0.5588634085951777, "grad_norm": 0.14278970658779144, "learning_rate": 4.100860662996108e-05, "loss": 1.1325, "step": 3555 }, { "epoch": 0.5590206134921103, "grad_norm": 0.15045011043548584, "learning_rate": 4.100386058632114e-05, "loss": 1.1009, "step": 3556 }, { "epoch": 0.5591778183890428, "grad_norm": 0.18575306236743927, "learning_rate": 4.0999113565226286e-05, "loss": 1.0188, "step": 3557 }, { "epoch": 0.5593350232859754, "grad_norm": 0.1384294033050537, "learning_rate": 4.0994365566966456e-05, "loss": 1.1446, "step": 3558 }, { "epoch": 0.5594922281829079, "grad_norm": 0.14572866261005402, "learning_rate": 4.098961659183163e-05, "loss": 1.0271, "step": 3559 }, { "epoch": 0.5596494330798404, "grad_norm": 0.17086221277713776, "learning_rate": 4.098486664011186e-05, "loss": 1.1636, "step": 3560 }, { "epoch": 0.559806637976773, "grad_norm": 0.15883690118789673, "learning_rate": 4.098011571209724e-05, "loss": 1.1531, "step": 3561 }, { "epoch": 0.5599638428737055, "grad_norm": 0.13134004175662994, "learning_rate": 4.097536380807797e-05, "loss": 1.1382, "step": 3562 }, { "epoch": 0.5601210477706381, "grad_norm": 0.18722890317440033, "learning_rate": 4.097061092834425e-05, "loss": 1.0367, "step": 3563 }, { "epoch": 0.5602782526675706, "grad_norm": 0.15676817297935486, "learning_rate": 4.0965857073186394e-05, "loss": 1.1242, "step": 3564 }, { "epoch": 0.5604354575645031, "grad_norm": 0.14853468537330627, "learning_rate": 4.096110224289472e-05, "loss": 1.1161, "step": 3565 }, { "epoch": 0.5605926624614357, "grad_norm": 0.15379522740840912, "learning_rate": 4.095634643775965e-05, "loss": 1.1923, "step": 3566 }, { "epoch": 0.5607498673583682, "grad_norm": 0.19688835740089417, "learning_rate": 4.095158965807165e-05, "loss": 1.1384, "step": 3567 }, { "epoch": 0.5609070722553008, "grad_norm": 0.13620081543922424, "learning_rate": 4.094683190412125e-05, "loss": 1.1516, "step": 3568 }, { "epoch": 0.5610642771522333, "grad_norm": 0.1553749442100525, "learning_rate": 4.0942073176199036e-05, "loss": 1.114, "step": 3569 }, { "epoch": 0.5612214820491658, "grad_norm": 0.1777694970369339, "learning_rate": 4.093731347459564e-05, "loss": 1.0629, "step": 3570 }, { "epoch": 0.5613786869460984, "grad_norm": 0.1379484385251999, "learning_rate": 4.0932552799601776e-05, "loss": 1.1787, "step": 3571 }, { "epoch": 0.5615358918430309, "grad_norm": 0.1495600789785385, "learning_rate": 4.092779115150821e-05, "loss": 1.2294, "step": 3572 }, { "epoch": 0.5616930967399635, "grad_norm": 0.13353364169597626, "learning_rate": 4.0923028530605756e-05, "loss": 1.1443, "step": 3573 }, { "epoch": 0.561850301636896, "grad_norm": 0.13893137872219086, "learning_rate": 4.09182649371853e-05, "loss": 1.2105, "step": 3574 }, { "epoch": 0.5620075065338286, "grad_norm": 0.12231533229351044, "learning_rate": 4.0913500371537796e-05, "loss": 1.0414, "step": 3575 }, { "epoch": 0.5621647114307611, "grad_norm": 0.1520678997039795, "learning_rate": 4.090873483395422e-05, "loss": 1.0846, "step": 3576 }, { "epoch": 0.5623219163276936, "grad_norm": 0.13837867975234985, "learning_rate": 4.090396832472566e-05, "loss": 0.9917, "step": 3577 }, { "epoch": 0.5624791212246262, "grad_norm": 0.13468261063098907, "learning_rate": 4.089920084414323e-05, "loss": 1.173, "step": 3578 }, { "epoch": 0.5626363261215587, "grad_norm": 0.15696372091770172, "learning_rate": 4.089443239249811e-05, "loss": 1.0865, "step": 3579 }, { "epoch": 0.5627935310184913, "grad_norm": 0.14664031565189362, "learning_rate": 4.088966297008152e-05, "loss": 1.1385, "step": 3580 }, { "epoch": 0.5629507359154238, "grad_norm": 0.18675899505615234, "learning_rate": 4.0884892577184774e-05, "loss": 1.0914, "step": 3581 }, { "epoch": 0.5631079408123563, "grad_norm": 0.14525212347507477, "learning_rate": 4.0880121214099225e-05, "loss": 1.0615, "step": 3582 }, { "epoch": 0.5632651457092889, "grad_norm": 0.12778644263744354, "learning_rate": 4.08753488811163e-05, "loss": 1.0975, "step": 3583 }, { "epoch": 0.5634223506062214, "grad_norm": 0.13698804378509521, "learning_rate": 4.087057557852747e-05, "loss": 1.1787, "step": 3584 }, { "epoch": 0.563579555503154, "grad_norm": 0.1349797397851944, "learning_rate": 4.086580130662426e-05, "loss": 1.2011, "step": 3585 }, { "epoch": 0.5637367604000865, "grad_norm": 0.12913671135902405, "learning_rate": 4.086102606569827e-05, "loss": 1.1346, "step": 3586 }, { "epoch": 0.563893965297019, "grad_norm": 0.15104079246520996, "learning_rate": 4.0856249856041154e-05, "loss": 1.1491, "step": 3587 }, { "epoch": 0.5640511701939516, "grad_norm": 0.13582037389278412, "learning_rate": 4.0851472677944636e-05, "loss": 1.0484, "step": 3588 }, { "epoch": 0.5642083750908841, "grad_norm": 0.13232627511024475, "learning_rate": 4.084669453170047e-05, "loss": 1.0965, "step": 3589 }, { "epoch": 0.5643655799878167, "grad_norm": 0.14906802773475647, "learning_rate": 4.08419154176005e-05, "loss": 1.1288, "step": 3590 }, { "epoch": 0.5645227848847492, "grad_norm": 0.13499197363853455, "learning_rate": 4.0837135335936606e-05, "loss": 1.1255, "step": 3591 }, { "epoch": 0.5646799897816817, "grad_norm": 0.15802523493766785, "learning_rate": 4.083235428700074e-05, "loss": 1.1828, "step": 3592 }, { "epoch": 0.5648371946786143, "grad_norm": 0.14267662167549133, "learning_rate": 4.082757227108492e-05, "loss": 1.1359, "step": 3593 }, { "epoch": 0.5649943995755468, "grad_norm": 0.12380488216876984, "learning_rate": 4.08227892884812e-05, "loss": 1.1136, "step": 3594 }, { "epoch": 0.5651516044724794, "grad_norm": 0.1933763176202774, "learning_rate": 4.081800533948171e-05, "loss": 1.068, "step": 3595 }, { "epoch": 0.5653088093694119, "grad_norm": 0.14156892895698547, "learning_rate": 4.081322042437864e-05, "loss": 1.2321, "step": 3596 }, { "epoch": 0.5654660142663444, "grad_norm": 0.14347857236862183, "learning_rate": 4.0808434543464233e-05, "loss": 1.1036, "step": 3597 }, { "epoch": 0.565623219163277, "grad_norm": 0.15195155143737793, "learning_rate": 4.08036476970308e-05, "loss": 1.1351, "step": 3598 }, { "epoch": 0.5657804240602095, "grad_norm": 0.1291377991437912, "learning_rate": 4.0798859885370676e-05, "loss": 1.1474, "step": 3599 }, { "epoch": 0.5659376289571421, "grad_norm": 0.21259215474128723, "learning_rate": 4.0794071108776314e-05, "loss": 1.1855, "step": 3600 }, { "epoch": 0.5660948338540746, "grad_norm": 0.1222395971417427, "learning_rate": 4.078928136754018e-05, "loss": 1.2164, "step": 3601 }, { "epoch": 0.566252038751007, "grad_norm": 0.18162107467651367, "learning_rate": 4.078449066195481e-05, "loss": 1.0613, "step": 3602 }, { "epoch": 0.5664092436479397, "grad_norm": 0.13709622621536255, "learning_rate": 4.0779698992312806e-05, "loss": 1.1023, "step": 3603 }, { "epoch": 0.5665664485448721, "grad_norm": 0.16631649434566498, "learning_rate": 4.077490635890683e-05, "loss": 1.1133, "step": 3604 }, { "epoch": 0.5667236534418048, "grad_norm": 0.14726059138774872, "learning_rate": 4.077011276202959e-05, "loss": 1.2028, "step": 3605 }, { "epoch": 0.5668808583387372, "grad_norm": 0.1343258172273636, "learning_rate": 4.0765318201973865e-05, "loss": 1.0611, "step": 3606 }, { "epoch": 0.5670380632356697, "grad_norm": 0.20124560594558716, "learning_rate": 4.0760522679032484e-05, "loss": 0.957, "step": 3607 }, { "epoch": 0.5671952681326023, "grad_norm": 0.12561757862567902, "learning_rate": 4.075572619349836e-05, "loss": 1.2235, "step": 3608 }, { "epoch": 0.5673524730295348, "grad_norm": 0.1428644210100174, "learning_rate": 4.075092874566441e-05, "loss": 1.1726, "step": 3609 }, { "epoch": 0.5675096779264674, "grad_norm": 0.12811197340488434, "learning_rate": 4.0746130335823656e-05, "loss": 1.0739, "step": 3610 }, { "epoch": 0.5676668828233999, "grad_norm": 0.14488913118839264, "learning_rate": 4.0741330964269176e-05, "loss": 1.1257, "step": 3611 }, { "epoch": 0.5678240877203324, "grad_norm": 0.15192878246307373, "learning_rate": 4.0736530631294104e-05, "loss": 1.1235, "step": 3612 }, { "epoch": 0.567981292617265, "grad_norm": 0.2228131741285324, "learning_rate": 4.0731729337191606e-05, "loss": 0.9947, "step": 3613 }, { "epoch": 0.5681384975141975, "grad_norm": 0.14630167186260223, "learning_rate": 4.0726927082254934e-05, "loss": 1.1134, "step": 3614 }, { "epoch": 0.5682957024111301, "grad_norm": 0.14998601377010345, "learning_rate": 4.072212386677739e-05, "loss": 1.2102, "step": 3615 }, { "epoch": 0.5684529073080626, "grad_norm": 0.12869413197040558, "learning_rate": 4.071731969105235e-05, "loss": 1.0684, "step": 3616 }, { "epoch": 0.5686101122049951, "grad_norm": 0.13855576515197754, "learning_rate": 4.071251455537321e-05, "loss": 1.1482, "step": 3617 }, { "epoch": 0.5687673171019277, "grad_norm": 0.11902645975351334, "learning_rate": 4.070770846003347e-05, "loss": 1.1163, "step": 3618 }, { "epoch": 0.5689245219988602, "grad_norm": 0.15628483891487122, "learning_rate": 4.070290140532667e-05, "loss": 1.1314, "step": 3619 }, { "epoch": 0.5690817268957928, "grad_norm": 0.1387438029050827, "learning_rate": 4.069809339154638e-05, "loss": 1.0639, "step": 3620 }, { "epoch": 0.5692389317927253, "grad_norm": 0.17734022438526154, "learning_rate": 4.0693284418986286e-05, "loss": 1.0748, "step": 3621 }, { "epoch": 0.5693961366896578, "grad_norm": 0.18251527845859528, "learning_rate": 4.068847448794009e-05, "loss": 1.1747, "step": 3622 }, { "epoch": 0.5695533415865904, "grad_norm": 0.18893775343894958, "learning_rate": 4.0683663598701546e-05, "loss": 1.0855, "step": 3623 }, { "epoch": 0.5697105464835229, "grad_norm": 0.14489051699638367, "learning_rate": 4.0678851751564515e-05, "loss": 1.112, "step": 3624 }, { "epoch": 0.5698677513804555, "grad_norm": 0.18162967264652252, "learning_rate": 4.0674038946822876e-05, "loss": 1.1593, "step": 3625 }, { "epoch": 0.570024956277388, "grad_norm": 0.16984649002552032, "learning_rate": 4.066922518477056e-05, "loss": 1.0952, "step": 3626 }, { "epoch": 0.5701821611743206, "grad_norm": 0.17276768386363983, "learning_rate": 4.0664410465701605e-05, "loss": 1.2318, "step": 3627 }, { "epoch": 0.5703393660712531, "grad_norm": 0.16322898864746094, "learning_rate": 4.065959478991005e-05, "loss": 1.0514, "step": 3628 }, { "epoch": 0.5704965709681856, "grad_norm": 0.11980457603931427, "learning_rate": 4.0654778157690025e-05, "loss": 1.1183, "step": 3629 }, { "epoch": 0.5706537758651182, "grad_norm": 0.14290717244148254, "learning_rate": 4.064996056933571e-05, "loss": 1.1669, "step": 3630 }, { "epoch": 0.5708109807620507, "grad_norm": 0.13066339492797852, "learning_rate": 4.064514202514136e-05, "loss": 1.2166, "step": 3631 }, { "epoch": 0.5709681856589833, "grad_norm": 0.14656370878219604, "learning_rate": 4.0640322525401254e-05, "loss": 1.0351, "step": 3632 }, { "epoch": 0.5711253905559158, "grad_norm": 0.1635439097881317, "learning_rate": 4.063550207040975e-05, "loss": 1.1568, "step": 3633 }, { "epoch": 0.5712825954528483, "grad_norm": 0.15011559426784515, "learning_rate": 4.063068066046127e-05, "loss": 1.1624, "step": 3634 }, { "epoch": 0.5714398003497809, "grad_norm": 0.13049575686454773, "learning_rate": 4.06258582958503e-05, "loss": 1.0406, "step": 3635 }, { "epoch": 0.5715970052467134, "grad_norm": 0.1825868785381317, "learning_rate": 4.0621034976871344e-05, "loss": 1.0516, "step": 3636 }, { "epoch": 0.571754210143646, "grad_norm": 0.14342835545539856, "learning_rate": 4.0616210703819e-05, "loss": 1.1982, "step": 3637 }, { "epoch": 0.5719114150405785, "grad_norm": 0.16550028324127197, "learning_rate": 4.061138547698794e-05, "loss": 1.0964, "step": 3638 }, { "epoch": 0.572068619937511, "grad_norm": 0.13078954815864563, "learning_rate": 4.060655929667284e-05, "loss": 1.1712, "step": 3639 }, { "epoch": 0.5722258248344436, "grad_norm": 0.17670851945877075, "learning_rate": 4.060173216316847e-05, "loss": 1.0302, "step": 3640 }, { "epoch": 0.5723830297313761, "grad_norm": 0.1388305425643921, "learning_rate": 4.0596904076769674e-05, "loss": 1.1351, "step": 3641 }, { "epoch": 0.5725402346283087, "grad_norm": 0.13523413240909576, "learning_rate": 4.059207503777131e-05, "loss": 1.1302, "step": 3642 }, { "epoch": 0.5726974395252412, "grad_norm": 0.14954648911952972, "learning_rate": 4.058724504646834e-05, "loss": 1.2166, "step": 3643 }, { "epoch": 0.5728546444221737, "grad_norm": 0.1510806679725647, "learning_rate": 4.058241410315574e-05, "loss": 1.0743, "step": 3644 }, { "epoch": 0.5730118493191063, "grad_norm": 0.13376949727535248, "learning_rate": 4.057758220812857e-05, "loss": 1.062, "step": 3645 }, { "epoch": 0.5731690542160388, "grad_norm": 0.16394519805908203, "learning_rate": 4.057274936168196e-05, "loss": 1.1083, "step": 3646 }, { "epoch": 0.5733262591129714, "grad_norm": 0.14098455011844635, "learning_rate": 4.056791556411106e-05, "loss": 1.1301, "step": 3647 }, { "epoch": 0.5734834640099039, "grad_norm": 0.13498766720294952, "learning_rate": 4.0563080815711116e-05, "loss": 1.1505, "step": 3648 }, { "epoch": 0.5736406689068364, "grad_norm": 0.1271194964647293, "learning_rate": 4.0558245116777394e-05, "loss": 1.1138, "step": 3649 }, { "epoch": 0.573797873803769, "grad_norm": 0.166845440864563, "learning_rate": 4.055340846760527e-05, "loss": 0.9864, "step": 3650 }, { "epoch": 0.5739550787007015, "grad_norm": 0.13443748652935028, "learning_rate": 4.054857086849013e-05, "loss": 1.1908, "step": 3651 }, { "epoch": 0.5741122835976341, "grad_norm": 0.12793311476707458, "learning_rate": 4.054373231972744e-05, "loss": 1.1881, "step": 3652 }, { "epoch": 0.5742694884945666, "grad_norm": 0.12760072946548462, "learning_rate": 4.053889282161272e-05, "loss": 1.0298, "step": 3653 }, { "epoch": 0.5744266933914991, "grad_norm": 0.14803969860076904, "learning_rate": 4.0534052374441544e-05, "loss": 1.1912, "step": 3654 }, { "epoch": 0.5745838982884317, "grad_norm": 0.15170976519584656, "learning_rate": 4.0529210978509556e-05, "loss": 1.1831, "step": 3655 }, { "epoch": 0.5747411031853642, "grad_norm": 0.13536196947097778, "learning_rate": 4.0524368634112446e-05, "loss": 1.0846, "step": 3656 }, { "epoch": 0.5748983080822968, "grad_norm": 0.14190278947353363, "learning_rate": 4.0519525341545964e-05, "loss": 1.0797, "step": 3657 }, { "epoch": 0.5750555129792293, "grad_norm": 0.14169275760650635, "learning_rate": 4.051468110110593e-05, "loss": 1.1638, "step": 3658 }, { "epoch": 0.5752127178761618, "grad_norm": 0.16677023470401764, "learning_rate": 4.050983591308819e-05, "loss": 1.1189, "step": 3659 }, { "epoch": 0.5753699227730944, "grad_norm": 0.14935442805290222, "learning_rate": 4.050498977778869e-05, "loss": 1.2741, "step": 3660 }, { "epoch": 0.5755271276700269, "grad_norm": 0.2275468111038208, "learning_rate": 4.050014269550342e-05, "loss": 1.1942, "step": 3661 }, { "epoch": 0.5756843325669595, "grad_norm": 0.14003866910934448, "learning_rate": 4.049529466652839e-05, "loss": 1.1577, "step": 3662 }, { "epoch": 0.575841537463892, "grad_norm": 0.14179524779319763, "learning_rate": 4.0490445691159726e-05, "loss": 1.1885, "step": 3663 }, { "epoch": 0.5759987423608245, "grad_norm": 0.14022792875766754, "learning_rate": 4.048559576969357e-05, "loss": 1.1538, "step": 3664 }, { "epoch": 0.5761559472577571, "grad_norm": 0.14557988941669464, "learning_rate": 4.048074490242615e-05, "loss": 1.1012, "step": 3665 }, { "epoch": 0.5763131521546896, "grad_norm": 0.14714285731315613, "learning_rate": 4.047589308965373e-05, "loss": 1.1016, "step": 3666 }, { "epoch": 0.5764703570516222, "grad_norm": 0.17905539274215698, "learning_rate": 4.0471040331672646e-05, "loss": 1.1554, "step": 3667 }, { "epoch": 0.5766275619485547, "grad_norm": 0.13437265157699585, "learning_rate": 4.046618662877928e-05, "loss": 1.1321, "step": 3668 }, { "epoch": 0.5767847668454872, "grad_norm": 0.1353270262479782, "learning_rate": 4.046133198127007e-05, "loss": 1.0082, "step": 3669 }, { "epoch": 0.5769419717424198, "grad_norm": 0.12693148851394653, "learning_rate": 4.045647638944154e-05, "loss": 1.1042, "step": 3670 }, { "epoch": 0.5770991766393523, "grad_norm": 0.14182643592357635, "learning_rate": 4.045161985359024e-05, "loss": 1.1912, "step": 3671 }, { "epoch": 0.5772563815362849, "grad_norm": 0.13259053230285645, "learning_rate": 4.044676237401278e-05, "loss": 1.128, "step": 3672 }, { "epoch": 0.5774135864332174, "grad_norm": 0.13726124167442322, "learning_rate": 4.044190395100585e-05, "loss": 1.0668, "step": 3673 }, { "epoch": 0.5775707913301499, "grad_norm": 0.13621871173381805, "learning_rate": 4.043704458486618e-05, "loss": 1.0255, "step": 3674 }, { "epoch": 0.5777279962270825, "grad_norm": 0.1848558783531189, "learning_rate": 4.043218427589056e-05, "loss": 1.0299, "step": 3675 }, { "epoch": 0.577885201124015, "grad_norm": 0.13466404378414154, "learning_rate": 4.042732302437585e-05, "loss": 1.0444, "step": 3676 }, { "epoch": 0.5780424060209476, "grad_norm": 0.1385255753993988, "learning_rate": 4.042246083061894e-05, "loss": 0.9893, "step": 3677 }, { "epoch": 0.5781996109178801, "grad_norm": 0.14059042930603027, "learning_rate": 4.041759769491679e-05, "loss": 1.1886, "step": 3678 }, { "epoch": 0.5783568158148127, "grad_norm": 0.1421997994184494, "learning_rate": 4.041273361756645e-05, "loss": 1.0924, "step": 3679 }, { "epoch": 0.5785140207117452, "grad_norm": 0.16908836364746094, "learning_rate": 4.040786859886497e-05, "loss": 1.0998, "step": 3680 }, { "epoch": 0.5785140207117452, "eval_loss": 1.1054325103759766, "eval_runtime": 2330.938, "eval_samples_per_second": 3.972, "eval_steps_per_second": 1.986, "step": 3680 }, { "epoch": 0.5786712256086777, "grad_norm": 0.21620066463947296, "learning_rate": 4.04030026391095e-05, "loss": 1.051, "step": 3681 }, { "epoch": 0.5788284305056103, "grad_norm": 0.12915241718292236, "learning_rate": 4.0398135738597244e-05, "loss": 1.0906, "step": 3682 }, { "epoch": 0.5789856354025428, "grad_norm": 0.1334918737411499, "learning_rate": 4.0393267897625434e-05, "loss": 1.066, "step": 3683 }, { "epoch": 0.5791428402994754, "grad_norm": 0.1355300396680832, "learning_rate": 4.038839911649139e-05, "loss": 1.1698, "step": 3684 }, { "epoch": 0.5793000451964079, "grad_norm": 0.15459658205509186, "learning_rate": 4.038352939549247e-05, "loss": 1.0792, "step": 3685 }, { "epoch": 0.5794572500933404, "grad_norm": 0.1558839976787567, "learning_rate": 4.0378658734926116e-05, "loss": 1.1722, "step": 3686 }, { "epoch": 0.579614454990273, "grad_norm": 0.1443861573934555, "learning_rate": 4.0373787135089796e-05, "loss": 1.1336, "step": 3687 }, { "epoch": 0.5797716598872055, "grad_norm": 0.12290564924478531, "learning_rate": 4.036891459628105e-05, "loss": 1.0873, "step": 3688 }, { "epoch": 0.5799288647841381, "grad_norm": 0.1709529608488083, "learning_rate": 4.0364041118797476e-05, "loss": 1.0867, "step": 3689 }, { "epoch": 0.5800860696810706, "grad_norm": 0.15766242146492004, "learning_rate": 4.0359166702936724e-05, "loss": 1.1389, "step": 3690 }, { "epoch": 0.5802432745780031, "grad_norm": 0.1576821208000183, "learning_rate": 4.035429134899652e-05, "loss": 1.176, "step": 3691 }, { "epoch": 0.5804004794749357, "grad_norm": 0.2048172652721405, "learning_rate": 4.0349415057274604e-05, "loss": 1.1504, "step": 3692 }, { "epoch": 0.5805576843718682, "grad_norm": 0.13622994720935822, "learning_rate": 4.0344537828068816e-05, "loss": 1.094, "step": 3693 }, { "epoch": 0.5807148892688008, "grad_norm": 0.14516420662403107, "learning_rate": 4.033965966167705e-05, "loss": 1.0591, "step": 3694 }, { "epoch": 0.5808720941657333, "grad_norm": 0.1271536499261856, "learning_rate": 4.033478055839723e-05, "loss": 1.1008, "step": 3695 }, { "epoch": 0.5810292990626658, "grad_norm": 0.13999730348587036, "learning_rate": 4.032990051852736e-05, "loss": 1.1955, "step": 3696 }, { "epoch": 0.5811865039595984, "grad_norm": 0.1689818650484085, "learning_rate": 4.032501954236549e-05, "loss": 1.0263, "step": 3697 }, { "epoch": 0.5813437088565309, "grad_norm": 0.1439339816570282, "learning_rate": 4.032013763020974e-05, "loss": 0.9829, "step": 3698 }, { "epoch": 0.5815009137534635, "grad_norm": 0.15156951546669006, "learning_rate": 4.031525478235827e-05, "loss": 1.0514, "step": 3699 }, { "epoch": 0.581658118650396, "grad_norm": 0.1496332734823227, "learning_rate": 4.031037099910931e-05, "loss": 1.1401, "step": 3700 }, { "epoch": 0.5818153235473285, "grad_norm": 0.1426810920238495, "learning_rate": 4.030548628076114e-05, "loss": 1.0832, "step": 3701 }, { "epoch": 0.5819725284442611, "grad_norm": 0.14869458973407745, "learning_rate": 4.03006006276121e-05, "loss": 1.0517, "step": 3702 }, { "epoch": 0.5821297333411936, "grad_norm": 0.15788580477237701, "learning_rate": 4.0295714039960595e-05, "loss": 0.912, "step": 3703 }, { "epoch": 0.5822869382381262, "grad_norm": 0.13516800105571747, "learning_rate": 4.029082651810507e-05, "loss": 1.1437, "step": 3704 }, { "epoch": 0.5824441431350587, "grad_norm": 0.15520767867565155, "learning_rate": 4.0285938062344034e-05, "loss": 1.2535, "step": 3705 }, { "epoch": 0.5826013480319912, "grad_norm": 0.13410113751888275, "learning_rate": 4.028104867297606e-05, "loss": 1.1021, "step": 3706 }, { "epoch": 0.5827585529289238, "grad_norm": 0.1681327521800995, "learning_rate": 4.027615835029978e-05, "loss": 1.1215, "step": 3707 }, { "epoch": 0.5829157578258563, "grad_norm": 0.15960273146629333, "learning_rate": 4.0271267094613877e-05, "loss": 1.165, "step": 3708 }, { "epoch": 0.5830729627227889, "grad_norm": 0.12318623811006546, "learning_rate": 4.0266374906217064e-05, "loss": 1.1991, "step": 3709 }, { "epoch": 0.5832301676197214, "grad_norm": 0.1433396190404892, "learning_rate": 4.0261481785408165e-05, "loss": 0.9933, "step": 3710 }, { "epoch": 0.5833873725166538, "grad_norm": 0.14328768849372864, "learning_rate": 4.025658773248603e-05, "loss": 1.0885, "step": 3711 }, { "epoch": 0.5835445774135865, "grad_norm": 0.14786562323570251, "learning_rate": 4.025169274774956e-05, "loss": 1.0613, "step": 3712 }, { "epoch": 0.583701782310519, "grad_norm": 0.1389857828617096, "learning_rate": 4.0246796831497724e-05, "loss": 1.1572, "step": 3713 }, { "epoch": 0.5838589872074516, "grad_norm": 0.14071199297904968, "learning_rate": 4.024189998402955e-05, "loss": 1.1341, "step": 3714 }, { "epoch": 0.584016192104384, "grad_norm": 0.1397501528263092, "learning_rate": 4.0237002205644116e-05, "loss": 1.1668, "step": 3715 }, { "epoch": 0.5841733970013165, "grad_norm": 0.13448472321033478, "learning_rate": 4.023210349664056e-05, "loss": 1.1673, "step": 3716 }, { "epoch": 0.5843306018982491, "grad_norm": 0.1473264843225479, "learning_rate": 4.022720385731808e-05, "loss": 1.0642, "step": 3717 }, { "epoch": 0.5844878067951816, "grad_norm": 0.13606154918670654, "learning_rate": 4.022230328797591e-05, "loss": 1.1701, "step": 3718 }, { "epoch": 0.5846450116921142, "grad_norm": 0.12004934996366501, "learning_rate": 4.021740178891339e-05, "loss": 1.137, "step": 3719 }, { "epoch": 0.5848022165890467, "grad_norm": 0.15330874919891357, "learning_rate": 4.021249936042986e-05, "loss": 1.1412, "step": 3720 }, { "epoch": 0.5849594214859792, "grad_norm": 0.13745856285095215, "learning_rate": 4.020759600282475e-05, "loss": 1.0378, "step": 3721 }, { "epoch": 0.5851166263829118, "grad_norm": 0.13707605004310608, "learning_rate": 4.020269171639754e-05, "loss": 1.0882, "step": 3722 }, { "epoch": 0.5852738312798443, "grad_norm": 0.1425696462392807, "learning_rate": 4.019778650144775e-05, "loss": 1.1595, "step": 3723 }, { "epoch": 0.5854310361767769, "grad_norm": 0.1498594433069229, "learning_rate": 4.0192880358275e-05, "loss": 1.0956, "step": 3724 }, { "epoch": 0.5855882410737094, "grad_norm": 0.1398852914571762, "learning_rate": 4.018797328717891e-05, "loss": 1.0655, "step": 3725 }, { "epoch": 0.5857454459706419, "grad_norm": 0.1315324306488037, "learning_rate": 4.018306528845921e-05, "loss": 1.1341, "step": 3726 }, { "epoch": 0.5859026508675745, "grad_norm": 0.136732816696167, "learning_rate": 4.017815636241565e-05, "loss": 1.126, "step": 3727 }, { "epoch": 0.586059855764507, "grad_norm": 0.12884072959423065, "learning_rate": 4.017324650934804e-05, "loss": 1.203, "step": 3728 }, { "epoch": 0.5862170606614396, "grad_norm": 0.1397540420293808, "learning_rate": 4.016833572955626e-05, "loss": 1.1054, "step": 3729 }, { "epoch": 0.5863742655583721, "grad_norm": 0.14420150220394135, "learning_rate": 4.016342402334026e-05, "loss": 1.1141, "step": 3730 }, { "epoch": 0.5865314704553047, "grad_norm": 0.13877861201763153, "learning_rate": 4.0158511391000006e-05, "loss": 1.0809, "step": 3731 }, { "epoch": 0.5866886753522372, "grad_norm": 0.1263275295495987, "learning_rate": 4.015359783283555e-05, "loss": 1.0784, "step": 3732 }, { "epoch": 0.5868458802491697, "grad_norm": 0.1705816388130188, "learning_rate": 4.0148683349146985e-05, "loss": 1.193, "step": 3733 }, { "epoch": 0.5870030851461023, "grad_norm": 0.1465051770210266, "learning_rate": 4.014376794023449e-05, "loss": 1.0628, "step": 3734 }, { "epoch": 0.5871602900430348, "grad_norm": 0.13974584639072418, "learning_rate": 4.013885160639826e-05, "loss": 1.1602, "step": 3735 }, { "epoch": 0.5873174949399674, "grad_norm": 0.13794563710689545, "learning_rate": 4.013393434793858e-05, "loss": 1.1384, "step": 3736 }, { "epoch": 0.5874746998368999, "grad_norm": 0.12318416684865952, "learning_rate": 4.012901616515578e-05, "loss": 1.1166, "step": 3737 }, { "epoch": 0.5876319047338324, "grad_norm": 0.14967970550060272, "learning_rate": 4.012409705835022e-05, "loss": 1.2414, "step": 3738 }, { "epoch": 0.587789109630765, "grad_norm": 0.13011139631271362, "learning_rate": 4.011917702782236e-05, "loss": 1.1052, "step": 3739 }, { "epoch": 0.5879463145276975, "grad_norm": 0.14660027623176575, "learning_rate": 4.0114256073872694e-05, "loss": 1.1247, "step": 3740 }, { "epoch": 0.5881035194246301, "grad_norm": 0.14377620816230774, "learning_rate": 4.010933419680176e-05, "loss": 1.2025, "step": 3741 }, { "epoch": 0.5882607243215626, "grad_norm": 0.13705839216709137, "learning_rate": 4.01044113969102e-05, "loss": 1.2845, "step": 3742 }, { "epoch": 0.5884179292184951, "grad_norm": 0.2073041945695877, "learning_rate": 4.009948767449865e-05, "loss": 1.0314, "step": 3743 }, { "epoch": 0.5885751341154277, "grad_norm": 0.1414439082145691, "learning_rate": 4.009456302986784e-05, "loss": 1.13, "step": 3744 }, { "epoch": 0.5887323390123602, "grad_norm": 0.1278563141822815, "learning_rate": 4.008963746331855e-05, "loss": 1.0588, "step": 3745 }, { "epoch": 0.5888895439092928, "grad_norm": 0.1332569420337677, "learning_rate": 4.008471097515163e-05, "loss": 1.1562, "step": 3746 }, { "epoch": 0.5890467488062253, "grad_norm": 0.15122583508491516, "learning_rate": 4.0079783565667944e-05, "loss": 1.067, "step": 3747 }, { "epoch": 0.5892039537031578, "grad_norm": 0.1541803926229477, "learning_rate": 4.0074855235168454e-05, "loss": 1.0791, "step": 3748 }, { "epoch": 0.5893611586000904, "grad_norm": 0.165819451212883, "learning_rate": 4.0069925983954165e-05, "loss": 1.1362, "step": 3749 }, { "epoch": 0.5895183634970229, "grad_norm": 0.1396874040365219, "learning_rate": 4.0064995812326135e-05, "loss": 1.0843, "step": 3750 }, { "epoch": 0.5896755683939555, "grad_norm": 0.14084576070308685, "learning_rate": 4.006006472058548e-05, "loss": 1.0624, "step": 3751 }, { "epoch": 0.589832773290888, "grad_norm": 0.17363415658473969, "learning_rate": 4.0055132709033373e-05, "loss": 1.1095, "step": 3752 }, { "epoch": 0.5899899781878205, "grad_norm": 0.1511961817741394, "learning_rate": 4.005019977797103e-05, "loss": 1.0511, "step": 3753 }, { "epoch": 0.5901471830847531, "grad_norm": 0.17082269489765167, "learning_rate": 4.004526592769976e-05, "loss": 1.1251, "step": 3754 }, { "epoch": 0.5903043879816856, "grad_norm": 0.14516089856624603, "learning_rate": 4.004033115852088e-05, "loss": 1.0771, "step": 3755 }, { "epoch": 0.5904615928786182, "grad_norm": 0.18782827258110046, "learning_rate": 4.0035395470735815e-05, "loss": 1.0621, "step": 3756 }, { "epoch": 0.5906187977755507, "grad_norm": 0.14073415100574493, "learning_rate": 4.003045886464599e-05, "loss": 1.1879, "step": 3757 }, { "epoch": 0.5907760026724832, "grad_norm": 0.13535237312316895, "learning_rate": 4.002552134055292e-05, "loss": 1.1784, "step": 3758 }, { "epoch": 0.5909332075694158, "grad_norm": 0.13585159182548523, "learning_rate": 4.002058289875817e-05, "loss": 1.0525, "step": 3759 }, { "epoch": 0.5910904124663483, "grad_norm": 0.13575370609760284, "learning_rate": 4.0015643539563383e-05, "loss": 1.1482, "step": 3760 }, { "epoch": 0.5912476173632809, "grad_norm": 0.16383205354213715, "learning_rate": 4.001070326327021e-05, "loss": 1.0601, "step": 3761 }, { "epoch": 0.5914048222602134, "grad_norm": 0.13209950923919678, "learning_rate": 4.00057620701804e-05, "loss": 1.1027, "step": 3762 }, { "epoch": 0.5915620271571459, "grad_norm": 0.16311690211296082, "learning_rate": 4.000081996059573e-05, "loss": 1.0555, "step": 3763 }, { "epoch": 0.5917192320540785, "grad_norm": 0.1423267275094986, "learning_rate": 3.999587693481804e-05, "loss": 1.1087, "step": 3764 }, { "epoch": 0.591876436951011, "grad_norm": 0.16528518497943878, "learning_rate": 3.9990932993149266e-05, "loss": 1.1218, "step": 3765 }, { "epoch": 0.5920336418479436, "grad_norm": 0.13041681051254272, "learning_rate": 3.9985988135891326e-05, "loss": 1.1765, "step": 3766 }, { "epoch": 0.5921908467448761, "grad_norm": 0.15191826224327087, "learning_rate": 3.998104236334625e-05, "loss": 1.043, "step": 3767 }, { "epoch": 0.5923480516418086, "grad_norm": 0.18016381561756134, "learning_rate": 3.997609567581611e-05, "loss": 1.0738, "step": 3768 }, { "epoch": 0.5925052565387412, "grad_norm": 0.14468325674533844, "learning_rate": 3.997114807360303e-05, "loss": 1.0948, "step": 3769 }, { "epoch": 0.5926624614356737, "grad_norm": 0.14610640704631805, "learning_rate": 3.996619955700918e-05, "loss": 1.0561, "step": 3770 }, { "epoch": 0.5928196663326063, "grad_norm": 0.1404576301574707, "learning_rate": 3.9961250126336803e-05, "loss": 1.1185, "step": 3771 }, { "epoch": 0.5929768712295388, "grad_norm": 0.13908030092716217, "learning_rate": 3.99562997818882e-05, "loss": 1.1209, "step": 3772 }, { "epoch": 0.5931340761264713, "grad_norm": 0.1259194165468216, "learning_rate": 3.99513485239657e-05, "loss": 1.1947, "step": 3773 }, { "epoch": 0.5932912810234039, "grad_norm": 0.1661093682050705, "learning_rate": 3.994639635287172e-05, "loss": 0.96, "step": 3774 }, { "epoch": 0.5934484859203364, "grad_norm": 0.13863816857337952, "learning_rate": 3.994144326890873e-05, "loss": 1.0553, "step": 3775 }, { "epoch": 0.593605690817269, "grad_norm": 0.17332686483860016, "learning_rate": 3.993648927237922e-05, "loss": 1.0102, "step": 3776 }, { "epoch": 0.5937628957142015, "grad_norm": 0.14273682236671448, "learning_rate": 3.9931534363585784e-05, "loss": 1.13, "step": 3777 }, { "epoch": 0.593920100611134, "grad_norm": 0.17309348285198212, "learning_rate": 3.992657854283104e-05, "loss": 1.0525, "step": 3778 }, { "epoch": 0.5940773055080666, "grad_norm": 0.1335354894399643, "learning_rate": 3.992162181041766e-05, "loss": 1.1013, "step": 3779 }, { "epoch": 0.5942345104049991, "grad_norm": 0.12234276533126831, "learning_rate": 3.9916664166648405e-05, "loss": 1.097, "step": 3780 }, { "epoch": 0.5943917153019317, "grad_norm": 0.14885415136814117, "learning_rate": 3.991170561182605e-05, "loss": 1.1586, "step": 3781 }, { "epoch": 0.5945489201988642, "grad_norm": 0.15419688820838928, "learning_rate": 3.990674614625345e-05, "loss": 1.0608, "step": 3782 }, { "epoch": 0.5947061250957968, "grad_norm": 0.1355455368757248, "learning_rate": 3.9901785770233524e-05, "loss": 1.0948, "step": 3783 }, { "epoch": 0.5948633299927293, "grad_norm": 0.13568316400051117, "learning_rate": 3.9896824484069205e-05, "loss": 1.1747, "step": 3784 }, { "epoch": 0.5950205348896618, "grad_norm": 0.13468366861343384, "learning_rate": 3.989186228806354e-05, "loss": 1.1428, "step": 3785 }, { "epoch": 0.5951777397865944, "grad_norm": 0.13422173261642456, "learning_rate": 3.988689918251958e-05, "loss": 1.076, "step": 3786 }, { "epoch": 0.5953349446835269, "grad_norm": 0.1521790474653244, "learning_rate": 3.9881935167740446e-05, "loss": 1.0135, "step": 3787 }, { "epoch": 0.5954921495804595, "grad_norm": 0.14957469701766968, "learning_rate": 3.9876970244029354e-05, "loss": 1.1601, "step": 3788 }, { "epoch": 0.595649354477392, "grad_norm": 0.14396795630455017, "learning_rate": 3.987200441168951e-05, "loss": 1.1368, "step": 3789 }, { "epoch": 0.5958065593743245, "grad_norm": 0.12735560536384583, "learning_rate": 3.986703767102423e-05, "loss": 1.268, "step": 3790 }, { "epoch": 0.5959637642712571, "grad_norm": 0.14245949685573578, "learning_rate": 3.986207002233685e-05, "loss": 1.1013, "step": 3791 }, { "epoch": 0.5961209691681896, "grad_norm": 0.16585348546504974, "learning_rate": 3.985710146593077e-05, "loss": 1.1026, "step": 3792 }, { "epoch": 0.5962781740651222, "grad_norm": 0.14120368659496307, "learning_rate": 3.9852132002109476e-05, "loss": 1.0614, "step": 3793 }, { "epoch": 0.5964353789620547, "grad_norm": 0.14978669583797455, "learning_rate": 3.984716163117646e-05, "loss": 1.0808, "step": 3794 }, { "epoch": 0.5965925838589872, "grad_norm": 0.1450655162334442, "learning_rate": 3.984219035343531e-05, "loss": 1.0358, "step": 3795 }, { "epoch": 0.5967497887559198, "grad_norm": 0.14680932462215424, "learning_rate": 3.983721816918963e-05, "loss": 1.2001, "step": 3796 }, { "epoch": 0.5969069936528523, "grad_norm": 0.14158986508846283, "learning_rate": 3.983224507874312e-05, "loss": 1.0586, "step": 3797 }, { "epoch": 0.5970641985497849, "grad_norm": 0.12906032800674438, "learning_rate": 3.982727108239952e-05, "loss": 1.1182, "step": 3798 }, { "epoch": 0.5972214034467174, "grad_norm": 0.12561413645744324, "learning_rate": 3.9822296180462615e-05, "loss": 1.1288, "step": 3799 }, { "epoch": 0.5973786083436499, "grad_norm": 0.17582055926322937, "learning_rate": 3.981732037323625e-05, "loss": 1.0403, "step": 3800 }, { "epoch": 0.5975358132405825, "grad_norm": 0.1650211364030838, "learning_rate": 3.981234366102434e-05, "loss": 1.1018, "step": 3801 }, { "epoch": 0.597693018137515, "grad_norm": 0.14423269033432007, "learning_rate": 3.9807366044130825e-05, "loss": 1.0561, "step": 3802 }, { "epoch": 0.5978502230344476, "grad_norm": 0.1274254322052002, "learning_rate": 3.980238752285974e-05, "loss": 1.1414, "step": 3803 }, { "epoch": 0.5980074279313801, "grad_norm": 0.1247900202870369, "learning_rate": 3.979740809751514e-05, "loss": 1.0948, "step": 3804 }, { "epoch": 0.5981646328283126, "grad_norm": 0.13164469599723816, "learning_rate": 3.9792427768401153e-05, "loss": 1.0135, "step": 3805 }, { "epoch": 0.5983218377252452, "grad_norm": 0.12268634140491486, "learning_rate": 3.978744653582197e-05, "loss": 1.1238, "step": 3806 }, { "epoch": 0.5984790426221777, "grad_norm": 0.13479411602020264, "learning_rate": 3.97824644000818e-05, "loss": 1.1349, "step": 3807 }, { "epoch": 0.5986362475191103, "grad_norm": 0.15460748970508575, "learning_rate": 3.9777481361484956e-05, "loss": 1.0716, "step": 3808 }, { "epoch": 0.5987934524160428, "grad_norm": 0.13242261111736298, "learning_rate": 3.9772497420335774e-05, "loss": 1.1981, "step": 3809 }, { "epoch": 0.5989506573129753, "grad_norm": 0.14284269511699677, "learning_rate": 3.976751257693865e-05, "loss": 1.1462, "step": 3810 }, { "epoch": 0.5991078622099079, "grad_norm": 0.14825420081615448, "learning_rate": 3.976252683159806e-05, "loss": 1.0104, "step": 3811 }, { "epoch": 0.5992650671068404, "grad_norm": 0.13737694919109344, "learning_rate": 3.975754018461848e-05, "loss": 1.0234, "step": 3812 }, { "epoch": 0.599422272003773, "grad_norm": 0.19531135261058807, "learning_rate": 3.9752552636304504e-05, "loss": 1.0967, "step": 3813 }, { "epoch": 0.5995794769007055, "grad_norm": 0.17012959718704224, "learning_rate": 3.9747564186960744e-05, "loss": 1.1456, "step": 3814 }, { "epoch": 0.599736681797638, "grad_norm": 0.1392662674188614, "learning_rate": 3.974257483689188e-05, "loss": 1.2154, "step": 3815 }, { "epoch": 0.5998938866945706, "grad_norm": 0.15269821882247925, "learning_rate": 3.9737584586402624e-05, "loss": 1.1929, "step": 3816 }, { "epoch": 0.600051091591503, "grad_norm": 0.13864651322364807, "learning_rate": 3.9732593435797774e-05, "loss": 1.1531, "step": 3817 }, { "epoch": 0.6002082964884357, "grad_norm": 0.12776337563991547, "learning_rate": 3.9727601385382174e-05, "loss": 1.1405, "step": 3818 }, { "epoch": 0.6003655013853681, "grad_norm": 0.14198248088359833, "learning_rate": 3.9722608435460716e-05, "loss": 1.152, "step": 3819 }, { "epoch": 0.6005227062823006, "grad_norm": 0.12717603147029877, "learning_rate": 3.971761458633836e-05, "loss": 1.0789, "step": 3820 }, { "epoch": 0.6006799111792333, "grad_norm": 0.13504749536514282, "learning_rate": 3.97126198383201e-05, "loss": 1.1775, "step": 3821 }, { "epoch": 0.6008371160761657, "grad_norm": 0.14354047179222107, "learning_rate": 3.9707624191710984e-05, "loss": 1.2396, "step": 3822 }, { "epoch": 0.6009943209730984, "grad_norm": 0.12898609042167664, "learning_rate": 3.9702627646816146e-05, "loss": 1.1291, "step": 3823 }, { "epoch": 0.6011515258700308, "grad_norm": 0.14688414335250854, "learning_rate": 3.969763020394076e-05, "loss": 1.1159, "step": 3824 }, { "epoch": 0.6013087307669633, "grad_norm": 0.1664854884147644, "learning_rate": 3.969263186339004e-05, "loss": 1.0544, "step": 3825 }, { "epoch": 0.601465935663896, "grad_norm": 0.14804497361183167, "learning_rate": 3.9687632625469264e-05, "loss": 1.122, "step": 3826 }, { "epoch": 0.6016231405608284, "grad_norm": 0.1647578924894333, "learning_rate": 3.9682632490483765e-05, "loss": 1.0659, "step": 3827 }, { "epoch": 0.601780345457761, "grad_norm": 0.1529950052499771, "learning_rate": 3.967763145873895e-05, "loss": 1.1689, "step": 3828 }, { "epoch": 0.6019375503546935, "grad_norm": 0.1398983597755432, "learning_rate": 3.967262953054024e-05, "loss": 1.0378, "step": 3829 }, { "epoch": 0.602094755251626, "grad_norm": 0.1268242448568344, "learning_rate": 3.966762670619315e-05, "loss": 1.1719, "step": 3830 }, { "epoch": 0.6022519601485586, "grad_norm": 0.143156036734581, "learning_rate": 3.9662622986003226e-05, "loss": 1.1132, "step": 3831 }, { "epoch": 0.6024091650454911, "grad_norm": 0.13624730706214905, "learning_rate": 3.9657618370276076e-05, "loss": 1.2035, "step": 3832 }, { "epoch": 0.6025663699424237, "grad_norm": 0.13455921411514282, "learning_rate": 3.9652612859317364e-05, "loss": 1.134, "step": 3833 }, { "epoch": 0.6027235748393562, "grad_norm": 0.13268525898456573, "learning_rate": 3.964760645343281e-05, "loss": 0.9777, "step": 3834 }, { "epoch": 0.6028807797362887, "grad_norm": 0.1300792098045349, "learning_rate": 3.964259915292818e-05, "loss": 1.0246, "step": 3835 }, { "epoch": 0.6030379846332213, "grad_norm": 0.11833825707435608, "learning_rate": 3.963759095810931e-05, "loss": 1.1349, "step": 3836 }, { "epoch": 0.6031951895301538, "grad_norm": 0.15196771919727325, "learning_rate": 3.9632581869282076e-05, "loss": 1.1064, "step": 3837 }, { "epoch": 0.6033523944270864, "grad_norm": 0.13854347169399261, "learning_rate": 3.962757188675241e-05, "loss": 1.0556, "step": 3838 }, { "epoch": 0.6035095993240189, "grad_norm": 0.12719032168388367, "learning_rate": 3.962256101082632e-05, "loss": 1.148, "step": 3839 }, { "epoch": 0.6036668042209515, "grad_norm": 0.14315147697925568, "learning_rate": 3.9617549241809826e-05, "loss": 1.109, "step": 3840 }, { "epoch": 0.6036668042209515, "eval_loss": 1.1036489009857178, "eval_runtime": 2338.1192, "eval_samples_per_second": 3.96, "eval_steps_per_second": 1.98, "step": 3840 }, { "epoch": 0.603824009117884, "grad_norm": 0.13646598160266876, "learning_rate": 3.961253658000904e-05, "loss": 1.115, "step": 3841 }, { "epoch": 0.6039812140148165, "grad_norm": 0.12857642769813538, "learning_rate": 3.960752302573012e-05, "loss": 1.1206, "step": 3842 }, { "epoch": 0.6041384189117491, "grad_norm": 0.15237180888652802, "learning_rate": 3.960250857927928e-05, "loss": 1.2445, "step": 3843 }, { "epoch": 0.6042956238086816, "grad_norm": 0.1883225291967392, "learning_rate": 3.959749324096277e-05, "loss": 1.1447, "step": 3844 }, { "epoch": 0.6044528287056142, "grad_norm": 0.13263137638568878, "learning_rate": 3.959247701108691e-05, "loss": 1.2384, "step": 3845 }, { "epoch": 0.6046100336025467, "grad_norm": 0.12633422017097473, "learning_rate": 3.958745988995807e-05, "loss": 1.177, "step": 3846 }, { "epoch": 0.6047672384994792, "grad_norm": 0.14455291628837585, "learning_rate": 3.9582441877882695e-05, "loss": 0.958, "step": 3847 }, { "epoch": 0.6049244433964118, "grad_norm": 0.15000420808792114, "learning_rate": 3.9577422975167245e-05, "loss": 1.2108, "step": 3848 }, { "epoch": 0.6050816482933443, "grad_norm": 0.13494277000427246, "learning_rate": 3.957240318211826e-05, "loss": 1.109, "step": 3849 }, { "epoch": 0.6052388531902769, "grad_norm": 0.12015588581562042, "learning_rate": 3.9567382499042337e-05, "loss": 1.1693, "step": 3850 }, { "epoch": 0.6053960580872094, "grad_norm": 0.13013339042663574, "learning_rate": 3.956236092624611e-05, "loss": 1.2344, "step": 3851 }, { "epoch": 0.6055532629841419, "grad_norm": 0.1262013465166092, "learning_rate": 3.955733846403629e-05, "loss": 1.069, "step": 3852 }, { "epoch": 0.6057104678810745, "grad_norm": 0.12954431772232056, "learning_rate": 3.9552315112719626e-05, "loss": 1.1141, "step": 3853 }, { "epoch": 0.605867672778007, "grad_norm": 0.12079882621765137, "learning_rate": 3.954729087260291e-05, "loss": 1.0836, "step": 3854 }, { "epoch": 0.6060248776749396, "grad_norm": 0.13711245357990265, "learning_rate": 3.9542265743993036e-05, "loss": 1.2398, "step": 3855 }, { "epoch": 0.6061820825718721, "grad_norm": 0.12721870839595795, "learning_rate": 3.9537239727196886e-05, "loss": 1.2091, "step": 3856 }, { "epoch": 0.6063392874688046, "grad_norm": 0.12898989021778107, "learning_rate": 3.9532212822521454e-05, "loss": 1.0693, "step": 3857 }, { "epoch": 0.6064964923657372, "grad_norm": 0.12627077102661133, "learning_rate": 3.952718503027375e-05, "loss": 1.1112, "step": 3858 }, { "epoch": 0.6066536972626697, "grad_norm": 0.15694749355316162, "learning_rate": 3.9522156350760855e-05, "loss": 1.0982, "step": 3859 }, { "epoch": 0.6068109021596023, "grad_norm": 0.137521430850029, "learning_rate": 3.9517126784289896e-05, "loss": 1.1493, "step": 3860 }, { "epoch": 0.6069681070565348, "grad_norm": 0.13872124254703522, "learning_rate": 3.9512096331168076e-05, "loss": 1.1129, "step": 3861 }, { "epoch": 0.6071253119534673, "grad_norm": 0.18209943175315857, "learning_rate": 3.9507064991702625e-05, "loss": 1.1326, "step": 3862 }, { "epoch": 0.6072825168503999, "grad_norm": 0.1378040462732315, "learning_rate": 3.950203276620084e-05, "loss": 1.1655, "step": 3863 }, { "epoch": 0.6074397217473324, "grad_norm": 0.13413548469543457, "learning_rate": 3.949699965497007e-05, "loss": 1.1299, "step": 3864 }, { "epoch": 0.607596926644265, "grad_norm": 0.12779201567173004, "learning_rate": 3.949196565831772e-05, "loss": 1.2032, "step": 3865 }, { "epoch": 0.6077541315411975, "grad_norm": 0.1494266390800476, "learning_rate": 3.9486930776551246e-05, "loss": 1.1007, "step": 3866 }, { "epoch": 0.60791133643813, "grad_norm": 0.12856054306030273, "learning_rate": 3.948189500997816e-05, "loss": 1.0411, "step": 3867 }, { "epoch": 0.6080685413350626, "grad_norm": 0.13846808671951294, "learning_rate": 3.947685835890602e-05, "loss": 1.1598, "step": 3868 }, { "epoch": 0.6082257462319951, "grad_norm": 0.13360343873500824, "learning_rate": 3.947182082364246e-05, "loss": 1.1168, "step": 3869 }, { "epoch": 0.6083829511289277, "grad_norm": 0.12072020769119263, "learning_rate": 3.946678240449515e-05, "loss": 1.1159, "step": 3870 }, { "epoch": 0.6085401560258602, "grad_norm": 0.12124241888523102, "learning_rate": 3.94617431017718e-05, "loss": 1.0516, "step": 3871 }, { "epoch": 0.6086973609227927, "grad_norm": 0.14225207269191742, "learning_rate": 3.945670291578021e-05, "loss": 1.1411, "step": 3872 }, { "epoch": 0.6088545658197253, "grad_norm": 0.1412978172302246, "learning_rate": 3.9451661846828216e-05, "loss": 1.139, "step": 3873 }, { "epoch": 0.6090117707166578, "grad_norm": 0.12855301797389984, "learning_rate": 3.9446619895223696e-05, "loss": 1.1952, "step": 3874 }, { "epoch": 0.6091689756135904, "grad_norm": 0.1284981071949005, "learning_rate": 3.94415770612746e-05, "loss": 1.0926, "step": 3875 }, { "epoch": 0.6093261805105229, "grad_norm": 0.14576299488544464, "learning_rate": 3.943653334528892e-05, "loss": 0.9897, "step": 3876 }, { "epoch": 0.6094833854074554, "grad_norm": 0.12897971272468567, "learning_rate": 3.9431488747574715e-05, "loss": 1.1691, "step": 3877 }, { "epoch": 0.609640590304388, "grad_norm": 0.13588838279247284, "learning_rate": 3.9426443268440086e-05, "loss": 1.0245, "step": 3878 }, { "epoch": 0.6097977952013205, "grad_norm": 0.12872371077537537, "learning_rate": 3.942139690819319e-05, "loss": 1.1282, "step": 3879 }, { "epoch": 0.6099550000982531, "grad_norm": 0.12690573930740356, "learning_rate": 3.9416349667142236e-05, "loss": 1.0472, "step": 3880 }, { "epoch": 0.6101122049951856, "grad_norm": 0.14904765784740448, "learning_rate": 3.94113015455955e-05, "loss": 1.1463, "step": 3881 }, { "epoch": 0.6102694098921181, "grad_norm": 0.1351795345544815, "learning_rate": 3.94062525438613e-05, "loss": 1.2001, "step": 3882 }, { "epoch": 0.6104266147890507, "grad_norm": 0.15208245813846588, "learning_rate": 3.9401202662248004e-05, "loss": 1.1609, "step": 3883 }, { "epoch": 0.6105838196859832, "grad_norm": 0.15503764152526855, "learning_rate": 3.939615190106404e-05, "loss": 1.1118, "step": 3884 }, { "epoch": 0.6107410245829158, "grad_norm": 0.1458703875541687, "learning_rate": 3.93911002606179e-05, "loss": 1.1229, "step": 3885 }, { "epoch": 0.6108982294798483, "grad_norm": 0.18219946324825287, "learning_rate": 3.9386047741218096e-05, "loss": 1.0334, "step": 3886 }, { "epoch": 0.6110554343767808, "grad_norm": 0.1569867730140686, "learning_rate": 3.938099434317324e-05, "loss": 1.0899, "step": 3887 }, { "epoch": 0.6112126392737134, "grad_norm": 0.11686835438013077, "learning_rate": 3.937594006679197e-05, "loss": 1.1626, "step": 3888 }, { "epoch": 0.6113698441706459, "grad_norm": 0.11868549883365631, "learning_rate": 3.9370884912382965e-05, "loss": 1.1415, "step": 3889 }, { "epoch": 0.6115270490675785, "grad_norm": 0.14847975969314575, "learning_rate": 3.9365828880254994e-05, "loss": 1.1623, "step": 3890 }, { "epoch": 0.611684253964511, "grad_norm": 0.11808676272630692, "learning_rate": 3.936077197071686e-05, "loss": 1.0998, "step": 3891 }, { "epoch": 0.6118414588614436, "grad_norm": 0.1390334814786911, "learning_rate": 3.935571418407741e-05, "loss": 1.0401, "step": 3892 }, { "epoch": 0.6119986637583761, "grad_norm": 0.13492408394813538, "learning_rate": 3.935065552064555e-05, "loss": 1.0843, "step": 3893 }, { "epoch": 0.6121558686553086, "grad_norm": 0.152793750166893, "learning_rate": 3.934559598073025e-05, "loss": 1.086, "step": 3894 }, { "epoch": 0.6123130735522412, "grad_norm": 0.13042302429676056, "learning_rate": 3.9340535564640534e-05, "loss": 1.1583, "step": 3895 }, { "epoch": 0.6124702784491737, "grad_norm": 0.14149799942970276, "learning_rate": 3.933547427268547e-05, "loss": 1.0554, "step": 3896 }, { "epoch": 0.6126274833461063, "grad_norm": 0.1303536593914032, "learning_rate": 3.933041210517419e-05, "loss": 1.0976, "step": 3897 }, { "epoch": 0.6127846882430388, "grad_norm": 0.15663042664527893, "learning_rate": 3.932534906241585e-05, "loss": 1.1031, "step": 3898 }, { "epoch": 0.6129418931399713, "grad_norm": 0.13524563610553741, "learning_rate": 3.9320285144719695e-05, "loss": 1.064, "step": 3899 }, { "epoch": 0.6130990980369039, "grad_norm": 0.1357249617576599, "learning_rate": 3.931522035239501e-05, "loss": 1.1611, "step": 3900 }, { "epoch": 0.6132563029338364, "grad_norm": 0.13968004286289215, "learning_rate": 3.931015468575113e-05, "loss": 1.1555, "step": 3901 }, { "epoch": 0.613413507830769, "grad_norm": 0.13817796111106873, "learning_rate": 3.9305088145097447e-05, "loss": 1.036, "step": 3902 }, { "epoch": 0.6135707127277015, "grad_norm": 0.13282924890518188, "learning_rate": 3.930002073074341e-05, "loss": 1.1606, "step": 3903 }, { "epoch": 0.613727917624634, "grad_norm": 0.15224741399288177, "learning_rate": 3.9294952442998514e-05, "loss": 1.2341, "step": 3904 }, { "epoch": 0.6138851225215666, "grad_norm": 0.1290319263935089, "learning_rate": 3.928988328217231e-05, "loss": 1.2607, "step": 3905 }, { "epoch": 0.6140423274184991, "grad_norm": 0.17039501667022705, "learning_rate": 3.9284813248574405e-05, "loss": 0.9922, "step": 3906 }, { "epoch": 0.6141995323154317, "grad_norm": 0.14699743688106537, "learning_rate": 3.927974234251446e-05, "loss": 1.0983, "step": 3907 }, { "epoch": 0.6143567372123642, "grad_norm": 0.15270787477493286, "learning_rate": 3.927467056430218e-05, "loss": 1.1628, "step": 3908 }, { "epoch": 0.6145139421092967, "grad_norm": 0.12988269329071045, "learning_rate": 3.9269597914247335e-05, "loss": 1.1579, "step": 3909 }, { "epoch": 0.6146711470062293, "grad_norm": 0.1486128568649292, "learning_rate": 3.926452439265974e-05, "loss": 1.2035, "step": 3910 }, { "epoch": 0.6148283519031618, "grad_norm": 0.1319809854030609, "learning_rate": 3.925944999984927e-05, "loss": 1.1176, "step": 3911 }, { "epoch": 0.6149855568000944, "grad_norm": 0.1273583173751831, "learning_rate": 3.925437473612585e-05, "loss": 1.1432, "step": 3912 }, { "epoch": 0.6151427616970269, "grad_norm": 0.13283202052116394, "learning_rate": 3.924929860179946e-05, "loss": 1.0856, "step": 3913 }, { "epoch": 0.6152999665939594, "grad_norm": 0.12940795719623566, "learning_rate": 3.924422159718011e-05, "loss": 1.0855, "step": 3914 }, { "epoch": 0.615457171490892, "grad_norm": 0.1384333074092865, "learning_rate": 3.9239143722577915e-05, "loss": 1.0048, "step": 3915 }, { "epoch": 0.6156143763878245, "grad_norm": 0.1223512664437294, "learning_rate": 3.923406497830299e-05, "loss": 1.1227, "step": 3916 }, { "epoch": 0.6157715812847571, "grad_norm": 0.1405000239610672, "learning_rate": 3.922898536466554e-05, "loss": 1.0386, "step": 3917 }, { "epoch": 0.6159287861816896, "grad_norm": 0.1473666876554489, "learning_rate": 3.92239048819758e-05, "loss": 1.0323, "step": 3918 }, { "epoch": 0.616085991078622, "grad_norm": 0.1441134363412857, "learning_rate": 3.921882353054407e-05, "loss": 1.0195, "step": 3919 }, { "epoch": 0.6162431959755547, "grad_norm": 0.13485924899578094, "learning_rate": 3.9213741310680686e-05, "loss": 1.0685, "step": 3920 }, { "epoch": 0.6164004008724872, "grad_norm": 0.15113161504268646, "learning_rate": 3.920865822269607e-05, "loss": 0.9724, "step": 3921 }, { "epoch": 0.6165576057694198, "grad_norm": 0.153519868850708, "learning_rate": 3.920357426690067e-05, "loss": 1.1156, "step": 3922 }, { "epoch": 0.6167148106663523, "grad_norm": 0.13160768151283264, "learning_rate": 3.9198489443605e-05, "loss": 1.1649, "step": 3923 }, { "epoch": 0.6168720155632847, "grad_norm": 0.14783591032028198, "learning_rate": 3.919340375311961e-05, "loss": 1.0885, "step": 3924 }, { "epoch": 0.6170292204602174, "grad_norm": 0.12354061752557755, "learning_rate": 3.918831719575512e-05, "loss": 1.1117, "step": 3925 }, { "epoch": 0.6171864253571498, "grad_norm": 0.15359634160995483, "learning_rate": 3.91832297718222e-05, "loss": 1.1198, "step": 3926 }, { "epoch": 0.6173436302540825, "grad_norm": 0.13819736242294312, "learning_rate": 3.917814148163158e-05, "loss": 1.1171, "step": 3927 }, { "epoch": 0.617500835151015, "grad_norm": 0.13880357146263123, "learning_rate": 3.917305232549401e-05, "loss": 1.1716, "step": 3928 }, { "epoch": 0.6176580400479474, "grad_norm": 0.13249677419662476, "learning_rate": 3.916796230372034e-05, "loss": 1.0433, "step": 3929 }, { "epoch": 0.61781524494488, "grad_norm": 0.1238926574587822, "learning_rate": 3.916287141662142e-05, "loss": 1.1879, "step": 3930 }, { "epoch": 0.6179724498418125, "grad_norm": 0.15673570334911346, "learning_rate": 3.915777966450821e-05, "loss": 1.1016, "step": 3931 }, { "epoch": 0.6181296547387451, "grad_norm": 0.1377524733543396, "learning_rate": 3.9152687047691695e-05, "loss": 1.1478, "step": 3932 }, { "epoch": 0.6182868596356776, "grad_norm": 0.13096009194850922, "learning_rate": 3.914759356648289e-05, "loss": 1.1211, "step": 3933 }, { "epoch": 0.6184440645326101, "grad_norm": 0.14554426074028015, "learning_rate": 3.9142499221192894e-05, "loss": 1.1488, "step": 3934 }, { "epoch": 0.6186012694295427, "grad_norm": 0.20322880148887634, "learning_rate": 3.9137404012132866e-05, "loss": 0.9873, "step": 3935 }, { "epoch": 0.6187584743264752, "grad_norm": 0.1772369146347046, "learning_rate": 3.913230793961399e-05, "loss": 1.1642, "step": 3936 }, { "epoch": 0.6189156792234078, "grad_norm": 0.1685638576745987, "learning_rate": 3.91272110039475e-05, "loss": 1.1079, "step": 3937 }, { "epoch": 0.6190728841203403, "grad_norm": 0.14887399971485138, "learning_rate": 3.912211320544473e-05, "loss": 1.1486, "step": 3938 }, { "epoch": 0.6192300890172728, "grad_norm": 0.13565029203891754, "learning_rate": 3.911701454441701e-05, "loss": 1.0728, "step": 3939 }, { "epoch": 0.6193872939142054, "grad_norm": 0.13880778849124908, "learning_rate": 3.911191502117576e-05, "loss": 1.1221, "step": 3940 }, { "epoch": 0.6195444988111379, "grad_norm": 0.13235561549663544, "learning_rate": 3.910681463603242e-05, "loss": 1.0952, "step": 3941 }, { "epoch": 0.6197017037080705, "grad_norm": 0.16491159796714783, "learning_rate": 3.9101713389298525e-05, "loss": 1.0765, "step": 3942 }, { "epoch": 0.619858908605003, "grad_norm": 0.14132866263389587, "learning_rate": 3.909661128128562e-05, "loss": 1.2179, "step": 3943 }, { "epoch": 0.6200161135019356, "grad_norm": 0.13405713438987732, "learning_rate": 3.909150831230534e-05, "loss": 1.1696, "step": 3944 }, { "epoch": 0.6201733183988681, "grad_norm": 0.1398204267024994, "learning_rate": 3.908640448266934e-05, "loss": 0.9907, "step": 3945 }, { "epoch": 0.6203305232958006, "grad_norm": 0.16295889019966125, "learning_rate": 3.908129979268936e-05, "loss": 1.0666, "step": 3946 }, { "epoch": 0.6204877281927332, "grad_norm": 0.14149579405784607, "learning_rate": 3.9076194242677156e-05, "loss": 1.1172, "step": 3947 }, { "epoch": 0.6206449330896657, "grad_norm": 0.16177891194820404, "learning_rate": 3.907108783294457e-05, "loss": 1.0195, "step": 3948 }, { "epoch": 0.6208021379865983, "grad_norm": 0.12914012372493744, "learning_rate": 3.906598056380347e-05, "loss": 1.2259, "step": 3949 }, { "epoch": 0.6209593428835308, "grad_norm": 0.14361891150474548, "learning_rate": 3.9060872435565796e-05, "loss": 0.9796, "step": 3950 }, { "epoch": 0.6211165477804633, "grad_norm": 0.12880343198776245, "learning_rate": 3.905576344854354e-05, "loss": 1.1487, "step": 3951 }, { "epoch": 0.6212737526773959, "grad_norm": 0.13725869357585907, "learning_rate": 3.9050653603048725e-05, "loss": 1.1024, "step": 3952 }, { "epoch": 0.6214309575743284, "grad_norm": 0.13440468907356262, "learning_rate": 3.904554289939345e-05, "loss": 1.171, "step": 3953 }, { "epoch": 0.621588162471261, "grad_norm": 0.12612858414649963, "learning_rate": 3.904043133788984e-05, "loss": 1.0411, "step": 3954 }, { "epoch": 0.6217453673681935, "grad_norm": 0.12622110545635223, "learning_rate": 3.903531891885012e-05, "loss": 1.1089, "step": 3955 }, { "epoch": 0.621902572265126, "grad_norm": 0.1516236960887909, "learning_rate": 3.9030205642586514e-05, "loss": 1.242, "step": 3956 }, { "epoch": 0.6220597771620586, "grad_norm": 0.1355689913034439, "learning_rate": 3.9025091509411336e-05, "loss": 1.0582, "step": 3957 }, { "epoch": 0.6222169820589911, "grad_norm": 0.15789024531841278, "learning_rate": 3.901997651963692e-05, "loss": 1.1701, "step": 3958 }, { "epoch": 0.6223741869559237, "grad_norm": 0.1915074735879898, "learning_rate": 3.901486067357569e-05, "loss": 1.1742, "step": 3959 }, { "epoch": 0.6225313918528562, "grad_norm": 0.1438111662864685, "learning_rate": 3.900974397154009e-05, "loss": 1.1453, "step": 3960 }, { "epoch": 0.6226885967497887, "grad_norm": 0.12249389290809631, "learning_rate": 3.900462641384264e-05, "loss": 1.0477, "step": 3961 }, { "epoch": 0.6228458016467213, "grad_norm": 0.15517558157444, "learning_rate": 3.899950800079588e-05, "loss": 1.0476, "step": 3962 }, { "epoch": 0.6230030065436538, "grad_norm": 0.14869697391986847, "learning_rate": 3.899438873271244e-05, "loss": 1.1311, "step": 3963 }, { "epoch": 0.6231602114405864, "grad_norm": 0.1483510434627533, "learning_rate": 3.8989268609904985e-05, "loss": 1.0019, "step": 3964 }, { "epoch": 0.6233174163375189, "grad_norm": 0.13164103031158447, "learning_rate": 3.898414763268622e-05, "loss": 1.0782, "step": 3965 }, { "epoch": 0.6234746212344514, "grad_norm": 0.14316119253635406, "learning_rate": 3.8979025801368936e-05, "loss": 1.0564, "step": 3966 }, { "epoch": 0.623631826131384, "grad_norm": 0.18632963299751282, "learning_rate": 3.8973903116265936e-05, "loss": 0.9668, "step": 3967 }, { "epoch": 0.6237890310283165, "grad_norm": 0.1400153636932373, "learning_rate": 3.8968779577690105e-05, "loss": 1.2211, "step": 3968 }, { "epoch": 0.6239462359252491, "grad_norm": 0.1399480402469635, "learning_rate": 3.896365518595436e-05, "loss": 1.1038, "step": 3969 }, { "epoch": 0.6241034408221816, "grad_norm": 0.14281897246837616, "learning_rate": 3.895852994137168e-05, "loss": 1.0784, "step": 3970 }, { "epoch": 0.6242606457191141, "grad_norm": 0.1383058726787567, "learning_rate": 3.8953403844255106e-05, "loss": 1.0747, "step": 3971 }, { "epoch": 0.6244178506160467, "grad_norm": 0.13327376544475555, "learning_rate": 3.894827689491772e-05, "loss": 1.16, "step": 3972 }, { "epoch": 0.6245750555129792, "grad_norm": 0.12594720721244812, "learning_rate": 3.8943149093672646e-05, "loss": 1.1264, "step": 3973 }, { "epoch": 0.6247322604099118, "grad_norm": 0.12865445017814636, "learning_rate": 3.8938020440833066e-05, "loss": 1.0779, "step": 3974 }, { "epoch": 0.6248894653068443, "grad_norm": 0.13021129369735718, "learning_rate": 3.893289093671224e-05, "loss": 1.2198, "step": 3975 }, { "epoch": 0.6250466702037768, "grad_norm": 0.15246430039405823, "learning_rate": 3.8927760581623455e-05, "loss": 1.229, "step": 3976 }, { "epoch": 0.6252038751007094, "grad_norm": 0.1328427493572235, "learning_rate": 3.892262937588002e-05, "loss": 1.0821, "step": 3977 }, { "epoch": 0.6253610799976419, "grad_norm": 0.14552165567874908, "learning_rate": 3.8917497319795385e-05, "loss": 1.0284, "step": 3978 }, { "epoch": 0.6255182848945745, "grad_norm": 0.13598152995109558, "learning_rate": 3.891236441368294e-05, "loss": 1.204, "step": 3979 }, { "epoch": 0.625675489791507, "grad_norm": 0.12847700715065002, "learning_rate": 3.890723065785622e-05, "loss": 0.9571, "step": 3980 }, { "epoch": 0.6258326946884395, "grad_norm": 0.1271078735589981, "learning_rate": 3.890209605262877e-05, "loss": 1.067, "step": 3981 }, { "epoch": 0.6259898995853721, "grad_norm": 0.12795038521289825, "learning_rate": 3.889696059831418e-05, "loss": 1.1389, "step": 3982 }, { "epoch": 0.6261471044823046, "grad_norm": 0.14186857640743256, "learning_rate": 3.8891824295226115e-05, "loss": 1.0729, "step": 3983 }, { "epoch": 0.6263043093792372, "grad_norm": 0.16656219959259033, "learning_rate": 3.8886687143678275e-05, "loss": 1.0958, "step": 3984 }, { "epoch": 0.6264615142761697, "grad_norm": 0.13067491352558136, "learning_rate": 3.888154914398442e-05, "loss": 1.1101, "step": 3985 }, { "epoch": 0.6266187191731022, "grad_norm": 0.14706555008888245, "learning_rate": 3.887641029645836e-05, "loss": 1.1012, "step": 3986 }, { "epoch": 0.6267759240700348, "grad_norm": 0.14502152800559998, "learning_rate": 3.887127060141396e-05, "loss": 1.1221, "step": 3987 }, { "epoch": 0.6269331289669673, "grad_norm": 0.13042514026165009, "learning_rate": 3.8866130059165115e-05, "loss": 1.2059, "step": 3988 }, { "epoch": 0.6270903338638999, "grad_norm": 0.13619203865528107, "learning_rate": 3.886098867002581e-05, "loss": 1.0648, "step": 3989 }, { "epoch": 0.6272475387608324, "grad_norm": 0.12742075324058533, "learning_rate": 3.885584643431006e-05, "loss": 1.1937, "step": 3990 }, { "epoch": 0.6274047436577649, "grad_norm": 0.15529866516590118, "learning_rate": 3.8850703352331925e-05, "loss": 1.084, "step": 3991 }, { "epoch": 0.6275619485546975, "grad_norm": 0.16348528861999512, "learning_rate": 3.8845559424405534e-05, "loss": 1.1566, "step": 3992 }, { "epoch": 0.62771915345163, "grad_norm": 0.11867035925388336, "learning_rate": 3.884041465084504e-05, "loss": 0.9744, "step": 3993 }, { "epoch": 0.6278763583485626, "grad_norm": 0.14471690356731415, "learning_rate": 3.8835269031964685e-05, "loss": 1.1047, "step": 3994 }, { "epoch": 0.6280335632454951, "grad_norm": 0.14537517726421356, "learning_rate": 3.883012256807873e-05, "loss": 1.1548, "step": 3995 }, { "epoch": 0.6281907681424277, "grad_norm": 0.1430378556251526, "learning_rate": 3.882497525950152e-05, "loss": 1.0705, "step": 3996 }, { "epoch": 0.6283479730393602, "grad_norm": 0.1448555290699005, "learning_rate": 3.881982710654741e-05, "loss": 1.1347, "step": 3997 }, { "epoch": 0.6285051779362927, "grad_norm": 0.14756466448307037, "learning_rate": 3.881467810953085e-05, "loss": 1.185, "step": 3998 }, { "epoch": 0.6286623828332253, "grad_norm": 0.13334186375141144, "learning_rate": 3.8809528268766304e-05, "loss": 1.2077, "step": 3999 }, { "epoch": 0.6288195877301578, "grad_norm": 0.16169632971286774, "learning_rate": 3.8804377584568324e-05, "loss": 1.0994, "step": 4000 }, { "epoch": 0.6288195877301578, "eval_loss": 1.1022133827209473, "eval_runtime": 2321.8951, "eval_samples_per_second": 3.987, "eval_steps_per_second": 1.994, "step": 4000 }, { "epoch": 0.6289767926270904, "grad_norm": 0.12798309326171875, "learning_rate": 3.879922605725148e-05, "loss": 1.2744, "step": 4001 }, { "epoch": 0.6291339975240229, "grad_norm": 0.1319848597049713, "learning_rate": 3.8794073687130414e-05, "loss": 1.0977, "step": 4002 }, { "epoch": 0.6292912024209554, "grad_norm": 0.14508125185966492, "learning_rate": 3.87889204745198e-05, "loss": 1.0866, "step": 4003 }, { "epoch": 0.629448407317888, "grad_norm": 0.1388920098543167, "learning_rate": 3.878376641973439e-05, "loss": 1.1524, "step": 4004 }, { "epoch": 0.6296056122148205, "grad_norm": 0.14365117251873016, "learning_rate": 3.8778611523088976e-05, "loss": 1.1457, "step": 4005 }, { "epoch": 0.6297628171117531, "grad_norm": 0.16236121952533722, "learning_rate": 3.877345578489839e-05, "loss": 1.13, "step": 4006 }, { "epoch": 0.6299200220086856, "grad_norm": 0.14283150434494019, "learning_rate": 3.876829920547753e-05, "loss": 1.0183, "step": 4007 }, { "epoch": 0.6300772269056181, "grad_norm": 0.16025055944919586, "learning_rate": 3.876314178514134e-05, "loss": 1.1702, "step": 4008 }, { "epoch": 0.6302344318025507, "grad_norm": 0.15556633472442627, "learning_rate": 3.875798352420482e-05, "loss": 1.1579, "step": 4009 }, { "epoch": 0.6303916366994832, "grad_norm": 0.16086506843566895, "learning_rate": 3.875282442298301e-05, "loss": 1.2713, "step": 4010 }, { "epoch": 0.6305488415964158, "grad_norm": 0.14218272268772125, "learning_rate": 3.8747664481791e-05, "loss": 1.1606, "step": 4011 }, { "epoch": 0.6307060464933483, "grad_norm": 0.13343515992164612, "learning_rate": 3.874250370094397e-05, "loss": 0.9663, "step": 4012 }, { "epoch": 0.6308632513902808, "grad_norm": 0.13490377366542816, "learning_rate": 3.873734208075709e-05, "loss": 1.0989, "step": 4013 }, { "epoch": 0.6310204562872134, "grad_norm": 0.13085541129112244, "learning_rate": 3.873217962154562e-05, "loss": 1.0909, "step": 4014 }, { "epoch": 0.6311776611841459, "grad_norm": 0.182506263256073, "learning_rate": 3.872701632362486e-05, "loss": 1.1672, "step": 4015 }, { "epoch": 0.6313348660810785, "grad_norm": 0.18165533244609833, "learning_rate": 3.8721852187310184e-05, "loss": 1.0913, "step": 4016 }, { "epoch": 0.631492070978011, "grad_norm": 0.2167358249425888, "learning_rate": 3.871668721291698e-05, "loss": 1.1286, "step": 4017 }, { "epoch": 0.6316492758749435, "grad_norm": 0.13552582263946533, "learning_rate": 3.871152140076071e-05, "loss": 1.0448, "step": 4018 }, { "epoch": 0.6318064807718761, "grad_norm": 0.1412697434425354, "learning_rate": 3.8706354751156875e-05, "loss": 1.0886, "step": 4019 }, { "epoch": 0.6319636856688086, "grad_norm": 0.18857620656490326, "learning_rate": 3.8701187264421046e-05, "loss": 1.1056, "step": 4020 }, { "epoch": 0.6321208905657412, "grad_norm": 0.13617004454135895, "learning_rate": 3.8696018940868835e-05, "loss": 1.2482, "step": 4021 }, { "epoch": 0.6322780954626737, "grad_norm": 0.15233376622200012, "learning_rate": 3.869084978081589e-05, "loss": 1.0117, "step": 4022 }, { "epoch": 0.6324353003596062, "grad_norm": 0.1499616652727127, "learning_rate": 3.868567978457793e-05, "loss": 1.1485, "step": 4023 }, { "epoch": 0.6325925052565388, "grad_norm": 0.15782010555267334, "learning_rate": 3.8680508952470726e-05, "loss": 0.9957, "step": 4024 }, { "epoch": 0.6327497101534713, "grad_norm": 0.1355133205652237, "learning_rate": 3.867533728481008e-05, "loss": 1.1428, "step": 4025 }, { "epoch": 0.6329069150504039, "grad_norm": 0.14103074371814728, "learning_rate": 3.8670164781911864e-05, "loss": 1.191, "step": 4026 }, { "epoch": 0.6330641199473364, "grad_norm": 0.15501828491687775, "learning_rate": 3.8664991444091994e-05, "loss": 1.0547, "step": 4027 }, { "epoch": 0.6332213248442689, "grad_norm": 0.16231434047222137, "learning_rate": 3.865981727166644e-05, "loss": 1.0961, "step": 4028 }, { "epoch": 0.6333785297412015, "grad_norm": 0.13422849774360657, "learning_rate": 3.8654642264951224e-05, "loss": 1.1408, "step": 4029 }, { "epoch": 0.633535734638134, "grad_norm": 0.13933904469013214, "learning_rate": 3.86494664242624e-05, "loss": 1.1404, "step": 4030 }, { "epoch": 0.6336929395350666, "grad_norm": 0.12841267883777618, "learning_rate": 3.8644289749916116e-05, "loss": 0.9775, "step": 4031 }, { "epoch": 0.633850144431999, "grad_norm": 0.13102753460407257, "learning_rate": 3.863911224222851e-05, "loss": 1.0729, "step": 4032 }, { "epoch": 0.6340073493289315, "grad_norm": 0.12041983008384705, "learning_rate": 3.8633933901515834e-05, "loss": 1.1076, "step": 4033 }, { "epoch": 0.6341645542258642, "grad_norm": 0.14172932505607605, "learning_rate": 3.862875472809434e-05, "loss": 1.1153, "step": 4034 }, { "epoch": 0.6343217591227966, "grad_norm": 0.15133321285247803, "learning_rate": 3.862357472228037e-05, "loss": 1.0577, "step": 4035 }, { "epoch": 0.6344789640197293, "grad_norm": 0.13426773250102997, "learning_rate": 3.861839388439029e-05, "loss": 1.1876, "step": 4036 }, { "epoch": 0.6346361689166617, "grad_norm": 0.1450241208076477, "learning_rate": 3.861321221474052e-05, "loss": 1.1069, "step": 4037 }, { "epoch": 0.6347933738135942, "grad_norm": 0.15062105655670166, "learning_rate": 3.8608029713647545e-05, "loss": 1.1346, "step": 4038 }, { "epoch": 0.6349505787105268, "grad_norm": 0.15360552072525024, "learning_rate": 3.860284638142789e-05, "loss": 1.1386, "step": 4039 }, { "epoch": 0.6351077836074593, "grad_norm": 0.15498320758342743, "learning_rate": 3.8597662218398136e-05, "loss": 1.0031, "step": 4040 }, { "epoch": 0.635264988504392, "grad_norm": 0.15017934143543243, "learning_rate": 3.85924772248749e-05, "loss": 1.0386, "step": 4041 }, { "epoch": 0.6354221934013244, "grad_norm": 0.14026129245758057, "learning_rate": 3.8587291401174886e-05, "loss": 1.1104, "step": 4042 }, { "epoch": 0.6355793982982569, "grad_norm": 0.12707599997520447, "learning_rate": 3.85821047476148e-05, "loss": 1.1505, "step": 4043 }, { "epoch": 0.6357366031951895, "grad_norm": 0.17259611189365387, "learning_rate": 3.857691726451143e-05, "loss": 1.0531, "step": 4044 }, { "epoch": 0.635893808092122, "grad_norm": 0.13623571395874023, "learning_rate": 3.857172895218162e-05, "loss": 1.0596, "step": 4045 }, { "epoch": 0.6360510129890546, "grad_norm": 0.1337416023015976, "learning_rate": 3.856653981094224e-05, "loss": 0.9934, "step": 4046 }, { "epoch": 0.6362082178859871, "grad_norm": 0.16031776368618011, "learning_rate": 3.8561349841110215e-05, "loss": 1.109, "step": 4047 }, { "epoch": 0.6363654227829197, "grad_norm": 0.1332205832004547, "learning_rate": 3.855615904300255e-05, "loss": 1.0332, "step": 4048 }, { "epoch": 0.6365226276798522, "grad_norm": 0.1302390694618225, "learning_rate": 3.855096741693627e-05, "loss": 0.9843, "step": 4049 }, { "epoch": 0.6366798325767847, "grad_norm": 0.13197927176952362, "learning_rate": 3.854577496322845e-05, "loss": 1.0397, "step": 4050 }, { "epoch": 0.6368370374737173, "grad_norm": 0.14378191530704498, "learning_rate": 3.854058168219624e-05, "loss": 1.0665, "step": 4051 }, { "epoch": 0.6369942423706498, "grad_norm": 0.137291818857193, "learning_rate": 3.853538757415681e-05, "loss": 1.0029, "step": 4052 }, { "epoch": 0.6371514472675824, "grad_norm": 0.1508316695690155, "learning_rate": 3.853019263942741e-05, "loss": 0.9818, "step": 4053 }, { "epoch": 0.6373086521645149, "grad_norm": 0.1240345910191536, "learning_rate": 3.852499687832533e-05, "loss": 1.0345, "step": 4054 }, { "epoch": 0.6374658570614474, "grad_norm": 0.14110183715820312, "learning_rate": 3.85198002911679e-05, "loss": 1.0664, "step": 4055 }, { "epoch": 0.63762306195838, "grad_norm": 0.12327904999256134, "learning_rate": 3.8514602878272496e-05, "loss": 1.0971, "step": 4056 }, { "epoch": 0.6377802668553125, "grad_norm": 0.13333137333393097, "learning_rate": 3.850940463995658e-05, "loss": 1.0969, "step": 4057 }, { "epoch": 0.6379374717522451, "grad_norm": 0.17208878695964813, "learning_rate": 3.850420557653762e-05, "loss": 1.0696, "step": 4058 }, { "epoch": 0.6380946766491776, "grad_norm": 0.13075406849384308, "learning_rate": 3.8499005688333165e-05, "loss": 1.1611, "step": 4059 }, { "epoch": 0.6382518815461101, "grad_norm": 0.13193902373313904, "learning_rate": 3.849380497566081e-05, "loss": 1.0657, "step": 4060 }, { "epoch": 0.6384090864430427, "grad_norm": 0.14207960665225983, "learning_rate": 3.848860343883818e-05, "loss": 1.0669, "step": 4061 }, { "epoch": 0.6385662913399752, "grad_norm": 0.12478265911340714, "learning_rate": 3.848340107818298e-05, "loss": 1.1441, "step": 4062 }, { "epoch": 0.6387234962369078, "grad_norm": 0.14924664795398712, "learning_rate": 3.847819789401294e-05, "loss": 1.0968, "step": 4063 }, { "epoch": 0.6388807011338403, "grad_norm": 0.15374234318733215, "learning_rate": 3.847299388664585e-05, "loss": 1.0329, "step": 4064 }, { "epoch": 0.6390379060307728, "grad_norm": 0.1404561996459961, "learning_rate": 3.8467789056399554e-05, "loss": 1.1609, "step": 4065 }, { "epoch": 0.6391951109277054, "grad_norm": 0.13482248783111572, "learning_rate": 3.846258340359195e-05, "loss": 1.1606, "step": 4066 }, { "epoch": 0.6393523158246379, "grad_norm": 0.13773567974567413, "learning_rate": 3.8457376928540966e-05, "loss": 1.2042, "step": 4067 }, { "epoch": 0.6395095207215705, "grad_norm": 0.14411531388759613, "learning_rate": 3.8452169631564604e-05, "loss": 1.1219, "step": 4068 }, { "epoch": 0.639666725618503, "grad_norm": 0.12763367593288422, "learning_rate": 3.8446961512980906e-05, "loss": 1.0552, "step": 4069 }, { "epoch": 0.6398239305154355, "grad_norm": 0.1240297257900238, "learning_rate": 3.844175257310796e-05, "loss": 1.128, "step": 4070 }, { "epoch": 0.6399811354123681, "grad_norm": 0.132890522480011, "learning_rate": 3.843654281226391e-05, "loss": 1.1883, "step": 4071 }, { "epoch": 0.6401383403093006, "grad_norm": 0.14669398963451385, "learning_rate": 3.843133223076695e-05, "loss": 1.1783, "step": 4072 }, { "epoch": 0.6402955452062332, "grad_norm": 0.15040293335914612, "learning_rate": 3.842612082893531e-05, "loss": 1.0935, "step": 4073 }, { "epoch": 0.6404527501031657, "grad_norm": 0.1313762068748474, "learning_rate": 3.84209086070873e-05, "loss": 1.1244, "step": 4074 }, { "epoch": 0.6406099550000982, "grad_norm": 0.14827388525009155, "learning_rate": 3.841569556554126e-05, "loss": 1.1807, "step": 4075 }, { "epoch": 0.6407671598970308, "grad_norm": 0.16525235772132874, "learning_rate": 3.8410481704615574e-05, "loss": 1.096, "step": 4076 }, { "epoch": 0.6409243647939633, "grad_norm": 0.1288774460554123, "learning_rate": 3.840526702462869e-05, "loss": 1.1513, "step": 4077 }, { "epoch": 0.6410815696908959, "grad_norm": 0.16332295536994934, "learning_rate": 3.84000515258991e-05, "loss": 1.1474, "step": 4078 }, { "epoch": 0.6412387745878284, "grad_norm": 0.20217250287532806, "learning_rate": 3.8394835208745343e-05, "loss": 1.0317, "step": 4079 }, { "epoch": 0.6413959794847609, "grad_norm": 0.12176761776208878, "learning_rate": 3.838961807348602e-05, "loss": 1.1813, "step": 4080 }, { "epoch": 0.6415531843816935, "grad_norm": 0.13961894810199738, "learning_rate": 3.838440012043977e-05, "loss": 1.0547, "step": 4081 }, { "epoch": 0.641710389278626, "grad_norm": 0.14181359112262726, "learning_rate": 3.837918134992528e-05, "loss": 1.0766, "step": 4082 }, { "epoch": 0.6418675941755586, "grad_norm": 0.12520286440849304, "learning_rate": 3.83739617622613e-05, "loss": 1.2473, "step": 4083 }, { "epoch": 0.6420247990724911, "grad_norm": 0.14975857734680176, "learning_rate": 3.836874135776662e-05, "loss": 1.1195, "step": 4084 }, { "epoch": 0.6421820039694236, "grad_norm": 0.1481516808271408, "learning_rate": 3.836352013676008e-05, "loss": 1.0374, "step": 4085 }, { "epoch": 0.6423392088663562, "grad_norm": 0.15783464908599854, "learning_rate": 3.835829809956058e-05, "loss": 1.165, "step": 4086 }, { "epoch": 0.6424964137632887, "grad_norm": 0.14074595272541046, "learning_rate": 3.8353075246487044e-05, "loss": 1.1495, "step": 4087 }, { "epoch": 0.6426536186602213, "grad_norm": 0.15384332835674286, "learning_rate": 3.834785157785849e-05, "loss": 1.0054, "step": 4088 }, { "epoch": 0.6428108235571538, "grad_norm": 0.1430482268333435, "learning_rate": 3.834262709399395e-05, "loss": 1.0299, "step": 4089 }, { "epoch": 0.6429680284540863, "grad_norm": 0.15135294198989868, "learning_rate": 3.8337401795212514e-05, "loss": 0.9536, "step": 4090 }, { "epoch": 0.6431252333510189, "grad_norm": 0.13654224574565887, "learning_rate": 3.833217568183331e-05, "loss": 1.112, "step": 4091 }, { "epoch": 0.6432824382479514, "grad_norm": 0.15030381083488464, "learning_rate": 3.832694875417554e-05, "loss": 1.0778, "step": 4092 }, { "epoch": 0.643439643144884, "grad_norm": 0.14391788840293884, "learning_rate": 3.8321721012558456e-05, "loss": 1.1318, "step": 4093 }, { "epoch": 0.6435968480418165, "grad_norm": 0.13817065954208374, "learning_rate": 3.8316492457301334e-05, "loss": 1.1262, "step": 4094 }, { "epoch": 0.643754052938749, "grad_norm": 0.1388375461101532, "learning_rate": 3.831126308872352e-05, "loss": 1.0942, "step": 4095 }, { "epoch": 0.6439112578356816, "grad_norm": 0.1540645956993103, "learning_rate": 3.83060329071444e-05, "loss": 1.1978, "step": 4096 }, { "epoch": 0.6440684627326141, "grad_norm": 0.15924197435379028, "learning_rate": 3.830080191288342e-05, "loss": 1.1066, "step": 4097 }, { "epoch": 0.6442256676295467, "grad_norm": 0.15522272884845734, "learning_rate": 3.829557010626006e-05, "loss": 1.1649, "step": 4098 }, { "epoch": 0.6443828725264792, "grad_norm": 0.1274399310350418, "learning_rate": 3.829033748759386e-05, "loss": 1.1831, "step": 4099 }, { "epoch": 0.6445400774234118, "grad_norm": 0.15869344770908356, "learning_rate": 3.8285104057204426e-05, "loss": 1.1169, "step": 4100 }, { "epoch": 0.6446972823203443, "grad_norm": 0.12494777143001556, "learning_rate": 3.827986981541138e-05, "loss": 1.0256, "step": 4101 }, { "epoch": 0.6448544872172768, "grad_norm": 0.1447725147008896, "learning_rate": 3.8274634762534405e-05, "loss": 1.1523, "step": 4102 }, { "epoch": 0.6450116921142094, "grad_norm": 0.1429455280303955, "learning_rate": 3.826939889889325e-05, "loss": 1.1314, "step": 4103 }, { "epoch": 0.6451688970111419, "grad_norm": 0.15775367617607117, "learning_rate": 3.8264162224807696e-05, "loss": 1.1693, "step": 4104 }, { "epoch": 0.6453261019080745, "grad_norm": 0.13923032581806183, "learning_rate": 3.825892474059758e-05, "loss": 1.1584, "step": 4105 }, { "epoch": 0.645483306805007, "grad_norm": 0.13384175300598145, "learning_rate": 3.825368644658279e-05, "loss": 1.1482, "step": 4106 }, { "epoch": 0.6456405117019395, "grad_norm": 0.15843184292316437, "learning_rate": 3.8248447343083255e-05, "loss": 1.02, "step": 4107 }, { "epoch": 0.6457977165988721, "grad_norm": 0.1540258228778839, "learning_rate": 3.8243207430418965e-05, "loss": 1.0609, "step": 4108 }, { "epoch": 0.6459549214958046, "grad_norm": 0.13326622545719147, "learning_rate": 3.823796670890996e-05, "loss": 1.1519, "step": 4109 }, { "epoch": 0.6461121263927372, "grad_norm": 0.18570023775100708, "learning_rate": 3.823272517887631e-05, "loss": 1.1447, "step": 4110 }, { "epoch": 0.6462693312896697, "grad_norm": 0.1267216056585312, "learning_rate": 3.8227482840638144e-05, "loss": 1.1803, "step": 4111 }, { "epoch": 0.6464265361866022, "grad_norm": 0.14555345475673676, "learning_rate": 3.822223969451566e-05, "loss": 1.2036, "step": 4112 }, { "epoch": 0.6465837410835348, "grad_norm": 0.15878638625144958, "learning_rate": 3.821699574082908e-05, "loss": 1.0386, "step": 4113 }, { "epoch": 0.6467409459804673, "grad_norm": 0.15997201204299927, "learning_rate": 3.82117509798987e-05, "loss": 1.1584, "step": 4114 }, { "epoch": 0.6468981508773999, "grad_norm": 0.13046613335609436, "learning_rate": 3.820650541204482e-05, "loss": 1.1935, "step": 4115 }, { "epoch": 0.6470553557743324, "grad_norm": 0.1658896803855896, "learning_rate": 3.820125903758786e-05, "loss": 1.1556, "step": 4116 }, { "epoch": 0.6472125606712649, "grad_norm": 0.13119108974933624, "learning_rate": 3.8196011856848204e-05, "loss": 1.0129, "step": 4117 }, { "epoch": 0.6473697655681975, "grad_norm": 0.13990801572799683, "learning_rate": 3.8190763870146355e-05, "loss": 1.1842, "step": 4118 }, { "epoch": 0.64752697046513, "grad_norm": 0.15538953244686127, "learning_rate": 3.818551507780284e-05, "loss": 1.0767, "step": 4119 }, { "epoch": 0.6476841753620626, "grad_norm": 0.1424800306558609, "learning_rate": 3.8180265480138236e-05, "loss": 1.0018, "step": 4120 }, { "epoch": 0.6478413802589951, "grad_norm": 0.13954682648181915, "learning_rate": 3.817501507747316e-05, "loss": 1.072, "step": 4121 }, { "epoch": 0.6479985851559276, "grad_norm": 0.18287043273448944, "learning_rate": 3.8169763870128284e-05, "loss": 1.1309, "step": 4122 }, { "epoch": 0.6481557900528602, "grad_norm": 0.13996580243110657, "learning_rate": 3.816451185842435e-05, "loss": 1.0904, "step": 4123 }, { "epoch": 0.6483129949497927, "grad_norm": 0.18551260232925415, "learning_rate": 3.815925904268211e-05, "loss": 1.0991, "step": 4124 }, { "epoch": 0.6484701998467253, "grad_norm": 0.1717006415128708, "learning_rate": 3.81540054232224e-05, "loss": 1.1385, "step": 4125 }, { "epoch": 0.6486274047436578, "grad_norm": 0.1935938596725464, "learning_rate": 3.814875100036609e-05, "loss": 1.0321, "step": 4126 }, { "epoch": 0.6487846096405903, "grad_norm": 0.16522254049777985, "learning_rate": 3.814349577443408e-05, "loss": 1.1219, "step": 4127 }, { "epoch": 0.6489418145375229, "grad_norm": 0.15041324496269226, "learning_rate": 3.813823974574738e-05, "loss": 1.0763, "step": 4128 }, { "epoch": 0.6490990194344554, "grad_norm": 0.13009963929653168, "learning_rate": 3.813298291462697e-05, "loss": 1.2183, "step": 4129 }, { "epoch": 0.649256224331388, "grad_norm": 0.1726483553647995, "learning_rate": 3.812772528139394e-05, "loss": 1.0157, "step": 4130 }, { "epoch": 0.6494134292283205, "grad_norm": 0.1214425191283226, "learning_rate": 3.812246684636939e-05, "loss": 1.0584, "step": 4131 }, { "epoch": 0.649570634125253, "grad_norm": 0.1466849148273468, "learning_rate": 3.81172076098745e-05, "loss": 0.9647, "step": 4132 }, { "epoch": 0.6497278390221856, "grad_norm": 0.1390989124774933, "learning_rate": 3.811194757223046e-05, "loss": 1.1379, "step": 4133 }, { "epoch": 0.649885043919118, "grad_norm": 0.16147755086421967, "learning_rate": 3.810668673375856e-05, "loss": 1.1648, "step": 4134 }, { "epoch": 0.6500422488160507, "grad_norm": 0.13957399129867554, "learning_rate": 3.810142509478011e-05, "loss": 1.0383, "step": 4135 }, { "epoch": 0.6501994537129832, "grad_norm": 0.14382591843605042, "learning_rate": 3.809616265561645e-05, "loss": 1.0436, "step": 4136 }, { "epoch": 0.6503566586099156, "grad_norm": 0.13169275224208832, "learning_rate": 3.809089941658901e-05, "loss": 1.1344, "step": 4137 }, { "epoch": 0.6505138635068483, "grad_norm": 0.14189770817756653, "learning_rate": 3.808563537801924e-05, "loss": 1.1493, "step": 4138 }, { "epoch": 0.6506710684037808, "grad_norm": 0.15206053853034973, "learning_rate": 3.808037054022865e-05, "loss": 1.0069, "step": 4139 }, { "epoch": 0.6508282733007134, "grad_norm": 0.18518586456775665, "learning_rate": 3.8075104903538795e-05, "loss": 1.0699, "step": 4140 }, { "epoch": 0.6509854781976459, "grad_norm": 0.1503700166940689, "learning_rate": 3.806983846827128e-05, "loss": 1.0635, "step": 4141 }, { "epoch": 0.6511426830945783, "grad_norm": 0.1372813582420349, "learning_rate": 3.806457123474776e-05, "loss": 1.017, "step": 4142 }, { "epoch": 0.651299887991511, "grad_norm": 0.14780773222446442, "learning_rate": 3.805930320328993e-05, "loss": 1.1236, "step": 4143 }, { "epoch": 0.6514570928884434, "grad_norm": 0.16201084852218628, "learning_rate": 3.805403437421955e-05, "loss": 0.9939, "step": 4144 }, { "epoch": 0.651614297785376, "grad_norm": 0.13731682300567627, "learning_rate": 3.804876474785842e-05, "loss": 1.1471, "step": 4145 }, { "epoch": 0.6517715026823085, "grad_norm": 0.15947549045085907, "learning_rate": 3.804349432452838e-05, "loss": 1.0879, "step": 4146 }, { "epoch": 0.651928707579241, "grad_norm": 0.1373717188835144, "learning_rate": 3.8038223104551344e-05, "loss": 1.0824, "step": 4147 }, { "epoch": 0.6520859124761736, "grad_norm": 0.13640908896923065, "learning_rate": 3.8032951088249245e-05, "loss": 1.1553, "step": 4148 }, { "epoch": 0.6522431173731061, "grad_norm": 0.13748691976070404, "learning_rate": 3.802767827594408e-05, "loss": 1.0725, "step": 4149 }, { "epoch": 0.6524003222700387, "grad_norm": 0.14089974761009216, "learning_rate": 3.80224046679579e-05, "loss": 1.1241, "step": 4150 }, { "epoch": 0.6525575271669712, "grad_norm": 0.13118582963943481, "learning_rate": 3.8017130264612775e-05, "loss": 1.0418, "step": 4151 }, { "epoch": 0.6527147320639037, "grad_norm": 0.1608380675315857, "learning_rate": 3.8011855066230866e-05, "loss": 1.1589, "step": 4152 }, { "epoch": 0.6528719369608363, "grad_norm": 0.18779993057250977, "learning_rate": 3.800657907313436e-05, "loss": 1.162, "step": 4153 }, { "epoch": 0.6530291418577688, "grad_norm": 0.1772323101758957, "learning_rate": 3.800130228564549e-05, "loss": 1.0781, "step": 4154 }, { "epoch": 0.6531863467547014, "grad_norm": 0.13229519128799438, "learning_rate": 3.799602470408654e-05, "loss": 1.1686, "step": 4155 }, { "epoch": 0.6533435516516339, "grad_norm": 0.1577441245317459, "learning_rate": 3.799074632877985e-05, "loss": 1.08, "step": 4156 }, { "epoch": 0.6535007565485665, "grad_norm": 0.14009861648082733, "learning_rate": 3.7985467160047804e-05, "loss": 1.147, "step": 4157 }, { "epoch": 0.653657961445499, "grad_norm": 0.1450236737728119, "learning_rate": 3.798018719821283e-05, "loss": 1.1681, "step": 4158 }, { "epoch": 0.6538151663424315, "grad_norm": 0.153233140707016, "learning_rate": 3.79749064435974e-05, "loss": 1.0823, "step": 4159 }, { "epoch": 0.6539723712393641, "grad_norm": 0.1304878294467926, "learning_rate": 3.796962489652406e-05, "loss": 1.058, "step": 4160 }, { "epoch": 0.6539723712393641, "eval_loss": 1.1008325815200806, "eval_runtime": 2357.0176, "eval_samples_per_second": 3.928, "eval_steps_per_second": 1.964, "step": 4160 }, { "epoch": 0.6541295761362966, "grad_norm": 0.1540244072675705, "learning_rate": 3.796434255731537e-05, "loss": 1.0523, "step": 4161 }, { "epoch": 0.6542867810332292, "grad_norm": 0.19584552943706512, "learning_rate": 3.7959059426293964e-05, "loss": 1.0906, "step": 4162 }, { "epoch": 0.6544439859301617, "grad_norm": 0.1526932269334793, "learning_rate": 3.795377550378252e-05, "loss": 1.0324, "step": 4163 }, { "epoch": 0.6546011908270942, "grad_norm": 0.15011286735534668, "learning_rate": 3.794849079010375e-05, "loss": 1.0482, "step": 4164 }, { "epoch": 0.6547583957240268, "grad_norm": 0.14008505642414093, "learning_rate": 3.794320528558044e-05, "loss": 1.1551, "step": 4165 }, { "epoch": 0.6549156006209593, "grad_norm": 0.12673290073871613, "learning_rate": 3.7937918990535376e-05, "loss": 1.0247, "step": 4166 }, { "epoch": 0.6550728055178919, "grad_norm": 0.13903503119945526, "learning_rate": 3.793263190529146e-05, "loss": 1.0566, "step": 4167 }, { "epoch": 0.6552300104148244, "grad_norm": 0.13928020000457764, "learning_rate": 3.7927344030171584e-05, "loss": 1.1154, "step": 4168 }, { "epoch": 0.6553872153117569, "grad_norm": 0.13580933213233948, "learning_rate": 3.7922055365498726e-05, "loss": 1.1776, "step": 4169 }, { "epoch": 0.6555444202086895, "grad_norm": 0.14068977534770966, "learning_rate": 3.79167659115959e-05, "loss": 1.0519, "step": 4170 }, { "epoch": 0.655701625105622, "grad_norm": 0.12532474100589752, "learning_rate": 3.7911475668786135e-05, "loss": 1.216, "step": 4171 }, { "epoch": 0.6558588300025546, "grad_norm": 1.0994434356689453, "learning_rate": 3.790618463739258e-05, "loss": 1.1033, "step": 4172 }, { "epoch": 0.6560160348994871, "grad_norm": 0.22691218554973602, "learning_rate": 3.790089281773837e-05, "loss": 1.2606, "step": 4173 }, { "epoch": 0.6561732397964196, "grad_norm": 0.19565100967884064, "learning_rate": 3.7895600210146696e-05, "loss": 1.0829, "step": 4174 }, { "epoch": 0.6563304446933522, "grad_norm": 0.16726306080818176, "learning_rate": 3.789030681494084e-05, "loss": 1.0931, "step": 4175 }, { "epoch": 0.6564876495902847, "grad_norm": 0.15301238000392914, "learning_rate": 3.788501263244408e-05, "loss": 1.2654, "step": 4176 }, { "epoch": 0.6566448544872173, "grad_norm": 0.16582754254341125, "learning_rate": 3.7879717662979785e-05, "loss": 0.9883, "step": 4177 }, { "epoch": 0.6568020593841498, "grad_norm": 0.16992676258087158, "learning_rate": 3.787442190687133e-05, "loss": 1.0806, "step": 4178 }, { "epoch": 0.6569592642810823, "grad_norm": 0.46979808807373047, "learning_rate": 3.786912536444217e-05, "loss": 1.2047, "step": 4179 }, { "epoch": 0.6571164691780149, "grad_norm": 0.21897472441196442, "learning_rate": 3.7863828036015805e-05, "loss": 1.0024, "step": 4180 }, { "epoch": 0.6572736740749474, "grad_norm": 0.17460548877716064, "learning_rate": 3.785852992191575e-05, "loss": 1.1766, "step": 4181 }, { "epoch": 0.65743087897188, "grad_norm": 0.16413532197475433, "learning_rate": 3.785323102246562e-05, "loss": 1.1307, "step": 4182 }, { "epoch": 0.6575880838688125, "grad_norm": 0.24324509501457214, "learning_rate": 3.784793133798904e-05, "loss": 0.998, "step": 4183 }, { "epoch": 0.657745288765745, "grad_norm": 0.2441328763961792, "learning_rate": 3.78426308688097e-05, "loss": 0.9856, "step": 4184 }, { "epoch": 0.6579024936626776, "grad_norm": 0.16712550818920135, "learning_rate": 3.7837329615251336e-05, "loss": 1.0543, "step": 4185 }, { "epoch": 0.6580596985596101, "grad_norm": 0.17282027006149292, "learning_rate": 3.783202757763771e-05, "loss": 1.0719, "step": 4186 }, { "epoch": 0.6582169034565427, "grad_norm": 0.20528116822242737, "learning_rate": 3.7826724756292666e-05, "loss": 1.2393, "step": 4187 }, { "epoch": 0.6583741083534752, "grad_norm": 0.2005816549062729, "learning_rate": 3.7821421151540084e-05, "loss": 0.9683, "step": 4188 }, { "epoch": 0.6585313132504077, "grad_norm": 0.21068783104419708, "learning_rate": 3.7816116763703874e-05, "loss": 1.0571, "step": 4189 }, { "epoch": 0.6586885181473403, "grad_norm": 0.17092068493366241, "learning_rate": 3.781081159310801e-05, "loss": 0.9867, "step": 4190 }, { "epoch": 0.6588457230442728, "grad_norm": 0.19518724083900452, "learning_rate": 3.780550564007652e-05, "loss": 1.131, "step": 4191 }, { "epoch": 0.6590029279412054, "grad_norm": 0.15224303305149078, "learning_rate": 3.780019890493347e-05, "loss": 1.1173, "step": 4192 }, { "epoch": 0.6591601328381379, "grad_norm": 0.1946234405040741, "learning_rate": 3.779489138800297e-05, "loss": 1.149, "step": 4193 }, { "epoch": 0.6593173377350704, "grad_norm": 0.16867277026176453, "learning_rate": 3.778958308960919e-05, "loss": 1.019, "step": 4194 }, { "epoch": 0.659474542632003, "grad_norm": 0.21311645209789276, "learning_rate": 3.778427401007632e-05, "loss": 1.0941, "step": 4195 }, { "epoch": 0.6596317475289355, "grad_norm": 0.18073083460330963, "learning_rate": 3.777896414972866e-05, "loss": 1.1035, "step": 4196 }, { "epoch": 0.6597889524258681, "grad_norm": 0.19108764827251434, "learning_rate": 3.7773653508890475e-05, "loss": 1.1679, "step": 4197 }, { "epoch": 0.6599461573228006, "grad_norm": 0.17400071024894714, "learning_rate": 3.776834208788613e-05, "loss": 1.0681, "step": 4198 }, { "epoch": 0.6601033622197331, "grad_norm": 0.173418328166008, "learning_rate": 3.776302988704004e-05, "loss": 1.1563, "step": 4199 }, { "epoch": 0.6602605671166657, "grad_norm": 0.18360170722007751, "learning_rate": 3.775771690667665e-05, "loss": 1.1159, "step": 4200 }, { "epoch": 0.6604177720135982, "grad_norm": 0.17133785784244537, "learning_rate": 3.775240314712043e-05, "loss": 1.1207, "step": 4201 }, { "epoch": 0.6605749769105308, "grad_norm": 0.1588124930858612, "learning_rate": 3.7747088608695965e-05, "loss": 1.0254, "step": 4202 }, { "epoch": 0.6607321818074633, "grad_norm": 0.20954452455043793, "learning_rate": 3.7741773291727815e-05, "loss": 1.108, "step": 4203 }, { "epoch": 0.6608893867043958, "grad_norm": 0.19729499518871307, "learning_rate": 3.773645719654064e-05, "loss": 1.1431, "step": 4204 }, { "epoch": 0.6610465916013284, "grad_norm": 0.20525553822517395, "learning_rate": 3.773114032345911e-05, "loss": 1.0959, "step": 4205 }, { "epoch": 0.6612037964982609, "grad_norm": 0.1458091139793396, "learning_rate": 3.772582267280798e-05, "loss": 1.1896, "step": 4206 }, { "epoch": 0.6613610013951935, "grad_norm": 0.18808773159980774, "learning_rate": 3.772050424491201e-05, "loss": 1.0664, "step": 4207 }, { "epoch": 0.661518206292126, "grad_norm": 0.1506444215774536, "learning_rate": 3.7715185040096046e-05, "loss": 1.1904, "step": 4208 }, { "epoch": 0.6616754111890586, "grad_norm": 0.15354827046394348, "learning_rate": 3.7709865058684944e-05, "loss": 1.1939, "step": 4209 }, { "epoch": 0.6618326160859911, "grad_norm": 0.15871380269527435, "learning_rate": 3.770454430100365e-05, "loss": 1.1425, "step": 4210 }, { "epoch": 0.6619898209829236, "grad_norm": 0.17379145324230194, "learning_rate": 3.7699222767377135e-05, "loss": 1.1903, "step": 4211 }, { "epoch": 0.6621470258798562, "grad_norm": 0.16655685007572174, "learning_rate": 3.769390045813041e-05, "loss": 1.0508, "step": 4212 }, { "epoch": 0.6623042307767887, "grad_norm": 0.16712385416030884, "learning_rate": 3.768857737358854e-05, "loss": 1.144, "step": 4213 }, { "epoch": 0.6624614356737213, "grad_norm": 0.20002847909927368, "learning_rate": 3.768325351407664e-05, "loss": 1.1763, "step": 4214 }, { "epoch": 0.6626186405706538, "grad_norm": 0.19525060057640076, "learning_rate": 3.7677928879919866e-05, "loss": 1.0668, "step": 4215 }, { "epoch": 0.6627758454675863, "grad_norm": 0.16536962985992432, "learning_rate": 3.767260347144344e-05, "loss": 1.1823, "step": 4216 }, { "epoch": 0.6629330503645189, "grad_norm": 0.25156453251838684, "learning_rate": 3.76672772889726e-05, "loss": 0.9883, "step": 4217 }, { "epoch": 0.6630902552614514, "grad_norm": 0.1674017310142517, "learning_rate": 3.766195033283267e-05, "loss": 1.0279, "step": 4218 }, { "epoch": 0.663247460158384, "grad_norm": 0.14695732295513153, "learning_rate": 3.765662260334899e-05, "loss": 0.9986, "step": 4219 }, { "epoch": 0.6634046650553165, "grad_norm": 0.17739038169384003, "learning_rate": 3.765129410084694e-05, "loss": 1.0437, "step": 4220 }, { "epoch": 0.663561869952249, "grad_norm": 0.1540808528661728, "learning_rate": 3.7645964825652e-05, "loss": 1.0727, "step": 4221 }, { "epoch": 0.6637190748491816, "grad_norm": 0.16241209208965302, "learning_rate": 3.7640634778089635e-05, "loss": 1.166, "step": 4222 }, { "epoch": 0.6638762797461141, "grad_norm": 0.1566782295703888, "learning_rate": 3.76353039584854e-05, "loss": 1.1314, "step": 4223 }, { "epoch": 0.6640334846430467, "grad_norm": 0.19483767449855804, "learning_rate": 3.762997236716487e-05, "loss": 1.0905, "step": 4224 }, { "epoch": 0.6641906895399792, "grad_norm": 0.1881464421749115, "learning_rate": 3.7624640004453674e-05, "loss": 1.1489, "step": 4225 }, { "epoch": 0.6643478944369117, "grad_norm": 0.15651607513427734, "learning_rate": 3.76193068706775e-05, "loss": 1.1743, "step": 4226 }, { "epoch": 0.6645050993338443, "grad_norm": 0.17168089747428894, "learning_rate": 3.761397296616208e-05, "loss": 0.9814, "step": 4227 }, { "epoch": 0.6646623042307768, "grad_norm": 0.18237245082855225, "learning_rate": 3.760863829123319e-05, "loss": 1.1592, "step": 4228 }, { "epoch": 0.6648195091277094, "grad_norm": 0.1554557830095291, "learning_rate": 3.760330284621664e-05, "loss": 1.1879, "step": 4229 }, { "epoch": 0.6649767140246419, "grad_norm": 0.17463083565235138, "learning_rate": 3.759796663143831e-05, "loss": 1.1055, "step": 4230 }, { "epoch": 0.6651339189215744, "grad_norm": 0.2073536515235901, "learning_rate": 3.75926296472241e-05, "loss": 1.0116, "step": 4231 }, { "epoch": 0.665291123818507, "grad_norm": 0.34921008348464966, "learning_rate": 3.758729189389999e-05, "loss": 1.1515, "step": 4232 }, { "epoch": 0.6654483287154395, "grad_norm": 0.17489458620548248, "learning_rate": 3.7581953371791985e-05, "loss": 1.1912, "step": 4233 }, { "epoch": 0.6656055336123721, "grad_norm": 0.16053788363933563, "learning_rate": 3.757661408122614e-05, "loss": 1.0192, "step": 4234 }, { "epoch": 0.6657627385093046, "grad_norm": 0.14950202405452728, "learning_rate": 3.757127402252855e-05, "loss": 1.0427, "step": 4235 }, { "epoch": 0.6659199434062371, "grad_norm": 0.19765916466712952, "learning_rate": 3.756593319602537e-05, "loss": 1.0708, "step": 4236 }, { "epoch": 0.6660771483031697, "grad_norm": 0.16286489367485046, "learning_rate": 3.756059160204281e-05, "loss": 1.2351, "step": 4237 }, { "epoch": 0.6662343532001022, "grad_norm": 0.17618192732334137, "learning_rate": 3.755524924090711e-05, "loss": 1.1426, "step": 4238 }, { "epoch": 0.6663915580970348, "grad_norm": 0.14757370948791504, "learning_rate": 3.7549906112944546e-05, "loss": 1.0719, "step": 4239 }, { "epoch": 0.6665487629939673, "grad_norm": 0.16622844338417053, "learning_rate": 3.754456221848146e-05, "loss": 1.0361, "step": 4240 }, { "epoch": 0.6667059678908998, "grad_norm": 0.19668567180633545, "learning_rate": 3.753921755784425e-05, "loss": 1.0828, "step": 4241 }, { "epoch": 0.6668631727878324, "grad_norm": 0.16749325394630432, "learning_rate": 3.753387213135935e-05, "loss": 1.1456, "step": 4242 }, { "epoch": 0.6670203776847649, "grad_norm": 0.16815859079360962, "learning_rate": 3.752852593935322e-05, "loss": 1.1183, "step": 4243 }, { "epoch": 0.6671775825816975, "grad_norm": 0.1528685837984085, "learning_rate": 3.752317898215239e-05, "loss": 1.1304, "step": 4244 }, { "epoch": 0.66733478747863, "grad_norm": 0.15588803589344025, "learning_rate": 3.751783126008344e-05, "loss": 1.0748, "step": 4245 }, { "epoch": 0.6674919923755624, "grad_norm": 0.16677920520305634, "learning_rate": 3.751248277347298e-05, "loss": 1.0492, "step": 4246 }, { "epoch": 0.667649197272495, "grad_norm": 0.1643311083316803, "learning_rate": 3.750713352264768e-05, "loss": 1.2256, "step": 4247 }, { "epoch": 0.6678064021694275, "grad_norm": 0.1470828801393509, "learning_rate": 3.750178350793425e-05, "loss": 1.0181, "step": 4248 }, { "epoch": 0.6679636070663602, "grad_norm": 0.17522110044956207, "learning_rate": 3.749643272965946e-05, "loss": 1.145, "step": 4249 }, { "epoch": 0.6681208119632926, "grad_norm": 0.17564639449119568, "learning_rate": 3.749108118815009e-05, "loss": 1.0818, "step": 4250 }, { "epoch": 0.6682780168602251, "grad_norm": 0.18222050368785858, "learning_rate": 3.748572888373302e-05, "loss": 1.0535, "step": 4251 }, { "epoch": 0.6684352217571577, "grad_norm": 0.1446535885334015, "learning_rate": 3.748037581673513e-05, "loss": 1.0991, "step": 4252 }, { "epoch": 0.6685924266540902, "grad_norm": 0.20465463399887085, "learning_rate": 3.747502198748336e-05, "loss": 1.0645, "step": 4253 }, { "epoch": 0.6687496315510228, "grad_norm": 0.1818159520626068, "learning_rate": 3.746966739630473e-05, "loss": 1.1188, "step": 4254 }, { "epoch": 0.6689068364479553, "grad_norm": 0.19995780289173126, "learning_rate": 3.746431204352624e-05, "loss": 1.1536, "step": 4255 }, { "epoch": 0.6690640413448878, "grad_norm": 0.20652185380458832, "learning_rate": 3.7458955929475e-05, "loss": 1.0296, "step": 4256 }, { "epoch": 0.6692212462418204, "grad_norm": 0.14761137962341309, "learning_rate": 3.745359905447814e-05, "loss": 1.0662, "step": 4257 }, { "epoch": 0.6693784511387529, "grad_norm": 0.18381749093532562, "learning_rate": 3.744824141886283e-05, "loss": 1.2777, "step": 4258 }, { "epoch": 0.6695356560356855, "grad_norm": 0.16218112409114838, "learning_rate": 3.7442883022956294e-05, "loss": 1.1225, "step": 4259 }, { "epoch": 0.669692860932618, "grad_norm": 0.13938245177268982, "learning_rate": 3.7437523867085813e-05, "loss": 1.0624, "step": 4260 }, { "epoch": 0.6698500658295506, "grad_norm": 0.1756437122821808, "learning_rate": 3.743216395157869e-05, "loss": 1.1881, "step": 4261 }, { "epoch": 0.6700072707264831, "grad_norm": 0.1338120400905609, "learning_rate": 3.74268032767623e-05, "loss": 1.1023, "step": 4262 }, { "epoch": 0.6701644756234156, "grad_norm": 0.17289410531520844, "learning_rate": 3.742144184296404e-05, "loss": 1.0453, "step": 4263 }, { "epoch": 0.6703216805203482, "grad_norm": 0.14961934089660645, "learning_rate": 3.741607965051137e-05, "loss": 1.1886, "step": 4264 }, { "epoch": 0.6704788854172807, "grad_norm": 0.18203534185886383, "learning_rate": 3.7410716699731805e-05, "loss": 1.1251, "step": 4265 }, { "epoch": 0.6706360903142133, "grad_norm": 0.17501042783260345, "learning_rate": 3.740535299095287e-05, "loss": 1.1852, "step": 4266 }, { "epoch": 0.6707932952111458, "grad_norm": 0.2523060142993927, "learning_rate": 3.739998852450218e-05, "loss": 0.8827, "step": 4267 }, { "epoch": 0.6709505001080783, "grad_norm": 0.1660834401845932, "learning_rate": 3.7394623300707375e-05, "loss": 1.1962, "step": 4268 }, { "epoch": 0.6711077050050109, "grad_norm": 0.1615104079246521, "learning_rate": 3.7389257319896135e-05, "loss": 1.1052, "step": 4269 }, { "epoch": 0.6712649099019434, "grad_norm": 0.1610841453075409, "learning_rate": 3.73838905823962e-05, "loss": 1.1558, "step": 4270 }, { "epoch": 0.671422114798876, "grad_norm": 0.17448994517326355, "learning_rate": 3.737852308853533e-05, "loss": 1.1081, "step": 4271 }, { "epoch": 0.6715793196958085, "grad_norm": 0.16625918447971344, "learning_rate": 3.737315483864138e-05, "loss": 0.9769, "step": 4272 }, { "epoch": 0.671736524592741, "grad_norm": 0.1489565372467041, "learning_rate": 3.736778583304221e-05, "loss": 1.0799, "step": 4273 }, { "epoch": 0.6718937294896736, "grad_norm": 0.1871234029531479, "learning_rate": 3.736241607206573e-05, "loss": 1.1174, "step": 4274 }, { "epoch": 0.6720509343866061, "grad_norm": 0.15307629108428955, "learning_rate": 3.735704555603992e-05, "loss": 1.0429, "step": 4275 }, { "epoch": 0.6722081392835387, "grad_norm": 0.18358135223388672, "learning_rate": 3.7351674285292785e-05, "loss": 1.0812, "step": 4276 }, { "epoch": 0.6723653441804712, "grad_norm": 0.1625143587589264, "learning_rate": 3.734630226015238e-05, "loss": 0.9433, "step": 4277 }, { "epoch": 0.6725225490774037, "grad_norm": 0.15999294817447662, "learning_rate": 3.73409294809468e-05, "loss": 1.1477, "step": 4278 }, { "epoch": 0.6726797539743363, "grad_norm": 0.25717711448669434, "learning_rate": 3.73355559480042e-05, "loss": 1.0463, "step": 4279 }, { "epoch": 0.6728369588712688, "grad_norm": 0.17935602366924286, "learning_rate": 3.733018166165277e-05, "loss": 1.1075, "step": 4280 }, { "epoch": 0.6729941637682014, "grad_norm": 0.16021014750003815, "learning_rate": 3.732480662222077e-05, "loss": 1.0991, "step": 4281 }, { "epoch": 0.6731513686651339, "grad_norm": 0.1721874475479126, "learning_rate": 3.7319430830036475e-05, "loss": 1.1008, "step": 4282 }, { "epoch": 0.6733085735620664, "grad_norm": 0.18219947814941406, "learning_rate": 3.731405428542821e-05, "loss": 1.075, "step": 4283 }, { "epoch": 0.673465778458999, "grad_norm": 0.1574048399925232, "learning_rate": 3.7308676988724366e-05, "loss": 1.2457, "step": 4284 }, { "epoch": 0.6736229833559315, "grad_norm": 0.14693421125411987, "learning_rate": 3.730329894025336e-05, "loss": 1.118, "step": 4285 }, { "epoch": 0.6737801882528641, "grad_norm": 0.24711671471595764, "learning_rate": 3.729792014034367e-05, "loss": 0.9959, "step": 4286 }, { "epoch": 0.6739373931497966, "grad_norm": 0.17833338677883148, "learning_rate": 3.729254058932381e-05, "loss": 1.0796, "step": 4287 }, { "epoch": 0.6740945980467291, "grad_norm": 0.17984211444854736, "learning_rate": 3.728716028752234e-05, "loss": 1.1121, "step": 4288 }, { "epoch": 0.6742518029436617, "grad_norm": 0.19105543196201324, "learning_rate": 3.728177923526786e-05, "loss": 1.1225, "step": 4289 }, { "epoch": 0.6744090078405942, "grad_norm": 0.14651957154273987, "learning_rate": 3.727639743288904e-05, "loss": 1.007, "step": 4290 }, { "epoch": 0.6745662127375268, "grad_norm": 0.14845982193946838, "learning_rate": 3.7271014880714577e-05, "loss": 1.1094, "step": 4291 }, { "epoch": 0.6747234176344593, "grad_norm": 0.1587723046541214, "learning_rate": 3.726563157907321e-05, "loss": 1.1088, "step": 4292 }, { "epoch": 0.6748806225313918, "grad_norm": 0.16230261325836182, "learning_rate": 3.726024752829373e-05, "loss": 1.1123, "step": 4293 }, { "epoch": 0.6750378274283244, "grad_norm": 0.1781100183725357, "learning_rate": 3.725486272870498e-05, "loss": 0.9464, "step": 4294 }, { "epoch": 0.6751950323252569, "grad_norm": 0.13477860391139984, "learning_rate": 3.724947718063585e-05, "loss": 1.0361, "step": 4295 }, { "epoch": 0.6753522372221895, "grad_norm": 0.1615627259016037, "learning_rate": 3.724409088441525e-05, "loss": 1.1301, "step": 4296 }, { "epoch": 0.675509442119122, "grad_norm": 0.1631171554327011, "learning_rate": 3.7238703840372166e-05, "loss": 1.0858, "step": 4297 }, { "epoch": 0.6756666470160545, "grad_norm": 0.1773926168680191, "learning_rate": 3.7233316048835615e-05, "loss": 0.9948, "step": 4298 }, { "epoch": 0.6758238519129871, "grad_norm": 0.16207563877105713, "learning_rate": 3.722792751013467e-05, "loss": 1.1216, "step": 4299 }, { "epoch": 0.6759810568099196, "grad_norm": 0.15376627445220947, "learning_rate": 3.722253822459843e-05, "loss": 1.0546, "step": 4300 }, { "epoch": 0.6761382617068522, "grad_norm": 0.14326240122318268, "learning_rate": 3.7217148192556065e-05, "loss": 1.1124, "step": 4301 }, { "epoch": 0.6762954666037847, "grad_norm": 0.13520918786525726, "learning_rate": 3.7211757414336775e-05, "loss": 1.1624, "step": 4302 }, { "epoch": 0.6764526715007172, "grad_norm": 0.1636790782213211, "learning_rate": 3.72063658902698e-05, "loss": 1.0277, "step": 4303 }, { "epoch": 0.6766098763976498, "grad_norm": 0.12657029926776886, "learning_rate": 3.720097362068443e-05, "loss": 1.1938, "step": 4304 }, { "epoch": 0.6767670812945823, "grad_norm": 0.16188089549541473, "learning_rate": 3.719558060591003e-05, "loss": 1.065, "step": 4305 }, { "epoch": 0.6769242861915149, "grad_norm": 0.20939183235168457, "learning_rate": 3.7190186846275954e-05, "loss": 1.1164, "step": 4306 }, { "epoch": 0.6770814910884474, "grad_norm": 0.1548239290714264, "learning_rate": 3.718479234211165e-05, "loss": 1.1214, "step": 4307 }, { "epoch": 0.6772386959853799, "grad_norm": 0.16959509253501892, "learning_rate": 3.7179397093746595e-05, "loss": 1.2162, "step": 4308 }, { "epoch": 0.6773959008823125, "grad_norm": 0.1429489105939865, "learning_rate": 3.7174001101510295e-05, "loss": 1.1158, "step": 4309 }, { "epoch": 0.677553105779245, "grad_norm": 0.15084104239940643, "learning_rate": 3.716860436573234e-05, "loss": 1.0594, "step": 4310 }, { "epoch": 0.6777103106761776, "grad_norm": 0.1567157655954361, "learning_rate": 3.716320688674232e-05, "loss": 1.0448, "step": 4311 }, { "epoch": 0.6778675155731101, "grad_norm": 0.13432151079177856, "learning_rate": 3.71578086648699e-05, "loss": 1.1063, "step": 4312 }, { "epoch": 0.6780247204700427, "grad_norm": 0.14147740602493286, "learning_rate": 3.715240970044479e-05, "loss": 1.2635, "step": 4313 }, { "epoch": 0.6781819253669752, "grad_norm": 0.14541368186473846, "learning_rate": 3.7147009993796726e-05, "loss": 1.167, "step": 4314 }, { "epoch": 0.6783391302639077, "grad_norm": 0.15351711213588715, "learning_rate": 3.714160954525551e-05, "loss": 1.2112, "step": 4315 }, { "epoch": 0.6784963351608403, "grad_norm": 0.1674315184354782, "learning_rate": 3.713620835515098e-05, "loss": 1.0714, "step": 4316 }, { "epoch": 0.6786535400577728, "grad_norm": 0.1620711088180542, "learning_rate": 3.713080642381303e-05, "loss": 1.1816, "step": 4317 }, { "epoch": 0.6788107449547054, "grad_norm": 0.16441158950328827, "learning_rate": 3.7125403751571565e-05, "loss": 1.1193, "step": 4318 }, { "epoch": 0.6789679498516379, "grad_norm": 0.16214364767074585, "learning_rate": 3.7120000338756574e-05, "loss": 1.1117, "step": 4319 }, { "epoch": 0.6791251547485704, "grad_norm": 0.1726810336112976, "learning_rate": 3.711459618569808e-05, "loss": 1.1549, "step": 4320 }, { "epoch": 0.6791251547485704, "eval_loss": 1.1005306243896484, "eval_runtime": 2322.7293, "eval_samples_per_second": 3.986, "eval_steps_per_second": 1.993, "step": 4320 }, { "epoch": 0.679282359645503, "grad_norm": 0.1818498969078064, "learning_rate": 3.710919129272614e-05, "loss": 1.1902, "step": 4321 }, { "epoch": 0.6794395645424355, "grad_norm": 0.13222762942314148, "learning_rate": 3.710378566017087e-05, "loss": 1.1281, "step": 4322 }, { "epoch": 0.6795967694393681, "grad_norm": 0.22074824571609497, "learning_rate": 3.709837928836242e-05, "loss": 1.0326, "step": 4323 }, { "epoch": 0.6797539743363006, "grad_norm": 0.20594502985477448, "learning_rate": 3.7092972177631e-05, "loss": 0.971, "step": 4324 }, { "epoch": 0.6799111792332331, "grad_norm": 0.1570967435836792, "learning_rate": 3.708756432830685e-05, "loss": 1.1011, "step": 4325 }, { "epoch": 0.6800683841301657, "grad_norm": 0.15465806424617767, "learning_rate": 3.708215574072026e-05, "loss": 1.1032, "step": 4326 }, { "epoch": 0.6802255890270982, "grad_norm": 0.14896081387996674, "learning_rate": 3.707674641520156e-05, "loss": 1.1976, "step": 4327 }, { "epoch": 0.6803827939240308, "grad_norm": 0.15382733941078186, "learning_rate": 3.707133635208114e-05, "loss": 1.1458, "step": 4328 }, { "epoch": 0.6805399988209633, "grad_norm": 0.14555004239082336, "learning_rate": 3.706592555168943e-05, "loss": 1.1442, "step": 4329 }, { "epoch": 0.6806972037178958, "grad_norm": 0.13018903136253357, "learning_rate": 3.70605140143569e-05, "loss": 1.1133, "step": 4330 }, { "epoch": 0.6808544086148284, "grad_norm": 0.17980679869651794, "learning_rate": 3.705510174041406e-05, "loss": 1.1452, "step": 4331 }, { "epoch": 0.6810116135117609, "grad_norm": 0.15200766921043396, "learning_rate": 3.704968873019145e-05, "loss": 1.0226, "step": 4332 }, { "epoch": 0.6811688184086935, "grad_norm": 0.15224698185920715, "learning_rate": 3.704427498401972e-05, "loss": 1.2318, "step": 4333 }, { "epoch": 0.681326023305626, "grad_norm": 0.15590931475162506, "learning_rate": 3.70388605022295e-05, "loss": 1.0923, "step": 4334 }, { "epoch": 0.6814832282025585, "grad_norm": 0.16589437425136566, "learning_rate": 3.703344528515147e-05, "loss": 1.1986, "step": 4335 }, { "epoch": 0.6816404330994911, "grad_norm": 0.15944154560565948, "learning_rate": 3.7028029333116406e-05, "loss": 1.1923, "step": 4336 }, { "epoch": 0.6817976379964236, "grad_norm": 0.14723898470401764, "learning_rate": 3.7022612646455064e-05, "loss": 1.1159, "step": 4337 }, { "epoch": 0.6819548428933562, "grad_norm": 0.1588645726442337, "learning_rate": 3.701719522549828e-05, "loss": 1.1289, "step": 4338 }, { "epoch": 0.6821120477902887, "grad_norm": 0.1596093773841858, "learning_rate": 3.701177707057694e-05, "loss": 1.0452, "step": 4339 }, { "epoch": 0.6822692526872212, "grad_norm": 0.17482325434684753, "learning_rate": 3.700635818202196e-05, "loss": 1.0804, "step": 4340 }, { "epoch": 0.6824264575841538, "grad_norm": 0.16940900683403015, "learning_rate": 3.70009385601643e-05, "loss": 1.1467, "step": 4341 }, { "epoch": 0.6825836624810863, "grad_norm": 0.18156111240386963, "learning_rate": 3.699551820533498e-05, "loss": 0.9255, "step": 4342 }, { "epoch": 0.6827408673780189, "grad_norm": 0.16947069764137268, "learning_rate": 3.6990097117865036e-05, "loss": 1.1077, "step": 4343 }, { "epoch": 0.6828980722749514, "grad_norm": 0.15319743752479553, "learning_rate": 3.698467529808559e-05, "loss": 1.1573, "step": 4344 }, { "epoch": 0.6830552771718839, "grad_norm": 0.15475665032863617, "learning_rate": 3.697925274632777e-05, "loss": 1.1847, "step": 4345 }, { "epoch": 0.6832124820688165, "grad_norm": 0.1482115238904953, "learning_rate": 3.697382946292277e-05, "loss": 1.0272, "step": 4346 }, { "epoch": 0.683369686965749, "grad_norm": 0.16407650709152222, "learning_rate": 3.696840544820182e-05, "loss": 1.1283, "step": 4347 }, { "epoch": 0.6835268918626816, "grad_norm": 0.165381520986557, "learning_rate": 3.696298070249621e-05, "loss": 1.174, "step": 4348 }, { "epoch": 0.6836840967596141, "grad_norm": 0.15581317245960236, "learning_rate": 3.6957555226137255e-05, "loss": 1.1043, "step": 4349 }, { "epoch": 0.6838413016565466, "grad_norm": 0.17735551297664642, "learning_rate": 3.695212901945632e-05, "loss": 1.1663, "step": 4350 }, { "epoch": 0.6839985065534792, "grad_norm": 0.14878909289836884, "learning_rate": 3.6946702082784815e-05, "loss": 1.1423, "step": 4351 }, { "epoch": 0.6841557114504117, "grad_norm": 0.1387113630771637, "learning_rate": 3.694127441645421e-05, "loss": 1.094, "step": 4352 }, { "epoch": 0.6843129163473443, "grad_norm": 0.13688206672668457, "learning_rate": 3.6935846020795986e-05, "loss": 0.9774, "step": 4353 }, { "epoch": 0.6844701212442768, "grad_norm": 0.1413954645395279, "learning_rate": 3.6930416896141714e-05, "loss": 1.0235, "step": 4354 }, { "epoch": 0.6846273261412092, "grad_norm": 0.1530650556087494, "learning_rate": 3.6924987042822964e-05, "loss": 1.1557, "step": 4355 }, { "epoch": 0.6847845310381419, "grad_norm": 0.16509467363357544, "learning_rate": 3.691955646117137e-05, "loss": 1.0712, "step": 4356 }, { "epoch": 0.6849417359350743, "grad_norm": 0.1299772709608078, "learning_rate": 3.691412515151863e-05, "loss": 1.0695, "step": 4357 }, { "epoch": 0.685098940832007, "grad_norm": 0.14452426135540009, "learning_rate": 3.690869311419644e-05, "loss": 1.1729, "step": 4358 }, { "epoch": 0.6852561457289394, "grad_norm": 0.17244118452072144, "learning_rate": 3.69032603495366e-05, "loss": 1.0492, "step": 4359 }, { "epoch": 0.6854133506258719, "grad_norm": 0.1782621592283249, "learning_rate": 3.68978268578709e-05, "loss": 1.0566, "step": 4360 }, { "epoch": 0.6855705555228045, "grad_norm": 0.17903123795986176, "learning_rate": 3.6892392639531204e-05, "loss": 1.1614, "step": 4361 }, { "epoch": 0.685727760419737, "grad_norm": 0.17313703894615173, "learning_rate": 3.6886957694849414e-05, "loss": 1.0283, "step": 4362 }, { "epoch": 0.6858849653166696, "grad_norm": 0.13656044006347656, "learning_rate": 3.688152202415747e-05, "loss": 1.1608, "step": 4363 }, { "epoch": 0.6860421702136021, "grad_norm": 0.18269652128219604, "learning_rate": 3.6876085627787376e-05, "loss": 1.0267, "step": 4364 }, { "epoch": 0.6861993751105347, "grad_norm": 0.13547973334789276, "learning_rate": 3.6870648506071154e-05, "loss": 1.1485, "step": 4365 }, { "epoch": 0.6863565800074672, "grad_norm": 0.13134713470935822, "learning_rate": 3.686521065934089e-05, "loss": 1.091, "step": 4366 }, { "epoch": 0.6865137849043997, "grad_norm": 0.1429309844970703, "learning_rate": 3.6859772087928694e-05, "loss": 1.3158, "step": 4367 }, { "epoch": 0.6866709898013323, "grad_norm": 0.14805296063423157, "learning_rate": 3.6854332792166745e-05, "loss": 1.1289, "step": 4368 }, { "epoch": 0.6868281946982648, "grad_norm": 0.16160231828689575, "learning_rate": 3.684889277238726e-05, "loss": 1.1209, "step": 4369 }, { "epoch": 0.6869853995951974, "grad_norm": 0.1638854295015335, "learning_rate": 3.684345202892248e-05, "loss": 1.188, "step": 4370 }, { "epoch": 0.6871426044921299, "grad_norm": 0.1331411898136139, "learning_rate": 3.683801056210471e-05, "loss": 1.0899, "step": 4371 }, { "epoch": 0.6872998093890624, "grad_norm": 0.1672300398349762, "learning_rate": 3.6832568372266294e-05, "loss": 1.1262, "step": 4372 }, { "epoch": 0.687457014285995, "grad_norm": 0.19106687605381012, "learning_rate": 3.682712545973963e-05, "loss": 1.0008, "step": 4373 }, { "epoch": 0.6876142191829275, "grad_norm": 0.18553289771080017, "learning_rate": 3.682168182485713e-05, "loss": 1.0962, "step": 4374 }, { "epoch": 0.6877714240798601, "grad_norm": 0.1664271205663681, "learning_rate": 3.681623746795129e-05, "loss": 1.0545, "step": 4375 }, { "epoch": 0.6879286289767926, "grad_norm": 0.16832974553108215, "learning_rate": 3.681079238935463e-05, "loss": 1.0419, "step": 4376 }, { "epoch": 0.6880858338737251, "grad_norm": 0.14619651436805725, "learning_rate": 3.6805346589399695e-05, "loss": 1.071, "step": 4377 }, { "epoch": 0.6882430387706577, "grad_norm": 0.16446255147457123, "learning_rate": 3.679990006841911e-05, "loss": 1.058, "step": 4378 }, { "epoch": 0.6884002436675902, "grad_norm": 0.13051556050777435, "learning_rate": 3.679445282674553e-05, "loss": 1.1425, "step": 4379 }, { "epoch": 0.6885574485645228, "grad_norm": 0.1575997918844223, "learning_rate": 3.6789004864711644e-05, "loss": 1.1478, "step": 4380 }, { "epoch": 0.6887146534614553, "grad_norm": 0.15147651731967926, "learning_rate": 3.67835561826502e-05, "loss": 1.1867, "step": 4381 }, { "epoch": 0.6888718583583878, "grad_norm": 0.17867539823055267, "learning_rate": 3.677810678089397e-05, "loss": 1.1013, "step": 4382 }, { "epoch": 0.6890290632553204, "grad_norm": 0.16259698569774628, "learning_rate": 3.677265665977579e-05, "loss": 1.229, "step": 4383 }, { "epoch": 0.6891862681522529, "grad_norm": 0.13512615859508514, "learning_rate": 3.6767205819628534e-05, "loss": 1.171, "step": 4384 }, { "epoch": 0.6893434730491855, "grad_norm": 0.17127491533756256, "learning_rate": 3.6761754260785126e-05, "loss": 1.1358, "step": 4385 }, { "epoch": 0.689500677946118, "grad_norm": 0.15448449552059174, "learning_rate": 3.675630198357851e-05, "loss": 1.1093, "step": 4386 }, { "epoch": 0.6896578828430505, "grad_norm": 0.17164385318756104, "learning_rate": 3.6750848988341704e-05, "loss": 1.0478, "step": 4387 }, { "epoch": 0.6898150877399831, "grad_norm": 0.17493198812007904, "learning_rate": 3.6745395275407744e-05, "loss": 1.1615, "step": 4388 }, { "epoch": 0.6899722926369156, "grad_norm": 0.14925609529018402, "learning_rate": 3.6739940845109735e-05, "loss": 1.0931, "step": 4389 }, { "epoch": 0.6901294975338482, "grad_norm": 0.12435558438301086, "learning_rate": 3.6734485697780806e-05, "loss": 1.137, "step": 4390 }, { "epoch": 0.6902867024307807, "grad_norm": 0.16357684135437012, "learning_rate": 3.672902983375413e-05, "loss": 1.0787, "step": 4391 }, { "epoch": 0.6904439073277132, "grad_norm": 0.1529734581708908, "learning_rate": 3.6723573253362945e-05, "loss": 1.111, "step": 4392 }, { "epoch": 0.6906011122246458, "grad_norm": 0.19325505197048187, "learning_rate": 3.671811595694051e-05, "loss": 1.0739, "step": 4393 }, { "epoch": 0.6907583171215783, "grad_norm": 0.14742138981819153, "learning_rate": 3.6712657944820144e-05, "loss": 1.1334, "step": 4394 }, { "epoch": 0.6909155220185109, "grad_norm": 0.16689634323120117, "learning_rate": 3.670719921733519e-05, "loss": 0.9864, "step": 4395 }, { "epoch": 0.6910727269154434, "grad_norm": 0.14415030181407928, "learning_rate": 3.6701739774819046e-05, "loss": 1.1618, "step": 4396 }, { "epoch": 0.6912299318123759, "grad_norm": 0.15743502974510193, "learning_rate": 3.6696279617605155e-05, "loss": 1.0823, "step": 4397 }, { "epoch": 0.6913871367093085, "grad_norm": 0.15715019404888153, "learning_rate": 3.669081874602701e-05, "loss": 1.093, "step": 4398 }, { "epoch": 0.691544341606241, "grad_norm": 0.1599857062101364, "learning_rate": 3.668535716041814e-05, "loss": 1.1724, "step": 4399 }, { "epoch": 0.6917015465031736, "grad_norm": 0.15201207995414734, "learning_rate": 3.667989486111212e-05, "loss": 1.1116, "step": 4400 }, { "epoch": 0.6918587514001061, "grad_norm": 0.15862667560577393, "learning_rate": 3.667443184844256e-05, "loss": 1.1513, "step": 4401 }, { "epoch": 0.6920159562970386, "grad_norm": 0.1607983559370041, "learning_rate": 3.666896812274311e-05, "loss": 1.0895, "step": 4402 }, { "epoch": 0.6921731611939712, "grad_norm": 0.14757487177848816, "learning_rate": 3.666350368434749e-05, "loss": 1.0656, "step": 4403 }, { "epoch": 0.6923303660909037, "grad_norm": 0.19606012105941772, "learning_rate": 3.665803853358944e-05, "loss": 1.1677, "step": 4404 }, { "epoch": 0.6924875709878363, "grad_norm": 0.1427164077758789, "learning_rate": 3.6652572670802754e-05, "loss": 1.047, "step": 4405 }, { "epoch": 0.6926447758847688, "grad_norm": 0.12801557779312134, "learning_rate": 3.664710609632127e-05, "loss": 1.1052, "step": 4406 }, { "epoch": 0.6928019807817013, "grad_norm": 0.17031121253967285, "learning_rate": 3.664163881047884e-05, "loss": 1.0519, "step": 4407 }, { "epoch": 0.6929591856786339, "grad_norm": 0.14045396447181702, "learning_rate": 3.6636170813609425e-05, "loss": 1.0799, "step": 4408 }, { "epoch": 0.6931163905755664, "grad_norm": 0.16111011803150177, "learning_rate": 3.663070210604697e-05, "loss": 1.0628, "step": 4409 }, { "epoch": 0.693273595472499, "grad_norm": 0.15665438771247864, "learning_rate": 3.662523268812547e-05, "loss": 1.2111, "step": 4410 }, { "epoch": 0.6934308003694315, "grad_norm": 0.1392172873020172, "learning_rate": 3.6619762560179006e-05, "loss": 1.1297, "step": 4411 }, { "epoch": 0.693588005266364, "grad_norm": 0.20096223056316376, "learning_rate": 3.661429172254163e-05, "loss": 1.1301, "step": 4412 }, { "epoch": 0.6937452101632966, "grad_norm": 0.16046035289764404, "learning_rate": 3.6608820175547526e-05, "loss": 1.125, "step": 4413 }, { "epoch": 0.6939024150602291, "grad_norm": 0.13595367968082428, "learning_rate": 3.660334791953085e-05, "loss": 1.0301, "step": 4414 }, { "epoch": 0.6940596199571617, "grad_norm": 0.1444806009531021, "learning_rate": 3.659787495482583e-05, "loss": 1.1592, "step": 4415 }, { "epoch": 0.6942168248540942, "grad_norm": 0.16791272163391113, "learning_rate": 3.659240128176673e-05, "loss": 1.1495, "step": 4416 }, { "epoch": 0.6943740297510267, "grad_norm": 0.13716629147529602, "learning_rate": 3.658692690068787e-05, "loss": 1.127, "step": 4417 }, { "epoch": 0.6945312346479593, "grad_norm": 0.12987127900123596, "learning_rate": 3.6581451811923596e-05, "loss": 1.2504, "step": 4418 }, { "epoch": 0.6946884395448918, "grad_norm": 0.1585957407951355, "learning_rate": 3.657597601580831e-05, "loss": 1.0458, "step": 4419 }, { "epoch": 0.6948456444418244, "grad_norm": 0.15420947968959808, "learning_rate": 3.6570499512676465e-05, "loss": 1.0563, "step": 4420 }, { "epoch": 0.6950028493387569, "grad_norm": 0.12740959227085114, "learning_rate": 3.6565022302862526e-05, "loss": 1.0257, "step": 4421 }, { "epoch": 0.6951600542356895, "grad_norm": 0.15865959227085114, "learning_rate": 3.6559544386701015e-05, "loss": 1.1003, "step": 4422 }, { "epoch": 0.695317259132622, "grad_norm": 0.16942833364009857, "learning_rate": 3.6554065764526524e-05, "loss": 1.054, "step": 4423 }, { "epoch": 0.6954744640295545, "grad_norm": 0.1516052782535553, "learning_rate": 3.654858643667365e-05, "loss": 1.0383, "step": 4424 }, { "epoch": 0.6956316689264871, "grad_norm": 0.12675218284130096, "learning_rate": 3.654310640347707e-05, "loss": 1.1814, "step": 4425 }, { "epoch": 0.6957888738234196, "grad_norm": 0.1705823838710785, "learning_rate": 3.653762566527146e-05, "loss": 1.1676, "step": 4426 }, { "epoch": 0.6959460787203522, "grad_norm": 0.14712105691432953, "learning_rate": 3.653214422239157e-05, "loss": 1.2246, "step": 4427 }, { "epoch": 0.6961032836172847, "grad_norm": 0.13124018907546997, "learning_rate": 3.652666207517219e-05, "loss": 1.1254, "step": 4428 }, { "epoch": 0.6962604885142172, "grad_norm": 0.13952194154262543, "learning_rate": 3.6521179223948153e-05, "loss": 1.1184, "step": 4429 }, { "epoch": 0.6964176934111498, "grad_norm": 0.16852107644081116, "learning_rate": 3.651569566905432e-05, "loss": 1.0189, "step": 4430 }, { "epoch": 0.6965748983080823, "grad_norm": 0.15509940683841705, "learning_rate": 3.6510211410825614e-05, "loss": 1.018, "step": 4431 }, { "epoch": 0.6967321032050149, "grad_norm": 0.15195202827453613, "learning_rate": 3.650472644959698e-05, "loss": 1.1238, "step": 4432 }, { "epoch": 0.6968893081019474, "grad_norm": 0.18177133798599243, "learning_rate": 3.6499240785703426e-05, "loss": 1.2951, "step": 4433 }, { "epoch": 0.6970465129988799, "grad_norm": 0.17335766553878784, "learning_rate": 3.649375441948001e-05, "loss": 1.1246, "step": 4434 }, { "epoch": 0.6972037178958125, "grad_norm": 0.15300196409225464, "learning_rate": 3.648826735126179e-05, "loss": 1.0736, "step": 4435 }, { "epoch": 0.697360922792745, "grad_norm": 0.1609075963497162, "learning_rate": 3.648277958138392e-05, "loss": 1.1375, "step": 4436 }, { "epoch": 0.6975181276896776, "grad_norm": 0.1483910232782364, "learning_rate": 3.647729111018156e-05, "loss": 1.2151, "step": 4437 }, { "epoch": 0.6976753325866101, "grad_norm": 0.16368529200553894, "learning_rate": 3.647180193798992e-05, "loss": 1.0947, "step": 4438 }, { "epoch": 0.6978325374835426, "grad_norm": 0.16666406393051147, "learning_rate": 3.646631206514427e-05, "loss": 1.1475, "step": 4439 }, { "epoch": 0.6979897423804752, "grad_norm": 0.17518015205860138, "learning_rate": 3.646082149197991e-05, "loss": 1.242, "step": 4440 }, { "epoch": 0.6981469472774077, "grad_norm": 0.15665574371814728, "learning_rate": 3.645533021883218e-05, "loss": 1.1221, "step": 4441 }, { "epoch": 0.6983041521743403, "grad_norm": 0.1375490128993988, "learning_rate": 3.644983824603645e-05, "loss": 0.9732, "step": 4442 }, { "epoch": 0.6984613570712728, "grad_norm": 0.19858404994010925, "learning_rate": 3.644434557392818e-05, "loss": 1.0138, "step": 4443 }, { "epoch": 0.6986185619682053, "grad_norm": 0.15151138603687286, "learning_rate": 3.643885220284282e-05, "loss": 1.0626, "step": 4444 }, { "epoch": 0.6987757668651379, "grad_norm": 0.13844822347164154, "learning_rate": 3.6433358133115884e-05, "loss": 1.0452, "step": 4445 }, { "epoch": 0.6989329717620704, "grad_norm": 0.1529357135295868, "learning_rate": 3.642786336508294e-05, "loss": 1.0192, "step": 4446 }, { "epoch": 0.699090176659003, "grad_norm": 0.19727185368537903, "learning_rate": 3.642236789907958e-05, "loss": 1.0846, "step": 4447 }, { "epoch": 0.6992473815559355, "grad_norm": 0.16505931317806244, "learning_rate": 3.6416871735441446e-05, "loss": 1.1061, "step": 4448 }, { "epoch": 0.699404586452868, "grad_norm": 0.1713247001171112, "learning_rate": 3.6411374874504236e-05, "loss": 1.0497, "step": 4449 }, { "epoch": 0.6995617913498006, "grad_norm": 0.14336654543876648, "learning_rate": 3.640587731660366e-05, "loss": 1.2052, "step": 4450 }, { "epoch": 0.6997189962467331, "grad_norm": 0.15254364907741547, "learning_rate": 3.640037906207549e-05, "loss": 1.1133, "step": 4451 }, { "epoch": 0.6998762011436657, "grad_norm": 0.17495720088481903, "learning_rate": 3.639488011125553e-05, "loss": 1.0429, "step": 4452 }, { "epoch": 0.7000334060405982, "grad_norm": 0.14323261380195618, "learning_rate": 3.638938046447967e-05, "loss": 1.0586, "step": 4453 }, { "epoch": 0.7001906109375307, "grad_norm": 0.1447620689868927, "learning_rate": 3.6383880122083775e-05, "loss": 1.121, "step": 4454 }, { "epoch": 0.7003478158344633, "grad_norm": 0.15612784028053284, "learning_rate": 3.6378379084403804e-05, "loss": 1.09, "step": 4455 }, { "epoch": 0.7005050207313958, "grad_norm": 0.14692716300487518, "learning_rate": 3.637287735177571e-05, "loss": 1.1235, "step": 4456 }, { "epoch": 0.7006622256283284, "grad_norm": 0.14650915563106537, "learning_rate": 3.6367374924535556e-05, "loss": 1.0611, "step": 4457 }, { "epoch": 0.7008194305252609, "grad_norm": 0.13822227716445923, "learning_rate": 3.636187180301939e-05, "loss": 1.1958, "step": 4458 }, { "epoch": 0.7009766354221934, "grad_norm": 0.13127368688583374, "learning_rate": 3.6356367987563316e-05, "loss": 1.0414, "step": 4459 }, { "epoch": 0.701133840319126, "grad_norm": 0.15684054791927338, "learning_rate": 3.6350863478503505e-05, "loss": 1.0821, "step": 4460 }, { "epoch": 0.7012910452160585, "grad_norm": 0.1420765221118927, "learning_rate": 3.634535827617612e-05, "loss": 1.0357, "step": 4461 }, { "epoch": 0.7014482501129911, "grad_norm": 0.1303970217704773, "learning_rate": 3.633985238091744e-05, "loss": 1.1562, "step": 4462 }, { "epoch": 0.7016054550099236, "grad_norm": 0.1747986227273941, "learning_rate": 3.633434579306371e-05, "loss": 1.2047, "step": 4463 }, { "epoch": 0.701762659906856, "grad_norm": 0.15176185965538025, "learning_rate": 3.632883851295127e-05, "loss": 0.9414, "step": 4464 }, { "epoch": 0.7019198648037887, "grad_norm": 0.16095110774040222, "learning_rate": 3.6323330540916474e-05, "loss": 1.0578, "step": 4465 }, { "epoch": 0.7020770697007211, "grad_norm": 0.14516471326351166, "learning_rate": 3.6317821877295724e-05, "loss": 0.99, "step": 4466 }, { "epoch": 0.7022342745976538, "grad_norm": 0.1713501513004303, "learning_rate": 3.631231252242549e-05, "loss": 1.1529, "step": 4467 }, { "epoch": 0.7023914794945862, "grad_norm": 0.13815224170684814, "learning_rate": 3.630680247664223e-05, "loss": 1.1611, "step": 4468 }, { "epoch": 0.7025486843915187, "grad_norm": 0.15194745361804962, "learning_rate": 3.63012917402825e-05, "loss": 1.1695, "step": 4469 }, { "epoch": 0.7027058892884513, "grad_norm": 0.15428611636161804, "learning_rate": 3.629578031368288e-05, "loss": 1.1073, "step": 4470 }, { "epoch": 0.7028630941853838, "grad_norm": 0.1365196257829666, "learning_rate": 3.6290268197179966e-05, "loss": 1.1479, "step": 4471 }, { "epoch": 0.7030202990823164, "grad_norm": 0.15833613276481628, "learning_rate": 3.628475539111043e-05, "loss": 1.1053, "step": 4472 }, { "epoch": 0.7031775039792489, "grad_norm": 0.15446895360946655, "learning_rate": 3.627924189581097e-05, "loss": 1.0302, "step": 4473 }, { "epoch": 0.7033347088761815, "grad_norm": 0.1326589435338974, "learning_rate": 3.627372771161833e-05, "loss": 1.1417, "step": 4474 }, { "epoch": 0.703491913773114, "grad_norm": 0.13878080248832703, "learning_rate": 3.62682128388693e-05, "loss": 1.0901, "step": 4475 }, { "epoch": 0.7036491186700465, "grad_norm": 0.19546456634998322, "learning_rate": 3.6262697277900694e-05, "loss": 1.0839, "step": 4476 }, { "epoch": 0.7038063235669791, "grad_norm": 0.1378042995929718, "learning_rate": 3.625718102904939e-05, "loss": 1.1619, "step": 4477 }, { "epoch": 0.7039635284639116, "grad_norm": 0.13764554262161255, "learning_rate": 3.6251664092652305e-05, "loss": 1.0876, "step": 4478 }, { "epoch": 0.7041207333608442, "grad_norm": 0.1276378333568573, "learning_rate": 3.624614646904638e-05, "loss": 0.9421, "step": 4479 }, { "epoch": 0.7042779382577767, "grad_norm": 0.1680934727191925, "learning_rate": 3.624062815856862e-05, "loss": 1.1248, "step": 4480 }, { "epoch": 0.7042779382577767, "eval_loss": 1.0991666316986084, "eval_runtime": 2304.3083, "eval_samples_per_second": 4.018, "eval_steps_per_second": 2.009, "step": 4480 }, { "epoch": 0.7044351431547092, "grad_norm": 0.15627016127109528, "learning_rate": 3.623510916155607e-05, "loss": 1.19, "step": 4481 }, { "epoch": 0.7045923480516418, "grad_norm": 0.1757209748029709, "learning_rate": 3.622958947834579e-05, "loss": 1.1513, "step": 4482 }, { "epoch": 0.7047495529485743, "grad_norm": 0.17841759324073792, "learning_rate": 3.6224069109274914e-05, "loss": 1.1358, "step": 4483 }, { "epoch": 0.7049067578455069, "grad_norm": 0.19921208918094635, "learning_rate": 3.6218548054680595e-05, "loss": 1.1385, "step": 4484 }, { "epoch": 0.7050639627424394, "grad_norm": 0.13649779558181763, "learning_rate": 3.6213026314900055e-05, "loss": 1.0355, "step": 4485 }, { "epoch": 0.7052211676393719, "grad_norm": 0.1654793620109558, "learning_rate": 3.620750389027051e-05, "loss": 1.1228, "step": 4486 }, { "epoch": 0.7053783725363045, "grad_norm": 0.1438949704170227, "learning_rate": 3.620198078112929e-05, "loss": 1.1667, "step": 4487 }, { "epoch": 0.705535577433237, "grad_norm": 0.14017674326896667, "learning_rate": 3.6196456987813704e-05, "loss": 1.0307, "step": 4488 }, { "epoch": 0.7056927823301696, "grad_norm": 0.13663023710250854, "learning_rate": 3.619093251066112e-05, "loss": 1.0255, "step": 4489 }, { "epoch": 0.7058499872271021, "grad_norm": 0.1306449919939041, "learning_rate": 3.618540735000896e-05, "loss": 0.9965, "step": 4490 }, { "epoch": 0.7060071921240346, "grad_norm": 0.13589157164096832, "learning_rate": 3.6179881506194665e-05, "loss": 1.1561, "step": 4491 }, { "epoch": 0.7061643970209672, "grad_norm": 0.15856143832206726, "learning_rate": 3.6174354979555756e-05, "loss": 0.9947, "step": 4492 }, { "epoch": 0.7063216019178997, "grad_norm": 0.20200398564338684, "learning_rate": 3.616882777042975e-05, "loss": 1.1324, "step": 4493 }, { "epoch": 0.7064788068148323, "grad_norm": 0.12902216613292694, "learning_rate": 3.616329987915425e-05, "loss": 1.1257, "step": 4494 }, { "epoch": 0.7066360117117648, "grad_norm": 0.12593093514442444, "learning_rate": 3.615777130606687e-05, "loss": 1.0938, "step": 4495 }, { "epoch": 0.7067932166086973, "grad_norm": 0.17199790477752686, "learning_rate": 3.615224205150525e-05, "loss": 1.013, "step": 4496 }, { "epoch": 0.7069504215056299, "grad_norm": 0.12172434478998184, "learning_rate": 3.6146712115807134e-05, "loss": 1.0747, "step": 4497 }, { "epoch": 0.7071076264025624, "grad_norm": 0.12670786678791046, "learning_rate": 3.614118149931025e-05, "loss": 1.0288, "step": 4498 }, { "epoch": 0.707264831299495, "grad_norm": 0.14240224659442902, "learning_rate": 3.613565020235239e-05, "loss": 1.2203, "step": 4499 }, { "epoch": 0.7074220361964275, "grad_norm": 0.15334568917751312, "learning_rate": 3.613011822527138e-05, "loss": 1.1555, "step": 4500 }, { "epoch": 0.70757924109336, "grad_norm": 0.19086161255836487, "learning_rate": 3.61245855684051e-05, "loss": 0.9874, "step": 4501 }, { "epoch": 0.7077364459902926, "grad_norm": 0.1404600292444229, "learning_rate": 3.611905223209145e-05, "loss": 1.1017, "step": 4502 }, { "epoch": 0.7078936508872251, "grad_norm": 0.35620176792144775, "learning_rate": 3.611351821666841e-05, "loss": 1.0504, "step": 4503 }, { "epoch": 0.7080508557841577, "grad_norm": 0.13687248528003693, "learning_rate": 3.610798352247396e-05, "loss": 1.0093, "step": 4504 }, { "epoch": 0.7082080606810902, "grad_norm": 0.14304716885089874, "learning_rate": 3.6102448149846125e-05, "loss": 1.1835, "step": 4505 }, { "epoch": 0.7083652655780227, "grad_norm": 0.14237457513809204, "learning_rate": 3.609691209912302e-05, "loss": 1.2334, "step": 4506 }, { "epoch": 0.7085224704749553, "grad_norm": 0.15722329914569855, "learning_rate": 3.609137537064272e-05, "loss": 1.114, "step": 4507 }, { "epoch": 0.7086796753718878, "grad_norm": 0.1474800407886505, "learning_rate": 3.608583796474343e-05, "loss": 1.0025, "step": 4508 }, { "epoch": 0.7088368802688204, "grad_norm": 0.15540659427642822, "learning_rate": 3.6080299881763336e-05, "loss": 1.1032, "step": 4509 }, { "epoch": 0.7089940851657529, "grad_norm": 0.14503051340579987, "learning_rate": 3.6074761122040665e-05, "loss": 1.1193, "step": 4510 }, { "epoch": 0.7091512900626854, "grad_norm": 0.15747176110744476, "learning_rate": 3.606922168591374e-05, "loss": 1.0396, "step": 4511 }, { "epoch": 0.709308494959618, "grad_norm": 0.1525149792432785, "learning_rate": 3.606368157372087e-05, "loss": 1.1112, "step": 4512 }, { "epoch": 0.7094656998565505, "grad_norm": 0.20445869863033295, "learning_rate": 3.605814078580042e-05, "loss": 0.956, "step": 4513 }, { "epoch": 0.7096229047534831, "grad_norm": 0.1403074860572815, "learning_rate": 3.6052599322490805e-05, "loss": 1.1497, "step": 4514 }, { "epoch": 0.7097801096504156, "grad_norm": 0.16936302185058594, "learning_rate": 3.604705718413047e-05, "loss": 1.153, "step": 4515 }, { "epoch": 0.7099373145473481, "grad_norm": 0.14044791460037231, "learning_rate": 3.6041514371057916e-05, "loss": 1.1045, "step": 4516 }, { "epoch": 0.7100945194442807, "grad_norm": 0.13026654720306396, "learning_rate": 3.6035970883611675e-05, "loss": 1.0103, "step": 4517 }, { "epoch": 0.7102517243412132, "grad_norm": 0.1425241082906723, "learning_rate": 3.603042672213033e-05, "loss": 1.1188, "step": 4518 }, { "epoch": 0.7104089292381458, "grad_norm": 0.17885011434555054, "learning_rate": 3.6024881886952474e-05, "loss": 0.9175, "step": 4519 }, { "epoch": 0.7105661341350783, "grad_norm": 0.1704678237438202, "learning_rate": 3.601933637841679e-05, "loss": 1.2171, "step": 4520 }, { "epoch": 0.7107233390320108, "grad_norm": 0.1702074110507965, "learning_rate": 3.601379019686196e-05, "loss": 1.1282, "step": 4521 }, { "epoch": 0.7108805439289434, "grad_norm": 0.14040248095989227, "learning_rate": 3.6008243342626734e-05, "loss": 1.1557, "step": 4522 }, { "epoch": 0.7110377488258759, "grad_norm": 0.22482609748840332, "learning_rate": 3.6002695816049884e-05, "loss": 1.0938, "step": 4523 }, { "epoch": 0.7111949537228085, "grad_norm": 0.16553983092308044, "learning_rate": 3.599714761747024e-05, "loss": 1.2154, "step": 4524 }, { "epoch": 0.711352158619741, "grad_norm": 0.15201883018016815, "learning_rate": 3.599159874722666e-05, "loss": 1.0501, "step": 4525 }, { "epoch": 0.7115093635166736, "grad_norm": 0.1263444870710373, "learning_rate": 3.5986049205658046e-05, "loss": 1.0935, "step": 4526 }, { "epoch": 0.7116665684136061, "grad_norm": 0.1617809683084488, "learning_rate": 3.598049899310335e-05, "loss": 1.1585, "step": 4527 }, { "epoch": 0.7118237733105386, "grad_norm": 0.14360898733139038, "learning_rate": 3.5974948109901554e-05, "loss": 1.1632, "step": 4528 }, { "epoch": 0.7119809782074712, "grad_norm": 0.17009443044662476, "learning_rate": 3.5969396556391674e-05, "loss": 1.086, "step": 4529 }, { "epoch": 0.7121381831044037, "grad_norm": 0.1356435865163803, "learning_rate": 3.59638443329128e-05, "loss": 1.0879, "step": 4530 }, { "epoch": 0.7122953880013363, "grad_norm": 0.16926179826259613, "learning_rate": 3.595829143980403e-05, "loss": 1.0006, "step": 4531 }, { "epoch": 0.7124525928982688, "grad_norm": 0.18381322920322418, "learning_rate": 3.5952737877404506e-05, "loss": 1.1344, "step": 4532 }, { "epoch": 0.7126097977952013, "grad_norm": 0.15497080981731415, "learning_rate": 3.594718364605342e-05, "loss": 1.0882, "step": 4533 }, { "epoch": 0.7127670026921339, "grad_norm": 0.14986175298690796, "learning_rate": 3.5941628746090017e-05, "loss": 1.1863, "step": 4534 }, { "epoch": 0.7129242075890664, "grad_norm": 0.15243493020534515, "learning_rate": 3.593607317785356e-05, "loss": 1.0519, "step": 4535 }, { "epoch": 0.713081412485999, "grad_norm": 0.15663480758666992, "learning_rate": 3.593051694168336e-05, "loss": 1.1227, "step": 4536 }, { "epoch": 0.7132386173829315, "grad_norm": 0.15480025112628937, "learning_rate": 3.5924960037918775e-05, "loss": 1.1673, "step": 4537 }, { "epoch": 0.713395822279864, "grad_norm": 0.13647949695587158, "learning_rate": 3.5919402466899196e-05, "loss": 1.1236, "step": 4538 }, { "epoch": 0.7135530271767966, "grad_norm": 0.13751398026943207, "learning_rate": 3.591384422896406e-05, "loss": 1.1066, "step": 4539 }, { "epoch": 0.7137102320737291, "grad_norm": 0.17009837925434113, "learning_rate": 3.590828532445284e-05, "loss": 1.113, "step": 4540 }, { "epoch": 0.7138674369706617, "grad_norm": 0.14258937537670135, "learning_rate": 3.590272575370506e-05, "loss": 1.1194, "step": 4541 }, { "epoch": 0.7140246418675942, "grad_norm": 0.16682106256484985, "learning_rate": 3.5897165517060275e-05, "loss": 0.9974, "step": 4542 }, { "epoch": 0.7141818467645267, "grad_norm": 0.14075370132923126, "learning_rate": 3.589160461485807e-05, "loss": 1.1454, "step": 4543 }, { "epoch": 0.7143390516614593, "grad_norm": 0.14308702945709229, "learning_rate": 3.5886043047438107e-05, "loss": 1.1113, "step": 4544 }, { "epoch": 0.7144962565583918, "grad_norm": 0.16267889738082886, "learning_rate": 3.5880480815140044e-05, "loss": 1.0237, "step": 4545 }, { "epoch": 0.7146534614553244, "grad_norm": 0.13113896548748016, "learning_rate": 3.587491791830362e-05, "loss": 1.0811, "step": 4546 }, { "epoch": 0.7148106663522569, "grad_norm": 0.15190038084983826, "learning_rate": 3.5869354357268583e-05, "loss": 1.1487, "step": 4547 }, { "epoch": 0.7149678712491894, "grad_norm": 0.15138080716133118, "learning_rate": 3.5863790132374736e-05, "loss": 1.0786, "step": 4548 }, { "epoch": 0.715125076146122, "grad_norm": 0.18089433014392853, "learning_rate": 3.585822524396192e-05, "loss": 1.0491, "step": 4549 }, { "epoch": 0.7152822810430545, "grad_norm": 0.14020849764347076, "learning_rate": 3.5852659692370014e-05, "loss": 1.147, "step": 4550 }, { "epoch": 0.7154394859399871, "grad_norm": 0.13302665948867798, "learning_rate": 3.5847093477938956e-05, "loss": 1.127, "step": 4551 }, { "epoch": 0.7155966908369196, "grad_norm": 0.12847675383090973, "learning_rate": 3.584152660100869e-05, "loss": 1.1067, "step": 4552 }, { "epoch": 0.7157538957338521, "grad_norm": 0.14483784139156342, "learning_rate": 3.583595906191924e-05, "loss": 1.1292, "step": 4553 }, { "epoch": 0.7159111006307847, "grad_norm": 0.21397073566913605, "learning_rate": 3.583039086101063e-05, "loss": 1.046, "step": 4554 }, { "epoch": 0.7160683055277172, "grad_norm": 0.148904949426651, "learning_rate": 3.582482199862295e-05, "loss": 1.0681, "step": 4555 }, { "epoch": 0.7162255104246498, "grad_norm": 0.15180173516273499, "learning_rate": 3.5819252475096335e-05, "loss": 1.1507, "step": 4556 }, { "epoch": 0.7163827153215823, "grad_norm": 0.14372770488262177, "learning_rate": 3.5813682290770944e-05, "loss": 1.0862, "step": 4557 }, { "epoch": 0.7165399202185148, "grad_norm": 0.1429951786994934, "learning_rate": 3.580811144598698e-05, "loss": 1.1292, "step": 4558 }, { "epoch": 0.7166971251154474, "grad_norm": 0.13750120997428894, "learning_rate": 3.580253994108469e-05, "loss": 1.1985, "step": 4559 }, { "epoch": 0.7168543300123799, "grad_norm": 0.1471010148525238, "learning_rate": 3.579696777640436e-05, "loss": 1.1356, "step": 4560 }, { "epoch": 0.7170115349093125, "grad_norm": 0.1405636966228485, "learning_rate": 3.579139495228632e-05, "loss": 1.1764, "step": 4561 }, { "epoch": 0.717168739806245, "grad_norm": 0.1493627279996872, "learning_rate": 3.578582146907094e-05, "loss": 0.9842, "step": 4562 }, { "epoch": 0.7173259447031775, "grad_norm": 0.14724910259246826, "learning_rate": 3.5780247327098614e-05, "loss": 1.2037, "step": 4563 }, { "epoch": 0.7174831496001101, "grad_norm": 0.15836381912231445, "learning_rate": 3.5774672526709805e-05, "loss": 1.1379, "step": 4564 }, { "epoch": 0.7176403544970426, "grad_norm": 0.14133360981941223, "learning_rate": 3.5769097068244985e-05, "loss": 0.9887, "step": 4565 }, { "epoch": 0.7177975593939752, "grad_norm": 0.15250210464000702, "learning_rate": 3.576352095204469e-05, "loss": 1.0526, "step": 4566 }, { "epoch": 0.7179547642909077, "grad_norm": 0.15717212855815887, "learning_rate": 3.575794417844949e-05, "loss": 1.2018, "step": 4567 }, { "epoch": 0.7181119691878401, "grad_norm": 0.15343016386032104, "learning_rate": 3.5752366747799995e-05, "loss": 1.1506, "step": 4568 }, { "epoch": 0.7182691740847728, "grad_norm": 0.16115228831768036, "learning_rate": 3.574678866043685e-05, "loss": 1.0947, "step": 4569 }, { "epoch": 0.7184263789817052, "grad_norm": 0.1491878181695938, "learning_rate": 3.574120991670074e-05, "loss": 1.014, "step": 4570 }, { "epoch": 0.7185835838786379, "grad_norm": 0.16374894976615906, "learning_rate": 3.573563051693238e-05, "loss": 1.1004, "step": 4571 }, { "epoch": 0.7187407887755703, "grad_norm": 0.2055385261774063, "learning_rate": 3.573005046147258e-05, "loss": 1.0941, "step": 4572 }, { "epoch": 0.7188979936725028, "grad_norm": 0.14261139929294586, "learning_rate": 3.572446975066211e-05, "loss": 1.1006, "step": 4573 }, { "epoch": 0.7190551985694354, "grad_norm": 0.14790108799934387, "learning_rate": 3.571888838484183e-05, "loss": 1.1123, "step": 4574 }, { "epoch": 0.7192124034663679, "grad_norm": 0.13406376540660858, "learning_rate": 3.571330636435263e-05, "loss": 1.1246, "step": 4575 }, { "epoch": 0.7193696083633006, "grad_norm": 0.1513693481683731, "learning_rate": 3.570772368953545e-05, "loss": 1.1425, "step": 4576 }, { "epoch": 0.719526813260233, "grad_norm": 0.1454620510339737, "learning_rate": 3.570214036073124e-05, "loss": 1.2008, "step": 4577 }, { "epoch": 0.7196840181571657, "grad_norm": 0.1506110578775406, "learning_rate": 3.569655637828101e-05, "loss": 1.31, "step": 4578 }, { "epoch": 0.7198412230540981, "grad_norm": 0.12902851402759552, "learning_rate": 3.569097174252582e-05, "loss": 1.1247, "step": 4579 }, { "epoch": 0.7199984279510306, "grad_norm": 0.14839795231819153, "learning_rate": 3.568538645380675e-05, "loss": 1.0916, "step": 4580 }, { "epoch": 0.7201556328479632, "grad_norm": 0.14236797392368317, "learning_rate": 3.567980051246494e-05, "loss": 1.08, "step": 4581 }, { "epoch": 0.7203128377448957, "grad_norm": 0.1483723521232605, "learning_rate": 3.5674213918841534e-05, "loss": 1.1429, "step": 4582 }, { "epoch": 0.7204700426418283, "grad_norm": 0.15774306654930115, "learning_rate": 3.566862667327777e-05, "loss": 1.0761, "step": 4583 }, { "epoch": 0.7206272475387608, "grad_norm": 0.1318364143371582, "learning_rate": 3.566303877611487e-05, "loss": 1.116, "step": 4584 }, { "epoch": 0.7207844524356933, "grad_norm": 0.13808578252792358, "learning_rate": 3.565745022769413e-05, "loss": 1.0402, "step": 4585 }, { "epoch": 0.7209416573326259, "grad_norm": 0.13911092281341553, "learning_rate": 3.5651861028356884e-05, "loss": 1.1032, "step": 4586 }, { "epoch": 0.7210988622295584, "grad_norm": 0.14403727650642395, "learning_rate": 3.56462711784445e-05, "loss": 1.0561, "step": 4587 }, { "epoch": 0.721256067126491, "grad_norm": 0.14054661989212036, "learning_rate": 3.564068067829837e-05, "loss": 1.1411, "step": 4588 }, { "epoch": 0.7214132720234235, "grad_norm": 0.1546674221754074, "learning_rate": 3.563508952825995e-05, "loss": 1.0848, "step": 4589 }, { "epoch": 0.721570476920356, "grad_norm": 0.1409955769777298, "learning_rate": 3.5629497728670725e-05, "loss": 1.0394, "step": 4590 }, { "epoch": 0.7217276818172886, "grad_norm": 0.1401342898607254, "learning_rate": 3.562390527987222e-05, "loss": 1.1356, "step": 4591 }, { "epoch": 0.7218848867142211, "grad_norm": 0.15182948112487793, "learning_rate": 3.5618312182206006e-05, "loss": 1.0383, "step": 4592 }, { "epoch": 0.7220420916111537, "grad_norm": 0.14586354792118073, "learning_rate": 3.561271843601369e-05, "loss": 0.9826, "step": 4593 }, { "epoch": 0.7221992965080862, "grad_norm": 0.14797888696193695, "learning_rate": 3.56071240416369e-05, "loss": 1.0666, "step": 4594 }, { "epoch": 0.7223565014050187, "grad_norm": 0.13578853011131287, "learning_rate": 3.560152899941733e-05, "loss": 1.2295, "step": 4595 }, { "epoch": 0.7225137063019513, "grad_norm": 0.12927323579788208, "learning_rate": 3.559593330969671e-05, "loss": 1.1353, "step": 4596 }, { "epoch": 0.7226709111988838, "grad_norm": 0.14741744101047516, "learning_rate": 3.559033697281679e-05, "loss": 1.0358, "step": 4597 }, { "epoch": 0.7228281160958164, "grad_norm": 0.14150522649288177, "learning_rate": 3.5584739989119395e-05, "loss": 1.1996, "step": 4598 }, { "epoch": 0.7229853209927489, "grad_norm": 0.13429418206214905, "learning_rate": 3.557914235894635e-05, "loss": 1.0684, "step": 4599 }, { "epoch": 0.7231425258896814, "grad_norm": 0.13873820006847382, "learning_rate": 3.557354408263954e-05, "loss": 0.9561, "step": 4600 }, { "epoch": 0.723299730786614, "grad_norm": 0.147738978266716, "learning_rate": 3.5567945160540884e-05, "loss": 1.128, "step": 4601 }, { "epoch": 0.7234569356835465, "grad_norm": 0.14353159070014954, "learning_rate": 3.556234559299235e-05, "loss": 1.0746, "step": 4602 }, { "epoch": 0.7236141405804791, "grad_norm": 0.1310272216796875, "learning_rate": 3.5556745380335934e-05, "loss": 1.05, "step": 4603 }, { "epoch": 0.7237713454774116, "grad_norm": 0.14752885699272156, "learning_rate": 3.555114452291367e-05, "loss": 1.1376, "step": 4604 }, { "epoch": 0.7239285503743441, "grad_norm": 0.1673475056886673, "learning_rate": 3.5545543021067645e-05, "loss": 1.0393, "step": 4605 }, { "epoch": 0.7240857552712767, "grad_norm": 0.15939272940158844, "learning_rate": 3.553994087513998e-05, "loss": 1.102, "step": 4606 }, { "epoch": 0.7242429601682092, "grad_norm": 0.14392653107643127, "learning_rate": 3.553433808547283e-05, "loss": 1.1948, "step": 4607 }, { "epoch": 0.7244001650651418, "grad_norm": 0.12796008586883545, "learning_rate": 3.552873465240838e-05, "loss": 1.2053, "step": 4608 }, { "epoch": 0.7245573699620743, "grad_norm": 0.12037085741758347, "learning_rate": 3.552313057628888e-05, "loss": 1.0953, "step": 4609 }, { "epoch": 0.7247145748590068, "grad_norm": 0.13031119108200073, "learning_rate": 3.5517525857456604e-05, "loss": 1.0254, "step": 4610 }, { "epoch": 0.7248717797559394, "grad_norm": 0.13989228010177612, "learning_rate": 3.551192049625387e-05, "loss": 1.1947, "step": 4611 }, { "epoch": 0.7250289846528719, "grad_norm": 0.17071443796157837, "learning_rate": 3.550631449302302e-05, "loss": 1.1944, "step": 4612 }, { "epoch": 0.7251861895498045, "grad_norm": 0.13762187957763672, "learning_rate": 3.550070784810646e-05, "loss": 1.0757, "step": 4613 }, { "epoch": 0.725343394446737, "grad_norm": 0.13957257568836212, "learning_rate": 3.5495100561846615e-05, "loss": 1.192, "step": 4614 }, { "epoch": 0.7255005993436695, "grad_norm": 0.15514232218265533, "learning_rate": 3.5489492634585955e-05, "loss": 1.1077, "step": 4615 }, { "epoch": 0.7256578042406021, "grad_norm": 0.1375376582145691, "learning_rate": 3.5483884066667006e-05, "loss": 1.1309, "step": 4616 }, { "epoch": 0.7258150091375346, "grad_norm": 0.1555032879114151, "learning_rate": 3.54782748584323e-05, "loss": 1.1224, "step": 4617 }, { "epoch": 0.7259722140344672, "grad_norm": 0.14163243770599365, "learning_rate": 3.5472665010224434e-05, "loss": 1.0983, "step": 4618 }, { "epoch": 0.7261294189313997, "grad_norm": 0.13645827770233154, "learning_rate": 3.546705452238603e-05, "loss": 1.1141, "step": 4619 }, { "epoch": 0.7262866238283322, "grad_norm": 0.154319167137146, "learning_rate": 3.546144339525976e-05, "loss": 1.1441, "step": 4620 }, { "epoch": 0.7264438287252648, "grad_norm": 0.13194586336612701, "learning_rate": 3.5455831629188343e-05, "loss": 1.1506, "step": 4621 }, { "epoch": 0.7266010336221973, "grad_norm": 0.14997638761997223, "learning_rate": 3.5450219224514506e-05, "loss": 1.1211, "step": 4622 }, { "epoch": 0.7267582385191299, "grad_norm": 0.19300997257232666, "learning_rate": 3.5444606181581034e-05, "loss": 0.9954, "step": 4623 }, { "epoch": 0.7269154434160624, "grad_norm": 0.12999020516872406, "learning_rate": 3.543899250073075e-05, "loss": 0.948, "step": 4624 }, { "epoch": 0.7270726483129949, "grad_norm": 0.13173635303974152, "learning_rate": 3.5433378182306534e-05, "loss": 1.035, "step": 4625 }, { "epoch": 0.7272298532099275, "grad_norm": 0.1882297694683075, "learning_rate": 3.542776322665128e-05, "loss": 1.0028, "step": 4626 }, { "epoch": 0.72738705810686, "grad_norm": 0.1392630636692047, "learning_rate": 3.54221476341079e-05, "loss": 1.1616, "step": 4627 }, { "epoch": 0.7275442630037926, "grad_norm": 0.1386382132768631, "learning_rate": 3.5416531405019416e-05, "loss": 0.9877, "step": 4628 }, { "epoch": 0.7277014679007251, "grad_norm": 0.12188861519098282, "learning_rate": 3.5410914539728827e-05, "loss": 1.121, "step": 4629 }, { "epoch": 0.7278586727976577, "grad_norm": 0.145427867770195, "learning_rate": 3.540529703857918e-05, "loss": 1.052, "step": 4630 }, { "epoch": 0.7280158776945902, "grad_norm": 0.16741080582141876, "learning_rate": 3.539967890191358e-05, "loss": 1.1455, "step": 4631 }, { "epoch": 0.7281730825915227, "grad_norm": 0.15054307878017426, "learning_rate": 3.539406013007516e-05, "loss": 1.133, "step": 4632 }, { "epoch": 0.7283302874884553, "grad_norm": 0.15563537180423737, "learning_rate": 3.5388440723407104e-05, "loss": 1.0843, "step": 4633 }, { "epoch": 0.7284874923853878, "grad_norm": 0.12449619174003601, "learning_rate": 3.5382820682252605e-05, "loss": 1.0446, "step": 4634 }, { "epoch": 0.7286446972823204, "grad_norm": 0.1355639100074768, "learning_rate": 3.5377200006954924e-05, "loss": 1.1817, "step": 4635 }, { "epoch": 0.7288019021792529, "grad_norm": 0.13719028234481812, "learning_rate": 3.537157869785735e-05, "loss": 1.1178, "step": 4636 }, { "epoch": 0.7289591070761854, "grad_norm": 0.13218499720096588, "learning_rate": 3.5365956755303216e-05, "loss": 1.1066, "step": 4637 }, { "epoch": 0.729116311973118, "grad_norm": 0.131890669465065, "learning_rate": 3.536033417963587e-05, "loss": 1.214, "step": 4638 }, { "epoch": 0.7292735168700505, "grad_norm": 0.12651421129703522, "learning_rate": 3.5354710971198744e-05, "loss": 1.204, "step": 4639 }, { "epoch": 0.7294307217669831, "grad_norm": 0.16133572161197662, "learning_rate": 3.5349087130335265e-05, "loss": 1.105, "step": 4640 }, { "epoch": 0.7294307217669831, "eval_loss": 1.0980340242385864, "eval_runtime": 2311.687, "eval_samples_per_second": 4.005, "eval_steps_per_second": 2.002, "step": 4640 }, { "epoch": 0.7295879266639156, "grad_norm": 0.13650627434253693, "learning_rate": 3.534346265738891e-05, "loss": 1.0481, "step": 4641 }, { "epoch": 0.7297451315608481, "grad_norm": 0.13792318105697632, "learning_rate": 3.533783755270322e-05, "loss": 1.1098, "step": 4642 }, { "epoch": 0.7299023364577807, "grad_norm": 0.16713480651378632, "learning_rate": 3.5332211816621744e-05, "loss": 1.0176, "step": 4643 }, { "epoch": 0.7300595413547132, "grad_norm": 0.15072934329509735, "learning_rate": 3.532658544948809e-05, "loss": 0.9831, "step": 4644 }, { "epoch": 0.7302167462516458, "grad_norm": 0.17281381785869598, "learning_rate": 3.532095845164588e-05, "loss": 1.1876, "step": 4645 }, { "epoch": 0.7303739511485783, "grad_norm": 0.1437559276819229, "learning_rate": 3.531533082343878e-05, "loss": 1.0361, "step": 4646 }, { "epoch": 0.7305311560455108, "grad_norm": 0.13455119729042053, "learning_rate": 3.530970256521055e-05, "loss": 1.0722, "step": 4647 }, { "epoch": 0.7306883609424434, "grad_norm": 0.14255169034004211, "learning_rate": 3.530407367730489e-05, "loss": 0.9882, "step": 4648 }, { "epoch": 0.7308455658393759, "grad_norm": 0.14039060473442078, "learning_rate": 3.5298444160065626e-05, "loss": 1.0624, "step": 4649 }, { "epoch": 0.7310027707363085, "grad_norm": 0.14412268996238708, "learning_rate": 3.5292814013836575e-05, "loss": 1.1823, "step": 4650 }, { "epoch": 0.731159975633241, "grad_norm": 0.15416951477527618, "learning_rate": 3.5287183238961605e-05, "loss": 1.1518, "step": 4651 }, { "epoch": 0.7313171805301735, "grad_norm": 0.14736409485340118, "learning_rate": 3.528155183578462e-05, "loss": 1.1206, "step": 4652 }, { "epoch": 0.7314743854271061, "grad_norm": 0.13994631171226501, "learning_rate": 3.5275919804649564e-05, "loss": 1.1733, "step": 4653 }, { "epoch": 0.7316315903240386, "grad_norm": 0.1334182471036911, "learning_rate": 3.527028714590043e-05, "loss": 1.0611, "step": 4654 }, { "epoch": 0.7317887952209712, "grad_norm": 0.1282099187374115, "learning_rate": 3.526465385988123e-05, "loss": 1.049, "step": 4655 }, { "epoch": 0.7319460001179037, "grad_norm": 0.16863442957401276, "learning_rate": 3.525901994693603e-05, "loss": 1.0266, "step": 4656 }, { "epoch": 0.7321032050148362, "grad_norm": 0.15601807832717896, "learning_rate": 3.5253385407408925e-05, "loss": 1.1386, "step": 4657 }, { "epoch": 0.7322604099117688, "grad_norm": 0.2561866044998169, "learning_rate": 3.524775024164405e-05, "loss": 1.0715, "step": 4658 }, { "epoch": 0.7324176148087013, "grad_norm": 0.19980670511722565, "learning_rate": 3.524211444998557e-05, "loss": 1.0759, "step": 4659 }, { "epoch": 0.7325748197056339, "grad_norm": 0.15558764338493347, "learning_rate": 3.523647803277772e-05, "loss": 1.0321, "step": 4660 }, { "epoch": 0.7327320246025664, "grad_norm": 0.13551028072834015, "learning_rate": 3.5230840990364736e-05, "loss": 1.124, "step": 4661 }, { "epoch": 0.7328892294994989, "grad_norm": 0.1487666517496109, "learning_rate": 3.522520332309091e-05, "loss": 1.0066, "step": 4662 }, { "epoch": 0.7330464343964315, "grad_norm": 0.13485178351402283, "learning_rate": 3.521956503130057e-05, "loss": 1.0981, "step": 4663 }, { "epoch": 0.733203639293364, "grad_norm": 0.1500270813703537, "learning_rate": 3.521392611533808e-05, "loss": 1.1002, "step": 4664 }, { "epoch": 0.7333608441902966, "grad_norm": 0.14526516199111938, "learning_rate": 3.520828657554785e-05, "loss": 1.1052, "step": 4665 }, { "epoch": 0.7335180490872291, "grad_norm": 0.14108894765377045, "learning_rate": 3.520264641227431e-05, "loss": 1.0931, "step": 4666 }, { "epoch": 0.7336752539841616, "grad_norm": 0.17181095480918884, "learning_rate": 3.519700562586195e-05, "loss": 1.0911, "step": 4667 }, { "epoch": 0.7338324588810942, "grad_norm": 0.1537475883960724, "learning_rate": 3.519136421665528e-05, "loss": 1.0408, "step": 4668 }, { "epoch": 0.7339896637780267, "grad_norm": 0.1326746642589569, "learning_rate": 3.518572218499885e-05, "loss": 1.0928, "step": 4669 }, { "epoch": 0.7341468686749593, "grad_norm": 0.14614926278591156, "learning_rate": 3.5180079531237273e-05, "loss": 1.1291, "step": 4670 }, { "epoch": 0.7343040735718918, "grad_norm": 0.14450791478157043, "learning_rate": 3.517443625571517e-05, "loss": 1.0901, "step": 4671 }, { "epoch": 0.7344612784688243, "grad_norm": 0.12363743782043457, "learning_rate": 3.5168792358777216e-05, "loss": 1.0032, "step": 4672 }, { "epoch": 0.7346184833657569, "grad_norm": 0.14431117475032806, "learning_rate": 3.5163147840768106e-05, "loss": 1.0301, "step": 4673 }, { "epoch": 0.7347756882626894, "grad_norm": 0.13479335606098175, "learning_rate": 3.5157502702032605e-05, "loss": 1.1135, "step": 4674 }, { "epoch": 0.734932893159622, "grad_norm": 0.1340409368276596, "learning_rate": 3.5151856942915475e-05, "loss": 1.3005, "step": 4675 }, { "epoch": 0.7350900980565545, "grad_norm": 0.12972420454025269, "learning_rate": 3.514621056376155e-05, "loss": 0.9629, "step": 4676 }, { "epoch": 0.735247302953487, "grad_norm": 0.14823409914970398, "learning_rate": 3.5140563564915694e-05, "loss": 1.1048, "step": 4677 }, { "epoch": 0.7354045078504196, "grad_norm": 0.12902098894119263, "learning_rate": 3.513491594672279e-05, "loss": 1.0985, "step": 4678 }, { "epoch": 0.735561712747352, "grad_norm": 0.14853690564632416, "learning_rate": 3.512926770952779e-05, "loss": 1.1909, "step": 4679 }, { "epoch": 0.7357189176442847, "grad_norm": 0.14038996398448944, "learning_rate": 3.512361885367565e-05, "loss": 1.0486, "step": 4680 }, { "epoch": 0.7358761225412171, "grad_norm": 0.13920599222183228, "learning_rate": 3.511796937951139e-05, "loss": 1.1348, "step": 4681 }, { "epoch": 0.7360333274381498, "grad_norm": 0.14314356446266174, "learning_rate": 3.5112319287380055e-05, "loss": 1.0857, "step": 4682 }, { "epoch": 0.7361905323350822, "grad_norm": 0.12702102959156036, "learning_rate": 3.510666857762673e-05, "loss": 1.2012, "step": 4683 }, { "epoch": 0.7363477372320147, "grad_norm": 0.16372568905353546, "learning_rate": 3.5101017250596556e-05, "loss": 1.0962, "step": 4684 }, { "epoch": 0.7365049421289473, "grad_norm": 0.13005401194095612, "learning_rate": 3.509536530663467e-05, "loss": 1.0554, "step": 4685 }, { "epoch": 0.7366621470258798, "grad_norm": 0.16866400837898254, "learning_rate": 3.508971274608628e-05, "loss": 1.1306, "step": 4686 }, { "epoch": 0.7368193519228124, "grad_norm": 0.1414596289396286, "learning_rate": 3.5084059569296624e-05, "loss": 1.114, "step": 4687 }, { "epoch": 0.7369765568197449, "grad_norm": 0.1379203200340271, "learning_rate": 3.507840577661099e-05, "loss": 1.094, "step": 4688 }, { "epoch": 0.7371337617166774, "grad_norm": 0.15202675759792328, "learning_rate": 3.5072751368374656e-05, "loss": 1.0731, "step": 4689 }, { "epoch": 0.73729096661361, "grad_norm": 0.19967341423034668, "learning_rate": 3.5067096344933e-05, "loss": 1.1763, "step": 4690 }, { "epoch": 0.7374481715105425, "grad_norm": 0.16265927255153656, "learning_rate": 3.5061440706631414e-05, "loss": 1.0624, "step": 4691 }, { "epoch": 0.7376053764074751, "grad_norm": 0.13775219023227692, "learning_rate": 3.5055784453815296e-05, "loss": 1.1432, "step": 4692 }, { "epoch": 0.7377625813044076, "grad_norm": 0.142997145652771, "learning_rate": 3.505012758683013e-05, "loss": 1.1118, "step": 4693 }, { "epoch": 0.7379197862013401, "grad_norm": 0.13343292474746704, "learning_rate": 3.504447010602141e-05, "loss": 1.2013, "step": 4694 }, { "epoch": 0.7380769910982727, "grad_norm": 0.14398351311683655, "learning_rate": 3.503881201173467e-05, "loss": 1.0959, "step": 4695 }, { "epoch": 0.7382341959952052, "grad_norm": 0.1524992734193802, "learning_rate": 3.50331533043155e-05, "loss": 1.1024, "step": 4696 }, { "epoch": 0.7383914008921378, "grad_norm": 0.12862329185009003, "learning_rate": 3.502749398410948e-05, "loss": 1.2062, "step": 4697 }, { "epoch": 0.7385486057890703, "grad_norm": 0.14112304151058197, "learning_rate": 3.502183405146229e-05, "loss": 1.1228, "step": 4698 }, { "epoch": 0.7387058106860028, "grad_norm": 0.13773949444293976, "learning_rate": 3.501617350671961e-05, "loss": 1.0286, "step": 4699 }, { "epoch": 0.7388630155829354, "grad_norm": 0.1456059068441391, "learning_rate": 3.501051235022716e-05, "loss": 1.1286, "step": 4700 }, { "epoch": 0.7390202204798679, "grad_norm": 0.13958464562892914, "learning_rate": 3.50048505823307e-05, "loss": 1.1567, "step": 4701 }, { "epoch": 0.7391774253768005, "grad_norm": 0.13271421194076538, "learning_rate": 3.499918820337602e-05, "loss": 1.1753, "step": 4702 }, { "epoch": 0.739334630273733, "grad_norm": 0.152462437748909, "learning_rate": 3.4993525213708986e-05, "loss": 1.0905, "step": 4703 }, { "epoch": 0.7394918351706655, "grad_norm": 0.13484671711921692, "learning_rate": 3.498786161367544e-05, "loss": 1.0469, "step": 4704 }, { "epoch": 0.7396490400675981, "grad_norm": 0.12940473854541779, "learning_rate": 3.498219740362133e-05, "loss": 1.1719, "step": 4705 }, { "epoch": 0.7398062449645306, "grad_norm": 0.16455109417438507, "learning_rate": 3.497653258389256e-05, "loss": 1.173, "step": 4706 }, { "epoch": 0.7399634498614632, "grad_norm": 0.12805430591106415, "learning_rate": 3.4970867154835153e-05, "loss": 1.1369, "step": 4707 }, { "epoch": 0.7401206547583957, "grad_norm": 0.16718123853206635, "learning_rate": 3.496520111679511e-05, "loss": 1.0984, "step": 4708 }, { "epoch": 0.7402778596553282, "grad_norm": 0.1421629935503006, "learning_rate": 3.49595344701185e-05, "loss": 1.0621, "step": 4709 }, { "epoch": 0.7404350645522608, "grad_norm": 0.12086508423089981, "learning_rate": 3.495386721515142e-05, "loss": 1.0634, "step": 4710 }, { "epoch": 0.7405922694491933, "grad_norm": 0.15074370801448822, "learning_rate": 3.494819935224e-05, "loss": 1.1158, "step": 4711 }, { "epoch": 0.7407494743461259, "grad_norm": 0.1683443784713745, "learning_rate": 3.4942530881730414e-05, "loss": 0.9991, "step": 4712 }, { "epoch": 0.7409066792430584, "grad_norm": 0.12071124464273453, "learning_rate": 3.4936861803968865e-05, "loss": 1.0965, "step": 4713 }, { "epoch": 0.7410638841399909, "grad_norm": 0.15003018081188202, "learning_rate": 3.493119211930162e-05, "loss": 1.0252, "step": 4714 }, { "epoch": 0.7412210890369235, "grad_norm": 0.144414022564888, "learning_rate": 3.4925521828074934e-05, "loss": 1.2352, "step": 4715 }, { "epoch": 0.741378293933856, "grad_norm": 0.13215228915214539, "learning_rate": 3.491985093063514e-05, "loss": 1.0834, "step": 4716 }, { "epoch": 0.7415354988307886, "grad_norm": 0.14399638772010803, "learning_rate": 3.491417942732859e-05, "loss": 1.0894, "step": 4717 }, { "epoch": 0.7416927037277211, "grad_norm": 0.12594108283519745, "learning_rate": 3.490850731850169e-05, "loss": 1.1273, "step": 4718 }, { "epoch": 0.7418499086246536, "grad_norm": 0.14319142699241638, "learning_rate": 3.490283460450086e-05, "loss": 1.133, "step": 4719 }, { "epoch": 0.7420071135215862, "grad_norm": 0.1428254246711731, "learning_rate": 3.489716128567257e-05, "loss": 1.0948, "step": 4720 }, { "epoch": 0.7421643184185187, "grad_norm": 0.1410217434167862, "learning_rate": 3.489148736236333e-05, "loss": 1.1161, "step": 4721 }, { "epoch": 0.7423215233154513, "grad_norm": 0.13551457226276398, "learning_rate": 3.488581283491966e-05, "loss": 1.16, "step": 4722 }, { "epoch": 0.7424787282123838, "grad_norm": 0.15328136086463928, "learning_rate": 3.488013770368817e-05, "loss": 1.1669, "step": 4723 }, { "epoch": 0.7426359331093163, "grad_norm": 0.13792365789413452, "learning_rate": 3.487446196901546e-05, "loss": 1.1167, "step": 4724 }, { "epoch": 0.7427931380062489, "grad_norm": 0.148483008146286, "learning_rate": 3.486878563124817e-05, "loss": 1.1051, "step": 4725 }, { "epoch": 0.7429503429031814, "grad_norm": 0.12647677958011627, "learning_rate": 3.4863108690733024e-05, "loss": 1.0418, "step": 4726 }, { "epoch": 0.743107547800114, "grad_norm": 0.13564087450504303, "learning_rate": 3.4857431147816696e-05, "loss": 1.0032, "step": 4727 }, { "epoch": 0.7432647526970465, "grad_norm": 0.1215532198548317, "learning_rate": 3.4851753002846006e-05, "loss": 1.1842, "step": 4728 }, { "epoch": 0.743421957593979, "grad_norm": 0.14689278602600098, "learning_rate": 3.4846074256167714e-05, "loss": 1.0337, "step": 4729 }, { "epoch": 0.7435791624909116, "grad_norm": 0.1328740268945694, "learning_rate": 3.484039490812867e-05, "loss": 0.9731, "step": 4730 }, { "epoch": 0.7437363673878441, "grad_norm": 0.12815256416797638, "learning_rate": 3.483471495907575e-05, "loss": 1.1345, "step": 4731 }, { "epoch": 0.7438935722847767, "grad_norm": 0.11724529415369034, "learning_rate": 3.4829034409355846e-05, "loss": 1.0029, "step": 4732 }, { "epoch": 0.7440507771817092, "grad_norm": 0.16131702065467834, "learning_rate": 3.4823353259315926e-05, "loss": 1.1775, "step": 4733 }, { "epoch": 0.7442079820786417, "grad_norm": 0.12095880508422852, "learning_rate": 3.4817671509302965e-05, "loss": 0.9847, "step": 4734 }, { "epoch": 0.7443651869755743, "grad_norm": 0.12888216972351074, "learning_rate": 3.4811989159663984e-05, "loss": 1.1294, "step": 4735 }, { "epoch": 0.7445223918725068, "grad_norm": 0.13063958287239075, "learning_rate": 3.4806306210746035e-05, "loss": 1.0426, "step": 4736 }, { "epoch": 0.7446795967694394, "grad_norm": 0.13246606290340424, "learning_rate": 3.480062266289621e-05, "loss": 1.1522, "step": 4737 }, { "epoch": 0.7448368016663719, "grad_norm": 0.1366496980190277, "learning_rate": 3.479493851646164e-05, "loss": 1.1525, "step": 4738 }, { "epoch": 0.7449940065633045, "grad_norm": 0.16373786330223083, "learning_rate": 3.47892537717895e-05, "loss": 1.0834, "step": 4739 }, { "epoch": 0.745151211460237, "grad_norm": 0.17399144172668457, "learning_rate": 3.478356842922699e-05, "loss": 1.1678, "step": 4740 }, { "epoch": 0.7453084163571695, "grad_norm": 0.1297912895679474, "learning_rate": 3.4777882489121325e-05, "loss": 0.8803, "step": 4741 }, { "epoch": 0.7454656212541021, "grad_norm": 0.1584755927324295, "learning_rate": 3.477219595181981e-05, "loss": 0.9579, "step": 4742 }, { "epoch": 0.7456228261510346, "grad_norm": 0.16625195741653442, "learning_rate": 3.476650881766975e-05, "loss": 1.102, "step": 4743 }, { "epoch": 0.7457800310479672, "grad_norm": 0.14477670192718506, "learning_rate": 3.476082108701849e-05, "loss": 1.1615, "step": 4744 }, { "epoch": 0.7459372359448997, "grad_norm": 0.16770561039447784, "learning_rate": 3.4755132760213415e-05, "loss": 1.029, "step": 4745 }, { "epoch": 0.7460944408418322, "grad_norm": 0.12708137929439545, "learning_rate": 3.474944383760195e-05, "loss": 1.089, "step": 4746 }, { "epoch": 0.7462516457387648, "grad_norm": 0.1296709030866623, "learning_rate": 3.474375431953154e-05, "loss": 1.1327, "step": 4747 }, { "epoch": 0.7464088506356973, "grad_norm": 0.126552015542984, "learning_rate": 3.4738064206349694e-05, "loss": 1.046, "step": 4748 }, { "epoch": 0.7465660555326299, "grad_norm": 0.13531428575515747, "learning_rate": 3.473237349840394e-05, "loss": 1.1643, "step": 4749 }, { "epoch": 0.7467232604295624, "grad_norm": 0.13297897577285767, "learning_rate": 3.4726682196041844e-05, "loss": 1.0452, "step": 4750 }, { "epoch": 0.7468804653264949, "grad_norm": 0.12808465957641602, "learning_rate": 3.4720990299611e-05, "loss": 1.0513, "step": 4751 }, { "epoch": 0.7470376702234275, "grad_norm": 0.1238451898097992, "learning_rate": 3.4715297809459055e-05, "loss": 1.1153, "step": 4752 }, { "epoch": 0.74719487512036, "grad_norm": 0.15730857849121094, "learning_rate": 3.4709604725933685e-05, "loss": 1.0497, "step": 4753 }, { "epoch": 0.7473520800172926, "grad_norm": 0.18751071393489838, "learning_rate": 3.470391104938261e-05, "loss": 1.0898, "step": 4754 }, { "epoch": 0.7475092849142251, "grad_norm": 0.12530651688575745, "learning_rate": 3.469821678015356e-05, "loss": 1.0435, "step": 4755 }, { "epoch": 0.7476664898111576, "grad_norm": 0.17319822311401367, "learning_rate": 3.469252191859432e-05, "loss": 1.172, "step": 4756 }, { "epoch": 0.7478236947080902, "grad_norm": 0.12704682350158691, "learning_rate": 3.468682646505273e-05, "loss": 1.064, "step": 4757 }, { "epoch": 0.7479808996050227, "grad_norm": 0.17880849540233612, "learning_rate": 3.468113041987664e-05, "loss": 1.1394, "step": 4758 }, { "epoch": 0.7481381045019553, "grad_norm": 0.14523322880268097, "learning_rate": 3.4675433783413936e-05, "loss": 1.1393, "step": 4759 }, { "epoch": 0.7482953093988878, "grad_norm": 0.1540546715259552, "learning_rate": 3.466973655601254e-05, "loss": 1.1145, "step": 4760 }, { "epoch": 0.7484525142958203, "grad_norm": 0.1491522341966629, "learning_rate": 3.466403873802043e-05, "loss": 1.0209, "step": 4761 }, { "epoch": 0.7486097191927529, "grad_norm": 0.135720893740654, "learning_rate": 3.4658340329785606e-05, "loss": 1.1023, "step": 4762 }, { "epoch": 0.7487669240896854, "grad_norm": 0.1523171365261078, "learning_rate": 3.46526413316561e-05, "loss": 1.2391, "step": 4763 }, { "epoch": 0.748924128986618, "grad_norm": 0.1411243975162506, "learning_rate": 3.464694174397999e-05, "loss": 1.1447, "step": 4764 }, { "epoch": 0.7490813338835505, "grad_norm": 0.1597915142774582, "learning_rate": 3.464124156710538e-05, "loss": 1.164, "step": 4765 }, { "epoch": 0.749238538780483, "grad_norm": 0.13481174409389496, "learning_rate": 3.463554080138042e-05, "loss": 1.0763, "step": 4766 }, { "epoch": 0.7493957436774156, "grad_norm": 0.13792438805103302, "learning_rate": 3.4629839447153286e-05, "loss": 1.0839, "step": 4767 }, { "epoch": 0.7495529485743481, "grad_norm": 0.1663919985294342, "learning_rate": 3.46241375047722e-05, "loss": 1.1405, "step": 4768 }, { "epoch": 0.7497101534712807, "grad_norm": 0.13219061493873596, "learning_rate": 3.461843497458541e-05, "loss": 1.0549, "step": 4769 }, { "epoch": 0.7498673583682132, "grad_norm": 0.1282496452331543, "learning_rate": 3.461273185694121e-05, "loss": 1.0713, "step": 4770 }, { "epoch": 0.7500245632651457, "grad_norm": 0.12896636128425598, "learning_rate": 3.460702815218792e-05, "loss": 1.0364, "step": 4771 }, { "epoch": 0.7501817681620783, "grad_norm": 0.13691388070583344, "learning_rate": 3.4601323860673904e-05, "loss": 1.038, "step": 4772 }, { "epoch": 0.7503389730590108, "grad_norm": 0.12374941259622574, "learning_rate": 3.459561898274756e-05, "loss": 1.0836, "step": 4773 }, { "epoch": 0.7504961779559434, "grad_norm": 0.14701367914676666, "learning_rate": 3.4589913518757313e-05, "loss": 1.2139, "step": 4774 }, { "epoch": 0.7506533828528759, "grad_norm": 0.14951729774475098, "learning_rate": 3.458420746905164e-05, "loss": 1.0835, "step": 4775 }, { "epoch": 0.7508105877498084, "grad_norm": 0.1360500305891037, "learning_rate": 3.457850083397903e-05, "loss": 1.1761, "step": 4776 }, { "epoch": 0.750967792646741, "grad_norm": 0.14075034856796265, "learning_rate": 3.4572793613888046e-05, "loss": 1.1391, "step": 4777 }, { "epoch": 0.7511249975436735, "grad_norm": 0.1566086709499359, "learning_rate": 3.456708580912725e-05, "loss": 1.0243, "step": 4778 }, { "epoch": 0.7512822024406061, "grad_norm": 0.12710240483283997, "learning_rate": 3.4561377420045246e-05, "loss": 0.9759, "step": 4779 }, { "epoch": 0.7514394073375386, "grad_norm": 0.17147888243198395, "learning_rate": 3.455566844699069e-05, "loss": 0.9989, "step": 4780 }, { "epoch": 0.751596612234471, "grad_norm": 0.1665210723876953, "learning_rate": 3.4549958890312256e-05, "loss": 1.1576, "step": 4781 }, { "epoch": 0.7517538171314037, "grad_norm": 0.17183759808540344, "learning_rate": 3.4544248750358677e-05, "loss": 1.1207, "step": 4782 }, { "epoch": 0.7519110220283362, "grad_norm": 0.13655908405780792, "learning_rate": 3.4538538027478696e-05, "loss": 1.0299, "step": 4783 }, { "epoch": 0.7520682269252688, "grad_norm": 0.13791604340076447, "learning_rate": 3.45328267220211e-05, "loss": 1.1945, "step": 4784 }, { "epoch": 0.7522254318222013, "grad_norm": 0.1784014105796814, "learning_rate": 3.4527114834334726e-05, "loss": 1.0836, "step": 4785 }, { "epoch": 0.7523826367191337, "grad_norm": 0.1311030387878418, "learning_rate": 3.452140236476842e-05, "loss": 1.1196, "step": 4786 }, { "epoch": 0.7525398416160664, "grad_norm": 0.14000438153743744, "learning_rate": 3.451568931367108e-05, "loss": 1.1135, "step": 4787 }, { "epoch": 0.7526970465129988, "grad_norm": 0.11728193610906601, "learning_rate": 3.450997568139165e-05, "loss": 1.0402, "step": 4788 }, { "epoch": 0.7528542514099315, "grad_norm": 0.15927207469940186, "learning_rate": 3.450426146827909e-05, "loss": 1.0004, "step": 4789 }, { "epoch": 0.753011456306864, "grad_norm": 0.12737083435058594, "learning_rate": 3.44985466746824e-05, "loss": 1.0429, "step": 4790 }, { "epoch": 0.7531686612037966, "grad_norm": 0.13107572495937347, "learning_rate": 3.449283130095061e-05, "loss": 1.1063, "step": 4791 }, { "epoch": 0.753325866100729, "grad_norm": 0.14925448596477509, "learning_rate": 3.448711534743281e-05, "loss": 1.1381, "step": 4792 }, { "epoch": 0.7534830709976615, "grad_norm": 0.1519125998020172, "learning_rate": 3.4481398814478096e-05, "loss": 0.9555, "step": 4793 }, { "epoch": 0.7536402758945941, "grad_norm": 0.13391917943954468, "learning_rate": 3.447568170243562e-05, "loss": 1.1197, "step": 4794 }, { "epoch": 0.7537974807915266, "grad_norm": 0.13998793065547943, "learning_rate": 3.446996401165455e-05, "loss": 1.1089, "step": 4795 }, { "epoch": 0.7539546856884592, "grad_norm": 0.12591344118118286, "learning_rate": 3.446424574248412e-05, "loss": 1.0845, "step": 4796 }, { "epoch": 0.7541118905853917, "grad_norm": 0.14241157472133636, "learning_rate": 3.445852689527356e-05, "loss": 1.1163, "step": 4797 }, { "epoch": 0.7542690954823242, "grad_norm": 0.14727558195590973, "learning_rate": 3.445280747037217e-05, "loss": 1.1268, "step": 4798 }, { "epoch": 0.7544263003792568, "grad_norm": 0.13568533957004547, "learning_rate": 3.444708746812927e-05, "loss": 1.1031, "step": 4799 }, { "epoch": 0.7545835052761893, "grad_norm": 0.16703443229198456, "learning_rate": 3.44413668888942e-05, "loss": 1.0572, "step": 4800 }, { "epoch": 0.7545835052761893, "eval_loss": 1.096881628036499, "eval_runtime": 2329.3288, "eval_samples_per_second": 3.975, "eval_steps_per_second": 1.987, "step": 4800 }, { "epoch": 0.7547407101731219, "grad_norm": 0.137963205575943, "learning_rate": 3.443564573301637e-05, "loss": 1.2142, "step": 4801 }, { "epoch": 0.7548979150700544, "grad_norm": 0.14127731323242188, "learning_rate": 3.4429924000845196e-05, "loss": 1.1542, "step": 4802 }, { "epoch": 0.7550551199669869, "grad_norm": 0.1367490291595459, "learning_rate": 3.442420169273015e-05, "loss": 1.0598, "step": 4803 }, { "epoch": 0.7552123248639195, "grad_norm": 0.155628502368927, "learning_rate": 3.441847880902071e-05, "loss": 1.0234, "step": 4804 }, { "epoch": 0.755369529760852, "grad_norm": 0.1460132598876953, "learning_rate": 3.4412755350066425e-05, "loss": 1.0925, "step": 4805 }, { "epoch": 0.7555267346577846, "grad_norm": 0.15177485346794128, "learning_rate": 3.4407031316216856e-05, "loss": 1.1072, "step": 4806 }, { "epoch": 0.7556839395547171, "grad_norm": 0.1640729159116745, "learning_rate": 3.440130670782161e-05, "loss": 1.1392, "step": 4807 }, { "epoch": 0.7558411444516496, "grad_norm": 0.13145369291305542, "learning_rate": 3.439558152523031e-05, "loss": 1.1667, "step": 4808 }, { "epoch": 0.7559983493485822, "grad_norm": 0.12419599294662476, "learning_rate": 3.438985576879265e-05, "loss": 0.9409, "step": 4809 }, { "epoch": 0.7561555542455147, "grad_norm": 0.14216910302639008, "learning_rate": 3.4384129438858315e-05, "loss": 1.1439, "step": 4810 }, { "epoch": 0.7563127591424473, "grad_norm": 0.14362764358520508, "learning_rate": 3.4378402535777065e-05, "loss": 1.0857, "step": 4811 }, { "epoch": 0.7564699640393798, "grad_norm": 0.12433180958032608, "learning_rate": 3.4372675059898676e-05, "loss": 1.171, "step": 4812 }, { "epoch": 0.7566271689363123, "grad_norm": 0.14561037719249725, "learning_rate": 3.436694701157294e-05, "loss": 1.035, "step": 4813 }, { "epoch": 0.7567843738332449, "grad_norm": 0.13551869988441467, "learning_rate": 3.436121839114973e-05, "loss": 1.1413, "step": 4814 }, { "epoch": 0.7569415787301774, "grad_norm": 0.14296530187129974, "learning_rate": 3.4355489198978905e-05, "loss": 1.1458, "step": 4815 }, { "epoch": 0.75709878362711, "grad_norm": 0.12551701068878174, "learning_rate": 3.434975943541041e-05, "loss": 1.1601, "step": 4816 }, { "epoch": 0.7572559885240425, "grad_norm": 0.13226158916950226, "learning_rate": 3.434402910079418e-05, "loss": 1.0699, "step": 4817 }, { "epoch": 0.757413193420975, "grad_norm": 0.1264914870262146, "learning_rate": 3.43382981954802e-05, "loss": 1.1619, "step": 4818 }, { "epoch": 0.7575703983179076, "grad_norm": 0.12864720821380615, "learning_rate": 3.4332566719818496e-05, "loss": 1.1159, "step": 4819 }, { "epoch": 0.7577276032148401, "grad_norm": 0.15113262832164764, "learning_rate": 3.4326834674159124e-05, "loss": 1.1242, "step": 4820 }, { "epoch": 0.7578848081117727, "grad_norm": 0.14772279560565948, "learning_rate": 3.432110205885218e-05, "loss": 1.2477, "step": 4821 }, { "epoch": 0.7580420130087052, "grad_norm": 0.12309836596250534, "learning_rate": 3.431536887424779e-05, "loss": 0.9678, "step": 4822 }, { "epoch": 0.7581992179056377, "grad_norm": 0.14076700806617737, "learning_rate": 3.4309635120696107e-05, "loss": 1.0875, "step": 4823 }, { "epoch": 0.7583564228025703, "grad_norm": 0.1606525033712387, "learning_rate": 3.4303900798547326e-05, "loss": 1.039, "step": 4824 }, { "epoch": 0.7585136276995028, "grad_norm": 0.17205810546875, "learning_rate": 3.429816590815169e-05, "loss": 0.9445, "step": 4825 }, { "epoch": 0.7586708325964354, "grad_norm": 0.1370888203382492, "learning_rate": 3.429243044985946e-05, "loss": 1.0914, "step": 4826 }, { "epoch": 0.7588280374933679, "grad_norm": 0.13176867365837097, "learning_rate": 3.428669442402093e-05, "loss": 1.1661, "step": 4827 }, { "epoch": 0.7589852423903004, "grad_norm": 0.1363096684217453, "learning_rate": 3.428095783098644e-05, "loss": 1.1407, "step": 4828 }, { "epoch": 0.759142447287233, "grad_norm": 0.14611127972602844, "learning_rate": 3.4275220671106354e-05, "loss": 1.1294, "step": 4829 }, { "epoch": 0.7592996521841655, "grad_norm": 0.1419224739074707, "learning_rate": 3.4269482944731074e-05, "loss": 1.1525, "step": 4830 }, { "epoch": 0.7594568570810981, "grad_norm": 0.12255913764238358, "learning_rate": 3.4263744652211055e-05, "loss": 1.0963, "step": 4831 }, { "epoch": 0.7596140619780306, "grad_norm": 0.13457930088043213, "learning_rate": 3.425800579389675e-05, "loss": 1.0591, "step": 4832 }, { "epoch": 0.7597712668749631, "grad_norm": 0.13149945437908173, "learning_rate": 3.4252266370138684e-05, "loss": 1.1598, "step": 4833 }, { "epoch": 0.7599284717718957, "grad_norm": 0.1281062662601471, "learning_rate": 3.424652638128739e-05, "loss": 1.1028, "step": 4834 }, { "epoch": 0.7600856766688282, "grad_norm": 0.12758252024650574, "learning_rate": 3.4240785827693435e-05, "loss": 1.1538, "step": 4835 }, { "epoch": 0.7602428815657608, "grad_norm": 0.13412296772003174, "learning_rate": 3.423504470970745e-05, "loss": 1.2111, "step": 4836 }, { "epoch": 0.7604000864626933, "grad_norm": 0.13403989374637604, "learning_rate": 3.422930302768007e-05, "loss": 1.0381, "step": 4837 }, { "epoch": 0.7605572913596258, "grad_norm": 0.14702297747135162, "learning_rate": 3.422356078196198e-05, "loss": 1.096, "step": 4838 }, { "epoch": 0.7607144962565584, "grad_norm": 0.1320294737815857, "learning_rate": 3.421781797290389e-05, "loss": 1.1756, "step": 4839 }, { "epoch": 0.7608717011534909, "grad_norm": 0.12777256965637207, "learning_rate": 3.4212074600856536e-05, "loss": 0.9583, "step": 4840 }, { "epoch": 0.7610289060504235, "grad_norm": 0.15317098796367645, "learning_rate": 3.4206330666170725e-05, "loss": 1.0876, "step": 4841 }, { "epoch": 0.761186110947356, "grad_norm": 0.1403750777244568, "learning_rate": 3.4200586169197265e-05, "loss": 1.1199, "step": 4842 }, { "epoch": 0.7613433158442886, "grad_norm": 0.13177213072776794, "learning_rate": 3.4194841110287016e-05, "loss": 1.0894, "step": 4843 }, { "epoch": 0.7615005207412211, "grad_norm": 0.13566197454929352, "learning_rate": 3.418909548979084e-05, "loss": 1.1449, "step": 4844 }, { "epoch": 0.7616577256381536, "grad_norm": 0.1385362595319748, "learning_rate": 3.418334930805968e-05, "loss": 1.128, "step": 4845 }, { "epoch": 0.7618149305350862, "grad_norm": 0.12482000142335892, "learning_rate": 3.417760256544449e-05, "loss": 1.0858, "step": 4846 }, { "epoch": 0.7619721354320187, "grad_norm": 0.11928500980138779, "learning_rate": 3.417185526229625e-05, "loss": 1.1047, "step": 4847 }, { "epoch": 0.7621293403289513, "grad_norm": 0.12555529177188873, "learning_rate": 3.4166107398965987e-05, "loss": 1.087, "step": 4848 }, { "epoch": 0.7622865452258838, "grad_norm": 0.1462515890598297, "learning_rate": 3.4160358975804755e-05, "loss": 1.0634, "step": 4849 }, { "epoch": 0.7624437501228163, "grad_norm": 0.1577952355146408, "learning_rate": 3.415460999316366e-05, "loss": 1.1449, "step": 4850 }, { "epoch": 0.7626009550197489, "grad_norm": 0.14062662422657013, "learning_rate": 3.41488604513938e-05, "loss": 1.0793, "step": 4851 }, { "epoch": 0.7627581599166814, "grad_norm": 0.13146206736564636, "learning_rate": 3.414311035084637e-05, "loss": 1.139, "step": 4852 }, { "epoch": 0.762915364813614, "grad_norm": 0.12374719232320786, "learning_rate": 3.413735969187254e-05, "loss": 1.1886, "step": 4853 }, { "epoch": 0.7630725697105465, "grad_norm": 0.1335606724023819, "learning_rate": 3.413160847482355e-05, "loss": 1.1146, "step": 4854 }, { "epoch": 0.763229774607479, "grad_norm": 0.1613902747631073, "learning_rate": 3.4125856700050645e-05, "loss": 1.0692, "step": 4855 }, { "epoch": 0.7633869795044116, "grad_norm": 0.15744736790657043, "learning_rate": 3.4120104367905145e-05, "loss": 1.1159, "step": 4856 }, { "epoch": 0.7635441844013441, "grad_norm": 0.1367904543876648, "learning_rate": 3.411435147873837e-05, "loss": 1.132, "step": 4857 }, { "epoch": 0.7637013892982767, "grad_norm": 0.16840605437755585, "learning_rate": 3.410859803290168e-05, "loss": 1.1439, "step": 4858 }, { "epoch": 0.7638585941952092, "grad_norm": 0.13757860660552979, "learning_rate": 3.4102844030746485e-05, "loss": 1.0952, "step": 4859 }, { "epoch": 0.7640157990921417, "grad_norm": 0.1161893904209137, "learning_rate": 3.40970894726242e-05, "loss": 1.1324, "step": 4860 }, { "epoch": 0.7641730039890743, "grad_norm": 0.13835035264492035, "learning_rate": 3.409133435888632e-05, "loss": 1.1272, "step": 4861 }, { "epoch": 0.7643302088860068, "grad_norm": 0.12219596654176712, "learning_rate": 3.408557868988431e-05, "loss": 1.0751, "step": 4862 }, { "epoch": 0.7644874137829394, "grad_norm": 0.1328803300857544, "learning_rate": 3.407982246596972e-05, "loss": 1.0784, "step": 4863 }, { "epoch": 0.7646446186798719, "grad_norm": 0.1300496906042099, "learning_rate": 3.407406568749414e-05, "loss": 1.0277, "step": 4864 }, { "epoch": 0.7648018235768044, "grad_norm": 0.1289186179637909, "learning_rate": 3.4068308354809134e-05, "loss": 1.1356, "step": 4865 }, { "epoch": 0.764959028473737, "grad_norm": 0.12275593727827072, "learning_rate": 3.406255046826637e-05, "loss": 1.1002, "step": 4866 }, { "epoch": 0.7651162333706695, "grad_norm": 0.13451841473579407, "learning_rate": 3.4056792028217494e-05, "loss": 1.0173, "step": 4867 }, { "epoch": 0.7652734382676021, "grad_norm": 0.14932623505592346, "learning_rate": 3.405103303501422e-05, "loss": 1.1156, "step": 4868 }, { "epoch": 0.7654306431645346, "grad_norm": 0.13008396327495575, "learning_rate": 3.404527348900829e-05, "loss": 1.1443, "step": 4869 }, { "epoch": 0.7655878480614671, "grad_norm": 0.15099525451660156, "learning_rate": 3.403951339055147e-05, "loss": 1.1328, "step": 4870 }, { "epoch": 0.7657450529583997, "grad_norm": 0.12122808396816254, "learning_rate": 3.4033752739995563e-05, "loss": 1.0865, "step": 4871 }, { "epoch": 0.7659022578553322, "grad_norm": 0.12879222631454468, "learning_rate": 3.402799153769241e-05, "loss": 1.0814, "step": 4872 }, { "epoch": 0.7660594627522648, "grad_norm": 0.17125335335731506, "learning_rate": 3.402222978399389e-05, "loss": 1.076, "step": 4873 }, { "epoch": 0.7662166676491973, "grad_norm": 0.11798808723688126, "learning_rate": 3.401646747925189e-05, "loss": 0.9454, "step": 4874 }, { "epoch": 0.7663738725461298, "grad_norm": 0.1341928094625473, "learning_rate": 3.401070462381837e-05, "loss": 1.1656, "step": 4875 }, { "epoch": 0.7665310774430624, "grad_norm": 0.14401687681674957, "learning_rate": 3.40049412180453e-05, "loss": 1.1035, "step": 4876 }, { "epoch": 0.7666882823399949, "grad_norm": 0.13822953402996063, "learning_rate": 3.399917726228466e-05, "loss": 1.111, "step": 4877 }, { "epoch": 0.7668454872369275, "grad_norm": 0.12841646373271942, "learning_rate": 3.3993412756888535e-05, "loss": 1.0493, "step": 4878 }, { "epoch": 0.76700269213386, "grad_norm": 0.15926672518253326, "learning_rate": 3.398764770220896e-05, "loss": 1.1567, "step": 4879 }, { "epoch": 0.7671598970307925, "grad_norm": 0.16407644748687744, "learning_rate": 3.3981882098598075e-05, "loss": 1.1008, "step": 4880 }, { "epoch": 0.7673171019277251, "grad_norm": 0.1251910775899887, "learning_rate": 3.3976115946408e-05, "loss": 1.1036, "step": 4881 }, { "epoch": 0.7674743068246576, "grad_norm": 0.1440126895904541, "learning_rate": 3.397034924599091e-05, "loss": 1.0757, "step": 4882 }, { "epoch": 0.7676315117215902, "grad_norm": 0.1343274861574173, "learning_rate": 3.3964581997699026e-05, "loss": 1.139, "step": 4883 }, { "epoch": 0.7677887166185227, "grad_norm": 0.13377784192562103, "learning_rate": 3.395881420188457e-05, "loss": 1.2354, "step": 4884 }, { "epoch": 0.7679459215154552, "grad_norm": 0.1382710486650467, "learning_rate": 3.395304585889984e-05, "loss": 1.0949, "step": 4885 }, { "epoch": 0.7681031264123878, "grad_norm": 0.12571008503437042, "learning_rate": 3.3947276969097124e-05, "loss": 1.1223, "step": 4886 }, { "epoch": 0.7682603313093203, "grad_norm": 0.15523536503314972, "learning_rate": 3.394150753282878e-05, "loss": 1.0913, "step": 4887 }, { "epoch": 0.7684175362062529, "grad_norm": 0.14620168507099152, "learning_rate": 3.3935737550447175e-05, "loss": 1.1561, "step": 4888 }, { "epoch": 0.7685747411031854, "grad_norm": 0.156593456864357, "learning_rate": 3.3929967022304714e-05, "loss": 1.0524, "step": 4889 }, { "epoch": 0.7687319460001178, "grad_norm": 0.13538812100887299, "learning_rate": 3.392419594875385e-05, "loss": 1.1074, "step": 4890 }, { "epoch": 0.7688891508970505, "grad_norm": 0.1342657506465912, "learning_rate": 3.3918424330147045e-05, "loss": 1.0716, "step": 4891 }, { "epoch": 0.769046355793983, "grad_norm": 0.1275315284729004, "learning_rate": 3.391265216683682e-05, "loss": 1.038, "step": 4892 }, { "epoch": 0.7692035606909156, "grad_norm": 0.14354261755943298, "learning_rate": 3.390687945917571e-05, "loss": 1.08, "step": 4893 }, { "epoch": 0.769360765587848, "grad_norm": 0.1632268726825714, "learning_rate": 3.3901106207516285e-05, "loss": 1.0238, "step": 4894 }, { "epoch": 0.7695179704847807, "grad_norm": 0.14514832198619843, "learning_rate": 3.389533241221117e-05, "loss": 1.1181, "step": 4895 }, { "epoch": 0.7696751753817132, "grad_norm": 0.15159910917282104, "learning_rate": 3.388955807361299e-05, "loss": 1.1007, "step": 4896 }, { "epoch": 0.7698323802786456, "grad_norm": 0.13943949341773987, "learning_rate": 3.388378319207443e-05, "loss": 0.9868, "step": 4897 }, { "epoch": 0.7699895851755783, "grad_norm": 0.12545351684093475, "learning_rate": 3.387800776794818e-05, "loss": 1.1619, "step": 4898 }, { "epoch": 0.7701467900725107, "grad_norm": 0.11740878969430923, "learning_rate": 3.387223180158701e-05, "loss": 1.0607, "step": 4899 }, { "epoch": 0.7703039949694434, "grad_norm": 0.13455021381378174, "learning_rate": 3.3866455293343666e-05, "loss": 1.0897, "step": 4900 }, { "epoch": 0.7704611998663758, "grad_norm": 0.12867052853107452, "learning_rate": 3.386067824357098e-05, "loss": 1.1811, "step": 4901 }, { "epoch": 0.7706184047633083, "grad_norm": 0.1307537704706192, "learning_rate": 3.385490065262176e-05, "loss": 1.0823, "step": 4902 }, { "epoch": 0.770775609660241, "grad_norm": 0.13356691598892212, "learning_rate": 3.3849122520848915e-05, "loss": 1.245, "step": 4903 }, { "epoch": 0.7709328145571734, "grad_norm": 0.13139644265174866, "learning_rate": 3.384334384860533e-05, "loss": 1.04, "step": 4904 }, { "epoch": 0.771090019454106, "grad_norm": 0.13805650174617767, "learning_rate": 3.3837564636243944e-05, "loss": 1.0628, "step": 4905 }, { "epoch": 0.7712472243510385, "grad_norm": 0.13503144681453705, "learning_rate": 3.383178488411775e-05, "loss": 1.0897, "step": 4906 }, { "epoch": 0.771404429247971, "grad_norm": 0.1391112506389618, "learning_rate": 3.382600459257973e-05, "loss": 1.034, "step": 4907 }, { "epoch": 0.7715616341449036, "grad_norm": 0.1330227553844452, "learning_rate": 3.3820223761982926e-05, "loss": 1.1307, "step": 4908 }, { "epoch": 0.7717188390418361, "grad_norm": 0.13399213552474976, "learning_rate": 3.381444239268041e-05, "loss": 1.0304, "step": 4909 }, { "epoch": 0.7718760439387687, "grad_norm": 0.11677687615156174, "learning_rate": 3.380866048502531e-05, "loss": 1.103, "step": 4910 }, { "epoch": 0.7720332488357012, "grad_norm": 0.17897523939609528, "learning_rate": 3.380287803937072e-05, "loss": 1.0233, "step": 4911 }, { "epoch": 0.7721904537326337, "grad_norm": 0.12963220477104187, "learning_rate": 3.3797095056069836e-05, "loss": 1.0871, "step": 4912 }, { "epoch": 0.7723476586295663, "grad_norm": 0.13949821889400482, "learning_rate": 3.379131153547587e-05, "loss": 1.2007, "step": 4913 }, { "epoch": 0.7725048635264988, "grad_norm": 0.13470464944839478, "learning_rate": 3.378552747794203e-05, "loss": 1.0367, "step": 4914 }, { "epoch": 0.7726620684234314, "grad_norm": 0.1272779107093811, "learning_rate": 3.377974288382161e-05, "loss": 1.0967, "step": 4915 }, { "epoch": 0.7728192733203639, "grad_norm": 0.13421685993671417, "learning_rate": 3.377395775346789e-05, "loss": 1.1023, "step": 4916 }, { "epoch": 0.7729764782172964, "grad_norm": 0.12207071483135223, "learning_rate": 3.376817208723422e-05, "loss": 1.1236, "step": 4917 }, { "epoch": 0.773133683114229, "grad_norm": 0.12212815135717392, "learning_rate": 3.376238588547396e-05, "loss": 1.0593, "step": 4918 }, { "epoch": 0.7732908880111615, "grad_norm": 0.1423274278640747, "learning_rate": 3.3756599148540494e-05, "loss": 1.1556, "step": 4919 }, { "epoch": 0.7734480929080941, "grad_norm": 0.12456110864877701, "learning_rate": 3.375081187678729e-05, "loss": 0.9687, "step": 4920 }, { "epoch": 0.7736052978050266, "grad_norm": 0.13654069602489471, "learning_rate": 3.3745024070567774e-05, "loss": 1.1634, "step": 4921 }, { "epoch": 0.7737625027019591, "grad_norm": 0.13371238112449646, "learning_rate": 3.373923573023547e-05, "loss": 1.1374, "step": 4922 }, { "epoch": 0.7739197075988917, "grad_norm": 0.14525705575942993, "learning_rate": 3.3733446856143895e-05, "loss": 1.0809, "step": 4923 }, { "epoch": 0.7740769124958242, "grad_norm": 0.1339891105890274, "learning_rate": 3.3727657448646614e-05, "loss": 1.1174, "step": 4924 }, { "epoch": 0.7742341173927568, "grad_norm": 0.13140228390693665, "learning_rate": 3.372186750809723e-05, "loss": 1.1715, "step": 4925 }, { "epoch": 0.7743913222896893, "grad_norm": 0.17299087345600128, "learning_rate": 3.3716077034849345e-05, "loss": 0.95, "step": 4926 }, { "epoch": 0.7745485271866218, "grad_norm": 0.15158572793006897, "learning_rate": 3.371028602925666e-05, "loss": 1.1738, "step": 4927 }, { "epoch": 0.7747057320835544, "grad_norm": 0.13169889152050018, "learning_rate": 3.3704494491672837e-05, "loss": 1.0398, "step": 4928 }, { "epoch": 0.7748629369804869, "grad_norm": 0.13216854631900787, "learning_rate": 3.36987024224516e-05, "loss": 1.1112, "step": 4929 }, { "epoch": 0.7750201418774195, "grad_norm": 0.14911629259586334, "learning_rate": 3.369290982194671e-05, "loss": 0.9443, "step": 4930 }, { "epoch": 0.775177346774352, "grad_norm": 0.13613444566726685, "learning_rate": 3.368711669051198e-05, "loss": 1.0903, "step": 4931 }, { "epoch": 0.7753345516712845, "grad_norm": 0.12087051570415497, "learning_rate": 3.368132302850121e-05, "loss": 1.1767, "step": 4932 }, { "epoch": 0.7754917565682171, "grad_norm": 0.13154035806655884, "learning_rate": 3.367552883626825e-05, "loss": 1.0404, "step": 4933 }, { "epoch": 0.7756489614651496, "grad_norm": 0.13925373554229736, "learning_rate": 3.366973411416702e-05, "loss": 0.9788, "step": 4934 }, { "epoch": 0.7758061663620822, "grad_norm": 0.1517360657453537, "learning_rate": 3.366393886255139e-05, "loss": 1.1265, "step": 4935 }, { "epoch": 0.7759633712590147, "grad_norm": 0.1432923823595047, "learning_rate": 3.365814308177536e-05, "loss": 1.088, "step": 4936 }, { "epoch": 0.7761205761559472, "grad_norm": 0.12432681024074554, "learning_rate": 3.365234677219288e-05, "loss": 1.0405, "step": 4937 }, { "epoch": 0.7762777810528798, "grad_norm": 0.1287374645471573, "learning_rate": 3.364654993415798e-05, "loss": 1.1894, "step": 4938 }, { "epoch": 0.7764349859498123, "grad_norm": 0.1255466192960739, "learning_rate": 3.3640752568024706e-05, "loss": 1.1598, "step": 4939 }, { "epoch": 0.7765921908467449, "grad_norm": 0.13246101140975952, "learning_rate": 3.3634954674147146e-05, "loss": 1.0111, "step": 4940 }, { "epoch": 0.7767493957436774, "grad_norm": 0.1282433420419693, "learning_rate": 3.362915625287941e-05, "loss": 1.1041, "step": 4941 }, { "epoch": 0.7769066006406099, "grad_norm": 0.13137435913085938, "learning_rate": 3.362335730457564e-05, "loss": 1.1897, "step": 4942 }, { "epoch": 0.7770638055375425, "grad_norm": 0.15391333401203156, "learning_rate": 3.3617557829590015e-05, "loss": 1.0739, "step": 4943 }, { "epoch": 0.777221010434475, "grad_norm": 0.13495557010173798, "learning_rate": 3.361175782827675e-05, "loss": 1.1336, "step": 4944 }, { "epoch": 0.7773782153314076, "grad_norm": 0.12705542147159576, "learning_rate": 3.3605957300990073e-05, "loss": 1.0736, "step": 4945 }, { "epoch": 0.7775354202283401, "grad_norm": 0.12318447977304459, "learning_rate": 3.360015624808427e-05, "loss": 1.0195, "step": 4946 }, { "epoch": 0.7776926251252727, "grad_norm": 0.1322765052318573, "learning_rate": 3.3594354669913654e-05, "loss": 1.1146, "step": 4947 }, { "epoch": 0.7778498300222052, "grad_norm": 0.14443641901016235, "learning_rate": 3.3588552566832545e-05, "loss": 1.0598, "step": 4948 }, { "epoch": 0.7780070349191377, "grad_norm": 0.11805952340364456, "learning_rate": 3.358274993919532e-05, "loss": 0.9953, "step": 4949 }, { "epoch": 0.7781642398160703, "grad_norm": 0.12899643182754517, "learning_rate": 3.357694678735639e-05, "loss": 1.1646, "step": 4950 }, { "epoch": 0.7783214447130028, "grad_norm": 0.14672844111919403, "learning_rate": 3.3571143111670187e-05, "loss": 1.0165, "step": 4951 }, { "epoch": 0.7784786496099354, "grad_norm": 0.1444154977798462, "learning_rate": 3.3565338912491165e-05, "loss": 1.0543, "step": 4952 }, { "epoch": 0.7786358545068679, "grad_norm": 0.12198904156684875, "learning_rate": 3.3559534190173834e-05, "loss": 1.0839, "step": 4953 }, { "epoch": 0.7787930594038004, "grad_norm": 0.13399964570999146, "learning_rate": 3.355372894507272e-05, "loss": 1.0607, "step": 4954 }, { "epoch": 0.778950264300733, "grad_norm": 0.13722872734069824, "learning_rate": 3.3547923177542396e-05, "loss": 1.1332, "step": 4955 }, { "epoch": 0.7791074691976655, "grad_norm": 0.12063074856996536, "learning_rate": 3.354211688793744e-05, "loss": 1.0272, "step": 4956 }, { "epoch": 0.7792646740945981, "grad_norm": 0.13837161660194397, "learning_rate": 3.353631007661249e-05, "loss": 1.1066, "step": 4957 }, { "epoch": 0.7794218789915306, "grad_norm": 0.1389688104391098, "learning_rate": 3.353050274392219e-05, "loss": 1.0007, "step": 4958 }, { "epoch": 0.7795790838884631, "grad_norm": 0.11978980898857117, "learning_rate": 3.3524694890221244e-05, "loss": 1.1153, "step": 4959 }, { "epoch": 0.7797362887853957, "grad_norm": 0.13067257404327393, "learning_rate": 3.3518886515864366e-05, "loss": 1.1104, "step": 4960 }, { "epoch": 0.7797362887853957, "eval_loss": 1.0962234735488892, "eval_runtime": 2341.8407, "eval_samples_per_second": 3.953, "eval_steps_per_second": 1.977, "step": 4960 }, { "epoch": 0.7798934936823282, "grad_norm": 0.14436736702919006, "learning_rate": 3.351307762120631e-05, "loss": 1.1186, "step": 4961 }, { "epoch": 0.7800506985792608, "grad_norm": 0.13643218576908112, "learning_rate": 3.350726820660187e-05, "loss": 1.2117, "step": 4962 }, { "epoch": 0.7802079034761933, "grad_norm": 0.15105631947517395, "learning_rate": 3.350145827240585e-05, "loss": 1.082, "step": 4963 }, { "epoch": 0.7803651083731258, "grad_norm": 0.1495438516139984, "learning_rate": 3.349564781897311e-05, "loss": 1.1437, "step": 4964 }, { "epoch": 0.7805223132700584, "grad_norm": 0.13507118821144104, "learning_rate": 3.348983684665852e-05, "loss": 1.0525, "step": 4965 }, { "epoch": 0.7806795181669909, "grad_norm": 0.1323672980070114, "learning_rate": 3.348402535581701e-05, "loss": 1.0791, "step": 4966 }, { "epoch": 0.7808367230639235, "grad_norm": 0.15458525717258453, "learning_rate": 3.34782133468035e-05, "loss": 1.0797, "step": 4967 }, { "epoch": 0.780993927960856, "grad_norm": 0.1274881511926651, "learning_rate": 3.347240081997297e-05, "loss": 1.0706, "step": 4968 }, { "epoch": 0.7811511328577885, "grad_norm": 0.19001010060310364, "learning_rate": 3.3466587775680444e-05, "loss": 1.1347, "step": 4969 }, { "epoch": 0.7813083377547211, "grad_norm": 0.13060776889324188, "learning_rate": 3.3460774214280944e-05, "loss": 1.1397, "step": 4970 }, { "epoch": 0.7814655426516536, "grad_norm": 0.11697187274694443, "learning_rate": 3.345496013612955e-05, "loss": 1.0687, "step": 4971 }, { "epoch": 0.7816227475485862, "grad_norm": 0.13726262748241425, "learning_rate": 3.344914554158136e-05, "loss": 1.0154, "step": 4972 }, { "epoch": 0.7817799524455187, "grad_norm": 0.12287222594022751, "learning_rate": 3.3443330430991506e-05, "loss": 0.9618, "step": 4973 }, { "epoch": 0.7819371573424512, "grad_norm": 0.14394275844097137, "learning_rate": 3.343751480471515e-05, "loss": 1.1896, "step": 4974 }, { "epoch": 0.7820943622393838, "grad_norm": 0.13699425756931305, "learning_rate": 3.3431698663107496e-05, "loss": 1.1249, "step": 4975 }, { "epoch": 0.7822515671363163, "grad_norm": 0.13724492490291595, "learning_rate": 3.3425882006523765e-05, "loss": 1.1185, "step": 4976 }, { "epoch": 0.7824087720332489, "grad_norm": 0.13086111843585968, "learning_rate": 3.3420064835319224e-05, "loss": 1.122, "step": 4977 }, { "epoch": 0.7825659769301814, "grad_norm": 0.13510918617248535, "learning_rate": 3.341424714984916e-05, "loss": 1.117, "step": 4978 }, { "epoch": 0.7827231818271139, "grad_norm": 0.13024483621120453, "learning_rate": 3.340842895046889e-05, "loss": 1.0512, "step": 4979 }, { "epoch": 0.7828803867240465, "grad_norm": 0.12385273724794388, "learning_rate": 3.340261023753377e-05, "loss": 0.9738, "step": 4980 }, { "epoch": 0.783037591620979, "grad_norm": 0.1568519026041031, "learning_rate": 3.33967910113992e-05, "loss": 1.1059, "step": 4981 }, { "epoch": 0.7831947965179116, "grad_norm": 0.1357090175151825, "learning_rate": 3.339097127242057e-05, "loss": 1.0016, "step": 4982 }, { "epoch": 0.7833520014148441, "grad_norm": 0.13885211944580078, "learning_rate": 3.3385151020953345e-05, "loss": 1.1129, "step": 4983 }, { "epoch": 0.7835092063117766, "grad_norm": 0.15067574381828308, "learning_rate": 3.337933025735299e-05, "loss": 1.1112, "step": 4984 }, { "epoch": 0.7836664112087092, "grad_norm": 0.22319741547107697, "learning_rate": 3.337350898197504e-05, "loss": 1.0822, "step": 4985 }, { "epoch": 0.7838236161056417, "grad_norm": 0.13410452008247375, "learning_rate": 3.3367687195175006e-05, "loss": 1.1652, "step": 4986 }, { "epoch": 0.7839808210025743, "grad_norm": 0.15852157771587372, "learning_rate": 3.3361864897308484e-05, "loss": 1.0636, "step": 4987 }, { "epoch": 0.7841380258995068, "grad_norm": 0.18127258121967316, "learning_rate": 3.335604208873106e-05, "loss": 1.099, "step": 4988 }, { "epoch": 0.7842952307964393, "grad_norm": 0.14409101009368896, "learning_rate": 3.335021876979838e-05, "loss": 1.0831, "step": 4989 }, { "epoch": 0.7844524356933719, "grad_norm": 0.15617966651916504, "learning_rate": 3.334439494086612e-05, "loss": 1.0458, "step": 4990 }, { "epoch": 0.7846096405903044, "grad_norm": 0.12643630802631378, "learning_rate": 3.333857060228995e-05, "loss": 1.1234, "step": 4991 }, { "epoch": 0.784766845487237, "grad_norm": 0.14369435608386993, "learning_rate": 3.3332745754425626e-05, "loss": 1.1506, "step": 4992 }, { "epoch": 0.7849240503841695, "grad_norm": 0.12761743366718292, "learning_rate": 3.332692039762889e-05, "loss": 1.1512, "step": 4993 }, { "epoch": 0.785081255281102, "grad_norm": 0.1430346816778183, "learning_rate": 3.332109453225554e-05, "loss": 0.99, "step": 4994 }, { "epoch": 0.7852384601780346, "grad_norm": 0.12227287888526917, "learning_rate": 3.3315268158661396e-05, "loss": 1.1705, "step": 4995 }, { "epoch": 0.785395665074967, "grad_norm": 0.1261235624551773, "learning_rate": 3.33094412772023e-05, "loss": 1.0471, "step": 4996 }, { "epoch": 0.7855528699718997, "grad_norm": 0.14919154345989227, "learning_rate": 3.330361388823416e-05, "loss": 1.0884, "step": 4997 }, { "epoch": 0.7857100748688322, "grad_norm": 0.1251683533191681, "learning_rate": 3.329778599211287e-05, "loss": 0.9639, "step": 4998 }, { "epoch": 0.7858672797657648, "grad_norm": 0.13637672364711761, "learning_rate": 3.3291957589194385e-05, "loss": 1.1214, "step": 4999 }, { "epoch": 0.7860244846626973, "grad_norm": 0.16353115439414978, "learning_rate": 3.328612867983468e-05, "loss": 1.0878, "step": 5000 }, { "epoch": 0.7861816895596297, "grad_norm": 0.12601494789123535, "learning_rate": 3.328029926438976e-05, "loss": 1.1214, "step": 5001 }, { "epoch": 0.7863388944565624, "grad_norm": 0.13055525720119476, "learning_rate": 3.3274469343215666e-05, "loss": 1.1049, "step": 5002 }, { "epoch": 0.7864960993534948, "grad_norm": 0.14286868274211884, "learning_rate": 3.326863891666846e-05, "loss": 1.2236, "step": 5003 }, { "epoch": 0.7866533042504275, "grad_norm": 0.12616075575351715, "learning_rate": 3.326280798510426e-05, "loss": 1.1382, "step": 5004 }, { "epoch": 0.78681050914736, "grad_norm": 0.12813489139080048, "learning_rate": 3.3256976548879184e-05, "loss": 1.0599, "step": 5005 }, { "epoch": 0.7869677140442924, "grad_norm": 0.12909750640392303, "learning_rate": 3.325114460834939e-05, "loss": 0.9082, "step": 5006 }, { "epoch": 0.787124918941225, "grad_norm": 0.12364039570093155, "learning_rate": 3.324531216387108e-05, "loss": 1.0784, "step": 5007 }, { "epoch": 0.7872821238381575, "grad_norm": 0.13159489631652832, "learning_rate": 3.3239479215800476e-05, "loss": 1.1308, "step": 5008 }, { "epoch": 0.7874393287350901, "grad_norm": 0.1293668895959854, "learning_rate": 3.323364576449383e-05, "loss": 1.0157, "step": 5009 }, { "epoch": 0.7875965336320226, "grad_norm": 0.13375996053218842, "learning_rate": 3.3227811810307426e-05, "loss": 1.1795, "step": 5010 }, { "epoch": 0.7877537385289551, "grad_norm": 0.13124480843544006, "learning_rate": 3.322197735359759e-05, "loss": 0.9774, "step": 5011 }, { "epoch": 0.7879109434258877, "grad_norm": 0.1319187879562378, "learning_rate": 3.321614239472064e-05, "loss": 1.0874, "step": 5012 }, { "epoch": 0.7880681483228202, "grad_norm": 0.1557384580373764, "learning_rate": 3.3210306934032995e-05, "loss": 1.0408, "step": 5013 }, { "epoch": 0.7882253532197528, "grad_norm": 0.17597688734531403, "learning_rate": 3.3204470971891026e-05, "loss": 1.0429, "step": 5014 }, { "epoch": 0.7883825581166853, "grad_norm": 0.1346769481897354, "learning_rate": 3.31986345086512e-05, "loss": 1.1029, "step": 5015 }, { "epoch": 0.7885397630136178, "grad_norm": 0.16205650568008423, "learning_rate": 3.319279754466996e-05, "loss": 1.1131, "step": 5016 }, { "epoch": 0.7886969679105504, "grad_norm": 0.17671433091163635, "learning_rate": 3.3186960080303816e-05, "loss": 1.0771, "step": 5017 }, { "epoch": 0.7888541728074829, "grad_norm": 0.15653863549232483, "learning_rate": 3.31811221159093e-05, "loss": 1.1959, "step": 5018 }, { "epoch": 0.7890113777044155, "grad_norm": 0.14297321438789368, "learning_rate": 3.317528365184298e-05, "loss": 1.1692, "step": 5019 }, { "epoch": 0.789168582601348, "grad_norm": 0.16521745920181274, "learning_rate": 3.316944468846144e-05, "loss": 1.0477, "step": 5020 }, { "epoch": 0.7893257874982805, "grad_norm": 0.13730360567569733, "learning_rate": 3.3163605226121296e-05, "loss": 1.1641, "step": 5021 }, { "epoch": 0.7894829923952131, "grad_norm": 0.16981300711631775, "learning_rate": 3.315776526517921e-05, "loss": 1.106, "step": 5022 }, { "epoch": 0.7896401972921456, "grad_norm": 0.1734813153743744, "learning_rate": 3.315192480599185e-05, "loss": 1.1008, "step": 5023 }, { "epoch": 0.7897974021890782, "grad_norm": 0.14239849150180817, "learning_rate": 3.314608384891594e-05, "loss": 1.0765, "step": 5024 }, { "epoch": 0.7899546070860107, "grad_norm": 0.14138491451740265, "learning_rate": 3.314024239430824e-05, "loss": 1.1847, "step": 5025 }, { "epoch": 0.7901118119829432, "grad_norm": 0.1398213654756546, "learning_rate": 3.313440044252549e-05, "loss": 1.1479, "step": 5026 }, { "epoch": 0.7902690168798758, "grad_norm": 0.12144920229911804, "learning_rate": 3.3128557993924516e-05, "loss": 1.0971, "step": 5027 }, { "epoch": 0.7904262217768083, "grad_norm": 0.14630118012428284, "learning_rate": 3.312271504886214e-05, "loss": 1.102, "step": 5028 }, { "epoch": 0.7905834266737409, "grad_norm": 0.14643485844135284, "learning_rate": 3.311687160769524e-05, "loss": 1.0097, "step": 5029 }, { "epoch": 0.7907406315706734, "grad_norm": 0.12107902765274048, "learning_rate": 3.311102767078071e-05, "loss": 1.1308, "step": 5030 }, { "epoch": 0.7908978364676059, "grad_norm": 0.13668377697467804, "learning_rate": 3.3105183238475455e-05, "loss": 1.0909, "step": 5031 }, { "epoch": 0.7910550413645385, "grad_norm": 0.13928672671318054, "learning_rate": 3.309933831113646e-05, "loss": 1.0461, "step": 5032 }, { "epoch": 0.791212246261471, "grad_norm": 0.13744144141674042, "learning_rate": 3.309349288912069e-05, "loss": 0.9517, "step": 5033 }, { "epoch": 0.7913694511584036, "grad_norm": 0.14722764492034912, "learning_rate": 3.308764697278518e-05, "loss": 1.1708, "step": 5034 }, { "epoch": 0.7915266560553361, "grad_norm": 0.13956117630004883, "learning_rate": 3.3081800562486946e-05, "loss": 1.043, "step": 5035 }, { "epoch": 0.7916838609522686, "grad_norm": 0.1356237679719925, "learning_rate": 3.307595365858309e-05, "loss": 1.1325, "step": 5036 }, { "epoch": 0.7918410658492012, "grad_norm": 0.1514151692390442, "learning_rate": 3.307010626143071e-05, "loss": 1.177, "step": 5037 }, { "epoch": 0.7919982707461337, "grad_norm": 0.11895293742418289, "learning_rate": 3.306425837138695e-05, "loss": 1.1792, "step": 5038 }, { "epoch": 0.7921554756430663, "grad_norm": 0.13698509335517883, "learning_rate": 3.305840998880897e-05, "loss": 1.0816, "step": 5039 }, { "epoch": 0.7923126805399988, "grad_norm": 0.13310359418392181, "learning_rate": 3.3052561114053965e-05, "loss": 1.0018, "step": 5040 }, { "epoch": 0.7924698854369313, "grad_norm": 0.1219170093536377, "learning_rate": 3.3046711747479166e-05, "loss": 0.9989, "step": 5041 }, { "epoch": 0.7926270903338639, "grad_norm": 0.13483086228370667, "learning_rate": 3.304086188944183e-05, "loss": 1.114, "step": 5042 }, { "epoch": 0.7927842952307964, "grad_norm": 0.1272246390581131, "learning_rate": 3.3035011540299245e-05, "loss": 1.1405, "step": 5043 }, { "epoch": 0.792941500127729, "grad_norm": 0.16075541079044342, "learning_rate": 3.302916070040872e-05, "loss": 1.1148, "step": 5044 }, { "epoch": 0.7930987050246615, "grad_norm": 0.12627846002578735, "learning_rate": 3.302330937012761e-05, "loss": 1.095, "step": 5045 }, { "epoch": 0.793255909921594, "grad_norm": 0.13641822338104248, "learning_rate": 3.3017457549813304e-05, "loss": 1.0507, "step": 5046 }, { "epoch": 0.7934131148185266, "grad_norm": 0.14821748435497284, "learning_rate": 3.301160523982317e-05, "loss": 1.0466, "step": 5047 }, { "epoch": 0.7935703197154591, "grad_norm": 0.13757331669330597, "learning_rate": 3.3005752440514694e-05, "loss": 1.1447, "step": 5048 }, { "epoch": 0.7937275246123917, "grad_norm": 0.1334858238697052, "learning_rate": 3.299989915224531e-05, "loss": 1.0936, "step": 5049 }, { "epoch": 0.7938847295093242, "grad_norm": 0.13829028606414795, "learning_rate": 3.299404537537252e-05, "loss": 1.1703, "step": 5050 }, { "epoch": 0.7940419344062567, "grad_norm": 0.1311250925064087, "learning_rate": 3.2988191110253866e-05, "loss": 1.0749, "step": 5051 }, { "epoch": 0.7941991393031893, "grad_norm": 0.13316184282302856, "learning_rate": 3.2982336357246875e-05, "loss": 1.1073, "step": 5052 }, { "epoch": 0.7943563442001218, "grad_norm": 0.1327652633190155, "learning_rate": 3.297648111670916e-05, "loss": 0.986, "step": 5053 }, { "epoch": 0.7945135490970544, "grad_norm": 0.13477909564971924, "learning_rate": 3.297062538899832e-05, "loss": 1.1242, "step": 5054 }, { "epoch": 0.7946707539939869, "grad_norm": 0.1262103021144867, "learning_rate": 3.296476917447202e-05, "loss": 1.0395, "step": 5055 }, { "epoch": 0.7948279588909195, "grad_norm": 0.1377694308757782, "learning_rate": 3.295891247348791e-05, "loss": 1.0956, "step": 5056 }, { "epoch": 0.794985163787852, "grad_norm": 0.14717347919940948, "learning_rate": 3.295305528640371e-05, "loss": 1.1344, "step": 5057 }, { "epoch": 0.7951423686847845, "grad_norm": 0.13811689615249634, "learning_rate": 3.2947197613577156e-05, "loss": 1.0311, "step": 5058 }, { "epoch": 0.7952995735817171, "grad_norm": 0.13427940011024475, "learning_rate": 3.294133945536601e-05, "loss": 0.9746, "step": 5059 }, { "epoch": 0.7954567784786496, "grad_norm": 0.14605213701725006, "learning_rate": 3.293548081212807e-05, "loss": 1.0583, "step": 5060 }, { "epoch": 0.7956139833755822, "grad_norm": 0.11798227578401566, "learning_rate": 3.292962168422114e-05, "loss": 1.1211, "step": 5061 }, { "epoch": 0.7957711882725147, "grad_norm": 0.1400463730096817, "learning_rate": 3.2923762072003094e-05, "loss": 1.1712, "step": 5062 }, { "epoch": 0.7959283931694472, "grad_norm": 0.1203552633523941, "learning_rate": 3.291790197583181e-05, "loss": 1.0153, "step": 5063 }, { "epoch": 0.7960855980663798, "grad_norm": 0.14843995869159698, "learning_rate": 3.2912041396065205e-05, "loss": 1.0718, "step": 5064 }, { "epoch": 0.7962428029633123, "grad_norm": 0.12414611876010895, "learning_rate": 3.290618033306121e-05, "loss": 1.1268, "step": 5065 }, { "epoch": 0.7964000078602449, "grad_norm": 0.13710583746433258, "learning_rate": 3.2900318787177794e-05, "loss": 1.0722, "step": 5066 }, { "epoch": 0.7965572127571774, "grad_norm": 0.14502912759780884, "learning_rate": 3.2894456758772976e-05, "loss": 1.1507, "step": 5067 }, { "epoch": 0.7967144176541099, "grad_norm": 0.14237187802791595, "learning_rate": 3.288859424820477e-05, "loss": 1.0388, "step": 5068 }, { "epoch": 0.7968716225510425, "grad_norm": 0.1451392024755478, "learning_rate": 3.288273125583124e-05, "loss": 1.1638, "step": 5069 }, { "epoch": 0.797028827447975, "grad_norm": 0.16580519080162048, "learning_rate": 3.2876867782010477e-05, "loss": 1.036, "step": 5070 }, { "epoch": 0.7971860323449076, "grad_norm": 0.22899524867534637, "learning_rate": 3.28710038271006e-05, "loss": 1.0015, "step": 5071 }, { "epoch": 0.7973432372418401, "grad_norm": 0.2474881261587143, "learning_rate": 3.2865139391459764e-05, "loss": 1.0732, "step": 5072 }, { "epoch": 0.7975004421387726, "grad_norm": 0.15084420144557953, "learning_rate": 3.2859274475446134e-05, "loss": 1.0875, "step": 5073 }, { "epoch": 0.7976576470357052, "grad_norm": 0.36738401651382446, "learning_rate": 3.2853409079417915e-05, "loss": 1.143, "step": 5074 }, { "epoch": 0.7978148519326377, "grad_norm": 0.25699618458747864, "learning_rate": 3.284754320373336e-05, "loss": 1.1118, "step": 5075 }, { "epoch": 0.7979720568295703, "grad_norm": 0.24444612860679626, "learning_rate": 3.284167684875072e-05, "loss": 1.047, "step": 5076 }, { "epoch": 0.7981292617265028, "grad_norm": 0.1936510056257248, "learning_rate": 3.283581001482829e-05, "loss": 1.2441, "step": 5077 }, { "epoch": 0.7982864666234353, "grad_norm": 0.23741251230239868, "learning_rate": 3.28299427023244e-05, "loss": 1.0017, "step": 5078 }, { "epoch": 0.7984436715203679, "grad_norm": 0.13592703640460968, "learning_rate": 3.28240749115974e-05, "loss": 1.1265, "step": 5079 }, { "epoch": 0.7986008764173004, "grad_norm": 0.16751481592655182, "learning_rate": 3.2818206643005675e-05, "loss": 1.0984, "step": 5080 }, { "epoch": 0.798758081314233, "grad_norm": 0.1578437089920044, "learning_rate": 3.2812337896907635e-05, "loss": 1.1604, "step": 5081 }, { "epoch": 0.7989152862111655, "grad_norm": 0.13940554857254028, "learning_rate": 3.280646867366172e-05, "loss": 0.9019, "step": 5082 }, { "epoch": 0.799072491108098, "grad_norm": 0.17531779408454895, "learning_rate": 3.28005989736264e-05, "loss": 1.1489, "step": 5083 }, { "epoch": 0.7992296960050306, "grad_norm": 0.15064075589179993, "learning_rate": 3.279472879716017e-05, "loss": 1.195, "step": 5084 }, { "epoch": 0.7993869009019631, "grad_norm": 0.13430656492710114, "learning_rate": 3.278885814462157e-05, "loss": 1.106, "step": 5085 }, { "epoch": 0.7995441057988957, "grad_norm": 0.15016251802444458, "learning_rate": 3.278298701636914e-05, "loss": 1.1596, "step": 5086 }, { "epoch": 0.7997013106958282, "grad_norm": 0.13735370337963104, "learning_rate": 3.277711541276148e-05, "loss": 1.1242, "step": 5087 }, { "epoch": 0.7998585155927607, "grad_norm": 0.13054972887039185, "learning_rate": 3.277124333415721e-05, "loss": 1.2209, "step": 5088 }, { "epoch": 0.8000157204896933, "grad_norm": 0.13098429143428802, "learning_rate": 3.276537078091495e-05, "loss": 1.1635, "step": 5089 }, { "epoch": 0.8001729253866258, "grad_norm": 0.14684736728668213, "learning_rate": 3.275949775339339e-05, "loss": 1.0943, "step": 5090 }, { "epoch": 0.8003301302835584, "grad_norm": 0.12680160999298096, "learning_rate": 3.2753624251951234e-05, "loss": 1.1548, "step": 5091 }, { "epoch": 0.8004873351804909, "grad_norm": 0.12368624657392502, "learning_rate": 3.274775027694721e-05, "loss": 1.1061, "step": 5092 }, { "epoch": 0.8006445400774234, "grad_norm": 0.12971846759319305, "learning_rate": 3.274187582874008e-05, "loss": 1.0889, "step": 5093 }, { "epoch": 0.800801744974356, "grad_norm": 0.1387680470943451, "learning_rate": 3.2736000907688624e-05, "loss": 1.0168, "step": 5094 }, { "epoch": 0.8009589498712885, "grad_norm": 0.17290528118610382, "learning_rate": 3.273012551415168e-05, "loss": 1.0913, "step": 5095 }, { "epoch": 0.8011161547682211, "grad_norm": 0.14590062201023102, "learning_rate": 3.272424964848806e-05, "loss": 1.0417, "step": 5096 }, { "epoch": 0.8012733596651536, "grad_norm": 0.12929320335388184, "learning_rate": 3.271837331105667e-05, "loss": 1.2211, "step": 5097 }, { "epoch": 0.8014305645620861, "grad_norm": 0.1567511260509491, "learning_rate": 3.27124965022164e-05, "loss": 1.1004, "step": 5098 }, { "epoch": 0.8015877694590187, "grad_norm": 0.12611301243305206, "learning_rate": 3.2706619222326194e-05, "loss": 1.1534, "step": 5099 }, { "epoch": 0.8017449743559512, "grad_norm": 0.12038855254650116, "learning_rate": 3.2700741471745014e-05, "loss": 0.9957, "step": 5100 }, { "epoch": 0.8019021792528838, "grad_norm": 0.13091440498828888, "learning_rate": 3.269486325083183e-05, "loss": 1.1319, "step": 5101 }, { "epoch": 0.8020593841498163, "grad_norm": 0.1311814785003662, "learning_rate": 3.2688984559945686e-05, "loss": 1.1079, "step": 5102 }, { "epoch": 0.8022165890467488, "grad_norm": 0.14918267726898193, "learning_rate": 3.268310539944561e-05, "loss": 1.2066, "step": 5103 }, { "epoch": 0.8023737939436814, "grad_norm": 0.14358769357204437, "learning_rate": 3.2677225769690695e-05, "loss": 1.1974, "step": 5104 }, { "epoch": 0.8025309988406139, "grad_norm": 0.16421452164649963, "learning_rate": 3.267134567104004e-05, "loss": 1.1073, "step": 5105 }, { "epoch": 0.8026882037375465, "grad_norm": 0.13539628684520721, "learning_rate": 3.266546510385278e-05, "loss": 1.0468, "step": 5106 }, { "epoch": 0.802845408634479, "grad_norm": 0.12118083983659744, "learning_rate": 3.265958406848807e-05, "loss": 1.0258, "step": 5107 }, { "epoch": 0.8030026135314116, "grad_norm": 0.12715213000774384, "learning_rate": 3.2653702565305114e-05, "loss": 1.152, "step": 5108 }, { "epoch": 0.803159818428344, "grad_norm": 0.12841090559959412, "learning_rate": 3.264782059466313e-05, "loss": 1.091, "step": 5109 }, { "epoch": 0.8033170233252765, "grad_norm": 0.14065606892108917, "learning_rate": 3.264193815692136e-05, "loss": 1.1107, "step": 5110 }, { "epoch": 0.8034742282222092, "grad_norm": 0.12508265674114227, "learning_rate": 3.2636055252439075e-05, "loss": 1.219, "step": 5111 }, { "epoch": 0.8036314331191416, "grad_norm": 0.14038169384002686, "learning_rate": 3.2630171881575587e-05, "loss": 1.1776, "step": 5112 }, { "epoch": 0.8037886380160743, "grad_norm": 0.18364542722702026, "learning_rate": 3.2624288044690244e-05, "loss": 1.0645, "step": 5113 }, { "epoch": 0.8039458429130067, "grad_norm": 0.13592836260795593, "learning_rate": 3.261840374214239e-05, "loss": 1.1253, "step": 5114 }, { "epoch": 0.8041030478099392, "grad_norm": 0.16061215102672577, "learning_rate": 3.261251897429142e-05, "loss": 1.1096, "step": 5115 }, { "epoch": 0.8042602527068718, "grad_norm": 0.13377389311790466, "learning_rate": 3.260663374149675e-05, "loss": 1.0646, "step": 5116 }, { "epoch": 0.8044174576038043, "grad_norm": 0.13679906725883484, "learning_rate": 3.260074804411784e-05, "loss": 1.0069, "step": 5117 }, { "epoch": 0.804574662500737, "grad_norm": 0.1492318958044052, "learning_rate": 3.2594861882514156e-05, "loss": 1.0913, "step": 5118 }, { "epoch": 0.8047318673976694, "grad_norm": 0.13761930167675018, "learning_rate": 3.2588975257045207e-05, "loss": 1.0727, "step": 5119 }, { "epoch": 0.8048890722946019, "grad_norm": 0.13349227607250214, "learning_rate": 3.258308816807052e-05, "loss": 1.1991, "step": 5120 }, { "epoch": 0.8048890722946019, "eval_loss": 1.0947285890579224, "eval_runtime": 2315.7281, "eval_samples_per_second": 3.998, "eval_steps_per_second": 1.999, "step": 5120 }, { "epoch": 0.8050462771915345, "grad_norm": 0.1252872198820114, "learning_rate": 3.2577200615949664e-05, "loss": 1.185, "step": 5121 }, { "epoch": 0.805203482088467, "grad_norm": 0.1414654552936554, "learning_rate": 3.2571312601042217e-05, "loss": 1.0538, "step": 5122 }, { "epoch": 0.8053606869853996, "grad_norm": 0.1252065896987915, "learning_rate": 3.256542412370781e-05, "loss": 1.082, "step": 5123 }, { "epoch": 0.8055178918823321, "grad_norm": 0.12409151345491409, "learning_rate": 3.2559535184306076e-05, "loss": 1.0973, "step": 5124 }, { "epoch": 0.8056750967792646, "grad_norm": 0.14512157440185547, "learning_rate": 3.25536457831967e-05, "loss": 1.1707, "step": 5125 }, { "epoch": 0.8058323016761972, "grad_norm": 0.12563149631023407, "learning_rate": 3.254775592073937e-05, "loss": 1.1245, "step": 5126 }, { "epoch": 0.8059895065731297, "grad_norm": 0.14275604486465454, "learning_rate": 3.254186559729384e-05, "loss": 1.1181, "step": 5127 }, { "epoch": 0.8061467114700623, "grad_norm": 0.13207076489925385, "learning_rate": 3.2535974813219857e-05, "loss": 1.1979, "step": 5128 }, { "epoch": 0.8063039163669948, "grad_norm": 0.1410505622625351, "learning_rate": 3.253008356887719e-05, "loss": 1.0827, "step": 5129 }, { "epoch": 0.8064611212639273, "grad_norm": 0.12954916059970856, "learning_rate": 3.252419186462568e-05, "loss": 1.0511, "step": 5130 }, { "epoch": 0.8066183261608599, "grad_norm": 0.1363353431224823, "learning_rate": 3.251829970082515e-05, "loss": 1.0479, "step": 5131 }, { "epoch": 0.8067755310577924, "grad_norm": 0.1394612044095993, "learning_rate": 3.251240707783548e-05, "loss": 1.1188, "step": 5132 }, { "epoch": 0.806932735954725, "grad_norm": 0.1582544445991516, "learning_rate": 3.2506513996016576e-05, "loss": 1.0661, "step": 5133 }, { "epoch": 0.8070899408516575, "grad_norm": 0.1461925208568573, "learning_rate": 3.250062045572835e-05, "loss": 1.1376, "step": 5134 }, { "epoch": 0.80724714574859, "grad_norm": 0.13171181082725525, "learning_rate": 3.249472645733076e-05, "loss": 1.1843, "step": 5135 }, { "epoch": 0.8074043506455226, "grad_norm": 0.12942472100257874, "learning_rate": 3.24888320011838e-05, "loss": 1.1095, "step": 5136 }, { "epoch": 0.8075615555424551, "grad_norm": 0.1437719464302063, "learning_rate": 3.248293708764748e-05, "loss": 1.1822, "step": 5137 }, { "epoch": 0.8077187604393877, "grad_norm": 0.12577079236507416, "learning_rate": 3.247704171708183e-05, "loss": 1.1092, "step": 5138 }, { "epoch": 0.8078759653363202, "grad_norm": 0.15389268100261688, "learning_rate": 3.247114588984692e-05, "loss": 1.078, "step": 5139 }, { "epoch": 0.8080331702332527, "grad_norm": 0.12741175293922424, "learning_rate": 3.246524960630284e-05, "loss": 1.1281, "step": 5140 }, { "epoch": 0.8081903751301853, "grad_norm": 0.13801594078540802, "learning_rate": 3.245935286680972e-05, "loss": 0.9421, "step": 5141 }, { "epoch": 0.8083475800271178, "grad_norm": 0.1279487907886505, "learning_rate": 3.245345567172771e-05, "loss": 1.1482, "step": 5142 }, { "epoch": 0.8085047849240504, "grad_norm": 0.13442997634410858, "learning_rate": 3.244755802141699e-05, "loss": 1.1681, "step": 5143 }, { "epoch": 0.8086619898209829, "grad_norm": 0.13333237171173096, "learning_rate": 3.2441659916237754e-05, "loss": 1.0835, "step": 5144 }, { "epoch": 0.8088191947179154, "grad_norm": 0.12792964279651642, "learning_rate": 3.2435761356550244e-05, "loss": 1.0747, "step": 5145 }, { "epoch": 0.808976399614848, "grad_norm": 0.1448972523212433, "learning_rate": 3.242986234271473e-05, "loss": 1.1094, "step": 5146 }, { "epoch": 0.8091336045117805, "grad_norm": 0.12385497987270355, "learning_rate": 3.2423962875091486e-05, "loss": 1.0085, "step": 5147 }, { "epoch": 0.8092908094087131, "grad_norm": 0.13567715883255005, "learning_rate": 3.241806295404084e-05, "loss": 1.0447, "step": 5148 }, { "epoch": 0.8094480143056456, "grad_norm": 0.1407817155122757, "learning_rate": 3.241216257992313e-05, "loss": 1.1313, "step": 5149 }, { "epoch": 0.8096052192025781, "grad_norm": 0.16297848522663116, "learning_rate": 3.240626175309873e-05, "loss": 1.1056, "step": 5150 }, { "epoch": 0.8097624240995107, "grad_norm": 0.14074040949344635, "learning_rate": 3.2400360473928046e-05, "loss": 1.082, "step": 5151 }, { "epoch": 0.8099196289964432, "grad_norm": 0.14632785320281982, "learning_rate": 3.2394458742771494e-05, "loss": 1.0199, "step": 5152 }, { "epoch": 0.8100768338933758, "grad_norm": 0.1421334147453308, "learning_rate": 3.238855655998954e-05, "loss": 1.1271, "step": 5153 }, { "epoch": 0.8102340387903083, "grad_norm": 0.1565493792295456, "learning_rate": 3.238265392594266e-05, "loss": 1.0984, "step": 5154 }, { "epoch": 0.8103912436872408, "grad_norm": 0.14498156309127808, "learning_rate": 3.237675084099137e-05, "loss": 1.186, "step": 5155 }, { "epoch": 0.8105484485841734, "grad_norm": 0.13381299376487732, "learning_rate": 3.23708473054962e-05, "loss": 1.0743, "step": 5156 }, { "epoch": 0.8107056534811059, "grad_norm": 0.14723144471645355, "learning_rate": 3.236494331981773e-05, "loss": 1.0761, "step": 5157 }, { "epoch": 0.8108628583780385, "grad_norm": 0.12565001845359802, "learning_rate": 3.235903888431654e-05, "loss": 1.0359, "step": 5158 }, { "epoch": 0.811020063274971, "grad_norm": 0.14763836562633514, "learning_rate": 3.235313399935326e-05, "loss": 1.1093, "step": 5159 }, { "epoch": 0.8111772681719036, "grad_norm": 0.18552212417125702, "learning_rate": 3.234722866528852e-05, "loss": 1.0349, "step": 5160 }, { "epoch": 0.8113344730688361, "grad_norm": 0.14310771226882935, "learning_rate": 3.234132288248302e-05, "loss": 1.0869, "step": 5161 }, { "epoch": 0.8114916779657686, "grad_norm": 0.1727200150489807, "learning_rate": 3.2335416651297436e-05, "loss": 1.1094, "step": 5162 }, { "epoch": 0.8116488828627012, "grad_norm": 0.1451343148946762, "learning_rate": 3.2329509972092524e-05, "loss": 1.1691, "step": 5163 }, { "epoch": 0.8118060877596337, "grad_norm": 0.13956843316555023, "learning_rate": 3.232360284522903e-05, "loss": 0.9013, "step": 5164 }, { "epoch": 0.8119632926565663, "grad_norm": 0.16323736310005188, "learning_rate": 3.2317695271067725e-05, "loss": 1.1315, "step": 5165 }, { "epoch": 0.8121204975534988, "grad_norm": 0.2190149426460266, "learning_rate": 3.231178724996945e-05, "loss": 1.2037, "step": 5166 }, { "epoch": 0.8122777024504313, "grad_norm": 0.14855903387069702, "learning_rate": 3.230587878229502e-05, "loss": 1.1752, "step": 5167 }, { "epoch": 0.8124349073473639, "grad_norm": 0.13913439214229584, "learning_rate": 3.2299969868405324e-05, "loss": 1.1198, "step": 5168 }, { "epoch": 0.8125921122442964, "grad_norm": 0.13375224173069, "learning_rate": 3.2294060508661225e-05, "loss": 1.0448, "step": 5169 }, { "epoch": 0.812749317141229, "grad_norm": 0.16045208275318146, "learning_rate": 3.228815070342368e-05, "loss": 1.1786, "step": 5170 }, { "epoch": 0.8129065220381615, "grad_norm": 0.1577804535627365, "learning_rate": 3.22822404530536e-05, "loss": 1.1587, "step": 5171 }, { "epoch": 0.813063726935094, "grad_norm": 0.15006138384342194, "learning_rate": 3.2276329757912e-05, "loss": 1.0087, "step": 5172 }, { "epoch": 0.8132209318320266, "grad_norm": 0.13959920406341553, "learning_rate": 3.227041861835985e-05, "loss": 1.0801, "step": 5173 }, { "epoch": 0.8133781367289591, "grad_norm": 0.130475252866745, "learning_rate": 3.2264507034758195e-05, "loss": 1.1616, "step": 5174 }, { "epoch": 0.8135353416258917, "grad_norm": 0.12751032412052155, "learning_rate": 3.2258595007468096e-05, "loss": 1.1243, "step": 5175 }, { "epoch": 0.8136925465228242, "grad_norm": 0.13321606814861298, "learning_rate": 3.225268253685062e-05, "loss": 1.1581, "step": 5176 }, { "epoch": 0.8138497514197567, "grad_norm": 0.14121800661087036, "learning_rate": 3.224676962326691e-05, "loss": 1.1686, "step": 5177 }, { "epoch": 0.8140069563166893, "grad_norm": 0.13203032314777374, "learning_rate": 3.2240856267078065e-05, "loss": 1.0636, "step": 5178 }, { "epoch": 0.8141641612136218, "grad_norm": 0.15097764134407043, "learning_rate": 3.223494246864527e-05, "loss": 1.0259, "step": 5179 }, { "epoch": 0.8143213661105544, "grad_norm": 0.13323961198329926, "learning_rate": 3.2229028228329714e-05, "loss": 1.1322, "step": 5180 }, { "epoch": 0.8144785710074869, "grad_norm": 0.15312063694000244, "learning_rate": 3.222311354649263e-05, "loss": 0.9622, "step": 5181 }, { "epoch": 0.8146357759044194, "grad_norm": 0.13417083024978638, "learning_rate": 3.2217198423495245e-05, "loss": 1.0706, "step": 5182 }, { "epoch": 0.814792980801352, "grad_norm": 0.13406860828399658, "learning_rate": 3.2211282859698846e-05, "loss": 1.0491, "step": 5183 }, { "epoch": 0.8149501856982845, "grad_norm": 0.15189844369888306, "learning_rate": 3.220536685546472e-05, "loss": 1.2357, "step": 5184 }, { "epoch": 0.8151073905952171, "grad_norm": 0.1376749873161316, "learning_rate": 3.21994504111542e-05, "loss": 1.1067, "step": 5185 }, { "epoch": 0.8152645954921496, "grad_norm": 0.15771864354610443, "learning_rate": 3.2193533527128654e-05, "loss": 1.1533, "step": 5186 }, { "epoch": 0.8154218003890821, "grad_norm": 0.1384822279214859, "learning_rate": 3.218761620374944e-05, "loss": 1.2118, "step": 5187 }, { "epoch": 0.8155790052860147, "grad_norm": 0.16346484422683716, "learning_rate": 3.218169844137797e-05, "loss": 1.0801, "step": 5188 }, { "epoch": 0.8157362101829472, "grad_norm": 0.1343608945608139, "learning_rate": 3.2175780240375695e-05, "loss": 1.0816, "step": 5189 }, { "epoch": 0.8158934150798798, "grad_norm": 0.14006014168262482, "learning_rate": 3.216986160110406e-05, "loss": 0.9928, "step": 5190 }, { "epoch": 0.8160506199768123, "grad_norm": 0.1414293646812439, "learning_rate": 3.216394252392456e-05, "loss": 1.2733, "step": 5191 }, { "epoch": 0.8162078248737448, "grad_norm": 0.11960668116807938, "learning_rate": 3.2158023009198706e-05, "loss": 1.0705, "step": 5192 }, { "epoch": 0.8163650297706774, "grad_norm": 0.13784858584403992, "learning_rate": 3.2152103057288045e-05, "loss": 1.0802, "step": 5193 }, { "epoch": 0.8165222346676099, "grad_norm": 0.15226618945598602, "learning_rate": 3.214618266855413e-05, "loss": 1.1768, "step": 5194 }, { "epoch": 0.8166794395645425, "grad_norm": 0.1458420753479004, "learning_rate": 3.2140261843358574e-05, "loss": 1.0452, "step": 5195 }, { "epoch": 0.816836644461475, "grad_norm": 0.12530644237995148, "learning_rate": 3.2134340582062994e-05, "loss": 0.9575, "step": 5196 }, { "epoch": 0.8169938493584075, "grad_norm": 0.13370977342128754, "learning_rate": 3.2128418885029034e-05, "loss": 1.1113, "step": 5197 }, { "epoch": 0.8171510542553401, "grad_norm": 0.13998764753341675, "learning_rate": 3.212249675261838e-05, "loss": 1.152, "step": 5198 }, { "epoch": 0.8173082591522726, "grad_norm": 0.16114076972007751, "learning_rate": 3.21165741851927e-05, "loss": 1.0632, "step": 5199 }, { "epoch": 0.8174654640492052, "grad_norm": 0.14692504703998566, "learning_rate": 3.211065118311377e-05, "loss": 1.1338, "step": 5200 }, { "epoch": 0.8176226689461377, "grad_norm": 0.17466497421264648, "learning_rate": 3.21047277467433e-05, "loss": 1.0663, "step": 5201 }, { "epoch": 0.8177798738430702, "grad_norm": 0.1547803282737732, "learning_rate": 3.2098803876443103e-05, "loss": 1.048, "step": 5202 }, { "epoch": 0.8179370787400028, "grad_norm": 0.14291557669639587, "learning_rate": 3.2092879572574975e-05, "loss": 1.0921, "step": 5203 }, { "epoch": 0.8180942836369353, "grad_norm": 0.13960640132427216, "learning_rate": 3.2086954835500736e-05, "loss": 1.1639, "step": 5204 }, { "epoch": 0.8182514885338679, "grad_norm": 0.17912665009498596, "learning_rate": 3.2081029665582274e-05, "loss": 1.0639, "step": 5205 }, { "epoch": 0.8184086934308004, "grad_norm": 0.14091219007968903, "learning_rate": 3.207510406318146e-05, "loss": 1.1496, "step": 5206 }, { "epoch": 0.8185658983277329, "grad_norm": 0.14809809625148773, "learning_rate": 3.206917802866021e-05, "loss": 1.1601, "step": 5207 }, { "epoch": 0.8187231032246655, "grad_norm": 0.1427210122346878, "learning_rate": 3.206325156238045e-05, "loss": 1.1745, "step": 5208 }, { "epoch": 0.818880308121598, "grad_norm": 0.1384812444448471, "learning_rate": 3.205732466470417e-05, "loss": 1.1849, "step": 5209 }, { "epoch": 0.8190375130185306, "grad_norm": 0.14473986625671387, "learning_rate": 3.2051397335993347e-05, "loss": 1.0299, "step": 5210 }, { "epoch": 0.819194717915463, "grad_norm": 0.1633637249469757, "learning_rate": 3.204546957661001e-05, "loss": 1.0674, "step": 5211 }, { "epoch": 0.8193519228123957, "grad_norm": 0.1735648661851883, "learning_rate": 3.203954138691619e-05, "loss": 1.1624, "step": 5212 }, { "epoch": 0.8195091277093282, "grad_norm": 0.13555194437503815, "learning_rate": 3.203361276727397e-05, "loss": 1.1849, "step": 5213 }, { "epoch": 0.8196663326062607, "grad_norm": 0.14566943049430847, "learning_rate": 3.202768371804544e-05, "loss": 1.0632, "step": 5214 }, { "epoch": 0.8198235375031933, "grad_norm": 0.15393325686454773, "learning_rate": 3.202175423959272e-05, "loss": 1.0622, "step": 5215 }, { "epoch": 0.8199807424001258, "grad_norm": 0.1464414745569229, "learning_rate": 3.201582433227798e-05, "loss": 1.1167, "step": 5216 }, { "epoch": 0.8201379472970584, "grad_norm": 0.14175230264663696, "learning_rate": 3.2009893996463384e-05, "loss": 1.0924, "step": 5217 }, { "epoch": 0.8202951521939909, "grad_norm": 0.1299310177564621, "learning_rate": 3.200396323251112e-05, "loss": 1.0429, "step": 5218 }, { "epoch": 0.8204523570909233, "grad_norm": 0.15049515664577484, "learning_rate": 3.199803204078344e-05, "loss": 1.1402, "step": 5219 }, { "epoch": 0.820609561987856, "grad_norm": 0.13316011428833008, "learning_rate": 3.199210042164259e-05, "loss": 1.1151, "step": 5220 }, { "epoch": 0.8207667668847884, "grad_norm": 0.13929545879364014, "learning_rate": 3.1986168375450845e-05, "loss": 1.0497, "step": 5221 }, { "epoch": 0.820923971781721, "grad_norm": 0.14065782725811005, "learning_rate": 3.198023590257052e-05, "loss": 1.1291, "step": 5222 }, { "epoch": 0.8210811766786535, "grad_norm": 0.13490915298461914, "learning_rate": 3.197430300336394e-05, "loss": 1.1718, "step": 5223 }, { "epoch": 0.821238381575586, "grad_norm": 0.1275462955236435, "learning_rate": 3.196836967819347e-05, "loss": 1.1754, "step": 5224 }, { "epoch": 0.8213955864725186, "grad_norm": 0.1298583745956421, "learning_rate": 3.196243592742148e-05, "loss": 0.9097, "step": 5225 }, { "epoch": 0.8215527913694511, "grad_norm": 0.13181264698505402, "learning_rate": 3.1956501751410416e-05, "loss": 1.2361, "step": 5226 }, { "epoch": 0.8217099962663837, "grad_norm": 0.12378641217947006, "learning_rate": 3.195056715052268e-05, "loss": 0.9561, "step": 5227 }, { "epoch": 0.8218672011633162, "grad_norm": 0.13846050202846527, "learning_rate": 3.194463212512075e-05, "loss": 1.1274, "step": 5228 }, { "epoch": 0.8220244060602487, "grad_norm": 0.13270175457000732, "learning_rate": 3.1938696675567114e-05, "loss": 1.1153, "step": 5229 }, { "epoch": 0.8221816109571813, "grad_norm": 0.13590794801712036, "learning_rate": 3.1932760802224285e-05, "loss": 1.1178, "step": 5230 }, { "epoch": 0.8223388158541138, "grad_norm": 0.13039462268352509, "learning_rate": 3.192682450545481e-05, "loss": 1.1305, "step": 5231 }, { "epoch": 0.8224960207510464, "grad_norm": 0.13966499269008636, "learning_rate": 3.1920887785621235e-05, "loss": 1.1426, "step": 5232 }, { "epoch": 0.8226532256479789, "grad_norm": 0.1593773514032364, "learning_rate": 3.191495064308618e-05, "loss": 1.0802, "step": 5233 }, { "epoch": 0.8228104305449114, "grad_norm": 0.14048096537590027, "learning_rate": 3.1909013078212235e-05, "loss": 1.0785, "step": 5234 }, { "epoch": 0.822967635441844, "grad_norm": 0.1394653022289276, "learning_rate": 3.190307509136207e-05, "loss": 1.0112, "step": 5235 }, { "epoch": 0.8231248403387765, "grad_norm": 0.15474697947502136, "learning_rate": 3.189713668289834e-05, "loss": 1.1056, "step": 5236 }, { "epoch": 0.8232820452357091, "grad_norm": 0.15830832719802856, "learning_rate": 3.1891197853183744e-05, "loss": 1.2166, "step": 5237 }, { "epoch": 0.8234392501326416, "grad_norm": 0.1479511260986328, "learning_rate": 3.1885258602581e-05, "loss": 0.9892, "step": 5238 }, { "epoch": 0.8235964550295741, "grad_norm": 0.1471664309501648, "learning_rate": 3.187931893145285e-05, "loss": 1.0284, "step": 5239 }, { "epoch": 0.8237536599265067, "grad_norm": 0.15496090054512024, "learning_rate": 3.1873378840162086e-05, "loss": 1.0534, "step": 5240 }, { "epoch": 0.8239108648234392, "grad_norm": 0.1371716558933258, "learning_rate": 3.186743832907149e-05, "loss": 1.1141, "step": 5241 }, { "epoch": 0.8240680697203718, "grad_norm": 0.1419869214296341, "learning_rate": 3.186149739854389e-05, "loss": 1.0923, "step": 5242 }, { "epoch": 0.8242252746173043, "grad_norm": 0.13275684416294098, "learning_rate": 3.185555604894214e-05, "loss": 1.2053, "step": 5243 }, { "epoch": 0.8243824795142368, "grad_norm": 0.14531686902046204, "learning_rate": 3.1849614280629096e-05, "loss": 1.0359, "step": 5244 }, { "epoch": 0.8245396844111694, "grad_norm": 0.1412031352519989, "learning_rate": 3.1843672093967685e-05, "loss": 1.0295, "step": 5245 }, { "epoch": 0.8246968893081019, "grad_norm": 0.13067127764225006, "learning_rate": 3.183772948932082e-05, "loss": 1.0293, "step": 5246 }, { "epoch": 0.8248540942050345, "grad_norm": 0.1574161797761917, "learning_rate": 3.183178646705146e-05, "loss": 1.1704, "step": 5247 }, { "epoch": 0.825011299101967, "grad_norm": 0.14514204859733582, "learning_rate": 3.1825843027522554e-05, "loss": 1.0959, "step": 5248 }, { "epoch": 0.8251685039988995, "grad_norm": 0.1517404168844223, "learning_rate": 3.1819899171097146e-05, "loss": 1.035, "step": 5249 }, { "epoch": 0.8253257088958321, "grad_norm": 0.13299009203910828, "learning_rate": 3.181395489813824e-05, "loss": 1.0324, "step": 5250 }, { "epoch": 0.8254829137927646, "grad_norm": 0.14681966602802277, "learning_rate": 3.180801020900889e-05, "loss": 1.1329, "step": 5251 }, { "epoch": 0.8256401186896972, "grad_norm": 0.11702141910791397, "learning_rate": 3.180206510407218e-05, "loss": 1.0655, "step": 5252 }, { "epoch": 0.8257973235866297, "grad_norm": 0.12597863376140594, "learning_rate": 3.1796119583691214e-05, "loss": 1.1097, "step": 5253 }, { "epoch": 0.8259545284835622, "grad_norm": 0.13226962089538574, "learning_rate": 3.179017364822913e-05, "loss": 1.0156, "step": 5254 }, { "epoch": 0.8261117333804948, "grad_norm": 0.13883601129055023, "learning_rate": 3.178422729804906e-05, "loss": 1.1682, "step": 5255 }, { "epoch": 0.8262689382774273, "grad_norm": 0.12312841415405273, "learning_rate": 3.177828053351421e-05, "loss": 1.1061, "step": 5256 }, { "epoch": 0.8264261431743599, "grad_norm": 0.1382024735212326, "learning_rate": 3.177233335498777e-05, "loss": 1.062, "step": 5257 }, { "epoch": 0.8265833480712924, "grad_norm": 0.138153076171875, "learning_rate": 3.176638576283298e-05, "loss": 1.1267, "step": 5258 }, { "epoch": 0.8267405529682249, "grad_norm": 0.15296030044555664, "learning_rate": 3.176043775741308e-05, "loss": 1.1957, "step": 5259 }, { "epoch": 0.8268977578651575, "grad_norm": 0.1348961442708969, "learning_rate": 3.175448933909138e-05, "loss": 1.1818, "step": 5260 }, { "epoch": 0.82705496276209, "grad_norm": 0.14340707659721375, "learning_rate": 3.1748540508231165e-05, "loss": 1.1147, "step": 5261 }, { "epoch": 0.8272121676590226, "grad_norm": 0.14010478556156158, "learning_rate": 3.174259126519576e-05, "loss": 1.0012, "step": 5262 }, { "epoch": 0.8273693725559551, "grad_norm": 0.13827413320541382, "learning_rate": 3.173664161034855e-05, "loss": 1.0526, "step": 5263 }, { "epoch": 0.8275265774528877, "grad_norm": 0.1358298659324646, "learning_rate": 3.1730691544052894e-05, "loss": 1.1188, "step": 5264 }, { "epoch": 0.8276837823498202, "grad_norm": 0.13605011999607086, "learning_rate": 3.172474106667221e-05, "loss": 1.1694, "step": 5265 }, { "epoch": 0.8278409872467527, "grad_norm": 0.13801071047782898, "learning_rate": 3.171879017856993e-05, "loss": 1.1848, "step": 5266 }, { "epoch": 0.8279981921436853, "grad_norm": 0.15380828082561493, "learning_rate": 3.1712838880109506e-05, "loss": 1.087, "step": 5267 }, { "epoch": 0.8281553970406178, "grad_norm": 0.13930104672908783, "learning_rate": 3.170688717165442e-05, "loss": 1.0753, "step": 5268 }, { "epoch": 0.8283126019375504, "grad_norm": 0.14911097288131714, "learning_rate": 3.170093505356819e-05, "loss": 1.0268, "step": 5269 }, { "epoch": 0.8284698068344829, "grad_norm": 0.14786234498023987, "learning_rate": 3.169498252621434e-05, "loss": 1.1067, "step": 5270 }, { "epoch": 0.8286270117314154, "grad_norm": 0.12906505167484283, "learning_rate": 3.168902958995643e-05, "loss": 1.0424, "step": 5271 }, { "epoch": 0.828784216628348, "grad_norm": 0.144206240773201, "learning_rate": 3.1683076245158036e-05, "loss": 1.2012, "step": 5272 }, { "epoch": 0.8289414215252805, "grad_norm": 0.1476116180419922, "learning_rate": 3.167712249218278e-05, "loss": 1.0983, "step": 5273 }, { "epoch": 0.8290986264222131, "grad_norm": 0.13781993091106415, "learning_rate": 3.167116833139428e-05, "loss": 1.0777, "step": 5274 }, { "epoch": 0.8292558313191456, "grad_norm": 0.1404123455286026, "learning_rate": 3.166521376315621e-05, "loss": 1.0654, "step": 5275 }, { "epoch": 0.8294130362160781, "grad_norm": 0.15808063745498657, "learning_rate": 3.1659258787832245e-05, "loss": 1.093, "step": 5276 }, { "epoch": 0.8295702411130107, "grad_norm": 0.14129871129989624, "learning_rate": 3.165330340578608e-05, "loss": 1.0843, "step": 5277 }, { "epoch": 0.8297274460099432, "grad_norm": 0.1428525745868683, "learning_rate": 3.1647347617381464e-05, "loss": 1.067, "step": 5278 }, { "epoch": 0.8298846509068758, "grad_norm": 0.12404271215200424, "learning_rate": 3.164139142298214e-05, "loss": 1.048, "step": 5279 }, { "epoch": 0.8300418558038083, "grad_norm": 0.12726539373397827, "learning_rate": 3.16354348229519e-05, "loss": 1.0153, "step": 5280 }, { "epoch": 0.8300418558038083, "eval_loss": 1.0945003032684326, "eval_runtime": 2386.806, "eval_samples_per_second": 3.879, "eval_steps_per_second": 1.939, "step": 5280 } ], "logging_steps": 1, "max_steps": 12722, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 160, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.7046171333707694e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }